def apply_confirm_events(storage, stats, msg): """ Apply confirm events (if present) (BULK)CONFIRM events can be passed in a file. The name of the file is mag['confirms']. :param storage: :param stats: :param msg: :return: """ confirms = msg.get('confirms') # SKIP confirms for relations catalogue = msg['header'].get('catalogue', "") if confirms and catalogue != 'rel': reader = ContentsReader(confirms) with ProgressTicker("Apply CONFIRM events", 10000) as progress: for event in reader.items(): progress.tick() action = event['event'] assert action in ['CONFIRM', 'BULKCONFIRM'] # get confirm data: BULKCONFIRM => data.confirms, CONFIRM => [data] confirm_data = event['data'].get('confirms', [event['data']]) storage.apply_confirms(confirm_data, msg['header']['timestamp']) stats.add_applied('CONFIRM', len(confirm_data)) reader.close() if confirms: # Remove file after it has been handled (or skipped) os.remove(confirms) del msg['confirms']
def apply_events(storage, last_events, start_after, stats): """Apply any unhandled events to the database :param storage: GOB (events + entities) :param start_after: the is of the last event that has been applied to the storage :param stats: update statitics for this action :return: """ with ActiveGarbageCollection( "Apply events"), storage.get_session() as session: logger.info("Apply events") PROCESS_PER = 10000 add_event_tids = set() with ProgressTicker("Apply events", PROCESS_PER) as progress: unhandled_events = storage.get_events_starting_after( start_after, PROCESS_PER) while unhandled_events: with EventApplicator(storage) as event_applicator: for event in unhandled_events: progress.tick() gob_event, count, applied_events = event_applicator.apply( event, last_events, add_event_tids) action = gob_event.action stats.add_applied(action, count) start_after = event.eventid # Remove event from session, to avoid trying to update event db object session.expunge(event) event_applicator.apply_all() unhandled_events = storage.get_events_starting_after( start_after, PROCESS_PER)
def ndjson_exporter(api, file, format=None, append=False, filter: EntityFilter = None): """ Exports a single entity in Newline Delimited JSON format :param api: API reader instance :param file: name of the file to write results :param format: NA :param append: NA :return: number of rows exported """ if append: raise NotImplementedError( "Appending not implemented for this exporter") row_count = 0 with open(file, 'w') as fp, ProgressTicker("Export entities", 10000) as progress: for entity in api: if filter and not filter.filter(entity): continue result = json.dumps(entity) fp.write(result + '\n') row_count += 1 progress.tick() return row_count
def _store_events(storage, last_events, events, stats): """Store events in GOB Only valid events are stored, other events are skipped (with an associated warning) The events are added in bulk in the database :param storage: GOB (events + entities) :param events: the events to process :param stats: update statitics for this action :return: """ with ActiveGarbageCollection("Store events"), storage.get_session(): # Use a session to commit all or rollback on any error logger.info("Store events") with ProgressTicker("Store events", 10000) as progress, \ EventCollector(storage, last_events) as event_collector: for event in events: progress.tick() if event_collector.collect(event): stats.store_event(event) else: stats.skip_event(event)
def bulk_add_entities(self, events): """Adds all applied ADD events to the storage :param events: list of gob events """ insert_data = [] progress = ProgressTicker("Bulk add entities", 10000) while events: progress.tick() event = events.pop(0) entity = event.get_attribute_dict() # Set the the _last_event entity['_last_event'] = event.id insert_data.append(entity) table = self.DbEntity.__table__ self.bulk_insert(table, insert_data)
class TestProgressTicker(TestCase): def setUp(self) -> None: self.ticker = ProgressTicker('TickerName', 15) def test_init(self): self.assertEqual(self.ticker._name, 'TickerName') self.assertEqual(self.ticker._report_interval, 15) self.assertEqual(self.ticker._count, 0) @patch("builtins.print") def test_enter(self, mock_print): res = self.ticker.__enter__() self.assertEqual(res, self.ticker, "__enter__ should return self") mock_print.assert_called_with("Start TickerName") @patch("builtins.print") def test_exit(self, mock_print): self.ticker._count = 18004 self.ticker.__exit__() mock_print.assert_called_with("End TickerName - 18004") @patch("builtins.print") def test_tick(self, mock_print): self.ticker._report_interval = 3 ticks = [(i, i % 3 == 0) for i in range(1, 20)] for cnt, do_print in ticks: self.ticker.tick() self.assertEqual(cnt, self.ticker._count) if do_print: mock_print.assert_called_with(f'TickerName - {cnt}') mock_print.reset_mock() else: mock_print.assert_not_called()
def _process_compare_results(storage, model, results, stats): """Process the results of the in database compare Creates the ADD, DELETE and CONFIRM records and returns them with the remaining records :param results: the result rows from the database comparison :return: list of events, list of remaining records """ version = model['version'] # Take two files: one for confirms and one for other events with ProgressTicker("Process compare result", 10000) as progress, \ ContentsWriter() as contents_writer, \ ContentsWriter() as confirms_writer, \ EventCollector(contents_writer, confirms_writer, version) as event_collector: filename = contents_writer.filename confirms = confirms_writer.filename for row in results: progress.tick() # Get the data for this record and create the event entity = row["_original_value"] stats.compare(row) if row['type'] == 'ADD': entity["_last_event"] = row['_last_event'] event = GOB.ADD.create_event(row['_tid'], entity, version) elif row['type'] == 'CONFIRM': data = {'_last_event': row['_last_event']} event = GOB.CONFIRM.create_event(row['_tid'], data, version) elif row['type'] == 'MODIFY': current_entity = storage.get_current_entity(entity) modifications = get_modifications(current_entity, entity, model['all_fields']) event = get_event_for(current_entity, entity, modifications, version) elif row['type'] == 'DELETE': data = {'_last_event': row['_last_event']} event = GOB.DELETE.create_event(row['_entity_tid'], data, version) else: continue event_collector.collect(event) return filename, confirms
def import_dataset(self): try: self.row = None with ContentsWriter() as writer, \ ProgressTicker(f"Import {self.catalogue} {self.entity}", 10000) as progress: self.filename = writer.filename # DELETE: Skip import rows -> write empty file # mark all entities as deleted if self.mode != ImportMode.DELETE: self.merger.prepare(progress) self.import_rows(writer.write, progress) self.merger.finish(writer.write) self.entity_validator.result() except Exception as e: # Print error message, the message that caused the error and a short stacktrace stacktrace = traceback.format_exc(limit=-5) print(f"Import failed at row {self.n_rows}: {e}", stacktrace) # Log the error and a short error description self.logger.error(f'Import failed at row {self.n_rows}: {e}') self.logger.error( "Import has failed", { "data": { "error": str(e), # Include a short error description, "row number": self.n_rows, self.source_id: "" if self.row is None else self.row[self.source_id], } }) return self.get_result_msg()
def csv_exporter(api, file, format=None, append=False, filter: EntityFilter = None): """CSV Exporter Exports the output of the API to a ; delimited csv file. Format is a dictionary which can have the following attributes: columns: A list of attributes which can be mapped 1-on-1 with the API output and csv column name Example: ['identificatie', 'code', 'naam'] reference: Can be found in the _embedded block of the HAL JSON output. Reference will contain a dictionary of API attributes with information on how to map them to csv columns. Example: ligtInBuurt: { 'ref': 'GBD.SDL', -- The abbreviations for this catalog and collection 'ref_name': 'ligtIn', -- A description of the relation used in the csv column name 'columns': ['identificatie', 'naam'], -- The columns to be taken from this _embedded reference } mapping: A dictionary of mapings between API output and CSV columns. This is currently being used for the state endpints as these aren't according to HAL JSON specs yet. Example: 'ligtIn:GBD.SDL.identificatie': 'gebieden:stadsdelenIdentificatie', :param filter: :param api: the API wrapper which can be iterated through :param file: the local file to write to :param format: format definition, see above for examples :param append: the file the result of this export will be appended to, or False :return: """ row_count = 0 mapping = build_mapping_from_format(format) fieldnames = [*mapping.keys()] if append: _ensure_fieldnames_match_existing_file(fieldnames, append) with open(file, 'a' if append else 'w', encoding='utf-8-sig') as fp, \ ProgressTicker("Export entities", 10000) as progress: # Get the fieldnames from the mapping writer = csv.DictWriter(fp, fieldnames=fieldnames, delimiter=';') if not append: writer.writeheader() for entity in api: if filter and not filter.filter(entity): continue row = {} for attribute_name, lookup_key in mapping.items(): row[attribute_name] = get_entity_value(entity, lookup_key) writer.writerow(row) row_count += 1 progress.tick() return row_count
def setUp(self) -> None: self.ticker = ProgressTicker('TickerName', 15)
def compare(msg): """Compare new data in msg (contents) with the current data :param msg: The new data, including header and summary :return: result message """ logger.configure(msg, "COMPARE") header = msg.get('header', {}) mode = header.get('mode', FULL_UPLOAD) logger.info( f"Compare (mode = {mode}) to GOB Database {GOBStorageHandler.user_name} started" ) # Parse the message header message = ImportMessage(msg) metadata = message.metadata # Get the model for the collection to be compared gob_model = GOBModel() entity_model = gob_model.get_collection(metadata.catalogue, metadata.entity) # Initialize a storage handler for the collection storage = GOBStorageHandler(metadata) model = f"{metadata.source} {metadata.catalogue} {metadata.entity}" logger.info(f"Compare {model}") stats = CompareStatistics() tmp_table_name = None with storage.get_session(): with ProgressTicker("Collect compare events", 10000) as progress: # Check any dependencies if not meets_dependencies(storage, msg): return { "header": msg["header"], "summary": logger.get_summary(), "contents": None } enricher = Enricher(storage, msg) populator = Populator(entity_model, msg) # If there are no records in the database all data are ADD events initial_add = not storage.has_any_entity() if initial_add: logger.info("Initial load of new collection detected") # Write ADD events directly, without using a temporary table contents_writer = ContentsWriter() contents_writer.open() # Pass a None confirms_writer because only ADD events are written collector = EventCollector(contents_writer, confirms_writer=None, version=entity_model['version']) collect = collector.collect_initial_add else: # Collect entities in a temporary table collector = EntityCollector(storage) collect = collector.collect tmp_table_name = collector.tmp_table_name for entity in msg["contents"]: progress.tick() stats.collect(entity) enricher.enrich(entity) populator.populate(entity) collect(entity) collector.close() if initial_add: filename = contents_writer.filename confirms = None contents_writer.close() else: # Compare entities from temporary table with storage.get_session(): diff = storage.compare_temporary_data(tmp_table_name, mode) filename, confirms = _process_compare_results( storage, entity_model, diff, stats) # Build result message results = stats.results() logger.info(f"Compare {model} completed", {'data': results}) results.update(logger.get_summary()) message = { "header": msg["header"], "summary": results, "contents_ref": filename, "confirms": confirms } return message
def esri_exporter(api, file, format=None, append=False, filter: EntityFilter = None): """ESRI Exporter This function will transform the output of an API to ESRI shape files. The result will be 4 files (.shp, .dbf, .shx and .prj), which all contain some required data. It uses the python bindings to the GDAL library. :param api: The encapsulated API as an iterator :param file: The main file (.shp) to write to :param format: The mapping of the API output to ESRI fields as defined in the export config. The max length of an esri fieldname is 10 characters. """ if append: raise NotImplementedError( "Appending not implemented for this exporter") row_count = 0 driver = ogr.GetDriverByName("ESRI Shapefile") dstfile = driver.CreateDataSource(file) # Set spatialref to RD spatialref = osr.SpatialReference() spatialref.ImportFromEPSG(28992) geometry_field = format['geometrie'] if 'geometrie' in format.keys( ) else 'geometrie' with ProgressTicker("Export entities", 10000) as progress: # Get records from the API and build the esri file for entity in api: if filter and not filter.filter(entity): continue entity_geometry = get_entity_value(entity, geometry_field) # On the first entity determine the type of shapefile we need to export if row_count == 0: # Please note that it will fail if a file with the same name already exists geometry_type = _get_geometry_type(entity_geometry) # Auto-reduce field sizes, encode data to utf-8 # see https://gdal.org/drivers/vector/shapefile.html#layer-creation-options dstlayer = dstfile.CreateLayer( "layer", spatialref, geom_type=geometry_type, options=['RESIZE=YES', f'ENCODING={ENCODING}']) # Add all field definitions, but skip geometrie all_fields = { k: v for k, v in format.items() if k is not geometry_field } add_field_definitions(dstlayer, all_fields.keys()) feature = ogr.Feature(dstlayer.GetLayerDefn()) if entity_geometry: feature.SetGeometry(create_geometry(entity_geometry)) for attribute_name, source in all_fields.items(): mapping = split_field_reference(source) value = get_entity_value(entity, mapping) # Esri expects an emtpy string when value is None value = '' if value is None else value feature.SetField(attribute_name, value) dstlayer.CreateFeature(feature) feature.Destroy() row_count += 1 progress.tick() # When no rows are returned no layer has been made, so create it afterwards to make sure files exist dstlayer = dstfile.CreateLayer( "layer", spatialref, geom_type=ogr.wkbPolygon) if row_count == 0 else dstlayer dstfile.Destroy() _create_cpg(file) return row_count