示例#1
0
def apply_confirm_events(storage, stats, msg):
    """
    Apply confirm events (if present)

    (BULK)CONFIRM events can be passed in a file.
    The name of the file is mag['confirms'].

    :param storage:
    :param stats:
    :param msg:
    :return:
    """
    confirms = msg.get('confirms')
    # SKIP confirms for relations
    catalogue = msg['header'].get('catalogue', "")
    if confirms and catalogue != 'rel':
        reader = ContentsReader(confirms)
        with ProgressTicker("Apply CONFIRM events", 10000) as progress:
            for event in reader.items():
                progress.tick()
                action = event['event']
                assert action in ['CONFIRM', 'BULKCONFIRM']
                # get confirm data: BULKCONFIRM => data.confirms, CONFIRM => [data]
                confirm_data = event['data'].get('confirms', [event['data']])
                storage.apply_confirms(confirm_data,
                                       msg['header']['timestamp'])
                stats.add_applied('CONFIRM', len(confirm_data))
        reader.close()
    if confirms:
        # Remove file after it has been handled (or skipped)
        os.remove(confirms)
        del msg['confirms']
示例#2
0
def apply_events(storage, last_events, start_after, stats):
    """Apply any unhandled events to the database

    :param storage: GOB (events + entities)
    :param start_after: the is of the last event that has been applied to the storage
    :param stats: update statitics for this action
    :return:
    """
    with ActiveGarbageCollection(
            "Apply events"), storage.get_session() as session:
        logger.info("Apply events")

        PROCESS_PER = 10000
        add_event_tids = set()
        with ProgressTicker("Apply events", PROCESS_PER) as progress:
            unhandled_events = storage.get_events_starting_after(
                start_after, PROCESS_PER)
            while unhandled_events:
                with EventApplicator(storage) as event_applicator:
                    for event in unhandled_events:
                        progress.tick()

                        gob_event, count, applied_events = event_applicator.apply(
                            event, last_events, add_event_tids)
                        action = gob_event.action
                        stats.add_applied(action, count)
                        start_after = event.eventid

                        # Remove event from session, to avoid trying to update event db object
                        session.expunge(event)

                    event_applicator.apply_all()

                unhandled_events = storage.get_events_starting_after(
                    start_after, PROCESS_PER)
示例#3
0
def ndjson_exporter(api,
                    file,
                    format=None,
                    append=False,
                    filter: EntityFilter = None):
    """
    Exports a single entity in Newline Delimited JSON format

    :param api: API reader instance
    :param file: name of the file to write results
    :param format: NA
    :param append: NA
    :return: number of rows exported
    """
    if append:
        raise NotImplementedError(
            "Appending not implemented for this exporter")

    row_count = 0
    with open(file, 'w') as fp, ProgressTicker("Export entities",
                                               10000) as progress:
        for entity in api:
            if filter and not filter.filter(entity):
                continue

            result = json.dumps(entity)

            fp.write(result + '\n')

            row_count += 1
            progress.tick()

    return row_count
示例#4
0
def _store_events(storage, last_events, events, stats):
    """Store events in GOB

    Only valid events are stored, other events are skipped (with an associated warning)
    The events are added in bulk in the database

    :param storage: GOB (events + entities)
    :param events: the events to process
    :param stats: update statitics for this action
    :return:
    """
    with ActiveGarbageCollection("Store events"), storage.get_session():
        # Use a session to commit all or rollback on any error
        logger.info("Store events")

        with ProgressTicker("Store events", 10000) as progress, \
                EventCollector(storage, last_events) as event_collector:

            for event in events:
                progress.tick()

                if event_collector.collect(event):
                    stats.store_event(event)
                else:
                    stats.skip_event(event)
示例#5
0
    def bulk_add_entities(self, events):
        """Adds all applied ADD events to the storage

        :param events: list of gob events
        """
        insert_data = []
        progress = ProgressTicker("Bulk add entities", 10000)
        while events:
            progress.tick()

            event = events.pop(0)
            entity = event.get_attribute_dict()
            # Set the the _last_event
            entity['_last_event'] = event.id
            insert_data.append(entity)
        table = self.DbEntity.__table__
        self.bulk_insert(table, insert_data)
示例#6
0
class TestProgressTicker(TestCase):
    def setUp(self) -> None:
        self.ticker = ProgressTicker('TickerName', 15)

    def test_init(self):
        self.assertEqual(self.ticker._name, 'TickerName')
        self.assertEqual(self.ticker._report_interval, 15)
        self.assertEqual(self.ticker._count, 0)

    @patch("builtins.print")
    def test_enter(self, mock_print):
        res = self.ticker.__enter__()

        self.assertEqual(res, self.ticker, "__enter__ should return self")
        mock_print.assert_called_with("Start TickerName")

    @patch("builtins.print")
    def test_exit(self, mock_print):
        self.ticker._count = 18004
        self.ticker.__exit__()
        mock_print.assert_called_with("End TickerName - 18004")

    @patch("builtins.print")
    def test_tick(self, mock_print):
        self.ticker._report_interval = 3

        ticks = [(i, i % 3 == 0) for i in range(1, 20)]

        for cnt, do_print in ticks:
            self.ticker.tick()
            self.assertEqual(cnt, self.ticker._count)

            if do_print:
                mock_print.assert_called_with(f'TickerName - {cnt}')
                mock_print.reset_mock()
            else:
                mock_print.assert_not_called()
示例#7
0
def _process_compare_results(storage, model, results, stats):
    """Process the results of the in database compare

    Creates the ADD, DELETE and CONFIRM records and returns them with the remaining records

    :param results: the result rows from the database comparison
    :return: list of events, list of remaining records
    """
    version = model['version']
    # Take two files: one for confirms and one for other events
    with ProgressTicker("Process compare result", 10000) as progress, \
            ContentsWriter() as contents_writer, \
            ContentsWriter() as confirms_writer, \
            EventCollector(contents_writer, confirms_writer, version) as event_collector:

        filename = contents_writer.filename
        confirms = confirms_writer.filename

        for row in results:
            progress.tick()
            # Get the data for this record and create the event
            entity = row["_original_value"]

            stats.compare(row)

            if row['type'] == 'ADD':
                entity["_last_event"] = row['_last_event']
                event = GOB.ADD.create_event(row['_tid'], entity, version)
            elif row['type'] == 'CONFIRM':
                data = {'_last_event': row['_last_event']}
                event = GOB.CONFIRM.create_event(row['_tid'], data, version)
            elif row['type'] == 'MODIFY':
                current_entity = storage.get_current_entity(entity)
                modifications = get_modifications(current_entity, entity,
                                                  model['all_fields'])
                event = get_event_for(current_entity, entity, modifications,
                                      version)
            elif row['type'] == 'DELETE':
                data = {'_last_event': row['_last_event']}
                event = GOB.DELETE.create_event(row['_entity_tid'], data,
                                                version)
            else:
                continue

            event_collector.collect(event)

    return filename, confirms
示例#8
0
    def import_dataset(self):
        try:
            self.row = None

            with ContentsWriter() as writer, \
                    ProgressTicker(f"Import {self.catalogue} {self.entity}", 10000) as progress:

                self.filename = writer.filename

                # DELETE: Skip import rows -> write empty file
                # mark all entities as deleted
                if self.mode != ImportMode.DELETE:
                    self.merger.prepare(progress)
                    self.import_rows(writer.write, progress)
                    self.merger.finish(writer.write)
                    self.entity_validator.result()

        except Exception as e:
            # Print error message, the message that caused the error and a short stacktrace
            stacktrace = traceback.format_exc(limit=-5)
            print(f"Import failed at row {self.n_rows}: {e}", stacktrace)
            # Log the error and a short error description
            self.logger.error(f'Import failed at row {self.n_rows}: {e}')
            self.logger.error(
                "Import has failed",
                {
                    "data": {
                        "error":
                        str(e),  # Include a short error description,
                        "row number":
                        self.n_rows,
                        self.source_id:
                        "" if self.row is None else self.row[self.source_id],
                    }
                })

        return self.get_result_msg()
示例#9
0
def csv_exporter(api,
                 file,
                 format=None,
                 append=False,
                 filter: EntityFilter = None):
    """CSV Exporter

    Exports the output of the API to a ; delimited csv file.

    Format is a dictionary which can have the following attributes:

    columns: A list of attributes which can be mapped 1-on-1 with the API output and csv column name

        Example: ['identificatie', 'code', 'naam']

    reference: Can be found in the _embedded block of the HAL JSON output. Reference will contain a
               dictionary of API attributes with information on how to map them to csv columns.

        Example:
            ligtInBuurt: {
                'ref': 'GBD.SDL',   -- The abbreviations for this catalog and collection
                'ref_name': 'ligtIn',  -- A description of the relation used in the csv column name
                'columns': ['identificatie', 'naam'],  -- The columns to be taken from this _embedded reference
            }

    mapping: A dictionary of mapings between API output and CSV columns. This is currently being used for the
             state endpints as these aren't according to HAL JSON specs yet.

        Example: 'ligtIn:GBD.SDL.identificatie': 'gebieden:stadsdelenIdentificatie',



    :param filter:
    :param api: the API wrapper which can be iterated through
    :param file: the local file to write to
    :param format: format definition, see above for examples
    :param append: the file the result of this export will be appended to, or False
    :return:
    """
    row_count = 0

    mapping = build_mapping_from_format(format)
    fieldnames = [*mapping.keys()]

    if append:
        _ensure_fieldnames_match_existing_file(fieldnames, append)

    with open(file, 'a' if append else 'w', encoding='utf-8-sig') as fp, \
            ProgressTicker("Export entities", 10000) as progress:
        # Get the fieldnames from the mapping
        writer = csv.DictWriter(fp, fieldnames=fieldnames, delimiter=';')

        if not append:
            writer.writeheader()

        for entity in api:
            if filter and not filter.filter(entity):
                continue

            row = {}
            for attribute_name, lookup_key in mapping.items():
                row[attribute_name] = get_entity_value(entity, lookup_key)

            writer.writerow(row)
            row_count += 1
            progress.tick()

    return row_count
示例#10
0
 def setUp(self) -> None:
     self.ticker = ProgressTicker('TickerName', 15)
示例#11
0
def compare(msg):
    """Compare new data in msg (contents) with the current data

    :param msg: The new data, including header and summary
    :return: result message
    """
    logger.configure(msg, "COMPARE")
    header = msg.get('header', {})
    mode = header.get('mode', FULL_UPLOAD)
    logger.info(
        f"Compare (mode = {mode}) to GOB Database {GOBStorageHandler.user_name} started"
    )

    # Parse the message header
    message = ImportMessage(msg)
    metadata = message.metadata

    # Get the model for the collection to be compared
    gob_model = GOBModel()
    entity_model = gob_model.get_collection(metadata.catalogue,
                                            metadata.entity)

    # Initialize a storage handler for the collection
    storage = GOBStorageHandler(metadata)
    model = f"{metadata.source} {metadata.catalogue} {metadata.entity}"
    logger.info(f"Compare {model}")

    stats = CompareStatistics()

    tmp_table_name = None
    with storage.get_session():
        with ProgressTicker("Collect compare events", 10000) as progress:
            # Check any dependencies
            if not meets_dependencies(storage, msg):
                return {
                    "header": msg["header"],
                    "summary": logger.get_summary(),
                    "contents": None
                }

            enricher = Enricher(storage, msg)
            populator = Populator(entity_model, msg)

            # If there are no records in the database all data are ADD events
            initial_add = not storage.has_any_entity()
            if initial_add:
                logger.info("Initial load of new collection detected")
                # Write ADD events directly, without using a temporary table
                contents_writer = ContentsWriter()
                contents_writer.open()
                # Pass a None confirms_writer because only ADD events are written
                collector = EventCollector(contents_writer,
                                           confirms_writer=None,
                                           version=entity_model['version'])
                collect = collector.collect_initial_add
            else:
                # Collect entities in a temporary table
                collector = EntityCollector(storage)
                collect = collector.collect
                tmp_table_name = collector.tmp_table_name

            for entity in msg["contents"]:
                progress.tick()
                stats.collect(entity)
                enricher.enrich(entity)
                populator.populate(entity)
                collect(entity)

            collector.close()

    if initial_add:
        filename = contents_writer.filename
        confirms = None
        contents_writer.close()
    else:
        # Compare entities from temporary table
        with storage.get_session():
            diff = storage.compare_temporary_data(tmp_table_name, mode)
            filename, confirms = _process_compare_results(
                storage, entity_model, diff, stats)

    # Build result message
    results = stats.results()

    logger.info(f"Compare {model} completed", {'data': results})

    results.update(logger.get_summary())

    message = {
        "header": msg["header"],
        "summary": results,
        "contents_ref": filename,
        "confirms": confirms
    }

    return message
示例#12
0
def esri_exporter(api,
                  file,
                  format=None,
                  append=False,
                  filter: EntityFilter = None):
    """ESRI Exporter

    This function will transform the output of an API to ESRI shape files. The
    result will be 4 files (.shp, .dbf, .shx and .prj), which all contain some
    required data.

    It uses the python bindings to the GDAL library.

    :param api: The encapsulated API as an iterator
    :param file: The main file (.shp) to write to
    :param format: The mapping of the API output to ESRI fields as defined in the
    export config. The max length of an esri fieldname is 10 characters.
    """
    if append:
        raise NotImplementedError(
            "Appending not implemented for this exporter")

    row_count = 0
    driver = ogr.GetDriverByName("ESRI Shapefile")
    dstfile = driver.CreateDataSource(file)

    # Set spatialref to RD
    spatialref = osr.SpatialReference()
    spatialref.ImportFromEPSG(28992)

    geometry_field = format['geometrie'] if 'geometrie' in format.keys(
    ) else 'geometrie'

    with ProgressTicker("Export entities", 10000) as progress:
        # Get records from the API and build the esri file
        for entity in api:
            if filter and not filter.filter(entity):
                continue

            entity_geometry = get_entity_value(entity, geometry_field)

            # On the first entity determine the type of shapefile we need to export
            if row_count == 0:
                # Please note that it will fail if a file with the same name already exists
                geometry_type = _get_geometry_type(entity_geometry)

                # Auto-reduce field sizes, encode data to utf-8
                # see https://gdal.org/drivers/vector/shapefile.html#layer-creation-options
                dstlayer = dstfile.CreateLayer(
                    "layer",
                    spatialref,
                    geom_type=geometry_type,
                    options=['RESIZE=YES', f'ENCODING={ENCODING}'])

                # Add all field definitions, but skip geometrie
                all_fields = {
                    k: v
                    for k, v in format.items() if k is not geometry_field
                }
                add_field_definitions(dstlayer, all_fields.keys())

            feature = ogr.Feature(dstlayer.GetLayerDefn())
            if entity_geometry:
                feature.SetGeometry(create_geometry(entity_geometry))

            for attribute_name, source in all_fields.items():
                mapping = split_field_reference(source)
                value = get_entity_value(entity, mapping)

                # Esri expects an emtpy string when value is None
                value = '' if value is None else value

                feature.SetField(attribute_name, value)

            dstlayer.CreateFeature(feature)

            feature.Destroy()
            row_count += 1
            progress.tick()

    # When no rows are returned no layer has been made, so create it afterwards to make sure files exist
    dstlayer = dstfile.CreateLayer(
        "layer", spatialref,
        geom_type=ogr.wkbPolygon) if row_count == 0 else dstlayer

    dstfile.Destroy()
    _create_cpg(file)

    return row_count