Пример #1
0
 def test_injections_with_data(self, mock_open):
     injections = [{"id": 1, "field1": "aap", "field2": 10}]
     mock_open.side_effect = [
         mock.mock_open(read_data=json.dumps(injections)).return_value,
     ]
     inject_spec = {
         "from": "anyfile",
         "on": "id",
         "conversions": {
             "field1": "=",
             "field2": "+"
         }
     }
     data = [{
         "id": 0,
         "field1": "0",
         "field2": 0,
         "field3": "x"
     }, {
         "id": 1,
         "field1": "0",
         "field2": 0,
         "field3": "x"
     }, {
         "id": 2,
         "field1": "0",
         "field2": 0,
         "field3": "x"
     }, {
         "id": 1,
         "field1": "0",
         "field2": 2,
         "field3": "x"
     }]
     expect = [{
         "id": 0,
         "field1": "0",
         "field2": 0,
         "field3": "x"
     }, {
         "id": 1,
         "field1": "aap",
         "field2": 10,
         "field3": "x"
     }, {
         "id": 2,
         "field1": "0",
         "field2": 0,
         "field3": "x"
     }, {
         "id": 1,
         "field1": "aap",
         "field2": 12,
         "field3": "x"
     }]
     injector = Injector(inject_spec)
     for row in data:
         injector.inject(row)
     self.assertEqual(data, expect)
Пример #2
0
 def test_empty_injections(self, mock_open):
     injections = []
     mock_open.side_effect = [
         mock.mock_open(read_data=json.dumps(injections)).return_value,
     ]
     inject_spec = {
         "from": "anyfile",
         "on": "id",
         "conversions": {
             "field1": "=",
             "field2": "+"
         }
     }
     data = []
     injector = Injector(inject_spec)
     for row in data:
         injector.inject(row)
     self.assertEqual(data, [])
Пример #3
0
    def test_apply_injection(self):
        row = {}
        injector = Injector(None)
        injector._apply(row, "key", "=", "aap")
        self.assertEqual(row, {"key": "aap"})

        row = {"key": 1}
        injector._apply(row, "key", "+", 1)
        self.assertEqual(row, {"key": 2})
Пример #4
0
    def init_dataset(self, dataset):
        self.dataset = dataset
        self.source = self.dataset['source']
        self.source_id = self.dataset['source']['entity_id']
        self.source_app = self.dataset['source'].get(
            'application', self.dataset['source']['name'])
        self.catalogue = self.dataset['catalogue']
        self.entity = self.dataset['entity']

        # Find the functional source id
        # This is the functional field that is mapped onto the source_id
        # or _source_id if no mapping exists
        ids = [
            key for key, value in self.dataset["gob_mapping"].items()
            if value["source_mapping"] == self.source_id
        ]
        self.func_source_id = ids[0] if ids else "_source_id"

        self.injector = Injector(self.source.get("inject"))
        self.enricher = BaseEnricher(self.source_app, self.catalogue,
                                     self.entity)
        self.validator = Validator(self.source_app, self.catalogue,
                                   self.entity, self.dataset)
        self.converter = Converter(self.catalogue, self.entity, self.dataset)
Пример #5
0
    def test_no_injections(self):
        data = []
        injector = Injector(None)
        injector.inject(data)

        self.assertEqual(data, [])
Пример #6
0
class ImportClient:
    """Main class for an import client

    This class serves as the main client for which the import can be configured in a dataset.json

    """

    n_rows = 0

    def __init__(self,
                 dataset,
                 msg,
                 logger,
                 mode: ImportMode = ImportMode.FULL):
        self.mode = mode
        self.logger = logger

        self.init_dataset(dataset)

        self.entity_validator = EntityValidator(self.catalogue, self.entity,
                                                self.func_source_id)
        self.merger = Merger(self)

        self.header = msg.get('header', {})
        self.logger.info(
            f"Import dataset {self.entity} from {self.source_app} (mode = {self.mode.name}) started"
        )

    def init_dataset(self, dataset):
        self.dataset = dataset
        self.source = self.dataset['source']
        self.source_id = self.dataset['source']['entity_id']
        self.source_app = self.dataset['source'].get(
            'application', self.dataset['source']['name'])
        self.catalogue = self.dataset['catalogue']
        self.entity = self.dataset['entity']

        # Find the functional source id
        # This is the functional field that is mapped onto the source_id
        # or _source_id if no mapping exists
        ids = [
            key for key, value in self.dataset["gob_mapping"].items()
            if value["source_mapping"] == self.source_id
        ]
        self.func_source_id = ids[0] if ids else "_source_id"

        self.injector = Injector(self.source.get("inject"))
        self.enricher = BaseEnricher(self.source_app, self.catalogue,
                                     self.entity)
        self.validator = Validator(self.source_app, self.catalogue,
                                   self.entity, self.dataset)
        self.converter = Converter(self.catalogue, self.entity, self.dataset)

    def get_result_msg(self):
        """The result of the import needs to be published.

        Publication includes a header, summary and results
        The header is for identification purposes
        The summary is for the interpretation of the results. Was the import successful, what er the metrics, etc
        The results is the imported data in GOB format

        :return:
        """
        header = {
            **self.header, "depends_on":
            self.dataset['source'].get('depends_on', {}),
            "enrich":
            self.dataset['source'].get('enrich', {}),
            "version":
            self.dataset['version'],
            "timestamp":
            datetime.datetime.utcnow().isoformat()
        }

        summary = {'num_records': self.n_rows}

        log_msg = f"Import dataset {self.entity} from {self.source_app} completed. "
        if self.mode == ImportMode.DELETE:
            log_msg += "0 records imported, all known entities will be marked as deleted."
        else:
            log_msg += f"{summary['num_records']} records were read from the source."

        # Log end of import process
        self.logger.info(log_msg, kwargs={"data": summary})

        summary.update(self.logger.get_summary())

        import_message = {
            "header": header,
            "summary": summary,
            "contents_ref": self.filename
        }

        return import_message

    def import_rows(self, write, progress):
        self.logger.info(f"Connect to {self.source_app}")
        reader = Reader(self.source, self.source_app, self.dataset, self.mode)
        reader.connect()

        self.logger.info(f"Start import from {self.source_app}")
        self.n_rows = 0
        for row in reader.read():
            progress.tick()

            self.row = row
            self.n_rows += 1

            self.injector.inject(row)

            self.enricher.enrich(row)

            self.merger.merge(row, write)

            entity = self.converter.convert(row)

            # validator and entity_validator build up sets of primary keys from the dataset
            # -> higher memory consumption
            self.validator.validate(entity)
            self.entity_validator.validate(entity)

            write(entity)

        self.validator.result()

        self.logger.info(
            f"{self.n_rows} records have been imported from {self.source_app}")

        min_rows = self.dataset.get("min_rows", 1)
        if self.mode == ImportMode.FULL and self.n_rows < min_rows:
            # Default requirement for full imports is a non-empty dataset
            self.logger.error(
                f"Too few records imported: {self.n_rows} < {min_rows}")

    def import_dataset(self):
        try:
            self.row = None

            with ContentsWriter() as writer, \
                    ProgressTicker(f"Import {self.catalogue} {self.entity}", 10000) as progress:

                self.filename = writer.filename

                # DELETE: Skip import rows -> write empty file
                # mark all entities as deleted
                if self.mode != ImportMode.DELETE:
                    self.merger.prepare(progress)
                    self.import_rows(writer.write, progress)
                    self.merger.finish(writer.write)
                    self.entity_validator.result()

        except Exception as e:
            # Print error message, the message that caused the error and a short stacktrace
            stacktrace = traceback.format_exc(limit=-5)
            print(f"Import failed at row {self.n_rows}: {e}", stacktrace)
            # Log the error and a short error description
            self.logger.error(f'Import failed at row {self.n_rows}: {e}')
            self.logger.error(
                "Import has failed",
                {
                    "data": {
                        "error":
                        str(e),  # Include a short error description,
                        "row number":
                        self.n_rows,
                        self.source_id:
                        "" if self.row is None else self.row[self.source_id],
                    }
                })

        return self.get_result_msg()