예제 #1
0
    def test_invalid_data(self):
        validator = Validator('source_app', 'meetbouten', 'meetbouten', self.mock_input_spec)
        for entity in self.invalid_meetbouten:
            validator.validate(entity)

        # Make sure the statusid has been listed as invalid
        self.assertEqual(validator.collection_qa['num_invalid_status.code'], 1)
예제 #2
0
    def test_nullpubliceerbaar(self):
        validator = Validator('source_app', 'meetbouten', 'meetbouten', self.mock_input_spec)

        for entity in self.nullpubliceerbaar_meetbouten:
            validator.validate(entity)

        # Make sure the publiceerbaar has been listed as invalid
        self.assertEqual(validator.collection_qa['num_invalid_publiceerbaar'], 0)
예제 #3
0
    def test_missing_fatal_data(self):
        missing_attr_meetbouten = self.valid_meetbouten[0].pop('publiceerbaar')
        validator = Validator('source_app', 'meetbouten', 'meetbouten', self.mock_input_spec)

        for entity in self.valid_meetbouten:
            validator.validate(entity)

        # Make sure the publiceerbaar has been listed as invalid
        self.assertEqual(validator.collection_qa['num_invalid_publiceerbaar'], 1)   # Warning
예제 #4
0
    def test_duplicate_primary_key(self):
        self.valid_meetbouten.append(self.valid_meetbouten[0])
        validator = Validator('source_app', 'meetbouten', 'meetbouten', self.mock_input_spec)

        with self.assertRaises(GOBException):
            for entity in self.valid_meetbouten:
                validator.validate(entity)
            validator.result()
예제 #5
0
    def init_dataset(self, dataset):
        self.dataset = dataset
        self.source = self.dataset['source']
        self.source_id = self.dataset['source']['entity_id']
        self.source_app = self.dataset['source'].get(
            'application', self.dataset['source']['name'])
        self.catalogue = self.dataset['catalogue']
        self.entity = self.dataset['entity']

        # Find the functional source id
        # This is the functional field that is mapped onto the source_id
        # or _source_id if no mapping exists
        ids = [
            key for key, value in self.dataset["gob_mapping"].items()
            if value["source_mapping"] == self.source_id
        ]
        self.func_source_id = ids[0] if ids else "_source_id"

        self.injector = Injector(self.source.get("inject"))
        self.enricher = BaseEnricher(self.source_app, self.catalogue,
                                     self.entity)
        self.validator = Validator(self.source_app, self.catalogue,
                                   self.entity, self.dataset)
        self.converter = Converter(self.catalogue, self.entity, self.dataset)
예제 #6
0
    def test_fatal_value(self):
        validator = Validator('source_app', 'meetbouten', 'meetbouten', self.mock_input_spec)

        with self.assertRaises(GOBException):
            for entity in self.fatal_meetbouten:
                print(entity)
                validator.validate(entity)
            validator.result()

        # Make sure the identificatie has been listed as invalid
        self.assertEqual(validator.collection_qa['num_invalid_identificatie'], 1)
예제 #7
0
    def test_valid_primary_key_with_states(self):
        mock_input_spec = {
            'catalogue': 'bag',
            'entity': 'woonplaatsen',
            'source': {
                'entity_id': 'identificatie'
            }
        }

        valid_entity = fixtures.get_valid_entity_with_state()
        validator = Validator('source_app', 'bag', 'woonplaatsen', mock_input_spec)

        for entity in valid_entity:
            validator.validate(entity)
        validator.result()
예제 #8
0
    def test_valid_primary_key_with_states_other_seqnr(self):
        mock_input_spec = {
            'catalogue': 'bag',
            'entity': 'woonplaatsen',
            'source': {
                'entity_id': 'identificatie'
            },
            'gob_mapping': {
                'volgnummer': {
                    'source_mapping': 'nummervolg'
                }
            }
        }

        valid_entity = [{'_source_id': '1234.1','identificatie': '1234', 'nummervolg': '1'}]
        validator = Validator('source_app', 'bag', 'woonplaatsen', mock_input_spec)

        for entity in valid_entity:
            validator.validate(entity)
        validator.result()
예제 #9
0
 def test_validat_data(self):
     validator = Validator('source_app', 'meetbouten', 'meetbouten', self.mock_input_spec)
     for entity in self.valid_meetbouten:
         validator.validate(entity)
예제 #10
0
class ImportClient:
    """Main class for an import client

    This class serves as the main client for which the import can be configured in a dataset.json

    """

    n_rows = 0

    def __init__(self,
                 dataset,
                 msg,
                 logger,
                 mode: ImportMode = ImportMode.FULL):
        self.mode = mode
        self.logger = logger

        self.init_dataset(dataset)

        self.entity_validator = EntityValidator(self.catalogue, self.entity,
                                                self.func_source_id)
        self.merger = Merger(self)

        self.header = msg.get('header', {})
        self.logger.info(
            f"Import dataset {self.entity} from {self.source_app} (mode = {self.mode.name}) started"
        )

    def init_dataset(self, dataset):
        self.dataset = dataset
        self.source = self.dataset['source']
        self.source_id = self.dataset['source']['entity_id']
        self.source_app = self.dataset['source'].get(
            'application', self.dataset['source']['name'])
        self.catalogue = self.dataset['catalogue']
        self.entity = self.dataset['entity']

        # Find the functional source id
        # This is the functional field that is mapped onto the source_id
        # or _source_id if no mapping exists
        ids = [
            key for key, value in self.dataset["gob_mapping"].items()
            if value["source_mapping"] == self.source_id
        ]
        self.func_source_id = ids[0] if ids else "_source_id"

        self.injector = Injector(self.source.get("inject"))
        self.enricher = BaseEnricher(self.source_app, self.catalogue,
                                     self.entity)
        self.validator = Validator(self.source_app, self.catalogue,
                                   self.entity, self.dataset)
        self.converter = Converter(self.catalogue, self.entity, self.dataset)

    def get_result_msg(self):
        """The result of the import needs to be published.

        Publication includes a header, summary and results
        The header is for identification purposes
        The summary is for the interpretation of the results. Was the import successful, what er the metrics, etc
        The results is the imported data in GOB format

        :return:
        """
        header = {
            **self.header, "depends_on":
            self.dataset['source'].get('depends_on', {}),
            "enrich":
            self.dataset['source'].get('enrich', {}),
            "version":
            self.dataset['version'],
            "timestamp":
            datetime.datetime.utcnow().isoformat()
        }

        summary = {'num_records': self.n_rows}

        log_msg = f"Import dataset {self.entity} from {self.source_app} completed. "
        if self.mode == ImportMode.DELETE:
            log_msg += "0 records imported, all known entities will be marked as deleted."
        else:
            log_msg += f"{summary['num_records']} records were read from the source."

        # Log end of import process
        self.logger.info(log_msg, kwargs={"data": summary})

        summary.update(self.logger.get_summary())

        import_message = {
            "header": header,
            "summary": summary,
            "contents_ref": self.filename
        }

        return import_message

    def import_rows(self, write, progress):
        self.logger.info(f"Connect to {self.source_app}")
        reader = Reader(self.source, self.source_app, self.dataset, self.mode)
        reader.connect()

        self.logger.info(f"Start import from {self.source_app}")
        self.n_rows = 0
        for row in reader.read():
            progress.tick()

            self.row = row
            self.n_rows += 1

            self.injector.inject(row)

            self.enricher.enrich(row)

            self.merger.merge(row, write)

            entity = self.converter.convert(row)

            # validator and entity_validator build up sets of primary keys from the dataset
            # -> higher memory consumption
            self.validator.validate(entity)
            self.entity_validator.validate(entity)

            write(entity)

        self.validator.result()

        self.logger.info(
            f"{self.n_rows} records have been imported from {self.source_app}")

        min_rows = self.dataset.get("min_rows", 1)
        if self.mode == ImportMode.FULL and self.n_rows < min_rows:
            # Default requirement for full imports is a non-empty dataset
            self.logger.error(
                f"Too few records imported: {self.n_rows} < {min_rows}")

    def import_dataset(self):
        try:
            self.row = None

            with ContentsWriter() as writer, \
                    ProgressTicker(f"Import {self.catalogue} {self.entity}", 10000) as progress:

                self.filename = writer.filename

                # DELETE: Skip import rows -> write empty file
                # mark all entities as deleted
                if self.mode != ImportMode.DELETE:
                    self.merger.prepare(progress)
                    self.import_rows(writer.write, progress)
                    self.merger.finish(writer.write)
                    self.entity_validator.result()

        except Exception as e:
            # Print error message, the message that caused the error and a short stacktrace
            stacktrace = traceback.format_exc(limit=-5)
            print(f"Import failed at row {self.n_rows}: {e}", stacktrace)
            # Log the error and a short error description
            self.logger.error(f'Import failed at row {self.n_rows}: {e}')
            self.logger.error(
                "Import has failed",
                {
                    "data": {
                        "error":
                        str(e),  # Include a short error description,
                        "row number":
                        self.n_rows,
                        self.source_id:
                        "" if self.row is None else self.row[self.source_id],
                    }
                })

        return self.get_result_msg()