def validate_ingest(
        self,
        ingest_info: IngestInfo,
        expected_ingest_info: IngestInfo,
        metadata: IngestMetadata,
    ) -> IngestInfo:
        """This function runs validation on a computed and expected ingest_info.

        Args:
            ingest_info: the computed ingest info object
            expected_ingest_info: the ingest info expected to be returned from
                `populate_data`. If `expected_ingest_info` is `None`, then
                expects the return value of `populate_data` to be `None`.
            metadata: an ingest info metadata struct to pass along to the proto
                converter.

        Returns:
            The result from populate_data in case the user needs to do any
            extra validations on the output.

        """

        if expected_ingest_info is None:
            assert ingest_info == expected_ingest_info
            return ingest_info

        # Attempt to convert the ingest_info to the ingest info proto,
        # validate the proto, and finally attempt to convert the proto into
        # our entitiy/ objects (which includes parsing strings into types)
        ingest_info_proto = serialization.convert_ingest_info_to_proto(
            ingest_info)
        validate(ingest_info_proto)
        res = ingest_info_converter.convert_to_persistence_entities(
            ingest_info_proto, metadata)

        assert res.enum_parsing_errors == 0
        assert res.general_parsing_errors == 0
        assert res.protected_class_errors == 0

        entity_validator.validate(res.people)

        differences = diff_ingest_infos(expected_ingest_info, ingest_info)

        if differences:
            self.fail(  # type: ignore[attr-defined]
                "IngestInfo objects do not match.\n"
                "Expected:\n{}\n"
                "Actual:\n{}\n"
                "Differences:\n{}\n\n"
                "(paste the following) scraped object:"
                "\n{}".format(
                    expected_ingest_info,
                    ingest_info,
                    "\n".join(differences),
                    repr(ingest_info),
                ))

        return ingest_info
    def test_extra_id(self):
        # Arrange
        ingest_info = IngestInfo()
        ingest_info.bookings.add(booking_id="1")

        # Act
        with pytest.raises(ValidationError) as e:
            ingest_info_validator.validate(ingest_info)
        result = e.value.errors

        # Assert
        expected_result = {
            "bookings": {
                "ids_never_referenced": {"1"},
            }
        }

        self.assertEqual(result, expected_result)
    def test_non_existing_id(self):
        # Arrange
        ingest_info = IngestInfo()
        ingest_info.people.add(person_id="1", booking_ids=["2"])

        # Act
        with pytest.raises(ValidationError) as e:
            ingest_info_validator.validate(ingest_info)
        result = e.value.errors

        # Assert
        expected_result = {
            "bookings": {
                "ids_referenced_that_do_not_exist": {"2"},
            }
        }

        self.assertEqual(result, expected_result)
    def test_duplicate_ids(self):
        # Arrange
        ingest_info = IngestInfo()
        ingest_info.people.extend([
            Person(person_id=PERSON_1),
            Person(person_id=PERSON_1),
            Person(person_id=PERSON_1, booking_ids=[BOOKING_1, BOOKING_2]),
        ])
        ingest_info.bookings.extend([
            Booking(booking_id=BOOKING_1),
            Booking(booking_id=BOOKING_1, charge_ids=[CHARGE_1, CHARGE_2]),
            Booking(booking_id=BOOKING_2, arrest_id=ARREST_1),
            Booking(booking_id=BOOKING_2, arrest_id=ARREST_2),
        ])
        ingest_info.arrests.extend([
            Arrest(arrest_id=ARREST_1),
            Arrest(arrest_id=ARREST_1),
            Arrest(arrest_id=ARREST_2),
        ])
        ingest_info.charges.extend(
            [Charge(charge_id=CHARGE_1),
             Charge(charge_id=CHARGE_2)])

        # Act
        with pytest.raises(ValidationError) as e:
            ingest_info_validator.validate(ingest_info)
        result = e.value.errors

        # Assert
        expected_result = {
            "people": {
                "duplicate_ids": {PERSON_1},
            },
            "bookings": {
                "duplicate_ids": {BOOKING_1, BOOKING_2}
            },
            "arrests": {
                "duplicate_ids": {ARREST_1}
            },
        }

        self.assertEqual(result, expected_result)
Пример #5
0
def _get_proto_from_batch_ingest_info_data_list(
        batch_ingest_info_data_list: List[BatchIngestInfoData]) -> \
        Tuple[ingest_info_pb2.IngestInfo, Dict[int, BatchIngestInfoData]]:
    """Merges an ingest_info_proto from all of the batched ingest_infos.

    Args:
        batch_ingest_info_data_list: A list of BatchIngestInfoData.
    Returns:
        an IngestInfo proto with data from all of the messages.
    """
    logging.info("Starting generation of proto")
    ingest_infos: List[IngestInfo] = []
    successful_tasks: Set[int] = set()
    failed_tasks: Dict[int, BatchIngestInfoData] = {}
    for batch_ingest_info_datum in batch_ingest_info_data_list:
        # We do this because dicts are not hashable in python and we want to
        # avoid an n2 operation to see which tasks have been seen previously
        # which can be on the order of a million operations.
        task_hash = batch_ingest_info_datum.task_hash
        if not batch_ingest_info_datum.error and task_hash not in \
                successful_tasks:
            successful_tasks.add(task_hash)
            if task_hash in failed_tasks:
                del failed_tasks[task_hash]
            if batch_ingest_info_datum.ingest_info:
                ingest_infos.append(batch_ingest_info_datum.ingest_info)
        else:
            # We only add to failed if we didn't see a successful one. This is
            # because its possible a task ran 3 times before passing, meaning
            # we don't want to fail on that when we see the failed ones.
            if task_hash not in successful_tasks:
                failed_tasks[task_hash] = batch_ingest_info_datum

    deduped_ingest_info = _dedup_people(ingest_infos)
    base_proto = ingest_utils.convert_ingest_info_to_proto(deduped_ingest_info)
    ingest_info_validator.validate(base_proto)
    logging.info("Generated proto for [%s] people", len(base_proto.people))
    return base_proto, failed_tasks
 def test_empty_ingest_info(self):
     ingest_info_validator.validate(IngestInfo())
    def test_reports_all_errors_together(self):
        # Arrange
        ingest_info = IngestInfo()
        ingest_info.people.extend([
            Person(person_id=PERSON_1, booking_ids=[MISSING_BOOKING]),
            Person(person_id=PERSON_1, booking_ids=[BOOKING_1]),
            Person(person_id=PERSON_1, booking_ids=[BOOKING_1, BOOKING_2]),
            Person(person_id=EXTRA_PERSON),
        ])
        ingest_info.bookings.extend([
            Booking(booking_id=BOOKING_1),
            Booking(booking_id=BOOKING_1, arrest_id=MISSING_ARREST),
            Booking(
                booking_id=BOOKING_2,
                arrest_id=ARREST_1,
                charge_ids=[CHARGE_1, CHARGE_2, MISSING_CHARGE],
            ),
            Booking(booking_id=EXTRA_BOOKING),
        ])
        ingest_info.arrests.extend([
            Arrest(arrest_id=ARREST_1),
            Arrest(arrest_id=ARREST_1),
            Arrest(arrest_id=EXTRA_ARREST),
        ])
        ingest_info.charges.extend([
            Charge(charge_id=CHARGE_1),
            Charge(charge_id=CHARGE_1, sentence_id=SENTENCE_1, bond_id=BOND_1),
            Charge(
                charge_id=CHARGE_2,
                sentence_id=MISSING_SENTENCE,
                bond_id=MISSING_BOND,
            ),
            Charge(charge_id=EXTRA_CHARGE),
        ])
        ingest_info.bonds.extend([
            Bond(bond_id=BOND_1),
            Bond(bond_id=BOND_1),
            Bond(bond_id=EXTRA_BOND)
        ])
        ingest_info.sentences.extend([
            Sentence(sentence_id=SENTENCE_1),
            Sentence(sentence_id=SENTENCE_1),
            Sentence(sentence_id=EXTRA_SENTENCE),
        ])

        # Act
        with pytest.raises(ValidationError) as e:
            ingest_info_validator.validate(ingest_info)
        result = e.value.errors

        # Assert
        expected_result = {
            "people": {
                "duplicate_ids": {PERSON_1}
            },
            "bookings": {
                "duplicate_ids": {BOOKING_1},
                "ids_referenced_that_do_not_exist": {MISSING_BOOKING},
                "ids_never_referenced": {EXTRA_BOOKING},
            },
            "arrests": {
                "duplicate_ids": {ARREST_1},
                "ids_referenced_that_do_not_exist": {MISSING_ARREST},
                "ids_never_referenced": {EXTRA_ARREST},
            },
            "charges": {
                "duplicate_ids": {CHARGE_1},
                "ids_referenced_that_do_not_exist": {MISSING_CHARGE},
                "ids_never_referenced": {EXTRA_CHARGE},
            },
            "sentences": {
                "duplicate_ids": {SENTENCE_1},
                "ids_referenced_that_do_not_exist": {MISSING_SENTENCE},
                "ids_never_referenced": {EXTRA_SENTENCE},
            },
            "bonds": {
                "duplicate_ids": {BOND_1},
                "ids_referenced_that_do_not_exist": {MISSING_BOND},
                "ids_never_referenced": {EXTRA_BOND},
            },
        }

        self.assertEqual(result, expected_result)
Пример #8
0
def write(ingest_info, metadata):
    """
    If in prod or if 'PERSIST_LOCALLY' is set to true, persist each person in
    the ingest_info. If a person with the given surname/birthday already exists,
    then update that person.

    Otherwise, simply log the given ingest_infos for debugging
    """
    ingest_info_validator.validate(ingest_info)

    mtags = {
        monitoring.TagKey.SHOULD_PERSIST: _should_persist(),
        monitoring.TagKey.PERSISTED: False
    }
    total_people = _get_total_people(ingest_info, metadata)
    with monitoring.measurements(mtags) as measurements:

        # Convert the people one at a time and count the errors as they happen.
        conversion_result: IngestInfoConversionResult = \
            ingest_info_converter.convert_to_persistence_entities(ingest_info,
                                                                  metadata)

        people, data_validation_errors = entity_validator.validate(
            conversion_result.people)
        logging.info(
            "Converted [%s] people with [%s] enum_parsing_errors, [%s]"
            " general_parsing_errors, [%s] protected_class_errors and "
            "[%s] data_validation_errors", len(people),
            conversion_result.enum_parsing_errors,
            conversion_result.general_parsing_errors,
            conversion_result.protected_class_errors, data_validation_errors)
        measurements.measure_int_put(m_people, len(people))

        if _should_abort(total_root_entities=total_people,
                         conversion_result=conversion_result,
                         data_validation_errors=data_validation_errors):
            #  TODO(#1665): remove once dangling PERSIST session investigation
            #   is complete.
            logging.info("_should_abort_ was true after converting people")
            return False

        if not _should_persist():
            return True

        persisted = False

        session = SessionFactory.for_schema_base(
            schema_base_for_system_level(metadata.system_level))

        try:
            logging.info("Starting entity matching")

            entity_matching_output = entity_matching.match(
                session, metadata.region, people)
            people = entity_matching_output.people
            total_root_entities = total_people \
                if metadata.system_level == SystemLevel.COUNTY \
                else entity_matching_output.total_root_entities
            logging.info("Completed entity matching with [%s] errors",
                         entity_matching_output.error_count)
            logging.info(
                "Completed entity matching and have [%s] total people "
                "to commit to DB", len(people))
            if _should_abort(
                    total_root_entities=total_root_entities,
                    conversion_result=conversion_result,
                    entity_matching_errors=entity_matching_output.error_count,
                    data_validation_errors=data_validation_errors):
                #  TODO(#1665): remove once dangling PERSIST session
                #   investigation is complete.
                logging.info("_should_abort_ was true after entity matching")
                return False

            database.write_people(
                session,
                people,
                metadata,
                orphaned_entities=entity_matching_output.orphaned_entities)
            logging.info("Successfully wrote to the database")
            session.commit()

            persisted = True
            mtags[monitoring.TagKey.PERSISTED] = True
        except Exception as e:
            logging.exception("An exception was raised in write(): [%s]",
                              type(e).__name__)
            # Record the error type that happened and increment the counter
            mtags[monitoring.TagKey.ERROR] = type(e).__name__
            measurements.measure_int_put(m_errors, 1)
            session.rollback()
            raise
        finally:
            session.close()
        return persisted
Пример #9
0
def write(
    ingest_info: IngestInfo,
    ingest_metadata: IngestMetadata,
    run_txn_fn: Callable[
        [Session, MeasurementMap, Callable[[Session], bool], Optional[int]],
        bool] = retry_transaction,
) -> bool:
    """
    If in prod or if 'PERSIST_LOCALLY' is set to true, persist each person in
    the ingest_info. If a person with the given surname/birthday already exists,
    then update that person.

    Otherwise, simply log the given ingest_infos for debugging

    `run_txn_fn` is exposed primarily for testing and should typically be left as `retry_transaction`. `run_txn_fn`
    must handle the coordination of the transaction including, when to run the body of the transaction and when to
    commit, rollback, or close the session.
    """
    ingest_info_validator.validate(ingest_info)

    mtags: Dict[str, Union[bool, str]] = {
        monitoring.TagKey.SHOULD_PERSIST: should_persist(),
        monitoring.TagKey.PERSISTED: False,
    }
    total_people = _get_total_people(ingest_info, ingest_metadata)
    with monitoring.measurements(mtags) as measurements:

        # Convert the people one at a time and count the errors as they happen.
        conversion_result: IngestInfoConversionResult = (
            ingest_info_converter.convert_to_persistence_entities(
                ingest_info, ingest_metadata))

        people, data_validation_errors = entity_validator.validate(
            conversion_result.people)
        logging.info(
            "Converted [%s] people with [%s] enum_parsing_errors, [%s]"
            " general_parsing_errors, [%s] protected_class_errors and "
            "[%s] data_validation_errors",
            len(people),
            conversion_result.enum_parsing_errors,
            conversion_result.general_parsing_errors,
            conversion_result.protected_class_errors,
            data_validation_errors,
        )
        measurements.measure_int_put(m_people, len(people))

        if _should_abort(
                total_root_entities=total_people,
                system_level=ingest_metadata.system_level,
                conversion_result=conversion_result,
                region_code=ingest_metadata.region,
                data_validation_errors=data_validation_errors,
        ):
            #  TODO(#1665): remove once dangling PERSIST session investigation
            #   is complete.
            logging.info("_should_abort_ was true after converting people")
            return False

        if not should_persist():
            return True

        @trace.span
        def match_and_write_people(session: Session) -> bool:
            logging.info("Starting entity matching")

            entity_matching_output = entity_matching.match(
                session, ingest_metadata.region, people)
            output_people = entity_matching_output.people
            total_root_entities = (total_people if ingest_metadata.system_level
                                   == SystemLevel.COUNTY else
                                   entity_matching_output.total_root_entities)
            logging.info(
                "Completed entity matching with [%s] errors",
                entity_matching_output.error_count,
            )
            logging.info(
                "Completed entity matching and have [%s] total people "
                "to commit to DB",
                len(output_people),
            )
            if _should_abort(
                    total_root_entities=total_root_entities,
                    system_level=ingest_metadata.system_level,
                    conversion_result=conversion_result,
                    region_code=ingest_metadata.region,
                    entity_matching_errors=entity_matching_output.error_count,
            ):
                #  TODO(#1665): remove once dangling PERSIST session
                #   investigation is complete.
                logging.info("_should_abort_ was true after entity matching")
                return False

            database_invariant_errors = (
                database_invariant_validator.validate_invariants(
                    session,
                    ingest_metadata.system_level,
                    ingest_metadata.region,
                    output_people,
                ))

            if _should_abort(
                    total_root_entities=total_root_entities,
                    system_level=ingest_metadata.system_level,
                    conversion_result=conversion_result,
                    region_code=ingest_metadata.region,
                    database_invariant_errors=database_invariant_errors,
            ):
                logging.info(
                    "_should_abort_ was true after database invariant validation"
                )
                return False

            database.write_people(
                session,
                output_people,
                ingest_metadata,
                orphaned_entities=entity_matching_output.orphaned_entities,
            )
            logging.info("Successfully wrote to the database")
            return True

        try:
            with SessionFactory.using_database(ingest_metadata.database_key,
                                               autocommit=False) as session:
                if not run_txn_fn(session, measurements,
                                  match_and_write_people, 5):
                    return False
            mtags[monitoring.TagKey.PERSISTED] = True
        except Exception as e:
            logging.exception("An exception was raised in write(): [%s]",
                              type(e).__name__)
            # Record the error type that happened and increment the counter
            mtags[monitoring.TagKey.ERROR] = type(e).__name__
            measurements.measure_int_put(m_errors, 1)
            raise
        return True