def _convert_and_throw_on_errors( ingest_info: IngestInfo, metadata: IngestMetadata ) -> List[county_entities.Person]: conversion_result: IngestInfoConversionResult = ( ingest_info_converter.convert_to_persistence_entities(ingest_info, metadata) ) if conversion_result.enum_parsing_errors > 0: raise ValueError( "Had [{}] enum parsing errors".format( conversion_result.enum_parsing_errors ) ) if conversion_result.general_parsing_errors > 0: raise ValueError( "Had [{}] general parsing errors".format( conversion_result.general_parsing_errors ) ) if conversion_result.protected_class_errors > 0: raise ValueError( "Had [{}] protected class errors".format( conversion_result.protected_class_errors ) ) return conversion_result.people
def validate_ingest( self, ingest_info: IngestInfo, expected_ingest_info: IngestInfo, metadata: IngestMetadata, ) -> IngestInfo: """This function runs validation on a computed and expected ingest_info. Args: ingest_info: the computed ingest info object expected_ingest_info: the ingest info expected to be returned from `populate_data`. If `expected_ingest_info` is `None`, then expects the return value of `populate_data` to be `None`. metadata: an ingest info metadata struct to pass along to the proto converter. Returns: The result from populate_data in case the user needs to do any extra validations on the output. """ if expected_ingest_info is None: assert ingest_info == expected_ingest_info return ingest_info # Attempt to convert the ingest_info to the ingest info proto, # validate the proto, and finally attempt to convert the proto into # our entitiy/ objects (which includes parsing strings into types) ingest_info_proto = serialization.convert_ingest_info_to_proto( ingest_info) validate(ingest_info_proto) res = ingest_info_converter.convert_to_persistence_entities( ingest_info_proto, metadata) assert res.enum_parsing_errors == 0 assert res.general_parsing_errors == 0 assert res.protected_class_errors == 0 entity_validator.validate(res.people) differences = diff_ingest_infos(expected_ingest_info, ingest_info) if differences: self.fail( # type: ignore[attr-defined] "IngestInfo objects do not match.\n" "Expected:\n{}\n" "Actual:\n{}\n" "Differences:\n{}\n\n" "(paste the following) scraped object:" "\n{}".format( expected_ingest_info, ingest_info, "\n".join(differences), repr(ingest_info), )) return ingest_info
def write(ingest_info, metadata): """ If in prod or if 'PERSIST_LOCALLY' is set to true, persist each person in the ingest_info. If a person with the given surname/birthday already exists, then update that person. Otherwise, simply log the given ingest_infos for debugging """ ingest_info_validator.validate(ingest_info) mtags = { monitoring.TagKey.SHOULD_PERSIST: _should_persist(), monitoring.TagKey.PERSISTED: False } total_people = _get_total_people(ingest_info, metadata) with monitoring.measurements(mtags) as measurements: # Convert the people one at a time and count the errors as they happen. conversion_result: IngestInfoConversionResult = \ ingest_info_converter.convert_to_persistence_entities(ingest_info, metadata) people, data_validation_errors = entity_validator.validate( conversion_result.people) logging.info( "Converted [%s] people with [%s] enum_parsing_errors, [%s]" " general_parsing_errors, [%s] protected_class_errors and " "[%s] data_validation_errors", len(people), conversion_result.enum_parsing_errors, conversion_result.general_parsing_errors, conversion_result.protected_class_errors, data_validation_errors) measurements.measure_int_put(m_people, len(people)) if _should_abort(total_root_entities=total_people, conversion_result=conversion_result, data_validation_errors=data_validation_errors): # TODO(#1665): remove once dangling PERSIST session investigation # is complete. logging.info("_should_abort_ was true after converting people") return False if not _should_persist(): return True persisted = False session = SessionFactory.for_schema_base( schema_base_for_system_level(metadata.system_level)) try: logging.info("Starting entity matching") entity_matching_output = entity_matching.match( session, metadata.region, people) people = entity_matching_output.people total_root_entities = total_people \ if metadata.system_level == SystemLevel.COUNTY \ else entity_matching_output.total_root_entities logging.info("Completed entity matching with [%s] errors", entity_matching_output.error_count) logging.info( "Completed entity matching and have [%s] total people " "to commit to DB", len(people)) if _should_abort( total_root_entities=total_root_entities, conversion_result=conversion_result, entity_matching_errors=entity_matching_output.error_count, data_validation_errors=data_validation_errors): # TODO(#1665): remove once dangling PERSIST session # investigation is complete. logging.info("_should_abort_ was true after entity matching") return False database.write_people( session, people, metadata, orphaned_entities=entity_matching_output.orphaned_entities) logging.info("Successfully wrote to the database") session.commit() persisted = True mtags[monitoring.TagKey.PERSISTED] = True except Exception as e: logging.exception("An exception was raised in write(): [%s]", type(e).__name__) # Record the error type that happened and increment the counter mtags[monitoring.TagKey.ERROR] = type(e).__name__ measurements.measure_int_put(m_errors, 1) session.rollback() raise finally: session.close() return persisted
def write( ingest_info: IngestInfo, ingest_metadata: IngestMetadata, run_txn_fn: Callable[ [Session, MeasurementMap, Callable[[Session], bool], Optional[int]], bool] = retry_transaction, ) -> bool: """ If in prod or if 'PERSIST_LOCALLY' is set to true, persist each person in the ingest_info. If a person with the given surname/birthday already exists, then update that person. Otherwise, simply log the given ingest_infos for debugging `run_txn_fn` is exposed primarily for testing and should typically be left as `retry_transaction`. `run_txn_fn` must handle the coordination of the transaction including, when to run the body of the transaction and when to commit, rollback, or close the session. """ ingest_info_validator.validate(ingest_info) mtags: Dict[str, Union[bool, str]] = { monitoring.TagKey.SHOULD_PERSIST: should_persist(), monitoring.TagKey.PERSISTED: False, } total_people = _get_total_people(ingest_info, ingest_metadata) with monitoring.measurements(mtags) as measurements: # Convert the people one at a time and count the errors as they happen. conversion_result: IngestInfoConversionResult = ( ingest_info_converter.convert_to_persistence_entities( ingest_info, ingest_metadata)) people, data_validation_errors = entity_validator.validate( conversion_result.people) logging.info( "Converted [%s] people with [%s] enum_parsing_errors, [%s]" " general_parsing_errors, [%s] protected_class_errors and " "[%s] data_validation_errors", len(people), conversion_result.enum_parsing_errors, conversion_result.general_parsing_errors, conversion_result.protected_class_errors, data_validation_errors, ) measurements.measure_int_put(m_people, len(people)) if _should_abort( total_root_entities=total_people, system_level=ingest_metadata.system_level, conversion_result=conversion_result, region_code=ingest_metadata.region, data_validation_errors=data_validation_errors, ): # TODO(#1665): remove once dangling PERSIST session investigation # is complete. logging.info("_should_abort_ was true after converting people") return False if not should_persist(): return True @trace.span def match_and_write_people(session: Session) -> bool: logging.info("Starting entity matching") entity_matching_output = entity_matching.match( session, ingest_metadata.region, people) output_people = entity_matching_output.people total_root_entities = (total_people if ingest_metadata.system_level == SystemLevel.COUNTY else entity_matching_output.total_root_entities) logging.info( "Completed entity matching with [%s] errors", entity_matching_output.error_count, ) logging.info( "Completed entity matching and have [%s] total people " "to commit to DB", len(output_people), ) if _should_abort( total_root_entities=total_root_entities, system_level=ingest_metadata.system_level, conversion_result=conversion_result, region_code=ingest_metadata.region, entity_matching_errors=entity_matching_output.error_count, ): # TODO(#1665): remove once dangling PERSIST session # investigation is complete. logging.info("_should_abort_ was true after entity matching") return False database_invariant_errors = ( database_invariant_validator.validate_invariants( session, ingest_metadata.system_level, ingest_metadata.region, output_people, )) if _should_abort( total_root_entities=total_root_entities, system_level=ingest_metadata.system_level, conversion_result=conversion_result, region_code=ingest_metadata.region, database_invariant_errors=database_invariant_errors, ): logging.info( "_should_abort_ was true after database invariant validation" ) return False database.write_people( session, output_people, ingest_metadata, orphaned_entities=entity_matching_output.orphaned_entities, ) logging.info("Successfully wrote to the database") return True try: with SessionFactory.using_database(ingest_metadata.database_key, autocommit=False) as session: if not run_txn_fn(session, measurements, match_and_write_people, 5): return False mtags[monitoring.TagKey.PERSISTED] = True except Exception as e: logging.exception("An exception was raised in write(): [%s]", type(e).__name__) # Record the error type that happened and increment the counter mtags[monitoring.TagKey.ERROR] = type(e).__name__ measurements.measure_int_put(m_errors, 1) raise return True