def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the supervision calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options['project'] input_dataset = project_id + '.' + data_input reference_dataset = project_id + '.' + reference_view_input static_reference_dataset = project_id + '.' + static_reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationPeriods incarceration_periods = ( p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionViolations supervision_violations = ( p | 'Load SupervisionViolations' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # TODO(#2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | 'Load SupervisionViolationResponses' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load SupervisionSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load IncarcerationSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionPeriods supervision_periods = ( p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) supervision_contacts = ( p | 'Load StateSupervisionContacts' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionContact, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) ssvr_agent_associations_as_kv = ( p | 'Load ssvr_agent_associations_as_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SSVR_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key='supervision_violation_response_id', state_code_filter=state_code, person_id_filter_set=None)) supervision_period_to_agent_associations_as_kv = ( p | 'Load supervision_period_to_agent_associations_as_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key='supervision_period_id', state_code_filter=state_code, person_id_filter_set=None)) # Bring in the judicial districts associated with supervision_periods sp_to_judicial_district_kv = ( p | 'Load sp_to_judicial_district_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id= SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, state_code_filter=state_code, person_id_filter_set=person_id_filter_set, table_key='person_id')) state_race_ethnicity_population_counts = ( p | 'Load state_race_ethnicity_population_counts' >> ImportTable( dataset_id=static_reference_dataset, table_id='state_race_ethnicity_population_counts', state_code_filter=state_code, person_id_filter_set=None, )) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { 'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id incarceration_periods_and_violation_responses = ( { 'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding # StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their related entities person_entities = ( { 'person': persons, 'assessments': assessments, 'incarceration_periods': incarceration_periods_with_source_violations, 'supervision_periods': supervision_periods, 'supervision_sentences': sentences_converted.supervision_sentences, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'violation_responses': violation_responses_with_hydrated_violations, 'supervision_contacts': supervision_contacts, 'supervision_period_judicial_district_association': sp_to_judicial_district_kv } | 'Group StatePerson to all entities' >> beam.CoGroupByKey()) # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods person_time_buckets = ( person_entities | 'Get SupervisionTimeBuckets' >> beam.ParDo( ClassifySupervisionTimeBuckets(), AsDict(ssvr_agent_associations_as_kv), AsDict(supervision_period_to_agent_associations_as_kv))) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_time_buckets_with_metadata = ( { 'person_events': person_time_buckets, 'person_metadata': person_metadata } | 'Group SupervisionTimeBuckets with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and SupervisionTimeBuckets for calculations' >> beam.ParDo(ExtractPersonEventsMetadata())) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Get the type of metric to calculate metric_types_set = set(metric_types) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get supervision metrics supervision_metrics = ( person_time_buckets_with_metadata | 'Get Supervision Metrics' >> GetSupervisionMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( supervision_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( SupervisionMetricType.SUPERVISION_COMPLIANCE.value, SupervisionMetricType.SUPERVISION_POPULATION.value, SupervisionMetricType.SUPERVISION_REVOCATION.value, SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value, SupervisionMetricType. SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS.value, SupervisionMetricType.SUPERVISION_SUCCESS.value, SupervisionMetricType. SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value, SupervisionMetricType.SUPERVISION_TERMINATION.value)) # Write the metrics to the output tables in BigQuery terminations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionTerminationMetric) compliance_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionCaseComplianceMetric) populations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionPopulationMetric) revocations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionRevocationMetric) revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionRevocationAnalysisMetric) revocation_violation_type_analysis_table_id = \ DATAFLOW_METRICS_TO_TABLES.get(SupervisionRevocationViolationTypeAnalysisMetric) successes_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionSuccessMetric) successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES.get( SuccessfulSupervisionSentenceDaysServedMetric) _ = (writable_metrics.SUPERVISION_POPULATION | f"Write population metrics to BQ table: {populations_table_id}" >> beam.io.WriteToBigQuery( table=populations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_REVOCATION | f"Write revocation metrics to BQ table: {revocations_table_id}" >> beam.io.WriteToBigQuery( table=revocations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_SUCCESS | f"Write success metrics to BQ table: {successes_table_id}" >> beam.io.WriteToBigQuery( table=successes_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED | f"Write supervision successful sentence length metrics to BQ" f" table: {successful_sentence_lengths_table_id}" >> beam.io.WriteToBigQuery( table=successful_sentence_lengths_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_TERMINATION | f"Write termination metrics to BQ table: {terminations_table_id}" >> beam.io.WriteToBigQuery( table=terminations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = ( writable_metrics.SUPERVISION_REVOCATION_ANALYSIS | f"Write revocation analyses metrics to BQ table: {revocation_analysis_table_id}" >> beam.io.WriteToBigQuery( table=revocation_analysis_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS | f"Write revocation violation type analyses metrics to BQ table: " f"{revocation_violation_type_analysis_table_id}" >> beam.io.WriteToBigQuery( table=revocation_violation_type_analysis_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_COMPLIANCE | f"Write compliance metrics to BQ table: {compliance_table_id}" >> beam.io.WriteToBigQuery( table=compliance_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
def run( apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: str, calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]], ): """Runs the program calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options["project"] if project_id is None: raise ValueError(f"No project set in pipeline options: {all_pipeline_options}") if state_code is None: raise ValueError("No state_code set for pipeline") input_dataset = project_id + "." + data_input reference_dataset = project_id + "." + reference_view_input static_reference_dataset = project_id + "." + static_reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = p | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateProgramAssignments program_assignments = p | "Load Program Assignments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateAssessments assessments = p | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionPeriods supervision_periods = p | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_period_to_agent_associations_as_kv = ( p | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, ) ) state_race_ethnicity_population_counts = ( p | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, ) ) # Group each StatePerson with their other entities persons_entities = { "person": persons, "program_assignments": program_assignments, "assessments": assessments, "supervision_periods": supervision_periods, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, } | "Group StatePerson to StateProgramAssignments and" >> beam.CoGroupByKey() # Identify ProgramEvents from the StatePerson's StateProgramAssignments person_program_events = persons_entities | beam.ParDo( ClassifyProgramAssignments() ) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts) ) ) person_program_events_with_metadata = ( {"person_events": person_program_events, "person_metadata": person_metadata} | "Group ProgramEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ProgramEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get program metrics program_metrics = ( person_program_events_with_metadata | "Get Program Metrics" >> GetProgramMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, ) ) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( program_metrics | "Convert to dict to be written to BQ" >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( ProgramMetricType.PROGRAM_PARTICIPATION.value, ProgramMetricType.PROGRAM_REFERRAL.value, ) ) # Write the metrics to the output tables in BigQuery referrals_table_id = DATAFLOW_METRICS_TO_TABLES[ProgramReferralMetric] participation_table_id = DATAFLOW_METRICS_TO_TABLES[ProgramParticipationMetric] _ = ( writable_metrics.PROGRAM_REFERRAL | f"Write referral metrics to BQ table: {referrals_table_id}" >> WriteAppendToBigQuery( output_table=referrals_table_id, output_dataset=output, ) ) _ = ( writable_metrics.PROGRAM_PARTICIPATION | f"Write participation metrics to BQ table: {participation_table_id}" >> WriteAppendToBigQuery( output_table=participation_table_id, output_dataset=output, ) )
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, metric_types: List[str], state_code: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options['project'] input_dataset = project_id + '.' + data_input reference_dataset = project_id + '.' + reference_view_input static_reference_dataset = project_id + '.' + static_reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationPeriods incarceration_periods = (p | 'Load IncarcerationPeriods' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateSupervisionViolations supervision_violations = \ (p | 'Load SupervisionViolations' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # TODO(#2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = \ (p | 'Load SupervisionViolationResponses' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( {'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo(SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( {'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations} | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo(SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods person_and_incarceration_periods = ( {'person': persons, 'incarceration_periods': incarceration_periods_with_source_violations} | 'Group StatePerson to StateIncarcerationPeriods' >> beam.CoGroupByKey() ) # Bring in the table that associates people and their county of residence person_id_to_county_kv = (p | 'Load person_id_to_county_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key='person_id', state_code_filter=state_code, person_id_filter_set=person_id_filter_set )) state_race_ethnicity_population_counts = ( p | 'Load state_race_ethnicity_population_counts' >> ImportTable( dataset_id=static_reference_dataset, table_id='state_race_ethnicity_population_counts', state_code_filter=state_code, person_id_filter_set=None )) # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods person_release_events = ( person_and_incarceration_periods | "ClassifyReleaseEvents" >> beam.ParDo(ClassifyReleaseEvents(), AsDict(person_id_to_county_kv)) ) person_metadata = (persons | "Build the person_metadata dictionary" >> beam.ParDo(BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_release_events_with_metadata = ( { 'person_events': person_release_events, 'person_metadata': person_metadata } | 'Group ReleaseEvents with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and ReleaseEvents for calculations' >> beam.ParDo(ExtractPersonReleaseEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get recidivism metrics recidivism_metrics = (person_release_events_with_metadata | 'Get Recidivism Metrics' >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set)) if person_id_filter_set: logging.warning("Non-empty person filter set - returning before writing metrics.") return # Convert the metrics into a format that's writable to BQ writable_metrics = (recidivism_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( ReincarcerationRecidivismMetricType.REINCARCERATION_RATE.value, ReincarcerationRecidivismMetricType.REINCARCERATION_COUNT.value )) # Write the recidivism metrics to the output tables in BigQuery rates_table_id = DATAFLOW_METRICS_TO_TABLES.get(ReincarcerationRecidivismRateMetric) counts_table_id = DATAFLOW_METRICS_TO_TABLES.get(ReincarcerationRecidivismCountMetric) _ = (writable_metrics.REINCARCERATION_RATE | f"Write rate metrics to BQ table: {rates_table_id}" >> beam.io.WriteToBigQuery( table=rates_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS )) _ = (writable_metrics.REINCARCERATION_COUNT | f"Write count metrics to BQ table: {counts_table_id}" >> beam.io.WriteToBigQuery( table=counts_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS ))
def run_test_pipeline(self, dataset: str, fake_supervision_period_id: int, unifying_id_field_filter_set: Optional[Set[int]] = None, metric_types_filter: Optional[Set[str]] = None): """Runs a test version of the program pipeline.""" test_pipeline = TestPipeline() # Get StatePersons persons = (test_pipeline | 'Load Persons' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True)) # Get StateProgramAssignments program_assignments = (test_pipeline | 'Load Program Assignments' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities. StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=unifying_id_field_filter_set)) # Get StateAssessments assessments = (test_pipeline | 'Load Assessments' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities. StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=unifying_id_field_filter_set)) # Get StateSupervisionPeriods supervision_periods = (test_pipeline | 'Load SupervisionPeriods' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class= entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=unifying_id_field_filter_set)) supervision_period_to_agent_map = { 'agent_id': 1010, 'agent_external_id': 'OFFICER0009', 'district_external_id': '10', 'supervision_period_id': fake_supervision_period_id } supervision_period_to_agent_associations = ( test_pipeline | 'Create SupervisionPeriod to Agent table' >> beam.Create([supervision_period_to_agent_map]) ) supervision_period_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert SupervisionPeriod to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id') ) state_race_ethnicity_population_count = { 'state_code': 'US_XX', 'race_or_ethnicity': 'BLACK', 'population_count': 1, 'representation_priority': 1 } state_race_ethnicity_population_counts = ( test_pipeline | 'Create state_race_ethnicity_population_count table' >> beam.Create( [state_race_ethnicity_population_count]) ) # Group each StatePerson with their other entities persons_entities = ( {'person': persons, 'program_assignments': program_assignments, 'assessments': assessments, 'supervision_periods': supervision_periods } | 'Group StatePerson to StateProgramAssignments and' >> beam.CoGroupByKey() ) # Identify ProgramEvents from the StatePerson's # StateProgramAssignments person_program_events = ( persons_entities | beam.ParDo(pipeline.ClassifyProgramAssignments(), AsDict( supervision_period_to_agent_associations_as_kv )) ) person_metadata = (persons | "Build the person_metadata dictionary" >> beam.ParDo(BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_program_events_with_metadata = ( { 'person_events': person_program_events, 'person_metadata': person_metadata } | 'Group ProgramEvents with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and ProgramEvents for calculations' >> beam.ParDo(ExtractPersonEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = PipelineOptions().get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp metric_types = metric_types_filter if metric_types_filter else {'ALL'} # Get program metrics program_metrics = (person_program_events_with_metadata | 'Get Program Metrics' >> # type: ignore pipeline.GetProgramMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types, calculation_end_month=None, calculation_month_count=-1)) assert_that(program_metrics, AssertMatchers.validate_pipeline_test()) test_pipeline.run()
def execute_pipeline( self, pipeline: beam.Pipeline, all_pipeline_options: Dict[str, Any], state_code: str, input_dataset: str, reference_dataset: str, static_reference_dataset: str, metric_types: List[str], person_id_filter_set: Optional[Set[int]], calculation_month_count: int = -1, calculation_end_month: Optional[str] = None, ) -> beam.Pipeline: persons = pipeline | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationPeriods incarceration_periods = ( pipeline | "Load IncarcerationPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionPeriods supervision_periods = pipeline | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( pipeline | "Load person_id_to_county_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) # Group each StatePerson with their StateIncarcerationPeriods person_entities = { "person": persons, "incarceration_periods": incarceration_periods, "supervision_periods": supervision_periods, "persons_to_recent_county_of_residence": person_id_to_county_kv, } | "Group StatePerson to StateIncarcerationPeriods" >> beam.CoGroupByKey( ) state_race_ethnicity_population_counts = ( pipeline | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods person_release_events = person_entities | "ClassifyReleaseEvents" >> beam.ParDo( ClassifyEvents(), identifier=self.pipeline_config.identifier) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), state_race_ethnicity_population_counts=AsList( state_race_ethnicity_population_counts), )) person_release_events_with_metadata = ( { "person_events": person_release_events, "person_metadata": person_metadata } | "Group ReleaseEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ReleaseEvents for calculations" >> beam.ParDo(ExtractPersonReleaseEventsMetadata())) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get recidivism metrics recidivism_metrics = ( person_release_events_with_metadata | "Get Recidivism Metrics" >> GetMetrics( pipeline_options=all_pipeline_options, pipeline_config=self.pipeline_config, metric_types_to_include=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) return recidivism_metrics
def run_test_pipeline( fake_person_id: int, state_code: str, dataset: str, expected_metric_types: Set[IncarcerationMetricType], allow_empty: bool = False, unifying_id_field_filter_set: Optional[Set[int]] = None, metric_types_filter: Optional[Set[str]] = None): """Runs a test version of the incarceration pipeline.""" test_pipeline = TestPipeline() # Get StatePersons persons = ( test_pipeline | 'Load Persons' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True)) # Get StateSentenceGroups sentence_groups = ( test_pipeline | 'Load StateSentenceGroups' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=unifying_id_field_filter_set)) # Get StateIncarcerationSentences incarceration_sentences = ( test_pipeline | 'Load StateIncarcerationSentences' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=unifying_id_field_filter_set)) # Get StateSupervisionSentences supervision_sentences = ( test_pipeline | 'Load StateSupervisionSentences' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=unifying_id_field_filter_set)) us_mo_sentence_status_rows: List[Dict[str, Any]] = [{ 'person_id': fake_person_id, 'sentence_external_id': 'XXX', 'sentence_status_external_id': 'YYY', 'status_code': 'ZZZ', 'status_date': 'not_a_date', 'status_description': 'XYZ' }] us_mo_sentence_statuses = (test_pipeline | 'Create MO sentence statuses' >> beam.Create(us_mo_sentence_status_rows)) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) sentences_and_sentence_groups = ( { 'sentence_groups': sentence_groups, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'supervision_sentences': sentences_converted.supervision_sentences } | 'Group sentences to sentence groups' >> beam.CoGroupByKey()) sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | 'Set hydrated sentences on sentence groups' >> beam.ParDo( SetSentencesOnSentenceGroup())) # Identify IncarcerationEvents events from the StatePerson's # StateIncarcerationPeriods fake_person_id_to_county_query_result = [{ 'person_id': fake_person_id, 'county_of_residence': _COUNTY_OF_RESIDENCE }] person_id_to_county_kv = ( test_pipeline | "Read person id to county associations from BigQuery" >> beam.Create(fake_person_id_to_county_query_result) | "Convert person_id to counties to KV" >> beam.ParDo( ConvertDictToKVTuple(), 'person_id')) incarceration_period_judicial_district_association_row = \ {'person_id': fake_person_id, 'incarceration_period_id': 123, 'judicial_district_code': 'NW'} ip_to_judicial_district_kv = ( test_pipeline | "Read incarceration_period to judicial_district associations from BigQuery" >> beam.Create( [incarceration_period_judicial_district_association_row]) | "Convert ips to judicial districts to KV" >> beam.ParDo( ConvertDictToKVTuple(), 'person_id')) state_race_ethnicity_population_count = { 'state_code': state_code, 'race_or_ethnicity': 'BLACK', 'population_count': 1, 'representation_priority': 1 } state_race_ethnicity_population_counts = ( test_pipeline | 'Create state_race_ethnicity_population_count table' >> beam.Create([state_race_ethnicity_population_count])) # Group each StatePerson with their related entities person_entities = ( { 'person': persons, 'sentence_groups': sentence_groups_with_hydrated_sentences, 'incarceration_period_judicial_district_association': ip_to_judicial_district_kv } | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey()) # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_incarceration_events = ( person_entities | 'Classify Incarceration Events' >> beam.ParDo( pipeline.ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_incarceration_events_with_metadata = ( { 'person_events': person_incarceration_events, 'person_metadata': person_metadata } | 'Group IncarcerationEvents with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations' >> beam.ParDo(ExtractPersonEventsMetadata())) # Get pipeline job details for accessing job_id all_pipeline_options = PipelineOptions().get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp metric_types = metric_types_filter if metric_types_filter else {'ALL'} # Get IncarcerationMetrics incarceration_metrics = ( person_incarceration_events_with_metadata | 'Get Incarceration Metrics' >> # type: ignore pipeline.GetIncarcerationMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types, calculation_end_month=None, calculation_month_count=-1)) assert_that( incarceration_metrics, AssertMatchers.validate_metric_type(allow_empty=allow_empty), 'Assert that all metrics are of the expected type.') assert_that( incarceration_metrics, AssertMatchers.validate_pipeline_test(expected_metric_types), 'Assert the type of metrics produced are expected') test_pipeline.run()
def execute_pipeline( self, pipeline: beam.Pipeline, all_pipeline_options: Dict[str, Any], state_code: str, input_dataset: str, _reference_dataset: str, static_reference_dataset: str, metric_types: List[str], person_id_filter_set: Optional[Set[int]], calculation_month_count: int = -1, calculation_end_month: Optional[str] = None, ) -> beam.Pipeline: # Get StatePersons persons = pipeline | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionViolations supervision_violations = ( pipeline | "Load SupervisionViolations" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) ) # Get StateSupervisionViolationResponses supervision_violation_responses = ( pipeline | "Load SupervisionViolationResponses" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) ) state_race_ethnicity_population_counts = ( pipeline | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, ) ) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { "violations": supervision_violations, "violation_responses": supervision_violation_responses, } | "Group StateSupervisionViolationResponses to StateSupervisionViolations" >> beam.CoGroupByKey() ) violations_with_hydrated_violation_responses = ( supervision_violations_and_responses | "Set hydrated StateSupervisionViolationResponses on the StateSupervisionViolations" >> beam.ParDo(SetViolationResponsesOntoViolations()) ) person_entities = { "person": persons, "violations": violations_with_hydrated_violation_responses, } | "Group StatePerson to violation entities" >> beam.CoGroupByKey() person_violation_events = person_entities | "Get ViolationEvents" >> beam.ParDo( ClassifyEvents(), identifier=self.pipeline_config.identifier ) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), state_race_ethnicity_population_counts=AsList( state_race_ethnicity_population_counts ), ) ) person_violation_events_with_metadata = ( { "person_events": person_violation_events, "person_metadata": person_metadata, } | "Group ViolationEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ViolationEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata()) ) metric_types_set = set(metric_types) job_timestamp = datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get violation metrics violation_metrics = ( person_violation_events_with_metadata | "Get Violation Metrics" >> GetMetrics( pipeline_options=all_pipeline_options, pipeline_config=self.pipeline_config, metric_types_to_include=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, ) ) return violation_metrics
def run( apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, metric_types: List[str], state_code: str, person_filter_ids: Optional[List[int]], ): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options["project"] if project_id is None: raise ValueError(f"No project set in pipeline options: {all_pipeline_options}") if state_code is None: raise ValueError("No state_code set for pipeline") input_dataset = project_id + "." + data_input reference_dataset = project_id + "." + reference_view_input static_reference_dataset = project_id + "." + static_reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = p | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationPeriods incarceration_periods = p | "Load IncarcerationPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionViolations supervision_violations = p | "Load SupervisionViolations" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # TODO(#2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | "Load SupervisionViolationResponses" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) ) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( { "violations": supervision_violations, "violation_responses": supervision_violation_responses, } | "Group StateSupervisionViolationResponses to " "StateSupervisionViolations" >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | "Set hydrated StateSupervisionViolations on " "the StateSupervisionViolationResponses" >> beam.ParDo(SetViolationOnViolationsResponse()) ) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( { "incarceration_periods": incarceration_periods, "violation_responses": violation_responses_with_hydrated_violations, } | "Group StateIncarcerationPeriods to " "StateSupervisionViolationResponses" >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | "Set hydrated StateSupervisionViolationResponses on " "the StateIncarcerationPeriods" >> beam.ParDo(SetViolationResponseOnIncarcerationPeriod()) ) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( p | "Load person_id_to_county_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, ) ) # Group each StatePerson with their StateIncarcerationPeriods person_entities = { "person": persons, "incarceration_periods": incarceration_periods_with_source_violations, "persons_to_recent_county_of_residence": person_id_to_county_kv, } | "Group StatePerson to StateIncarcerationPeriods" >> beam.CoGroupByKey() state_race_ethnicity_population_counts = ( p | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, ) ) # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods person_release_events = person_entities | "ClassifyReleaseEvents" >> beam.ParDo( ClassifyReleaseEvents() ) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts) ) ) person_release_events_with_metadata = ( {"person_events": person_release_events, "person_metadata": person_metadata} | "Group ReleaseEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ReleaseEvents for calculations" >> beam.ParDo(ExtractPersonReleaseEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get recidivism metrics recidivism_metrics = ( person_release_events_with_metadata | "Get Recidivism Metrics" >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set ) ) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( recidivism_metrics | "Convert to dict to be written to BQ" >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( ReincarcerationRecidivismMetricType.REINCARCERATION_RATE.value, ReincarcerationRecidivismMetricType.REINCARCERATION_COUNT.value, ) ) # Write the recidivism metrics to the output tables in BigQuery rates_table_id = DATAFLOW_METRICS_TO_TABLES[ReincarcerationRecidivismRateMetric] counts_table_id = DATAFLOW_METRICS_TO_TABLES[ ReincarcerationRecidivismCountMetric ] _ = ( writable_metrics.REINCARCERATION_RATE | f"Write rate metrics to BQ table: {rates_table_id}" >> WriteAppendToBigQuery( output_table=rates_table_id, output_dataset=output, ) ) _ = ( writable_metrics.REINCARCERATION_COUNT | f"Write count metrics to BQ table: {counts_table_id}" >> WriteAppendToBigQuery( output_table=counts_table_id, output_dataset=output, ) )
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the incarceration calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options['project'] input_dataset = project_id + '.' + data_input reference_dataset = project_id + '.' + reference_view_input static_reference_dataset = project_id + '.' + static_reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load StatePersons' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSentenceGroups sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load StateIncarcerationSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load StateSupervisionSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) supervision_sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( supervision_sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) sentences_and_sentence_groups = ( { 'sentence_groups': sentence_groups, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'supervision_sentences': sentences_converted.supervision_sentences } | 'Group sentences to sentence groups' >> beam.CoGroupByKey()) # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | 'Set hydrated sentences on sentence groups' >> beam.ParDo( SetSentencesOnSentenceGroup())) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( p | 'Load person_id_to_county_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key='person_id', state_code_filter=state_code, person_id_filter_set=person_id_filter_set)) ip_to_judicial_district_kv = ( p | 'Load ip_to_judicial_district_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id= INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, table_key='person_id', state_code_filter=state_code, person_id_filter_set=person_id_filter_set)) state_race_ethnicity_population_counts = ( p | 'Load state_race_ethnicity_population_counts' >> ImportTable( dataset_id=static_reference_dataset, table_id='state_race_ethnicity_population_counts', state_code_filter=state_code, person_id_filter_set=None)) # Group each StatePerson with their related entities person_entities = ( { 'person': persons, 'sentence_groups': sentence_groups_with_hydrated_sentences, 'incarceration_period_judicial_district_association': ip_to_judicial_district_kv } | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey()) # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_incarceration_events = ( person_entities | 'Classify Incarceration Events' >> beam.ParDo( ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_incarceration_events_with_metadata = ( { 'person_events': person_incarceration_events, 'person_metadata': person_metadata } | 'Group IncarcerationEvents with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations' >> beam.ParDo(ExtractPersonEventsMetadata())) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get IncarcerationMetrics incarceration_metrics = ( person_incarceration_events_with_metadata | 'Get Incarceration Metrics' >> GetIncarcerationMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( incarceration_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( IncarcerationMetricType.INCARCERATION_ADMISSION.value, IncarcerationMetricType.INCARCERATION_POPULATION.value, IncarcerationMetricType.INCARCERATION_RELEASE.value)) # Write the metrics to the output tables in BigQuery admissions_table_id = DATAFLOW_METRICS_TO_TABLES.get( IncarcerationAdmissionMetric) population_table_id = DATAFLOW_METRICS_TO_TABLES.get( IncarcerationPopulationMetric) releases_table_id = DATAFLOW_METRICS_TO_TABLES.get( IncarcerationReleaseMetric) _ = (writable_metrics.INCARCERATION_ADMISSION | f"Write admission metrics to BQ table: {admissions_table_id}" >> beam.io.WriteToBigQuery( table=admissions_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.INCARCERATION_POPULATION | f"Write population metrics to BQ table: {population_table_id}" >> beam.io.WriteToBigQuery( table=population_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.INCARCERATION_RELEASE | f"Write release metrics to BQ table: {releases_table_id}" >> beam.io.WriteToBigQuery( table=releases_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
def run( apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: str, calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]], ): """Runs the incarceration calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options["project"] if project_id is None: raise ValueError(f"No project set in pipeline options: {all_pipeline_options}") if state_code is None: raise ValueError("No state_code set for pipeline") input_dataset = project_id + "." + data_input reference_dataset = project_id + "." + reference_view_input static_reference_dataset = project_id + "." + static_reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = p | "Load StatePersons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSentenceGroups sentence_groups = p | "Load StateSentenceGroups" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationSentences incarceration_sentences = ( p | "Load StateIncarcerationSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) ) # Get StateSupervisionSentences supervision_sentences = p | "Load StateSupervisionSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) if state_code == "US_MO": # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set, ) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> ReadFromBigQuery(query=us_mo_sentence_status_query) ) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([]) ) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | "Convert MO sentence status ranking table to KV tuples" >> beam.ParDo(ConvertDictToKVTuple(), "person_id") ) supervision_sentences_and_statuses = ( { "incarceration_sentences": incarceration_sentences, "supervision_sentences": supervision_sentences, "sentence_statuses": us_mo_sentence_status_rankings_as_kv, } | "Group sentences to the sentence statuses for that person" >> beam.CoGroupByKey() ) sentences_converted = ( supervision_sentences_and_statuses | "Convert to state-specific sentences" >> beam.ParDo(ConvertSentencesToStateSpecificType()).with_outputs( "incarceration_sentences", "supervision_sentences" ) ) sentences_and_sentence_groups = { "sentence_groups": sentence_groups, "incarceration_sentences": sentences_converted.incarceration_sentences, "supervision_sentences": sentences_converted.supervision_sentences, } | "Group sentences to sentence groups" >> beam.CoGroupByKey() # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | "Set hydrated sentences on sentence groups" >> beam.ParDo(SetSentencesOnSentenceGroup()) ) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( p | "Load person_id_to_county_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, ) ) ip_to_judicial_district_kv = ( p | "Load ip_to_judicial_district_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, ) ) state_race_ethnicity_population_counts = ( p | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, ) ) # Group each StatePerson with their related entities person_entities = { "person": persons, "sentence_groups": sentence_groups_with_hydrated_sentences, "incarceration_period_judicial_district_association": ip_to_judicial_district_kv, "persons_to_recent_county_of_residence": person_id_to_county_kv, } | "Group StatePerson to SentenceGroups" >> beam.CoGroupByKey() # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_incarceration_events = ( person_entities | "Classify Incarceration Events" >> beam.ParDo(ClassifyIncarcerationEvents()) ) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts) ) ) person_incarceration_events_with_metadata = ( { "person_events": person_incarceration_events, "person_metadata": person_metadata, } | "Group IncarcerationEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get IncarcerationMetrics incarceration_metrics = ( person_incarceration_events_with_metadata | "Get Incarceration Metrics" >> GetIncarcerationMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, ) ) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( incarceration_metrics | "Convert to dict to be written to BQ" >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( IncarcerationMetricType.INCARCERATION_ADMISSION.value, IncarcerationMetricType.INCARCERATION_POPULATION.value, IncarcerationMetricType.INCARCERATION_RELEASE.value, ) ) # Write the metrics to the output tables in BigQuery admissions_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationAdmissionMetric] population_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationPopulationMetric] releases_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationReleaseMetric] _ = ( writable_metrics.INCARCERATION_ADMISSION | f"Write admission metrics to BQ table: {admissions_table_id}" >> WriteAppendToBigQuery( output_table=admissions_table_id, output_dataset=output, ) ) _ = ( writable_metrics.INCARCERATION_POPULATION | f"Write population metrics to BQ table: {population_table_id}" >> WriteAppendToBigQuery( output_table=population_table_id, output_dataset=output, ) ) _ = ( writable_metrics.INCARCERATION_RELEASE | f"Write release metrics to BQ table: {releases_table_id}" >> WriteAppendToBigQuery( output_table=releases_table_id, output_dataset=output, ) )
def execute_pipeline( self, pipeline: beam.Pipeline, all_pipeline_options: Dict[str, Any], state_code: str, input_dataset: str, reference_dataset: str, static_reference_dataset: str, metric_types: List[str], person_id_filter_set: Optional[Set[int]], calculation_month_count: int = -1, calculation_end_month: Optional[str] = None, ) -> beam.Pipeline: persons = pipeline | "Load StatePersons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSentenceGroups sentence_groups = pipeline | "Load StateSentenceGroups" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationSentences incarceration_sentences = ( pipeline | "Load StateIncarcerationSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionSentences supervision_sentences = ( pipeline | "Load StateSupervisionSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionPeriods supervision_periods = ( pipeline | "Load StateSupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateAssessments assessments = pipeline | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionViolations supervision_violations = ( pipeline | "Load SupervisionViolations" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionViolationResponses supervision_violation_responses = ( pipeline | "Load SupervisionViolationResponses" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) if state_code == "US_MO": # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set, ) us_mo_sentence_statuses = ( pipeline | "Read MO sentence status table from BigQuery" >> ReadFromBigQuery(query=us_mo_sentence_status_query)) else: us_mo_sentence_statuses = ( pipeline | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | "Convert MO sentence status ranking table to KV tuples" >> beam.ParDo(ConvertDictToKVTuple(), "person_id")) supervision_sentences_and_statuses = ( { "incarceration_sentences": incarceration_sentences, "supervision_sentences": supervision_sentences, "sentence_statuses": us_mo_sentence_status_rankings_as_kv, } | "Group sentences to the sentence statuses for that person" >> beam.CoGroupByKey()) sentences_converted = ( supervision_sentences_and_statuses | "Convert to state-specific sentences" >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( "incarceration_sentences", "supervision_sentences")) # Set hydrated supervision periods on the corresponding incarceration sentences incarceration_sentences_with_hydrated_sps = ( { "supervision_periods": supervision_periods, "sentences": sentences_converted.incarceration_sentences, } | "Group supervision periods to incarceration sentences" >> beam.CoGroupByKey() | "Set hydrated supervision periods on incarceration sentences" >> beam.ParDo(SetSupervisionPeriodsOnSentences())) # Set hydrated supervision periods on the corresponding supervision sentences supervision_sentences_with_hydrated_sps = ( { "supervision_periods": supervision_periods, "sentences": sentences_converted.supervision_sentences, } | "Group supervision periods to supervision sentences" >> beam.CoGroupByKey() | "Set hydrated supervision periods on supervision sentences" >> beam.ParDo(SetSupervisionPeriodsOnSentences())) sentences_and_sentence_groups = { "sentence_groups": sentence_groups, "incarceration_sentences": incarceration_sentences_with_hydrated_sps, "supervision_sentences": supervision_sentences_with_hydrated_sps, } | "Group sentences to sentence groups" >> beam.CoGroupByKey() # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | "Set hydrated sentences on sentence groups" >> beam.ParDo( SetSentencesOnSentenceGroup())) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( pipeline | "Load person_id_to_county_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) ip_to_judicial_district_kv = ( pipeline | "Load ip_to_judicial_district_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id= INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) supervision_period_to_agent_associations_as_kv = ( pipeline | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) state_race_ethnicity_population_counts = ( pipeline | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { "violations": supervision_violations, "violation_responses": supervision_violation_responses, } | "Group StateSupervisionViolationResponses to " "StateSupervisionViolations" >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | "Set hydrated StateSupervisionViolations on " "the StateSupervisionViolationResponses" >> beam.ParDo( SetViolationOnViolationsResponse())) # Group each StatePerson with their related entities person_entities = { "person": persons, "assessments": assessments, "sentence_groups": sentence_groups_with_hydrated_sentences, "violation_responses": violation_responses_with_hydrated_violations, "incarceration_period_judicial_district_association": ip_to_judicial_district_kv, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, "persons_to_recent_county_of_residence": person_id_to_county_kv, } | "Group StatePerson to SentenceGroups" >> beam.CoGroupByKey() # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_incarceration_events = ( person_entities | "Classify Incarceration Events" >> beam.ParDo( ClassifyEvents(), identifier=self.pipeline_config.identifier)) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), state_race_ethnicity_population_counts=AsList( state_race_ethnicity_population_counts), )) person_incarceration_events_with_metadata = ( { "person_events": person_incarceration_events, "person_metadata": person_metadata, } | "Group IncarcerationEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata())) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get IncarcerationMetrics incarceration_metrics = ( person_incarceration_events_with_metadata | "Get Incarceration Metrics" >> GetMetrics( pipeline_options=all_pipeline_options, pipeline_config=self.pipeline_config, metric_types_to_include=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) return incarceration_metrics
def run( apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: str, calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]], ) -> None: """Runs the supervision calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options["project"] if project_id is None: raise ValueError( f"No project set in pipeline options: {all_pipeline_options}") if state_code is None: raise ValueError("No state_code set for pipeline") input_dataset = project_id + "." + data_input reference_dataset = project_id + "." + reference_view_input static_reference_dataset = project_id + "." + static_reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = p | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationPeriods incarceration_periods = p | "Load IncarcerationPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionViolations supervision_violations = p | "Load SupervisionViolations" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # TODO(#2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | "Load SupervisionViolationResponses" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionSentences supervision_sentences = p | "Load SupervisionSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationSentences incarceration_sentences = p | "Load IncarcerationSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionPeriods supervision_periods = p | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateAssessments assessments = p | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_contacts = p | "Load StateSupervisionContacts" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionContact, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_period_to_agent_associations_as_kv = ( p | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) # Bring in the judicial districts associated with supervision_periods sp_to_judicial_district_kv = ( p | "Load sp_to_judicial_district_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id= SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, state_code_filter=state_code, person_id_filter_set=person_id_filter_set, table_key="person_id", )) state_race_ethnicity_population_counts = ( p | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) if state_code == "US_MO": # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set, ) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> ReadFromBigQuery(query=us_mo_sentence_status_query)) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | "Convert MO sentence status ranking table to KV tuples" >> beam.ParDo(ConvertDictToKVTuple(), "person_id")) sentences_and_statuses = ( { "incarceration_sentences": incarceration_sentences, "supervision_sentences": supervision_sentences, "sentence_statuses": us_mo_sentence_status_rankings_as_kv, } | "Group sentences to the sentence statuses for that person" >> beam.CoGroupByKey()) sentences_converted = ( sentences_and_statuses | "Convert to state-specific sentences" >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( "incarceration_sentences", "supervision_sentences")) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { "violations": supervision_violations, "violation_responses": supervision_violation_responses, } | "Group StateSupervisionViolationResponses to " "StateSupervisionViolations" >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | "Set hydrated StateSupervisionViolations on " "the StateSupervisionViolationResponses" >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id incarceration_periods_and_violation_responses = ( { "incarceration_periods": incarceration_periods, "violation_responses": violation_responses_with_hydrated_violations, } | "Group StateIncarcerationPeriods to " "StateSupervisionViolationResponses" >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding # StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | "Set hydrated StateSupervisionViolationResponses on " "the StateIncarcerationPeriods" >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their related entities person_entities = { "person": persons, "assessments": assessments, "incarceration_periods": incarceration_periods_with_source_violations, "supervision_periods": supervision_periods, "supervision_sentences": sentences_converted.supervision_sentences, "incarceration_sentences": sentences_converted.incarceration_sentences, "violation_responses": violation_responses_with_hydrated_violations, "supervision_contacts": supervision_contacts, "supervision_period_judicial_district_association": sp_to_judicial_district_kv, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, } | "Group StatePerson to all entities" >> beam.CoGroupByKey() # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods person_time_buckets = (person_entities | "Get SupervisionTimeBuckets" >> beam.ParDo( ClassifySupervisionTimeBuckets())) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_time_buckets_with_metadata = ( { "person_events": person_time_buckets, "person_metadata": person_metadata } | "Group SupervisionTimeBuckets with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and SupervisionTimeBuckets for calculations" >> beam.ParDo(ExtractPersonEventsMetadata())) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Get the type of metric to calculate metric_types_set = set(metric_types) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get supervision metrics supervision_metrics = ( person_time_buckets_with_metadata | "Get Supervision Metrics" >> GetSupervisionMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( supervision_metrics | "Convert to dict to be written to BQ" >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( SupervisionMetricType.SUPERVISION_COMPLIANCE.value, SupervisionMetricType.SUPERVISION_POPULATION.value, SupervisionMetricType.SUPERVISION_REVOCATION.value, SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value, SupervisionMetricType.SUPERVISION_START.value, SupervisionMetricType.SUPERVISION_SUCCESS.value, SupervisionMetricType. SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value, SupervisionMetricType.SUPERVISION_TERMINATION.value, SupervisionMetricType.SUPERVISION_OUT_OF_STATE_POPULATION. value, SupervisionMetricType.SUPERVISION_DOWNGRADE.value, )) terminations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionTerminationMetric] compliance_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionCaseComplianceMetric] populations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionPopulationMetric] revocations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionRevocationMetric] revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionRevocationAnalysisMetric] successes_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionSuccessMetric] successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES[ SuccessfulSupervisionSentenceDaysServedMetric] supervision_starts_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionStartMetric] out_of_state_populations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionOutOfStatePopulationMetric] supervision_downgrade_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionDowngradeMetric] _ = (writable_metrics.SUPERVISION_POPULATION | f"Write population metrics to BQ table: {populations_table_id}" >> WriteAppendToBigQuery( output_table=populations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_OUT_OF_STATE_POPULATION | f"Write out of state population metrics to BQ table: " f"{out_of_state_populations_table_id}" >> WriteAppendToBigQuery( output_table=out_of_state_populations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_REVOCATION | f"Write revocation metrics to BQ table: {revocations_table_id}" >> WriteAppendToBigQuery( output_table=revocations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_SUCCESS | f"Write success metrics to BQ table: {successes_table_id}" >> WriteAppendToBigQuery( output_table=successes_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED | f"Write supervision successful sentence length metrics to BQ" f" table: {successful_sentence_lengths_table_id}" >> WriteAppendToBigQuery( output_table=successful_sentence_lengths_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_TERMINATION | f"Write termination metrics to BQ table: {terminations_table_id}" >> WriteAppendToBigQuery( output_table=terminations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_REVOCATION_ANALYSIS | f"Write revocation analyses metrics to BQ table: " f"{revocation_analysis_table_id}" >> WriteAppendToBigQuery( output_table=revocation_analysis_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_COMPLIANCE | f"Write compliance metrics to BQ table: {compliance_table_id}" >> WriteAppendToBigQuery( output_table=compliance_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_START | f"Write start metrics to BQ table: {supervision_starts_table_id}" >> WriteAppendToBigQuery( output_table=supervision_starts_table_id, output_dataset=output, )) _ = ( writable_metrics.SUPERVISION_DOWNGRADE | f"Write downgrade metrics to BQ table: {supervision_downgrade_table_id}" >> WriteAppendToBigQuery( output_table=supervision_downgrade_table_id, output_dataset=output, ))
def execute_pipeline( self, pipeline: beam.Pipeline, all_pipeline_options: Dict[str, Any], state_code: str, input_dataset: str, reference_dataset: str, static_reference_dataset: str, metric_types: List[str], person_id_filter_set: Optional[Set[int]], calculation_month_count: int = -1, calculation_end_month: Optional[str] = None, ) -> beam.Pipeline: # Get StatePersons persons = pipeline | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateProgramAssignments program_assignments = pipeline | "Load Program Assignments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateAssessments assessments = pipeline | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionPeriods supervision_periods = pipeline | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_period_to_agent_associations_as_kv = ( pipeline | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) state_race_ethnicity_population_counts = ( pipeline | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) # Group each StatePerson with their other entities persons_entities = { "person": persons, "program_assignments": program_assignments, "assessments": assessments, "supervision_periods": supervision_periods, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, } | "Group StatePerson to StateProgramAssignments and" >> beam.CoGroupByKey( ) # Identify ProgramEvents from the StatePerson's StateProgramAssignments person_program_events = persons_entities | beam.ParDo( ClassifyEvents(), identifier=self.pipeline_config.identifier) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), state_race_ethnicity_population_counts=AsList( state_race_ethnicity_population_counts), )) person_program_events_with_metadata = ( { "person_events": person_program_events, "person_metadata": person_metadata } | "Group ProgramEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ProgramEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata())) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get program metrics program_metrics = (person_program_events_with_metadata | "Get Program Metrics" >> GetMetrics( pipeline_options=all_pipeline_options, pipeline_config=self.pipeline_config, metric_types_to_include=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) return program_metrics