def run(argv=None): """Runs the program calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() # Parse command-line arguments known_args, pipeline_args = parse_arguments(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = pipeline_options.get_all_options() input_dataset = all_pipeline_options['project'] + '.' + known_args.input reference_dataset = all_pipeline_options['project'] + '.' + \ known_args.reference_input with beam.Pipeline(argv=pipeline_args) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StatePerson, root_entity_class=entities.StatePerson, unifying_id_field='person_id', build_related_entities=True)) # Get StateProgramAssignments program_assignments = ( p | 'Load Program Assignments' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateProgramAssignment, root_entity_class=entities.StateProgramAssignment, unifying_id_field='person_id', build_related_entities=True)) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateAssessment, root_entity_class=entities.StateAssessment, unifying_id_field='person_id', build_related_entities=False)) # Get StateSupervisionPeriods supervision_periods = ( p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateSupervisionPeriod, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field='person_id', build_related_entities=False)) supervision_period_to_agent_association_query = \ f"SELECT * FROM `{reference_dataset}.supervision_period_to_agent_association`" supervision_period_to_agent_associations = ( p | "Read Supervision Period to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource( query=supervision_period_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the supervision_period_id column # as the key supervision_period_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert Supervision Period to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id')) # Group each StatePerson with their other entities persons_entities = ({ 'person': persons, 'program_assignments': program_assignments, 'assessments': assessments, 'supervision_periods': supervision_periods } | 'Group StatePerson to StateProgramAssignments and' >> beam.CoGroupByKey()) # Identify ProgramEvents from the StatePerson's StateProgramAssignments person_program_events = ( persons_entities | beam.ParDo( ClassifyProgramAssignments(), AsDict(supervision_period_to_agent_associations_as_kv))) # Get dimensions to include and methodologies to use inclusions, _ = dimensions_and_methodologies(known_args) # Get pipeline job details for accessing job_id all_pipeline_options = pipeline_options.get_all_options() # The number of months to limit the monthly calculation output to calculation_month_limit = known_args.calculation_month_limit # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get program metrics program_metrics = ( person_program_events | 'Get Program Metrics' >> GetProgramMetrics( pipeline_options=all_pipeline_options, inclusions=inclusions, calculation_month_limit=calculation_month_limit)) # Convert the metrics into a format that's writable to BQ writable_metrics = ( program_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( ProgramMetricWritableDict()).with_outputs('referrals')) # Write the metrics to the output tables in BigQuery referrals_table = known_args.output + '.program_referral_metrics' _ = (writable_metrics.referrals | f"Write referral metrics to BQ table: {referrals_table}" >> beam.io.WriteToBigQuery( table=referrals_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, metric_types: List[str], state_code: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() query_dataset = all_pipeline_options['project'] + '.' + data_input reference_dataset = all_pipeline_options['project'] + '.' + reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = ( p | 'Load Persons' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set)) # Get StateIncarcerationPeriods incarceration_periods = ( p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionViolations supervision_violations = \ (p | 'Load SupervisionViolations' >> BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = \ (p | 'Load SupervisionViolationResponses' >> BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( { 'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( { 'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods person_and_incarceration_periods = ( { 'person': persons, 'incarceration_periods': incarceration_periods_with_source_violations } | 'Group StatePerson to StateIncarcerationPeriods' >> beam.CoGroupByKey()) # Bring in the table that associates people and their county of residence person_id_to_county_query = select_all_by_person_query( reference_dataset, PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, # TODO(3602): Once we put state_code on StatePerson objects, we can update the # persons_to_recent_county_of_residence query to have a state_code field, allowing us to also filter the # output by state_code. state_code_filter=None, person_id_filter_set=person_id_filter_set) person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Identify ReleaseEvents events from the StatePerson's # StateIncarcerationPeriods person_events = ( person_and_incarceration_periods | "ClassifyReleaseEvents" >> beam.ParDo( ClassifyReleaseEvents(), AsDict(person_id_to_county_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get recidivism metrics recidivism_metrics = (person_events | 'Get Recidivism Metrics' >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( recidivism_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( RecidivismMetricWritableDict()).with_outputs( 'rates', 'counts')) # Write the recidivism metrics to the output tables in BigQuery rates_table_id = DATAFLOW_METRICS_TO_TABLES.get( ReincarcerationRecidivismRateMetric) counts_table_id = DATAFLOW_METRICS_TO_TABLES.get( ReincarcerationRecidivismCountMetric) _ = (writable_metrics.rates | f"Write rate metrics to BQ table: {rates_table_id}" >> beam.io.WriteToBigQuery( table=rates_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.counts | f"Write count metrics to BQ table: {counts_table_id}" >> beam.io.WriteToBigQuery( table=counts_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the program calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() input_dataset = all_pipeline_options['project'] + '.' + data_input reference_dataset = all_pipeline_options['project'] + '.' + reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set)) # Get StateProgramAssignments program_assignments = ( p | 'Load Program Assignments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionPeriods supervision_periods = ( p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) supervision_period_to_agent_association_query = \ f"SELECT * FROM `{reference_dataset}.supervision_period_to_agent_association`" supervision_period_to_agent_associations = ( p | "Read Supervision Period to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource( query=supervision_period_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the supervision_period_id column # as the key supervision_period_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert Supervision Period to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id')) # Group each StatePerson with their other entities persons_entities = ({ 'person': persons, 'program_assignments': program_assignments, 'assessments': assessments, 'supervision_periods': supervision_periods } | 'Group StatePerson to StateProgramAssignments and' >> beam.CoGroupByKey()) # Identify ProgramEvents from the StatePerson's StateProgramAssignments person_program_events = ( persons_entities | beam.ParDo( ClassifyProgramAssignments(), AsDict(supervision_period_to_agent_associations_as_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get program metrics program_metrics = ( person_program_events | 'Get Program Metrics' >> GetProgramMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( program_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( ProgramMetricWritableDict()).with_outputs('referrals')) # Write the metrics to the output tables in BigQuery referrals_table = output + '.program_referral_metrics' _ = (writable_metrics.referrals | f"Write referral metrics to BQ table: {referrals_table}" >> beam.io.WriteToBigQuery( table=referrals_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, metric_types: List[str], state_code: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options['project'] input_dataset = project_id + '.' + data_input reference_dataset = project_id + '.' + reference_view_input static_reference_dataset = project_id + '.' + static_reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationPeriods incarceration_periods = (p | 'Load IncarcerationPeriods' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateSupervisionViolations supervision_violations = \ (p | 'Load SupervisionViolations' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # TODO(#2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = \ (p | 'Load SupervisionViolationResponses' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( {'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo(SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( {'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations} | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo(SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods person_and_incarceration_periods = ( {'person': persons, 'incarceration_periods': incarceration_periods_with_source_violations} | 'Group StatePerson to StateIncarcerationPeriods' >> beam.CoGroupByKey() ) # Bring in the table that associates people and their county of residence person_id_to_county_kv = (p | 'Load person_id_to_county_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key='person_id', state_code_filter=state_code, person_id_filter_set=person_id_filter_set )) state_race_ethnicity_population_counts = ( p | 'Load state_race_ethnicity_population_counts' >> ImportTable( dataset_id=static_reference_dataset, table_id='state_race_ethnicity_population_counts', state_code_filter=state_code, person_id_filter_set=None )) # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods person_release_events = ( person_and_incarceration_periods | "ClassifyReleaseEvents" >> beam.ParDo(ClassifyReleaseEvents(), AsDict(person_id_to_county_kv)) ) person_metadata = (persons | "Build the person_metadata dictionary" >> beam.ParDo(BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_release_events_with_metadata = ( { 'person_events': person_release_events, 'person_metadata': person_metadata } | 'Group ReleaseEvents with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and ReleaseEvents for calculations' >> beam.ParDo(ExtractPersonReleaseEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get recidivism metrics recidivism_metrics = (person_release_events_with_metadata | 'Get Recidivism Metrics' >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set)) if person_id_filter_set: logging.warning("Non-empty person filter set - returning before writing metrics.") return # Convert the metrics into a format that's writable to BQ writable_metrics = (recidivism_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( ReincarcerationRecidivismMetricType.REINCARCERATION_RATE.value, ReincarcerationRecidivismMetricType.REINCARCERATION_COUNT.value )) # Write the recidivism metrics to the output tables in BigQuery rates_table_id = DATAFLOW_METRICS_TO_TABLES.get(ReincarcerationRecidivismRateMetric) counts_table_id = DATAFLOW_METRICS_TO_TABLES.get(ReincarcerationRecidivismCountMetric) _ = (writable_metrics.REINCARCERATION_RATE | f"Write rate metrics to BQ table: {rates_table_id}" >> beam.io.WriteToBigQuery( table=rates_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS )) _ = (writable_metrics.REINCARCERATION_COUNT | f"Write count metrics to BQ table: {counts_table_id}" >> beam.io.WriteToBigQuery( table=counts_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS ))
def run( apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: str, calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]], ) -> None: """Runs the supervision calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options["project"] if project_id is None: raise ValueError( f"No project set in pipeline options: {all_pipeline_options}") if state_code is None: raise ValueError("No state_code set for pipeline") input_dataset = project_id + "." + data_input reference_dataset = project_id + "." + reference_view_input static_reference_dataset = project_id + "." + static_reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = p | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationPeriods incarceration_periods = p | "Load IncarcerationPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionViolations supervision_violations = p | "Load SupervisionViolations" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # TODO(#2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | "Load SupervisionViolationResponses" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionSentences supervision_sentences = p | "Load SupervisionSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationSentences incarceration_sentences = p | "Load IncarcerationSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionPeriods supervision_periods = p | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateAssessments assessments = p | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_contacts = p | "Load StateSupervisionContacts" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionContact, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_period_to_agent_associations_as_kv = ( p | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) # Bring in the judicial districts associated with supervision_periods sp_to_judicial_district_kv = ( p | "Load sp_to_judicial_district_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id= SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, state_code_filter=state_code, person_id_filter_set=person_id_filter_set, table_key="person_id", )) state_race_ethnicity_population_counts = ( p | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) if state_code == "US_MO": # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set, ) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> ReadFromBigQuery(query=us_mo_sentence_status_query)) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | "Convert MO sentence status ranking table to KV tuples" >> beam.ParDo(ConvertDictToKVTuple(), "person_id")) sentences_and_statuses = ( { "incarceration_sentences": incarceration_sentences, "supervision_sentences": supervision_sentences, "sentence_statuses": us_mo_sentence_status_rankings_as_kv, } | "Group sentences to the sentence statuses for that person" >> beam.CoGroupByKey()) sentences_converted = ( sentences_and_statuses | "Convert to state-specific sentences" >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( "incarceration_sentences", "supervision_sentences")) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { "violations": supervision_violations, "violation_responses": supervision_violation_responses, } | "Group StateSupervisionViolationResponses to " "StateSupervisionViolations" >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | "Set hydrated StateSupervisionViolations on " "the StateSupervisionViolationResponses" >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id incarceration_periods_and_violation_responses = ( { "incarceration_periods": incarceration_periods, "violation_responses": violation_responses_with_hydrated_violations, } | "Group StateIncarcerationPeriods to " "StateSupervisionViolationResponses" >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding # StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | "Set hydrated StateSupervisionViolationResponses on " "the StateIncarcerationPeriods" >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their related entities person_entities = { "person": persons, "assessments": assessments, "incarceration_periods": incarceration_periods_with_source_violations, "supervision_periods": supervision_periods, "supervision_sentences": sentences_converted.supervision_sentences, "incarceration_sentences": sentences_converted.incarceration_sentences, "violation_responses": violation_responses_with_hydrated_violations, "supervision_contacts": supervision_contacts, "supervision_period_judicial_district_association": sp_to_judicial_district_kv, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, } | "Group StatePerson to all entities" >> beam.CoGroupByKey() # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods person_time_buckets = (person_entities | "Get SupervisionTimeBuckets" >> beam.ParDo( ClassifySupervisionTimeBuckets())) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_time_buckets_with_metadata = ( { "person_events": person_time_buckets, "person_metadata": person_metadata } | "Group SupervisionTimeBuckets with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and SupervisionTimeBuckets for calculations" >> beam.ParDo(ExtractPersonEventsMetadata())) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Get the type of metric to calculate metric_types_set = set(metric_types) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get supervision metrics supervision_metrics = ( person_time_buckets_with_metadata | "Get Supervision Metrics" >> GetSupervisionMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( supervision_metrics | "Convert to dict to be written to BQ" >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( SupervisionMetricType.SUPERVISION_COMPLIANCE.value, SupervisionMetricType.SUPERVISION_POPULATION.value, SupervisionMetricType.SUPERVISION_REVOCATION.value, SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value, SupervisionMetricType.SUPERVISION_START.value, SupervisionMetricType.SUPERVISION_SUCCESS.value, SupervisionMetricType. SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value, SupervisionMetricType.SUPERVISION_TERMINATION.value, SupervisionMetricType.SUPERVISION_OUT_OF_STATE_POPULATION. value, SupervisionMetricType.SUPERVISION_DOWNGRADE.value, )) terminations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionTerminationMetric] compliance_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionCaseComplianceMetric] populations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionPopulationMetric] revocations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionRevocationMetric] revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionRevocationAnalysisMetric] successes_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionSuccessMetric] successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES[ SuccessfulSupervisionSentenceDaysServedMetric] supervision_starts_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionStartMetric] out_of_state_populations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionOutOfStatePopulationMetric] supervision_downgrade_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionDowngradeMetric] _ = (writable_metrics.SUPERVISION_POPULATION | f"Write population metrics to BQ table: {populations_table_id}" >> WriteAppendToBigQuery( output_table=populations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_OUT_OF_STATE_POPULATION | f"Write out of state population metrics to BQ table: " f"{out_of_state_populations_table_id}" >> WriteAppendToBigQuery( output_table=out_of_state_populations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_REVOCATION | f"Write revocation metrics to BQ table: {revocations_table_id}" >> WriteAppendToBigQuery( output_table=revocations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_SUCCESS | f"Write success metrics to BQ table: {successes_table_id}" >> WriteAppendToBigQuery( output_table=successes_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED | f"Write supervision successful sentence length metrics to BQ" f" table: {successful_sentence_lengths_table_id}" >> WriteAppendToBigQuery( output_table=successful_sentence_lengths_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_TERMINATION | f"Write termination metrics to BQ table: {terminations_table_id}" >> WriteAppendToBigQuery( output_table=terminations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_REVOCATION_ANALYSIS | f"Write revocation analyses metrics to BQ table: " f"{revocation_analysis_table_id}" >> WriteAppendToBigQuery( output_table=revocation_analysis_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_COMPLIANCE | f"Write compliance metrics to BQ table: {compliance_table_id}" >> WriteAppendToBigQuery( output_table=compliance_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_START | f"Write start metrics to BQ table: {supervision_starts_table_id}" >> WriteAppendToBigQuery( output_table=supervision_starts_table_id, output_dataset=output, )) _ = ( writable_metrics.SUPERVISION_DOWNGRADE | f"Write downgrade metrics to BQ table: {supervision_downgrade_table_id}" >> WriteAppendToBigQuery( output_table=supervision_downgrade_table_id, output_dataset=output, ))
def run(argv=None): """Runs the incarceration calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() # Parse command-line arguments known_args, pipeline_args = parse_arguments(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = pipeline_options.get_all_options() query_dataset = all_pipeline_options['project'] + '.' + known_args.input reference_dataset = all_pipeline_options[ 'project'] + '.' + known_args.reference_input with beam.Pipeline(argv=pipeline_args) as p: # Get StatePersons persons = (p | 'Load StatePersons' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StatePerson, root_entity_class=entities.StatePerson, unifying_id_field='person_id', build_related_entities=True)) # Get StateSentenceGroups sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateSentenceGroup, root_entity_class=entities.StateSentenceGroup, unifying_id_field='person_id', build_related_entities=True)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load StateIncarcerationSentences' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateIncarcerationSentence, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field='person_id', build_related_entities=True)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load StateSupervisionSentences' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateSupervisionSentence, root_entity_class=entities.StateSupervisionSentence, unifying_id_field='person_id', build_related_entities=True)) sentences_and_sentence_groups = ( { 'sentence_groups': sentence_groups, 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences } | 'Group sentences to sentence groups' >> beam.CoGroupByKey()) # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | 'Set hydrated sentences on sentence groups' >> beam.ParDo( SetSentencesOnSentenceGroup())) # Group each StatePerson with their related entities person_and_sentence_groups = ( { 'person': persons, 'sentence_groups': sentence_groups_with_hydrated_sentences } | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey()) # Bring in the table that associates people and their county of residence person_id_to_county_query = \ f"SELECT * FROM `{reference_dataset}.persons_to_recent_county_of_residence`" person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_events = ( person_and_sentence_groups | 'Classify Incarceration Events' >> beam.ParDo( ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) # Get dimensions to include and methodologies to use inclusions, _ = dimensions_and_methodologies(known_args) # Get pipeline job details for accessing job_id all_pipeline_options = pipeline_options.get_all_options() # The number of months to limit the monthly calculation output to calculation_month_limit = known_args.calculation_month_limit # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get IncarcerationMetrics incarceration_metrics = ( person_events | 'Get Incarceration Metrics' >> GetIncarcerationMetrics( pipeline_options=all_pipeline_options, inclusions=inclusions, calculation_month_limit=calculation_month_limit)) # Convert the metrics into a format that's writable to BQ writable_metrics = ( incarceration_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(IncarcerationMetricWritableDict()).with_outputs( 'admissions', 'populations', 'releases')) # Write the metrics to the output tables in BigQuery admissions_table = known_args.output + '.incarceration_admission_metrics' population_table = known_args.output + '.incarceration_population_metrics' releases_table = known_args.output + '.incarceration_release_metrics' _ = (writable_metrics.admissions | f"Write admission metrics to BQ table: {admissions_table}" >> beam.io.WriteToBigQuery( table=admissions_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.populations | f"Write population metrics to BQ table: {population_table}" >> beam.io.WriteToBigQuery( table=population_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.releases | f"Write release metrics to BQ table: {releases_table}" >> beam.io.WriteToBigQuery( table=releases_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(argv=None): """Runs the supervision calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() # Parse command-line arguments known_args, pipeline_args = parse_arguments(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = pipeline_options.get_all_options() input_dataset = all_pipeline_options['project'] + '.' + known_args.input reference_dataset = all_pipeline_options['project'] + '.' + \ known_args.reference_input with beam.Pipeline(argv=pipeline_args) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StatePerson, root_entity_class=entities.StatePerson, unifying_id_field='person_id', build_related_entities=True)) # Get StateIncarcerationPeriods incarceration_periods = ( p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateIncarcerationPeriod, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field='person_id', build_related_entities=True)) # Get StateSupervisionViolations supervision_violations = ( p | 'Load SupervisionViolations' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateSupervisionViolation, root_entity_class=entities.StateSupervisionViolation, unifying_id_field='person_id', build_related_entities=True)) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | 'Load SupervisionViolationResponses' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateSupervisionViolationResponse, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field='person_id', build_related_entities=True)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load SupervisionSentences' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateSupervisionSentence, root_entity_class=entities.StateSupervisionSentence, unifying_id_field='person_id', build_related_entities=True)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load IncarcerationSentences' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateIncarcerationSentence, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field='person_id', build_related_entities=True)) # Get StateSupervisionPeriods supervision_periods = ( p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateSupervisionPeriod, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field='person_id', build_related_entities=True)) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateAssessment, root_entity_class=entities.StateAssessment, unifying_id_field='person_id', build_related_entities=False)) # Bring in the table that associates StateSupervisionViolationResponses to information about StateAgents ssvr_to_agent_association_query = f"SELECT * FROM `{reference_dataset}.ssvr_to_agent_association`" ssvr_to_agent_associations = ( p | "Read SSVR to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=ssvr_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the # supervision_violation_response_id column as the key ssvr_agent_associations_as_kv = ( ssvr_to_agent_associations | 'Convert SSVR to Agent table to KV tuples' >> beam.ParDo( ConvertDictToKVTuple(), 'supervision_violation_response_id')) supervision_period_to_agent_association_query = f"SELECT * FROM `{reference_dataset}." \ f"supervision_period_to_agent_association`" supervision_period_to_agent_associations = ( p | "Read Supervision Period to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource( query=supervision_period_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the supervision_period_id column # as the key supervision_period_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert Supervision Period to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id')) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { 'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id incarceration_periods_and_violation_responses = ( { 'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding # StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods and StateSupervisionSentences person_periods_and_sentences = ( { 'person': persons, 'assessments': assessments, 'incarceration_periods': incarceration_periods_with_source_violations, 'supervision_periods': supervision_periods, 'supervision_sentences': supervision_sentences, 'incarceration_sentences': incarceration_sentences, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StatePerson to all entities' >> beam.CoGroupByKey()) # The state_code to run calculations on state_code = known_args.state_code identifier_options = {'state_code': state_code} # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods person_time_buckets = ( person_periods_and_sentences | 'Get SupervisionTimeBuckets' >> beam.ParDo( ClassifySupervisionTimeBuckets(), AsDict(ssvr_agent_associations_as_kv), AsDict(supervision_period_to_agent_associations_as_kv), ** identifier_options)) # Get dimensions to include and methodologies to use inclusions, _ = dimensions_and_methodologies(known_args) # Get pipeline job details for accessing job_id all_pipeline_options = pipeline_options.get_all_options() # Get the type of metric to calculate metric_type = known_args.metric_type # The number of months to limit the monthly calculation output to calculation_month_limit = known_args.calculation_month_limit # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get supervision metrics supervision_metrics = ( person_time_buckets | 'Get Supervision Metrics' >> GetSupervisionMetrics( pipeline_options=all_pipeline_options, inclusions=inclusions, metric_type=metric_type, calculation_month_limit=calculation_month_limit)) # Convert the metrics into a format that's writable to BQ writable_metrics = ( supervision_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(SupervisionMetricWritableDict()).with_outputs( 'populations', 'revocations', 'successes', 'assessment_changes', 'revocation_analyses', 'revocation_violation_type_analyses')) # Write the metrics to the output tables in BigQuery populations_table = known_args.output + '.supervision_population_metrics' revocations_table = known_args.output + '.supervision_revocation_metrics' successes_table = known_args.output + '.supervision_success_metrics' assessment_changes_table = known_args.output + '.terminated_supervision_assessment_score_change_metrics' revocation_analysis_table = known_args.output + '.supervision_revocation_analysis_metrics' revocation_violation_type_analysis_table = known_args.output + \ '.supervision_revocation_violation_type_analysis_metrics' _ = (writable_metrics.populations | f"Write population metrics to BQ table: {populations_table}" >> beam.io.WriteToBigQuery( table=populations_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.revocations | f"Write revocation metrics to BQ table: {revocations_table}" >> beam.io.WriteToBigQuery( table=revocations_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.successes | f"Write success metrics to BQ table: {successes_table}" >> beam.io.WriteToBigQuery( table=successes_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = ( writable_metrics.assessment_changes | f"Write assessment change metrics to BQ table: {assessment_changes_table}" >> beam.io.WriteToBigQuery( table=assessment_changes_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = ( writable_metrics.revocation_analyses | f"Write revocation analyses metrics to BQ table: {revocation_analysis_table}" >> beam.io.WriteToBigQuery( table=revocation_analysis_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.revocation_violation_type_analyses | f"Write revocation violation type analyses metrics to BQ table: " f"{revocation_violation_type_analysis_table}" >> beam.io.WriteToBigQuery( table=revocation_violation_type_analysis_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(argv=None): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() # Parse command-line arguments known_args, pipeline_args = parse_arguments(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = pipeline_options.get_all_options() query_dataset = all_pipeline_options['project'] + '.' + known_args.input reference_dataset = all_pipeline_options['project'] + '.' + \ known_args.reference_input with beam.Pipeline(argv=pipeline_args) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StatePerson, root_entity_class=entities.StatePerson, unifying_id_field='person_id', build_related_entities=True)) # Get StateIncarcerationPeriods incarceration_periods = ( p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateIncarcerationPeriod, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field='person_id', build_related_entities=True)) # Get StateSupervisionViolations supervision_violations = \ (p | 'Load SupervisionViolations' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateSupervisionViolation, root_entity_class=entities.StateSupervisionViolation, unifying_id_field='person_id', build_related_entities=True )) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = \ (p | 'Load SupervisionViolationResponses' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateSupervisionViolationResponse, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field='person_id', build_related_entities=True )) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( { 'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( { 'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods person_and_incarceration_periods = ( { 'person': persons, 'incarceration_periods': incarceration_periods_with_source_violations } | 'Group StatePerson to StateIncarcerationPeriods' >> beam.CoGroupByKey()) # Bring in the table that associates people and their county of # residence person_id_to_county_query = \ f"SELECT * FROM " \ f"`{reference_dataset}.persons_to_recent_county_of_residence`" person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Identify ReleaseEvents events from the StatePerson's # StateIncarcerationPeriods person_events = ( person_and_incarceration_periods | "ClassifyReleaseEvents" >> beam.ParDo( ClassifyReleaseEvents(), AsDict(person_id_to_county_kv))) # Get dimensions to include and methodologies to use inclusions, methodologies = dimensions_and_methodologies(known_args) # Get pipeline job details for accessing job_id all_pipeline_options = pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get recidivism metrics recidivism_metrics = ( person_events | 'Get Recidivism Metrics' >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, inclusions=inclusions)) filter_metrics_kwargs = {'methodologies': methodologies} # Filter out unneeded metrics final_recidivism_metrics = ( recidivism_metrics | 'Filter out unwanted metrics' >> beam.ParDo( FilterMetrics(), **filter_metrics_kwargs)) # Convert the metrics into a format that's writable to BQ writable_metrics = ( final_recidivism_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( RecidivismMetricWritableDict()).with_outputs( 'rates', 'counts', 'liberties')) # Write the recidivism metrics to the output tables in BigQuery rates_table = known_args.output + '.recidivism_rate_metrics' counts_table = known_args.output + '.recidivism_count_metrics' liberty_table = known_args.output + '.recidivism_liberty_metrics' _ = (writable_metrics.rates | f"Write rate metrics to BQ table: {rates_table}" >> beam.io.WriteToBigQuery( table=rates_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.counts | f"Write count metrics to BQ table: {counts_table}" >> beam.io.WriteToBigQuery( table=counts_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.liberties | f"Write liberty metrics to BQ table: {liberty_table}" >> beam.io.WriteToBigQuery( table=liberty_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def execute_pipeline( self, pipeline: beam.Pipeline, all_pipeline_options: Dict[str, Any], state_code: str, input_dataset: str, _reference_dataset: str, static_reference_dataset: str, metric_types: List[str], person_id_filter_set: Optional[Set[int]], calculation_month_count: int = -1, calculation_end_month: Optional[str] = None, ) -> beam.Pipeline: # Get StatePersons persons = pipeline | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionViolations supervision_violations = ( pipeline | "Load SupervisionViolations" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) ) # Get StateSupervisionViolationResponses supervision_violation_responses = ( pipeline | "Load SupervisionViolationResponses" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) ) state_race_ethnicity_population_counts = ( pipeline | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, ) ) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { "violations": supervision_violations, "violation_responses": supervision_violation_responses, } | "Group StateSupervisionViolationResponses to StateSupervisionViolations" >> beam.CoGroupByKey() ) violations_with_hydrated_violation_responses = ( supervision_violations_and_responses | "Set hydrated StateSupervisionViolationResponses on the StateSupervisionViolations" >> beam.ParDo(SetViolationResponsesOntoViolations()) ) person_entities = { "person": persons, "violations": violations_with_hydrated_violation_responses, } | "Group StatePerson to violation entities" >> beam.CoGroupByKey() person_violation_events = person_entities | "Get ViolationEvents" >> beam.ParDo( ClassifyEvents(), identifier=self.pipeline_config.identifier ) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), state_race_ethnicity_population_counts=AsList( state_race_ethnicity_population_counts ), ) ) person_violation_events_with_metadata = ( { "person_events": person_violation_events, "person_metadata": person_metadata, } | "Group ViolationEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ViolationEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata()) ) metric_types_set = set(metric_types) job_timestamp = datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get violation metrics violation_metrics = ( person_violation_events_with_metadata | "Get Violation Metrics" >> GetMetrics( pipeline_options=all_pipeline_options, pipeline_config=self.pipeline_config, metric_types_to_include=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, ) ) return violation_metrics
def run( apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, metric_types: List[str], state_code: str, person_filter_ids: Optional[List[int]], ): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options["project"] if project_id is None: raise ValueError(f"No project set in pipeline options: {all_pipeline_options}") if state_code is None: raise ValueError("No state_code set for pipeline") input_dataset = project_id + "." + data_input reference_dataset = project_id + "." + reference_view_input static_reference_dataset = project_id + "." + static_reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = p | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationPeriods incarceration_periods = p | "Load IncarcerationPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionViolations supervision_violations = p | "Load SupervisionViolations" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # TODO(#2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | "Load SupervisionViolationResponses" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) ) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( { "violations": supervision_violations, "violation_responses": supervision_violation_responses, } | "Group StateSupervisionViolationResponses to " "StateSupervisionViolations" >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | "Set hydrated StateSupervisionViolations on " "the StateSupervisionViolationResponses" >> beam.ParDo(SetViolationOnViolationsResponse()) ) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( { "incarceration_periods": incarceration_periods, "violation_responses": violation_responses_with_hydrated_violations, } | "Group StateIncarcerationPeriods to " "StateSupervisionViolationResponses" >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | "Set hydrated StateSupervisionViolationResponses on " "the StateIncarcerationPeriods" >> beam.ParDo(SetViolationResponseOnIncarcerationPeriod()) ) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( p | "Load person_id_to_county_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, ) ) # Group each StatePerson with their StateIncarcerationPeriods person_entities = { "person": persons, "incarceration_periods": incarceration_periods_with_source_violations, "persons_to_recent_county_of_residence": person_id_to_county_kv, } | "Group StatePerson to StateIncarcerationPeriods" >> beam.CoGroupByKey() state_race_ethnicity_population_counts = ( p | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, ) ) # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods person_release_events = person_entities | "ClassifyReleaseEvents" >> beam.ParDo( ClassifyReleaseEvents() ) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts) ) ) person_release_events_with_metadata = ( {"person_events": person_release_events, "person_metadata": person_metadata} | "Group ReleaseEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ReleaseEvents for calculations" >> beam.ParDo(ExtractPersonReleaseEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get recidivism metrics recidivism_metrics = ( person_release_events_with_metadata | "Get Recidivism Metrics" >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set ) ) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( recidivism_metrics | "Convert to dict to be written to BQ" >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( ReincarcerationRecidivismMetricType.REINCARCERATION_RATE.value, ReincarcerationRecidivismMetricType.REINCARCERATION_COUNT.value, ) ) # Write the recidivism metrics to the output tables in BigQuery rates_table_id = DATAFLOW_METRICS_TO_TABLES[ReincarcerationRecidivismRateMetric] counts_table_id = DATAFLOW_METRICS_TO_TABLES[ ReincarcerationRecidivismCountMetric ] _ = ( writable_metrics.REINCARCERATION_RATE | f"Write rate metrics to BQ table: {rates_table_id}" >> WriteAppendToBigQuery( output_table=rates_table_id, output_dataset=output, ) ) _ = ( writable_metrics.REINCARCERATION_COUNT | f"Write count metrics to BQ table: {counts_table_id}" >> WriteAppendToBigQuery( output_table=counts_table_id, output_dataset=output, ) )
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the supervision calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() input_dataset = all_pipeline_options['project'] + '.' + data_input reference_dataset = all_pipeline_options['project'] + '.' + reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationPeriods incarceration_periods = (p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateSupervisionViolations supervision_violations = (p | 'Load SupervisionViolations' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = (p | 'Load SupervisionViolationResponses' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateSupervisionSentences supervision_sentences = (p | 'Load SupervisionSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateIncarcerationSentences incarceration_sentences = (p | 'Load IncarcerationSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateSupervisionPeriods supervision_periods = (p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Bring in the table that associates StateSupervisionViolationResponses to information about StateAgents ssvr_to_agent_association_query = f"SELECT * FROM `{reference_dataset}.ssvr_to_agent_association`" ssvr_to_agent_associations = (p | "Read SSVR to Agent table from BigQuery" >> beam.io.Read(beam.io.BigQuerySource (query=ssvr_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the # supervision_violation_response_id column as the key ssvr_agent_associations_as_kv = (ssvr_to_agent_associations | 'Convert SSVR to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_violation_response_id') ) supervision_period_to_agent_association_query = f"SELECT * FROM `{reference_dataset}." \ f"supervision_period_to_agent_association`" supervision_period_to_agent_associations = (p | "Read Supervision Period to Agent table from BigQuery" >> beam.io.Read(beam.io.BigQuerySource (query=supervision_period_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the supervision_period_id column # as the key supervision_period_to_agent_associations_as_kv = (supervision_period_to_agent_associations | 'Convert Supervision Period to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id') ) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = f"SELECT * FROM `{reference_dataset}.us_mo_sentence_statuses`" us_mo_sentence_statuses = (p | "Read MO sentence status table from BigQuery" >> beam.io.Read(beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = (p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id') ) sentences_and_statuses = ( {'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv} | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey() ) sentences_converted = ( sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo(ConvertSentencesToStateSpecificType()).with_outputs('incarceration_sentences', 'supervision_sentences') ) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( {'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo(SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id incarceration_periods_and_violation_responses = ( {'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations} | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding # StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo(SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods and StateSupervisionSentences person_periods_and_sentences = ( {'person': persons, 'assessments': assessments, 'incarceration_periods': incarceration_periods_with_source_violations, 'supervision_periods': supervision_periods, 'supervision_sentences': sentences_converted.supervision_sentences, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StatePerson to all entities' >> beam.CoGroupByKey() ) # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods person_time_buckets = ( person_periods_and_sentences | 'Get SupervisionTimeBuckets' >> beam.ParDo(ClassifySupervisionTimeBuckets(), AsDict(ssvr_agent_associations_as_kv), AsDict(supervision_period_to_agent_associations_as_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Get the type of metric to calculate metric_types_set = set(metric_types) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get supervision metrics supervision_metrics = (person_time_buckets | 'Get Supervision Metrics' >> GetSupervisionMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning("Non-empty person filter set - returning before writing metrics.") return # Convert the metrics into a format that's writable to BQ writable_metrics = (supervision_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( SupervisionMetricWritableDict()).with_outputs( 'populations', 'revocations', 'successes', 'successful_sentence_lengths', 'assessment_changes', 'revocation_analyses', 'revocation_violation_type_analyses' ) ) # Write the metrics to the output tables in BigQuery populations_table = output + '.supervision_population_metrics' revocations_table = output + '.supervision_revocation_metrics' successes_table = output + '.supervision_success_metrics' successful_sentence_lengths_table = output + '.successful_supervision_sentence_days_served_metrics' assessment_changes_table = output + '.terminated_supervision_assessment_score_change_metrics' revocation_analysis_table = output + '.supervision_revocation_analysis_metrics' revocation_violation_type_analysis_table = output + \ '.supervision_revocation_violation_type_analysis_metrics' _ = (writable_metrics.populations | f"Write population metrics to BQ table: {populations_table}" >> beam.io.WriteToBigQuery( table=populations_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.revocations | f"Write revocation metrics to BQ table: {revocations_table}" >> beam.io.WriteToBigQuery( table=revocations_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.successes | f"Write success metrics to BQ table: {successes_table}" >> beam.io.WriteToBigQuery( table=successes_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.successful_sentence_lengths | f"Write supervision successful sentence length metrics to BQ" f" table: {successful_sentence_lengths_table}" >> beam.io.WriteToBigQuery( table=successful_sentence_lengths_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.assessment_changes | f"Write assessment change metrics to BQ table: {assessment_changes_table}" >> beam.io.WriteToBigQuery( table=assessment_changes_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.revocation_analyses | f"Write revocation analyses metrics to BQ table: {revocation_analysis_table}" >> beam.io.WriteToBigQuery( table=revocation_analysis_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.revocation_violation_type_analyses | f"Write revocation violation type analyses metrics to BQ table: " f"{revocation_violation_type_analysis_table}" >> beam.io.WriteToBigQuery( table=revocation_violation_type_analysis_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND ))
def run( apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: str, calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]], ): """Runs the incarceration calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options["project"] if project_id is None: raise ValueError(f"No project set in pipeline options: {all_pipeline_options}") if state_code is None: raise ValueError("No state_code set for pipeline") input_dataset = project_id + "." + data_input reference_dataset = project_id + "." + reference_view_input static_reference_dataset = project_id + "." + static_reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = p | "Load StatePersons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSentenceGroups sentence_groups = p | "Load StateSentenceGroups" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationSentences incarceration_sentences = ( p | "Load StateIncarcerationSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) ) # Get StateSupervisionSentences supervision_sentences = p | "Load StateSupervisionSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) if state_code == "US_MO": # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set, ) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> ReadFromBigQuery(query=us_mo_sentence_status_query) ) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([]) ) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | "Convert MO sentence status ranking table to KV tuples" >> beam.ParDo(ConvertDictToKVTuple(), "person_id") ) supervision_sentences_and_statuses = ( { "incarceration_sentences": incarceration_sentences, "supervision_sentences": supervision_sentences, "sentence_statuses": us_mo_sentence_status_rankings_as_kv, } | "Group sentences to the sentence statuses for that person" >> beam.CoGroupByKey() ) sentences_converted = ( supervision_sentences_and_statuses | "Convert to state-specific sentences" >> beam.ParDo(ConvertSentencesToStateSpecificType()).with_outputs( "incarceration_sentences", "supervision_sentences" ) ) sentences_and_sentence_groups = { "sentence_groups": sentence_groups, "incarceration_sentences": sentences_converted.incarceration_sentences, "supervision_sentences": sentences_converted.supervision_sentences, } | "Group sentences to sentence groups" >> beam.CoGroupByKey() # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | "Set hydrated sentences on sentence groups" >> beam.ParDo(SetSentencesOnSentenceGroup()) ) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( p | "Load person_id_to_county_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, ) ) ip_to_judicial_district_kv = ( p | "Load ip_to_judicial_district_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, ) ) state_race_ethnicity_population_counts = ( p | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, ) ) # Group each StatePerson with their related entities person_entities = { "person": persons, "sentence_groups": sentence_groups_with_hydrated_sentences, "incarceration_period_judicial_district_association": ip_to_judicial_district_kv, "persons_to_recent_county_of_residence": person_id_to_county_kv, } | "Group StatePerson to SentenceGroups" >> beam.CoGroupByKey() # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_incarceration_events = ( person_entities | "Classify Incarceration Events" >> beam.ParDo(ClassifyIncarcerationEvents()) ) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts) ) ) person_incarceration_events_with_metadata = ( { "person_events": person_incarceration_events, "person_metadata": person_metadata, } | "Group IncarcerationEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get IncarcerationMetrics incarceration_metrics = ( person_incarceration_events_with_metadata | "Get Incarceration Metrics" >> GetIncarcerationMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, ) ) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( incarceration_metrics | "Convert to dict to be written to BQ" >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( IncarcerationMetricType.INCARCERATION_ADMISSION.value, IncarcerationMetricType.INCARCERATION_POPULATION.value, IncarcerationMetricType.INCARCERATION_RELEASE.value, ) ) # Write the metrics to the output tables in BigQuery admissions_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationAdmissionMetric] population_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationPopulationMetric] releases_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationReleaseMetric] _ = ( writable_metrics.INCARCERATION_ADMISSION | f"Write admission metrics to BQ table: {admissions_table_id}" >> WriteAppendToBigQuery( output_table=admissions_table_id, output_dataset=output, ) ) _ = ( writable_metrics.INCARCERATION_POPULATION | f"Write population metrics to BQ table: {population_table_id}" >> WriteAppendToBigQuery( output_table=population_table_id, output_dataset=output, ) ) _ = ( writable_metrics.INCARCERATION_RELEASE | f"Write release metrics to BQ table: {releases_table_id}" >> WriteAppendToBigQuery( output_table=releases_table_id, output_dataset=output, ) )
def execute_pipeline( self, pipeline: beam.Pipeline, all_pipeline_options: Dict[str, Any], state_code: str, input_dataset: str, reference_dataset: str, static_reference_dataset: str, metric_types: List[str], person_id_filter_set: Optional[Set[int]], calculation_month_count: int = -1, calculation_end_month: Optional[str] = None, ) -> beam.Pipeline: persons = pipeline | "Load StatePersons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSentenceGroups sentence_groups = pipeline | "Load StateSentenceGroups" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationSentences incarceration_sentences = ( pipeline | "Load StateIncarcerationSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionSentences supervision_sentences = ( pipeline | "Load StateSupervisionSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionPeriods supervision_periods = ( pipeline | "Load StateSupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateAssessments assessments = pipeline | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionViolations supervision_violations = ( pipeline | "Load SupervisionViolations" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionViolationResponses supervision_violation_responses = ( pipeline | "Load SupervisionViolationResponses" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) if state_code == "US_MO": # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set, ) us_mo_sentence_statuses = ( pipeline | "Read MO sentence status table from BigQuery" >> ReadFromBigQuery(query=us_mo_sentence_status_query)) else: us_mo_sentence_statuses = ( pipeline | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | "Convert MO sentence status ranking table to KV tuples" >> beam.ParDo(ConvertDictToKVTuple(), "person_id")) supervision_sentences_and_statuses = ( { "incarceration_sentences": incarceration_sentences, "supervision_sentences": supervision_sentences, "sentence_statuses": us_mo_sentence_status_rankings_as_kv, } | "Group sentences to the sentence statuses for that person" >> beam.CoGroupByKey()) sentences_converted = ( supervision_sentences_and_statuses | "Convert to state-specific sentences" >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( "incarceration_sentences", "supervision_sentences")) # Set hydrated supervision periods on the corresponding incarceration sentences incarceration_sentences_with_hydrated_sps = ( { "supervision_periods": supervision_periods, "sentences": sentences_converted.incarceration_sentences, } | "Group supervision periods to incarceration sentences" >> beam.CoGroupByKey() | "Set hydrated supervision periods on incarceration sentences" >> beam.ParDo(SetSupervisionPeriodsOnSentences())) # Set hydrated supervision periods on the corresponding supervision sentences supervision_sentences_with_hydrated_sps = ( { "supervision_periods": supervision_periods, "sentences": sentences_converted.supervision_sentences, } | "Group supervision periods to supervision sentences" >> beam.CoGroupByKey() | "Set hydrated supervision periods on supervision sentences" >> beam.ParDo(SetSupervisionPeriodsOnSentences())) sentences_and_sentence_groups = { "sentence_groups": sentence_groups, "incarceration_sentences": incarceration_sentences_with_hydrated_sps, "supervision_sentences": supervision_sentences_with_hydrated_sps, } | "Group sentences to sentence groups" >> beam.CoGroupByKey() # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | "Set hydrated sentences on sentence groups" >> beam.ParDo( SetSentencesOnSentenceGroup())) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( pipeline | "Load person_id_to_county_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) ip_to_judicial_district_kv = ( pipeline | "Load ip_to_judicial_district_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id= INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) supervision_period_to_agent_associations_as_kv = ( pipeline | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) state_race_ethnicity_population_counts = ( pipeline | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { "violations": supervision_violations, "violation_responses": supervision_violation_responses, } | "Group StateSupervisionViolationResponses to " "StateSupervisionViolations" >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | "Set hydrated StateSupervisionViolations on " "the StateSupervisionViolationResponses" >> beam.ParDo( SetViolationOnViolationsResponse())) # Group each StatePerson with their related entities person_entities = { "person": persons, "assessments": assessments, "sentence_groups": sentence_groups_with_hydrated_sentences, "violation_responses": violation_responses_with_hydrated_violations, "incarceration_period_judicial_district_association": ip_to_judicial_district_kv, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, "persons_to_recent_county_of_residence": person_id_to_county_kv, } | "Group StatePerson to SentenceGroups" >> beam.CoGroupByKey() # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_incarceration_events = ( person_entities | "Classify Incarceration Events" >> beam.ParDo( ClassifyEvents(), identifier=self.pipeline_config.identifier)) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), state_race_ethnicity_population_counts=AsList( state_race_ethnicity_population_counts), )) person_incarceration_events_with_metadata = ( { "person_events": person_incarceration_events, "person_metadata": person_metadata, } | "Group IncarcerationEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata())) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get IncarcerationMetrics incarceration_metrics = ( person_incarceration_events_with_metadata | "Get Incarceration Metrics" >> GetMetrics( pipeline_options=all_pipeline_options, pipeline_config=self.pipeline_config, metric_types_to_include=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) return incarceration_metrics
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the incarceration calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_apache_beam_pipeline_options = apache_beam_pipeline_options.get_all_options( ) query_dataset = all_apache_beam_pipeline_options[ 'project'] + '.' + data_input reference_dataset = all_apache_beam_pipeline_options[ 'project'] + '.' + reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load StatePersons' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set)) # Get StateSentenceGroups sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load StateIncarcerationSentences' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load StateSupervisionSentences' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = f"SELECT * FROM `{reference_dataset}.us_mo_sentence_statuses`" us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) supervision_sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( supervision_sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) sentences_and_sentence_groups = ( { 'sentence_groups': sentence_groups, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'supervision_sentences': sentences_converted.supervision_sentences } | 'Group sentences to sentence groups' >> beam.CoGroupByKey()) # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | 'Set hydrated sentences on sentence groups' >> beam.ParDo( SetSentencesOnSentenceGroup())) # Group each StatePerson with their related entities person_and_sentence_groups = ( { 'person': persons, 'sentence_groups': sentence_groups_with_hydrated_sentences } | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey()) # Bring in the table that associates people and their county of residence person_id_to_county_query = \ f"SELECT * FROM `{reference_dataset}.persons_to_recent_county_of_residence`" person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_events = ( person_and_sentence_groups | 'Classify Incarceration Events' >> beam.ParDo( ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_apache_beam_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get IncarcerationMetrics incarceration_metrics = ( person_events | 'Get Incarceration Metrics' >> GetIncarcerationMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( incarceration_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(IncarcerationMetricWritableDict()).with_outputs( 'admissions', 'populations', 'releases')) # Write the metrics to the output tables in BigQuery admissions_table = output + '.incarceration_admission_metrics' population_table = output + '.incarceration_population_metrics' releases_table = output + '.incarceration_release_metrics' _ = (writable_metrics.admissions | f"Write admission metrics to BQ table: {admissions_table}" >> beam.io.WriteToBigQuery( table=admissions_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.populations | f"Write population metrics to BQ table: {population_table}" >> beam.io.WriteToBigQuery( table=population_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.releases | f"Write release metrics to BQ table: {releases_table}" >> beam.io.WriteToBigQuery( table=releases_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def execute_pipeline( self, pipeline: beam.Pipeline, all_pipeline_options: Dict[str, Any], state_code: str, input_dataset: str, reference_dataset: str, static_reference_dataset: str, metric_types: List[str], person_id_filter_set: Optional[Set[int]], calculation_month_count: int = -1, calculation_end_month: Optional[str] = None, ) -> beam.Pipeline: persons = pipeline | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationPeriods incarceration_periods = ( pipeline | "Load IncarcerationPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionPeriods supervision_periods = pipeline | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( pipeline | "Load person_id_to_county_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) # Group each StatePerson with their StateIncarcerationPeriods person_entities = { "person": persons, "incarceration_periods": incarceration_periods, "supervision_periods": supervision_periods, "persons_to_recent_county_of_residence": person_id_to_county_kv, } | "Group StatePerson to StateIncarcerationPeriods" >> beam.CoGroupByKey( ) state_race_ethnicity_population_counts = ( pipeline | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods person_release_events = person_entities | "ClassifyReleaseEvents" >> beam.ParDo( ClassifyEvents(), identifier=self.pipeline_config.identifier) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), state_race_ethnicity_population_counts=AsList( state_race_ethnicity_population_counts), )) person_release_events_with_metadata = ( { "person_events": person_release_events, "person_metadata": person_metadata } | "Group ReleaseEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ReleaseEvents for calculations" >> beam.ParDo(ExtractPersonReleaseEventsMetadata())) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get recidivism metrics recidivism_metrics = ( person_release_events_with_metadata | "Get Recidivism Metrics" >> GetMetrics( pipeline_options=all_pipeline_options, pipeline_config=self.pipeline_config, metric_types_to_include=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) return recidivism_metrics
def run( apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: str, calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]], ): """Runs the program calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options["project"] if project_id is None: raise ValueError(f"No project set in pipeline options: {all_pipeline_options}") if state_code is None: raise ValueError("No state_code set for pipeline") input_dataset = project_id + "." + data_input reference_dataset = project_id + "." + reference_view_input static_reference_dataset = project_id + "." + static_reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = p | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateProgramAssignments program_assignments = p | "Load Program Assignments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateAssessments assessments = p | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionPeriods supervision_periods = p | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_period_to_agent_associations_as_kv = ( p | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, ) ) state_race_ethnicity_population_counts = ( p | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, ) ) # Group each StatePerson with their other entities persons_entities = { "person": persons, "program_assignments": program_assignments, "assessments": assessments, "supervision_periods": supervision_periods, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, } | "Group StatePerson to StateProgramAssignments and" >> beam.CoGroupByKey() # Identify ProgramEvents from the StatePerson's StateProgramAssignments person_program_events = persons_entities | beam.ParDo( ClassifyProgramAssignments() ) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts) ) ) person_program_events_with_metadata = ( {"person_events": person_program_events, "person_metadata": person_metadata} | "Group ProgramEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ProgramEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get program metrics program_metrics = ( person_program_events_with_metadata | "Get Program Metrics" >> GetProgramMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, ) ) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( program_metrics | "Convert to dict to be written to BQ" >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( ProgramMetricType.PROGRAM_PARTICIPATION.value, ProgramMetricType.PROGRAM_REFERRAL.value, ) ) # Write the metrics to the output tables in BigQuery referrals_table_id = DATAFLOW_METRICS_TO_TABLES[ProgramReferralMetric] participation_table_id = DATAFLOW_METRICS_TO_TABLES[ProgramParticipationMetric] _ = ( writable_metrics.PROGRAM_REFERRAL | f"Write referral metrics to BQ table: {referrals_table_id}" >> WriteAppendToBigQuery( output_table=referrals_table_id, output_dataset=output, ) ) _ = ( writable_metrics.PROGRAM_PARTICIPATION | f"Write participation metrics to BQ table: {participation_table_id}" >> WriteAppendToBigQuery( output_table=participation_table_id, output_dataset=output, ) )
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the incarceration calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_apache_beam_pipeline_options = apache_beam_pipeline_options.get_all_options( ) query_dataset = all_apache_beam_pipeline_options[ 'project'] + '.' + data_input reference_dataset = all_apache_beam_pipeline_options[ 'project'] + '.' + reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load StatePersons' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set)) # Get StateSentenceGroups sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load StateIncarcerationSentences' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load StateSupervisionSentences' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) supervision_sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( supervision_sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) sentences_and_sentence_groups = ( { 'sentence_groups': sentence_groups, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'supervision_sentences': sentences_converted.supervision_sentences } | 'Group sentences to sentence groups' >> beam.CoGroupByKey()) # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | 'Set hydrated sentences on sentence groups' >> beam.ParDo( SetSentencesOnSentenceGroup())) # Bring in the table that associates people and their county of residence person_id_to_county_query = select_all_by_person_query( reference_dataset, PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, # TODO(3602): Once we put state_code on StatePerson objects, we can update the # persons_to_recent_county_of_residence query to have a state_code field, allowing us to also filter the # output by state_code. state_code_filter=None, person_id_filter_set=person_id_filter_set) person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Bring in the judicial districts associated with incarceration_periods ip_to_judicial_district_query = select_all_by_person_query( reference_dataset, INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, state_code, person_id_filter_set) ip_to_judicial_district_kv = ( p | "Read incarceration_period to judicial_district associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=ip_to_judicial_district_query, use_standard_sql=True)) | "Convert incarceration_period to judicial_district association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Group each StatePerson with their related entities person_entities = ( { 'person': persons, 'sentence_groups': sentence_groups_with_hydrated_sentences, 'incarceration_period_judicial_district_association': ip_to_judicial_district_kv } | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey()) # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_events = ( person_entities | 'Classify Incarceration Events' >> beam.ParDo( ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_apache_beam_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get IncarcerationMetrics incarceration_metrics = ( person_events | 'Get Incarceration Metrics' >> GetIncarcerationMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( incarceration_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( IncarcerationMetricType.INCARCERATION_ADMISSION.value, IncarcerationMetricType.INCARCERATION_POPULATION.value, IncarcerationMetricType.INCARCERATION_RELEASE.value)) # Write the metrics to the output tables in BigQuery admissions_table_id = DATAFLOW_METRICS_TO_TABLES.get( IncarcerationAdmissionMetric) population_table_id = DATAFLOW_METRICS_TO_TABLES.get( IncarcerationPopulationMetric) releases_table_id = DATAFLOW_METRICS_TO_TABLES.get( IncarcerationReleaseMetric) _ = (writable_metrics.INCARCERATION_ADMISSION | f"Write admission metrics to BQ table: {admissions_table_id}" >> beam.io.WriteToBigQuery( table=admissions_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.INCARCERATION_POPULATION | f"Write population metrics to BQ table: {population_table_id}" >> beam.io.WriteToBigQuery( table=population_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.INCARCERATION_RELEASE | f"Write release metrics to BQ table: {releases_table_id}" >> beam.io.WriteToBigQuery( table=releases_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the supervision calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() input_dataset = all_pipeline_options['project'] + '.' + data_input reference_dataset = all_pipeline_options['project'] + '.' + reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationPeriods incarceration_periods = ( p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionViolations supervision_violations = ( p | 'Load SupervisionViolations' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | 'Load SupervisionViolationResponses' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load SupervisionSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load IncarcerationSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionPeriods supervision_periods = ( p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) supervision_contacts = ( p | 'Load StateSupervisionContacts' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionContact, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Bring in the table that associates StateSupervisionViolationResponses to information about StateAgents ssvr_to_agent_association_query = select_all_by_person_query( reference_dataset, SSVR_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code, person_id_filter_set) ssvr_to_agent_associations = ( p | "Read SSVR to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=ssvr_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the # supervision_violation_response_id column as the key ssvr_agent_associations_as_kv = ( ssvr_to_agent_associations | 'Convert SSVR to Agent table to KV tuples' >> beam.ParDo( ConvertDictToKVTuple(), 'supervision_violation_response_id')) supervision_period_to_agent_association_query = select_all_by_person_query( reference_dataset, SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code, person_id_filter_set) supervision_period_to_agent_associations = ( p | "Read Supervision Period to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource( query=supervision_period_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the supervision_period_id column # as the key supervision_period_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert Supervision Period to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id')) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) # Bring in the judicial districts associated with supervision_periods sp_to_judicial_district_query = select_all_by_person_query( reference_dataset, SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, state_code, person_id_filter_set) sp_to_judicial_district_kv = ( p | "Read supervision_period to judicial_district associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=sp_to_judicial_district_query, use_standard_sql=True)) | "Convert supervision_period to judicial_district association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { 'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id incarceration_periods_and_violation_responses = ( { 'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding # StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their related entities person_entities = ( { 'person': persons, 'assessments': assessments, 'incarceration_periods': incarceration_periods_with_source_violations, 'supervision_periods': supervision_periods, 'supervision_sentences': sentences_converted.supervision_sentences, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'violation_responses': violation_responses_with_hydrated_violations, 'supervision_contacts': supervision_contacts, 'supervision_period_judicial_district_association': sp_to_judicial_district_kv } | 'Group StatePerson to all entities' >> beam.CoGroupByKey()) # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods person_time_buckets = ( person_entities | 'Get SupervisionTimeBuckets' >> beam.ParDo( ClassifySupervisionTimeBuckets(), AsDict(ssvr_agent_associations_as_kv), AsDict(supervision_period_to_agent_associations_as_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Get the type of metric to calculate metric_types_set = set(metric_types) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get supervision metrics supervision_metrics = ( person_time_buckets | 'Get Supervision Metrics' >> GetSupervisionMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( supervision_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( SupervisionMetricType.SUPERVISION_COMPLIANCE.value, SupervisionMetricType.SUPERVISION_POPULATION.value, SupervisionMetricType.SUPERVISION_REVOCATION.value, SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value, SupervisionMetricType. SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS.value, SupervisionMetricType.SUPERVISION_SUCCESS.value, SupervisionMetricType. SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value, SupervisionMetricType.SUPERVISION_TERMINATION.value)) # Write the metrics to the output tables in BigQuery terminations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionTerminationMetric) compliance_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionCaseComplianceMetric) populations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionPopulationMetric) revocations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionRevocationMetric) revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionRevocationAnalysisMetric) revocation_violation_type_analysis_table_id = \ DATAFLOW_METRICS_TO_TABLES.get(SupervisionRevocationViolationTypeAnalysisMetric) successes_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionSuccessMetric) successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES.get( SuccessfulSupervisionSentenceDaysServedMetric) _ = (writable_metrics.SUPERVISION_POPULATION | f"Write population metrics to BQ table: {populations_table_id}" >> beam.io.WriteToBigQuery( table=populations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_REVOCATION | f"Write revocation metrics to BQ table: {revocations_table_id}" >> beam.io.WriteToBigQuery( table=revocations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_SUCCESS | f"Write success metrics to BQ table: {successes_table_id}" >> beam.io.WriteToBigQuery( table=successes_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED | f"Write supervision successful sentence length metrics to BQ" f" table: {successful_sentence_lengths_table_id}" >> beam.io.WriteToBigQuery( table=successful_sentence_lengths_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_TERMINATION | f"Write termination metrics to BQ table: {terminations_table_id}" >> beam.io.WriteToBigQuery( table=terminations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = ( writable_metrics.SUPERVISION_REVOCATION_ANALYSIS | f"Write revocation analyses metrics to BQ table: {revocation_analysis_table_id}" >> beam.io.WriteToBigQuery( table=revocation_analysis_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS | f"Write revocation violation type analyses metrics to BQ table: " f"{revocation_violation_type_analysis_table_id}" >> beam.io.WriteToBigQuery( table=revocation_violation_type_analysis_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_COMPLIANCE | f"Write compliance metrics to BQ table: {compliance_table_id}" >> beam.io.WriteToBigQuery( table=compliance_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
def execute_pipeline( self, pipeline: beam.Pipeline, all_pipeline_options: Dict[str, Any], state_code: str, input_dataset: str, reference_dataset: str, static_reference_dataset: str, metric_types: List[str], person_id_filter_set: Optional[Set[int]], calculation_month_count: int = -1, calculation_end_month: Optional[str] = None, ) -> beam.Pipeline: # Get StatePersons persons = pipeline | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateProgramAssignments program_assignments = pipeline | "Load Program Assignments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateAssessments assessments = pipeline | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionPeriods supervision_periods = pipeline | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_period_to_agent_associations_as_kv = ( pipeline | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) state_race_ethnicity_population_counts = ( pipeline | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) # Group each StatePerson with their other entities persons_entities = { "person": persons, "program_assignments": program_assignments, "assessments": assessments, "supervision_periods": supervision_periods, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, } | "Group StatePerson to StateProgramAssignments and" >> beam.CoGroupByKey( ) # Identify ProgramEvents from the StatePerson's StateProgramAssignments person_program_events = persons_entities | beam.ParDo( ClassifyEvents(), identifier=self.pipeline_config.identifier) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), state_race_ethnicity_population_counts=AsList( state_race_ethnicity_population_counts), )) person_program_events_with_metadata = ( { "person_events": person_program_events, "person_metadata": person_metadata } | "Group ProgramEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ProgramEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata())) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get program metrics program_metrics = (person_program_events_with_metadata | "Get Program Metrics" >> GetMetrics( pipeline_options=all_pipeline_options, pipeline_config=self.pipeline_config, metric_types_to_include=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) return program_metrics