def create_groups(group_ids, corpus, word, ignore_corpus, ignore_word): """Generate groups given the input PCollections.""" def attach_corpus_fn(group, corpus, ignore): selected = None len_corpus = len(corpus) while not selected: c = list(corpus[randrange(0, len_corpus)].values())[0] if c != ignore: selected = c yield (group, selected) def attach_word_fn(group, words, ignore): selected = None len_words = len(words) while not selected: c = list(words[randrange(0, len_words)].values())[0] if c != ignore: selected = c yield group + (selected, ) return (group_ids | 'attach corpus' >> beam.FlatMap(attach_corpus_fn, AsList(corpus), AsSingleton(ignore_corpus)) | 'attach word' >> beam.FlatMap(attach_word_fn, AsList(word), AsSingleton(ignore_word)))
def main(argv=None): options = PipelineOptions(argv) p = Pipeline(options=options) input = p | 'Input' >> beam.Create([1, 2, 3], reshuffle=False) output1 = input | 'Output1' >> beam.Map(lambda x, side: (x, side), AsList(input)) input | 'Output2' >> beam.Map( lambda x, side: logging.info('x: %s, side: %s', x, side), AsList(output1)) p.run()
def test_sdf_with_side_inputs(self): with TestPipeline() as p: side1 = p | 'Create1' >> Create(['1', '2']) side2 = p | 'Create2' >> Create(['3', '4']) side3 = p | 'Create3' >> Create(['5']) result = (p | 'create_main' >> beam.Create(['a', 'b', 'c']) | beam.ParDo(ExpandStrings(), AsList(side1), AsList(side2), AsSingleton(side3))) expected_result = [] for c in ['a', 'b', 'c']: for i in range(5): expected_result.append(c + ':' + str(i + 1)) assert_that(result, equal_to(expected_result))
def test_side_inputs(self): class SplitNumbersFn(DoFn): def process(self, element): if element < 0: yield pvalue.OutputValue('tag_negative', element) else: yield element class ProcessNumbersFn(DoFn): def process(self, element, negatives): yield element class DummySource(iobase.BoundedSource): pass root_read = Read(DummySource()) result = (self.pipeline | 'read' >> root_read | ParDo(SplitNumbersFn()).with_outputs('tag_negative', main='positive')) positive, negative = result positive | ParDo(ProcessNumbersFn(), AsList(negative)) self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(root_transforms, sorted([root_read])) self.assertEqual(len(self.visitor.step_names), 3) self.assertEqual(len(self.visitor.views), 1) self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList))
def test_side_inputs(self): class SplitNumbersFn(NewDoFn): def process(self, element): if element < 0: yield pvalue.SideOutputValue('tag_negative', element) else: yield element class ProcessNumbersFn(NewDoFn): def process(self, element, negatives): yield element root_create = Create('create', [[-1, 2, 3]]) result = (self.pipeline | root_create | ParDo(SplitNumbersFn()).with_outputs('tag_negative', main='positive')) positive, negative = result positive | ParDo(ProcessNumbersFn(), AsList(negative)) self.pipeline.visit(self.visitor) root_transforms = sorted( [t.transform for t in self.visitor.root_transforms]) self.assertEqual(root_transforms, sorted([root_create])) self.assertEqual(len(self.visitor.step_names), 4) self.assertEqual(len(self.visitor.views), 1) self.assertTrue( isinstance(self.visitor.views[0], pvalue.ListPCollectionView))
def test_side_inputs(self): class SplitNumbersFn(DoFn): def process(self, element): if element < 0: yield pvalue.TaggedOutput('tag_negative', element) else: yield element class ProcessNumbersFn(DoFn): def process(self, element, negatives): yield element root_read = beam.Impulse() result = (self.pipeline | 'read' >> root_read | ParDo(SplitNumbersFn()).with_outputs('tag_negative', main='positive')) positive, negative = result positive | ParDo(ProcessNumbersFn(), AsList(negative)) self.pipeline.visit(self.visitor) root_transforms = [t.transform for t in self.visitor.root_transforms] self.assertEqual(root_transforms, [root_read]) self.assertEqual(len(self.visitor.step_names), 3) self.assertEqual(len(self.visitor.views), 1) self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList))
def test_pcollectionview_not_recreated(self): pipeline = Pipeline('DirectRunner') value = pipeline | 'create1' >> Create([1, 2, 3]) value2 = pipeline | 'create2' >> Create([(1, 1), (2, 2), (3, 3)]) value3 = pipeline | 'create3' >> Create([(1, 1), (2, 2), (3, 3)]) self.assertEqual(AsSingleton(value), AsSingleton(value)) self.assertEqual(AsSingleton('new', value, default_value=1), AsSingleton('new', value, default_value=1)) self.assertNotEqual(AsSingleton(value), AsSingleton('new', value, default_value=1)) self.assertEqual(AsIter(value), AsIter(value)) self.assertEqual(AsList(value), AsList(value)) self.assertEqual(AsDict(value2), AsDict(value2)) self.assertNotEqual(AsSingleton(value), AsSingleton(value2)) self.assertNotEqual(AsIter(value), AsIter(value2)) self.assertNotEqual(AsList(value), AsList(value2)) self.assertNotEqual(AsDict(value2), AsDict(value3))
def test_pardo_unfusable_side_inputs(self): def cross_product(elem, sides): for side in sides: yield elem, side with self.create_pipeline() as p: pcoll = p | beam.Create(['a', 'b']) assert_that( pcoll | beam.FlatMap(cross_product, AsList(pcoll)), equal_to([('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')])) with self.create_pipeline() as p: pcoll = p | beam.Create(['a', 'b']) derived = ((pcoll, ) | beam.Flatten() | beam.Map(lambda x: (x, x)) | beam.GroupByKey() | 'Unkey' >> beam.Map(lambda (x, _): x)) assert_that( pcoll | beam.FlatMap(cross_product, AsList(derived)), equal_to([('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')]))
def test_pardo_side_inputs(self): def cross_product(elem, sides): for side in sides: yield elem, side with self.create_pipeline() as p: main = p | 'main' >> beam.Create(['a', 'b', 'c']) side = p | 'side' >> beam.Create(['x', 'y']) assert_that( main | beam.FlatMap(cross_product, AsList(side)), equal_to([('a', 'x'), ('b', 'x'), ('c', 'x'), ('a', 'y'), ('b', 'y'), ('c', 'y')]))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input_file', required=True, help=('Input file in the form of gcs bucket url')) parser.add_argument('--input_topic', required=True, help=('Input PubSub topic of the form ')) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) p = beam.Pipeline(options=pipeline_options) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True live_sales = (p | 'Read From PubSub' >> beam.io.ReadFromPubSub( topic=known_args.input_topic).with_output_types(bytes) | 'decode' >> beam.Map(lambda x: x.decode('utf-8'))) historical_sales = (p | beam.io.ReadFromText(known_args.input_file, skip_header_lines=1)) order_stats = (historical_sales | 'Split & Collect order_size' >> (beam.ParDo(CollectOrderSize())) | 'Calcuate Mean & Standard Deviation' >> beam.CombineGlobally(StandardDeviation())) current_asp = ( live_sales | 'Get Current Order size' >> (beam.ParDo(CollectOrderTuple())) | 'Calcuate Z-score' >> beam.Map(calculate_zscore, AsList(order_stats)) | 'Write to Bigquery' >> beam.io.WriteToBigQuery( 'spikey-gcp:spikey_orders.order_zscores', schema=spikey_schema)) result = p.run() result.wait_until_finish()
def test_side_inputs(self): class SplitNumbersFn(DoFn): def process(self, element): if element < 0: yield pvalue.TaggedOutput('tag_negative', element) else: yield element class ProcessNumbersFn(DoFn): def process(self, element, negatives): yield element def _process_numbers(pcoll, negatives): first_output = (pcoll | 'process numbers step 1' >> ParDo( ProcessNumbersFn(), negatives)) second_output = (first_output | 'process numbers step 2' >> ParDo( ProcessNumbersFn(), negatives)) output_pc = ((first_output, second_output) | 'flatten results' >> beam.Flatten()) return output_pc root_read = beam.Impulse() result = (self.pipeline | 'read' >> root_read | ParDo(SplitNumbersFn()).with_outputs('tag_negative', main='positive')) positive, negative = result _process_numbers(positive, AsList(negative)) self.pipeline.visit(self.visitor) root_transforms = [t.transform for t in self.visitor.root_transforms] self.assertEqual(root_transforms, [root_read]) self.assertEqual(len(self.visitor.step_names), 5) self.assertEqual(len(self.visitor.views), 1) self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList))
# Esta funcion calcula el schema del parquet a escribir, aplicando el renombre de columnas al schema original def getSchema(): df_schema = pyarrow.Schema.from_pandas( pd.read_parquet(user_options.schema_source.get())) for (key, value) in ast.literal_eval( user_options.rename_columns.get()).items(): df_schema = df_schema.set( df_schema.get_field_index(key), pyarrow.field(value, df_schema.types[df_schema.get_field_index(key)])) return df_schema # Este lee los archivos parquet fuente y calcula el diccionario con el mapeo de las columnas a renombrar map_rename_cols = ( p | "Read for rename cols" >> ReadFromParquet(user_options.url_raw) | "Map rename cols" >> beam.Map(mapRenameCols) | "Rename cols to string" >> beam.Map(str) | "Deduplicate elements" >> beam.Distinct()) # Este lee los datos desde los archivos fuente data = (p | "Read parquet for data" >> ReadFromParquet(user_options.url_raw)) # Este aplica la funcion para renombarar las columnas y recibe el resultado del paso anterior como diccionario rename_data = (data | "Rename columns" >> beam.Map( reColumns, rename_cols=AsList(map_rename_cols))) # Este escribe los datos en la ruta destino, obteniendo el schema desde la funcion getSchema _ = (rename_data | "Write to storage TRN" >> WriteToParquet( user_options.url_trn, schema=getSchema(), file_name_suffix=".parquet")) print("End Pipeline")
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--dataset', default='musicbrainz', help='BigQuery dataset name' ) parser.add_argument( '--table', default='recordings_by_artists_dataflow', help='BiqQuery table' ) args, argv = parser.parse_known_args() pipeline_options = PipelineOptions(argv) pipeline_options.view_as(StandardOptions).runner = 'DataflowRunner' gcp_options = pipeline_options.view_as(GoogleCloudOptions) if not gcp_options.job_name: gcp_options.job_name = 'music-job' worker_options = pipeline_options.view_as(WorkerOptions) if not worker_options.use_public_ips: worker_options.use_public_ips = False table_spec = bigquery.TableReference(projectId=gcp_options.project, datasetId=args.dataset, tableId=args.table) table_schema = { 'fields': [ {'name': 'id', 'mode': 'NULLABLE', 'type': 'INTEGER'}, {'name': 'artist_gid', 'mode': 'NULLABLE', 'type': 'STRING'}, {'name': 'artist_name', 'mode': 'NULLABLE', 'type': 'STRING'}, {'name': 'area', 'mode': 'NULLABLE', 'type': 'STRING'}, {'name': 'gender', 'mode': 'NULLABLE', 'type': 'STRING'}, {'name': 'artist_credit', 'mode': 'NULLABLE', 'type': 'INTEGER'}, {'name': 'recording_name', 'mode': 'NULLABLE', 'type': 'STRING'}, {'name': 'length', 'mode': 'NULLABLE', 'type': 'INTEGER'}, {'name': 'recording_gid', 'mode': 'NULLABLE', 'type': 'STRING'}, {'name': 'video', 'mode': 'NULLABLE', 'type': 'BOOLEAN'}, ] } with beam.Pipeline(options=pipeline_options) as pipeline: gender = pipeline | \ 'Read gender' >> beam.io.ReadFromText('gs://solutions-public-assets/bqetl/gender.json') | \ 'Process gender' >> beam.Map(process_gender_or_area) area = pipeline | \ 'Read area' >> beam.io.ReadFromText('gs://solutions-public-assets/bqetl/area.json') | \ 'Process area' >> beam.Map(process_gender_or_area) artists = pipeline | \ 'Read Artists' >> beam.io.ReadFromText('gs://solutions-public-assets/bqetl/artist.json') | \ 'Convert artist from json to dict' >> beam.Map(lambda e: json.loads(e)) | \ 'Process artists' >> beam.Map(process_artists, AsList(gender), AsList(area)) recordings = pipeline | \ 'Read Recordings' >> beam.io.ReadFromText('gs://solutions-public-assets/bqetl/recording.json') | \ 'Process recording' >> beam.Map(process_recording) artist_credit_name = pipeline | \ 'Read Artists Credit Name' >> beam.io.ReadFromText('gs://solutions-public-assets/bqetl/artist_credit_name.json') | \ 'Process artist credit name' >> beam.Map(process_artist_credit) # Joining artist and artist_credit_name # SELECT artist.id, # artist.gid as artist_gid, # artist.name as artist_name, # artist.area, # artist_credit_name.artist_credit # FROM datafusion-dataproc-tutorial.musicbrainz.artist as artist # INNER JOIN datafusion-dataproc-tutorial.musicbrainz.artist_credit_name AS artist_credit_name # ON artist.id = artist_credit_name.artist # joined_artist_and_artist_credit_name = ({ 'artists': artists, 'artist_credit_name': artist_credit_name}) | \ 'Merge artist and artist_credit_name to intermitent' >> beam.CoGroupByKey() | \ 'UnSetCoGroup intermitent' >> beam.ParDo(UnSetCoGroup(), 'artists', 'artist_credit_name', 'artist') | \ 'Map artist_credit to dict element' >> beam.Map(lambda e: (e['artist_credit'], e)) # Joining previous table with recordings # SELECT intermitent.id, # intermitent.artist_gid, # intermitent.artist_name, # intermitent.area, # intermitent.artist_credit, # recording.recording_name, # recording.length, # recording.video # FROM datafusion-dataproc-tutorial.musicbrainz.intermitents as intermitent # INNER JOIN datafusion-dataproc-tutorial.musicbrainz.recording AS recording # ON intermitent.artist_credit = recording.artist_credit # joined_artist_and_artist_credit_name_and_recording = ({ 'joined_artist_and_artist_credit_name': joined_artist_and_artist_credit_name, 'recordings': recordings}) | \ 'Merge intermitent and recording' >> beam.CoGroupByKey() | \ 'UnSetCoGroup final' >> beam.ParDo(UnSetCoGroup(), 'joined_artist_and_artist_credit_name', 'recordings', 'artist_credit') | \ 'Write To BQ' >> beam.io.WriteToBigQuery(table_spec, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED) logging.getLogger().setLevel(logging.INFO)
def run_test_pipeline( fake_person_id: int, state_code: str, dataset: str, expected_metric_types: Set[IncarcerationMetricType], allow_empty: bool = False, unifying_id_field_filter_set: Optional[Set[int]] = None, metric_types_filter: Optional[Set[str]] = None): """Runs a test version of the incarceration pipeline.""" test_pipeline = TestPipeline() # Get StatePersons persons = ( test_pipeline | 'Load Persons' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True)) # Get StateSentenceGroups sentence_groups = ( test_pipeline | 'Load StateSentenceGroups' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=unifying_id_field_filter_set)) # Get StateIncarcerationSentences incarceration_sentences = ( test_pipeline | 'Load StateIncarcerationSentences' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=unifying_id_field_filter_set)) # Get StateSupervisionSentences supervision_sentences = ( test_pipeline | 'Load StateSupervisionSentences' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=unifying_id_field_filter_set)) us_mo_sentence_status_rows: List[Dict[str, Any]] = [{ 'person_id': fake_person_id, 'sentence_external_id': 'XXX', 'sentence_status_external_id': 'YYY', 'status_code': 'ZZZ', 'status_date': 'not_a_date', 'status_description': 'XYZ' }] us_mo_sentence_statuses = (test_pipeline | 'Create MO sentence statuses' >> beam.Create(us_mo_sentence_status_rows)) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) sentences_and_sentence_groups = ( { 'sentence_groups': sentence_groups, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'supervision_sentences': sentences_converted.supervision_sentences } | 'Group sentences to sentence groups' >> beam.CoGroupByKey()) sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | 'Set hydrated sentences on sentence groups' >> beam.ParDo( SetSentencesOnSentenceGroup())) # Identify IncarcerationEvents events from the StatePerson's # StateIncarcerationPeriods fake_person_id_to_county_query_result = [{ 'person_id': fake_person_id, 'county_of_residence': _COUNTY_OF_RESIDENCE }] person_id_to_county_kv = ( test_pipeline | "Read person id to county associations from BigQuery" >> beam.Create(fake_person_id_to_county_query_result) | "Convert person_id to counties to KV" >> beam.ParDo( ConvertDictToKVTuple(), 'person_id')) incarceration_period_judicial_district_association_row = \ {'person_id': fake_person_id, 'incarceration_period_id': 123, 'judicial_district_code': 'NW'} ip_to_judicial_district_kv = ( test_pipeline | "Read incarceration_period to judicial_district associations from BigQuery" >> beam.Create( [incarceration_period_judicial_district_association_row]) | "Convert ips to judicial districts to KV" >> beam.ParDo( ConvertDictToKVTuple(), 'person_id')) state_race_ethnicity_population_count = { 'state_code': state_code, 'race_or_ethnicity': 'BLACK', 'population_count': 1, 'representation_priority': 1 } state_race_ethnicity_population_counts = ( test_pipeline | 'Create state_race_ethnicity_population_count table' >> beam.Create([state_race_ethnicity_population_count])) # Group each StatePerson with their related entities person_entities = ( { 'person': persons, 'sentence_groups': sentence_groups_with_hydrated_sentences, 'incarceration_period_judicial_district_association': ip_to_judicial_district_kv } | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey()) # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_incarceration_events = ( person_entities | 'Classify Incarceration Events' >> beam.ParDo( pipeline.ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_incarceration_events_with_metadata = ( { 'person_events': person_incarceration_events, 'person_metadata': person_metadata } | 'Group IncarcerationEvents with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations' >> beam.ParDo(ExtractPersonEventsMetadata())) # Get pipeline job details for accessing job_id all_pipeline_options = PipelineOptions().get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp metric_types = metric_types_filter if metric_types_filter else {'ALL'} # Get IncarcerationMetrics incarceration_metrics = ( person_incarceration_events_with_metadata | 'Get Incarceration Metrics' >> # type: ignore pipeline.GetIncarcerationMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types, calculation_end_month=None, calculation_month_count=-1)) assert_that( incarceration_metrics, AssertMatchers.validate_metric_type(allow_empty=allow_empty), 'Assert that all metrics are of the expected type.') assert_that( incarceration_metrics, AssertMatchers.validate_pipeline_test(expected_metric_types), 'Assert the type of metrics produced are expected') test_pipeline.run()
def execute_pipeline( self, pipeline: beam.Pipeline, all_pipeline_options: Dict[str, Any], state_code: str, input_dataset: str, reference_dataset: str, static_reference_dataset: str, metric_types: List[str], person_id_filter_set: Optional[Set[int]], calculation_month_count: int = -1, calculation_end_month: Optional[str] = None, ) -> beam.Pipeline: persons = pipeline | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationPeriods incarceration_periods = ( pipeline | "Load IncarcerationPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionPeriods supervision_periods = pipeline | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( pipeline | "Load person_id_to_county_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) # Group each StatePerson with their StateIncarcerationPeriods person_entities = { "person": persons, "incarceration_periods": incarceration_periods, "supervision_periods": supervision_periods, "persons_to_recent_county_of_residence": person_id_to_county_kv, } | "Group StatePerson to StateIncarcerationPeriods" >> beam.CoGroupByKey( ) state_race_ethnicity_population_counts = ( pipeline | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods person_release_events = person_entities | "ClassifyReleaseEvents" >> beam.ParDo( ClassifyEvents(), identifier=self.pipeline_config.identifier) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), state_race_ethnicity_population_counts=AsList( state_race_ethnicity_population_counts), )) person_release_events_with_metadata = ( { "person_events": person_release_events, "person_metadata": person_metadata } | "Group ReleaseEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ReleaseEvents for calculations" >> beam.ParDo(ExtractPersonReleaseEventsMetadata())) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get recidivism metrics recidivism_metrics = ( person_release_events_with_metadata | "Get Recidivism Metrics" >> GetMetrics( pipeline_options=all_pipeline_options, pipeline_config=self.pipeline_config, metric_types_to_include=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) return recidivism_metrics
def run( apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, metric_types: List[str], state_code: str, person_filter_ids: Optional[List[int]], ): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options["project"] if project_id is None: raise ValueError(f"No project set in pipeline options: {all_pipeline_options}") if state_code is None: raise ValueError("No state_code set for pipeline") input_dataset = project_id + "." + data_input reference_dataset = project_id + "." + reference_view_input static_reference_dataset = project_id + "." + static_reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = p | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationPeriods incarceration_periods = p | "Load IncarcerationPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionViolations supervision_violations = p | "Load SupervisionViolations" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # TODO(#2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | "Load SupervisionViolationResponses" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) ) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( { "violations": supervision_violations, "violation_responses": supervision_violation_responses, } | "Group StateSupervisionViolationResponses to " "StateSupervisionViolations" >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | "Set hydrated StateSupervisionViolations on " "the StateSupervisionViolationResponses" >> beam.ParDo(SetViolationOnViolationsResponse()) ) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( { "incarceration_periods": incarceration_periods, "violation_responses": violation_responses_with_hydrated_violations, } | "Group StateIncarcerationPeriods to " "StateSupervisionViolationResponses" >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | "Set hydrated StateSupervisionViolationResponses on " "the StateIncarcerationPeriods" >> beam.ParDo(SetViolationResponseOnIncarcerationPeriod()) ) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( p | "Load person_id_to_county_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, ) ) # Group each StatePerson with their StateIncarcerationPeriods person_entities = { "person": persons, "incarceration_periods": incarceration_periods_with_source_violations, "persons_to_recent_county_of_residence": person_id_to_county_kv, } | "Group StatePerson to StateIncarcerationPeriods" >> beam.CoGroupByKey() state_race_ethnicity_population_counts = ( p | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, ) ) # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods person_release_events = person_entities | "ClassifyReleaseEvents" >> beam.ParDo( ClassifyReleaseEvents() ) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts) ) ) person_release_events_with_metadata = ( {"person_events": person_release_events, "person_metadata": person_metadata} | "Group ReleaseEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ReleaseEvents for calculations" >> beam.ParDo(ExtractPersonReleaseEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get recidivism metrics recidivism_metrics = ( person_release_events_with_metadata | "Get Recidivism Metrics" >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set ) ) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( recidivism_metrics | "Convert to dict to be written to BQ" >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( ReincarcerationRecidivismMetricType.REINCARCERATION_RATE.value, ReincarcerationRecidivismMetricType.REINCARCERATION_COUNT.value, ) ) # Write the recidivism metrics to the output tables in BigQuery rates_table_id = DATAFLOW_METRICS_TO_TABLES[ReincarcerationRecidivismRateMetric] counts_table_id = DATAFLOW_METRICS_TO_TABLES[ ReincarcerationRecidivismCountMetric ] _ = ( writable_metrics.REINCARCERATION_RATE | f"Write rate metrics to BQ table: {rates_table_id}" >> WriteAppendToBigQuery( output_table=rates_table_id, output_dataset=output, ) ) _ = ( writable_metrics.REINCARCERATION_COUNT | f"Write count metrics to BQ table: {counts_table_id}" >> WriteAppendToBigQuery( output_table=counts_table_id, output_dataset=output, ) )
def execute_pipeline( self, pipeline: beam.Pipeline, all_pipeline_options: Dict[str, Any], state_code: str, input_dataset: str, _reference_dataset: str, static_reference_dataset: str, metric_types: List[str], person_id_filter_set: Optional[Set[int]], calculation_month_count: int = -1, calculation_end_month: Optional[str] = None, ) -> beam.Pipeline: # Get StatePersons persons = pipeline | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionViolations supervision_violations = ( pipeline | "Load SupervisionViolations" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) ) # Get StateSupervisionViolationResponses supervision_violation_responses = ( pipeline | "Load SupervisionViolationResponses" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) ) state_race_ethnicity_population_counts = ( pipeline | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, ) ) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { "violations": supervision_violations, "violation_responses": supervision_violation_responses, } | "Group StateSupervisionViolationResponses to StateSupervisionViolations" >> beam.CoGroupByKey() ) violations_with_hydrated_violation_responses = ( supervision_violations_and_responses | "Set hydrated StateSupervisionViolationResponses on the StateSupervisionViolations" >> beam.ParDo(SetViolationResponsesOntoViolations()) ) person_entities = { "person": persons, "violations": violations_with_hydrated_violation_responses, } | "Group StatePerson to violation entities" >> beam.CoGroupByKey() person_violation_events = person_entities | "Get ViolationEvents" >> beam.ParDo( ClassifyEvents(), identifier=self.pipeline_config.identifier ) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), state_race_ethnicity_population_counts=AsList( state_race_ethnicity_population_counts ), ) ) person_violation_events_with_metadata = ( { "person_events": person_violation_events, "person_metadata": person_metadata, } | "Group ViolationEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ViolationEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata()) ) metric_types_set = set(metric_types) job_timestamp = datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get violation metrics violation_metrics = ( person_violation_events_with_metadata | "Get Violation Metrics" >> GetMetrics( pipeline_options=all_pipeline_options, pipeline_config=self.pipeline_config, metric_types_to_include=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, ) ) return violation_metrics
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the incarceration calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options['project'] input_dataset = project_id + '.' + data_input reference_dataset = project_id + '.' + reference_view_input static_reference_dataset = project_id + '.' + static_reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load StatePersons' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSentenceGroups sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load StateIncarcerationSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load StateSupervisionSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) supervision_sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( supervision_sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) sentences_and_sentence_groups = ( { 'sentence_groups': sentence_groups, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'supervision_sentences': sentences_converted.supervision_sentences } | 'Group sentences to sentence groups' >> beam.CoGroupByKey()) # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | 'Set hydrated sentences on sentence groups' >> beam.ParDo( SetSentencesOnSentenceGroup())) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( p | 'Load person_id_to_county_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key='person_id', state_code_filter=state_code, person_id_filter_set=person_id_filter_set)) ip_to_judicial_district_kv = ( p | 'Load ip_to_judicial_district_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id= INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, table_key='person_id', state_code_filter=state_code, person_id_filter_set=person_id_filter_set)) state_race_ethnicity_population_counts = ( p | 'Load state_race_ethnicity_population_counts' >> ImportTable( dataset_id=static_reference_dataset, table_id='state_race_ethnicity_population_counts', state_code_filter=state_code, person_id_filter_set=None)) # Group each StatePerson with their related entities person_entities = ( { 'person': persons, 'sentence_groups': sentence_groups_with_hydrated_sentences, 'incarceration_period_judicial_district_association': ip_to_judicial_district_kv } | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey()) # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_incarceration_events = ( person_entities | 'Classify Incarceration Events' >> beam.ParDo( ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_incarceration_events_with_metadata = ( { 'person_events': person_incarceration_events, 'person_metadata': person_metadata } | 'Group IncarcerationEvents with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations' >> beam.ParDo(ExtractPersonEventsMetadata())) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get IncarcerationMetrics incarceration_metrics = ( person_incarceration_events_with_metadata | 'Get Incarceration Metrics' >> GetIncarcerationMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( incarceration_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( IncarcerationMetricType.INCARCERATION_ADMISSION.value, IncarcerationMetricType.INCARCERATION_POPULATION.value, IncarcerationMetricType.INCARCERATION_RELEASE.value)) # Write the metrics to the output tables in BigQuery admissions_table_id = DATAFLOW_METRICS_TO_TABLES.get( IncarcerationAdmissionMetric) population_table_id = DATAFLOW_METRICS_TO_TABLES.get( IncarcerationPopulationMetric) releases_table_id = DATAFLOW_METRICS_TO_TABLES.get( IncarcerationReleaseMetric) _ = (writable_metrics.INCARCERATION_ADMISSION | f"Write admission metrics to BQ table: {admissions_table_id}" >> beam.io.WriteToBigQuery( table=admissions_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.INCARCERATION_POPULATION | f"Write population metrics to BQ table: {population_table_id}" >> beam.io.WriteToBigQuery( table=population_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.INCARCERATION_RELEASE | f"Write release metrics to BQ table: {releases_table_id}" >> beam.io.WriteToBigQuery( table=releases_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
def run( apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: str, calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]], ): """Runs the incarceration calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options["project"] if project_id is None: raise ValueError(f"No project set in pipeline options: {all_pipeline_options}") if state_code is None: raise ValueError("No state_code set for pipeline") input_dataset = project_id + "." + data_input reference_dataset = project_id + "." + reference_view_input static_reference_dataset = project_id + "." + static_reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = p | "Load StatePersons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSentenceGroups sentence_groups = p | "Load StateSentenceGroups" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationSentences incarceration_sentences = ( p | "Load StateIncarcerationSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) ) # Get StateSupervisionSentences supervision_sentences = p | "Load StateSupervisionSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) if state_code == "US_MO": # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set, ) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> ReadFromBigQuery(query=us_mo_sentence_status_query) ) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([]) ) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | "Convert MO sentence status ranking table to KV tuples" >> beam.ParDo(ConvertDictToKVTuple(), "person_id") ) supervision_sentences_and_statuses = ( { "incarceration_sentences": incarceration_sentences, "supervision_sentences": supervision_sentences, "sentence_statuses": us_mo_sentence_status_rankings_as_kv, } | "Group sentences to the sentence statuses for that person" >> beam.CoGroupByKey() ) sentences_converted = ( supervision_sentences_and_statuses | "Convert to state-specific sentences" >> beam.ParDo(ConvertSentencesToStateSpecificType()).with_outputs( "incarceration_sentences", "supervision_sentences" ) ) sentences_and_sentence_groups = { "sentence_groups": sentence_groups, "incarceration_sentences": sentences_converted.incarceration_sentences, "supervision_sentences": sentences_converted.supervision_sentences, } | "Group sentences to sentence groups" >> beam.CoGroupByKey() # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | "Set hydrated sentences on sentence groups" >> beam.ParDo(SetSentencesOnSentenceGroup()) ) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( p | "Load person_id_to_county_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, ) ) ip_to_judicial_district_kv = ( p | "Load ip_to_judicial_district_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, ) ) state_race_ethnicity_population_counts = ( p | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, ) ) # Group each StatePerson with their related entities person_entities = { "person": persons, "sentence_groups": sentence_groups_with_hydrated_sentences, "incarceration_period_judicial_district_association": ip_to_judicial_district_kv, "persons_to_recent_county_of_residence": person_id_to_county_kv, } | "Group StatePerson to SentenceGroups" >> beam.CoGroupByKey() # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_incarceration_events = ( person_entities | "Classify Incarceration Events" >> beam.ParDo(ClassifyIncarcerationEvents()) ) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts) ) ) person_incarceration_events_with_metadata = ( { "person_events": person_incarceration_events, "person_metadata": person_metadata, } | "Group IncarcerationEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get IncarcerationMetrics incarceration_metrics = ( person_incarceration_events_with_metadata | "Get Incarceration Metrics" >> GetIncarcerationMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, ) ) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( incarceration_metrics | "Convert to dict to be written to BQ" >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( IncarcerationMetricType.INCARCERATION_ADMISSION.value, IncarcerationMetricType.INCARCERATION_POPULATION.value, IncarcerationMetricType.INCARCERATION_RELEASE.value, ) ) # Write the metrics to the output tables in BigQuery admissions_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationAdmissionMetric] population_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationPopulationMetric] releases_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationReleaseMetric] _ = ( writable_metrics.INCARCERATION_ADMISSION | f"Write admission metrics to BQ table: {admissions_table_id}" >> WriteAppendToBigQuery( output_table=admissions_table_id, output_dataset=output, ) ) _ = ( writable_metrics.INCARCERATION_POPULATION | f"Write population metrics to BQ table: {population_table_id}" >> WriteAppendToBigQuery( output_table=population_table_id, output_dataset=output, ) ) _ = ( writable_metrics.INCARCERATION_RELEASE | f"Write release metrics to BQ table: {releases_table_id}" >> WriteAppendToBigQuery( output_table=releases_table_id, output_dataset=output, ) )
def run(argv=None): """ This funciton parses the command line arguments and runs the Beam Pipeline. Args: argv: list containing the commandline arguments for this call of the script. """ # Keeps track if schema was inferred by input or ouput table. schema_inferred = False data_args, pipeline_args = parse_data_generator_args(argv) data_args, schema_inferred = fetch_schema(data_args, schema_inferred) pipeline_options = PipelineOptions(pipeline_args) temp_location = pipeline_options.display_data()['temp_location'] temp_blob = write_n_line_file_to_gcs( pipeline_options.display_data()['project'], temp_location, data_args.num_records) data_gen = DataGenerator( bq_schema_filename=data_args.schema_file, input_bq_table=data_args.input_bq_table, p_null=data_args.p_null, min_date=data_args.min_date, max_date=data_args.max_date, only_pos=data_args.only_pos, max_int=data_args.max_int, max_float=data_args.max_float, float_precision=data_args.float_precision, write_disp=data_args.write_disp, key_skew=data_args.key_skew, primary_key_cols=data_args.primary_key_cols, dest_joining_key_col=data_args.dest_joining_key_col) # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information including where Dataflow should # store temp files, and what the project id is and what runner to use. p = beam.Pipeline(options=pipeline_options) # When generating a dimension table we get the distinct keys as a side # input from the main table so we generate dimension records that join to # the main data table. key_set = \ (p | 'Query Keys from main table' >> beam.io.Read( beam.io.BigQuerySource( query="SELECT DISTINCT({}) FROM `{}`".format( data_args.source_joining_key_col, data_args.fact_table), use_standard_sql=True) ) | 'Extract key values' >> beam.Map( lambda x: (x[data_args.source_joining_key_col])) ) rows = ( p # Read the file we created with num_records newlines. # | 'Read file with num_records lines' >> beam.io.ReadFromText( os.path.join('gs://', temp_blob.bucket.name, temp_blob.name)) # Use our instance of our custom DataGenerator Class to generate 1 fake # datum with the appropriate schema for each element in the PColleciton # created above. | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen)) | 'Parse Json Strings' >> beam.FlatMap(lambda row: [json.loads(row)]) | 'Enforce joining keys' >> beam.FlatMap( data_gen.enforce_joinable_keys, key_set=AsList(key_set))) if data_args.primary_key_cols: for key in data_args.primary_key_cols.split(','): rows |= 'Enforcing primary key: {}'.format( key) >> EnforcePrimaryKeys(key) if data_args.csv_schema_order: (rows | 'Order fields for CSV writing.' >> beam.FlatMap( lambda d: [dict_to_csv(d, data_args.csv_schema_order.split(','))]) | 'Write to GCS' >> beam.io.textio.WriteToText( file_path_prefix=data_args.output_prefix, file_name_suffix='.csv') ) if data_args.avro_schema_file: avsc = avro.schema.parse(open(data_args.avro_schema_file, 'rb').read()) (rows # Need to convert time stamps from strings to timestamp-micros | 'Fix date and time Types for Avro.' >> beam.FlatMap(lambda row: fix_record_for_avro(row, avsc)) | 'Write to Avro.' >> beam.io.avroio.WriteToAvro( file_path_prefix=data_args.output_prefix, codec='null', file_name_suffix='.avro', use_fastavro=True, schema=avsc)) if data_args.output_bq_table: (rows | 'Write to BigQuery.' >> beam.io.gcp.bigquery.WriteToBigQuery( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command line. data_args.output_bq_table, schema=None if schema_inferred else data_gen.get_bq_schema_string(), # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=data_gen.write_disp, # Use the max recommended batch size. batch_size=500)) p.run().wait_until_finish() # Manually clean up of temp_num_records.txt because it will be outside this # job's directory and Dataflow will not remove it for us. temp_blob.delete()
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the supervision calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options['project'] input_dataset = project_id + '.' + data_input reference_dataset = project_id + '.' + reference_view_input static_reference_dataset = project_id + '.' + static_reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationPeriods incarceration_periods = ( p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionViolations supervision_violations = ( p | 'Load SupervisionViolations' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # TODO(#2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | 'Load SupervisionViolationResponses' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load SupervisionSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load IncarcerationSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionPeriods supervision_periods = ( p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) supervision_contacts = ( p | 'Load StateSupervisionContacts' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionContact, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) ssvr_agent_associations_as_kv = ( p | 'Load ssvr_agent_associations_as_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SSVR_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key='supervision_violation_response_id', state_code_filter=state_code, person_id_filter_set=None)) supervision_period_to_agent_associations_as_kv = ( p | 'Load supervision_period_to_agent_associations_as_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key='supervision_period_id', state_code_filter=state_code, person_id_filter_set=None)) # Bring in the judicial districts associated with supervision_periods sp_to_judicial_district_kv = ( p | 'Load sp_to_judicial_district_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id= SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, state_code_filter=state_code, person_id_filter_set=person_id_filter_set, table_key='person_id')) state_race_ethnicity_population_counts = ( p | 'Load state_race_ethnicity_population_counts' >> ImportTable( dataset_id=static_reference_dataset, table_id='state_race_ethnicity_population_counts', state_code_filter=state_code, person_id_filter_set=None, )) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { 'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id incarceration_periods_and_violation_responses = ( { 'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding # StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their related entities person_entities = ( { 'person': persons, 'assessments': assessments, 'incarceration_periods': incarceration_periods_with_source_violations, 'supervision_periods': supervision_periods, 'supervision_sentences': sentences_converted.supervision_sentences, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'violation_responses': violation_responses_with_hydrated_violations, 'supervision_contacts': supervision_contacts, 'supervision_period_judicial_district_association': sp_to_judicial_district_kv } | 'Group StatePerson to all entities' >> beam.CoGroupByKey()) # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods person_time_buckets = ( person_entities | 'Get SupervisionTimeBuckets' >> beam.ParDo( ClassifySupervisionTimeBuckets(), AsDict(ssvr_agent_associations_as_kv), AsDict(supervision_period_to_agent_associations_as_kv))) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_time_buckets_with_metadata = ( { 'person_events': person_time_buckets, 'person_metadata': person_metadata } | 'Group SupervisionTimeBuckets with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and SupervisionTimeBuckets for calculations' >> beam.ParDo(ExtractPersonEventsMetadata())) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Get the type of metric to calculate metric_types_set = set(metric_types) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get supervision metrics supervision_metrics = ( person_time_buckets_with_metadata | 'Get Supervision Metrics' >> GetSupervisionMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( supervision_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( SupervisionMetricType.SUPERVISION_COMPLIANCE.value, SupervisionMetricType.SUPERVISION_POPULATION.value, SupervisionMetricType.SUPERVISION_REVOCATION.value, SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value, SupervisionMetricType. SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS.value, SupervisionMetricType.SUPERVISION_SUCCESS.value, SupervisionMetricType. SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value, SupervisionMetricType.SUPERVISION_TERMINATION.value)) # Write the metrics to the output tables in BigQuery terminations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionTerminationMetric) compliance_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionCaseComplianceMetric) populations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionPopulationMetric) revocations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionRevocationMetric) revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionRevocationAnalysisMetric) revocation_violation_type_analysis_table_id = \ DATAFLOW_METRICS_TO_TABLES.get(SupervisionRevocationViolationTypeAnalysisMetric) successes_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionSuccessMetric) successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES.get( SuccessfulSupervisionSentenceDaysServedMetric) _ = (writable_metrics.SUPERVISION_POPULATION | f"Write population metrics to BQ table: {populations_table_id}" >> beam.io.WriteToBigQuery( table=populations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_REVOCATION | f"Write revocation metrics to BQ table: {revocations_table_id}" >> beam.io.WriteToBigQuery( table=revocations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_SUCCESS | f"Write success metrics to BQ table: {successes_table_id}" >> beam.io.WriteToBigQuery( table=successes_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED | f"Write supervision successful sentence length metrics to BQ" f" table: {successful_sentence_lengths_table_id}" >> beam.io.WriteToBigQuery( table=successful_sentence_lengths_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_TERMINATION | f"Write termination metrics to BQ table: {terminations_table_id}" >> beam.io.WriteToBigQuery( table=terminations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = ( writable_metrics.SUPERVISION_REVOCATION_ANALYSIS | f"Write revocation analyses metrics to BQ table: {revocation_analysis_table_id}" >> beam.io.WriteToBigQuery( table=revocation_analysis_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS | f"Write revocation violation type analyses metrics to BQ table: " f"{revocation_violation_type_analysis_table_id}" >> beam.io.WriteToBigQuery( table=revocation_violation_type_analysis_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_COMPLIANCE | f"Write compliance metrics to BQ table: {compliance_table_id}" >> beam.io.WriteToBigQuery( table=compliance_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
def run( apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: str, calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]], ): """Runs the program calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options["project"] if project_id is None: raise ValueError(f"No project set in pipeline options: {all_pipeline_options}") if state_code is None: raise ValueError("No state_code set for pipeline") input_dataset = project_id + "." + data_input reference_dataset = project_id + "." + reference_view_input static_reference_dataset = project_id + "." + static_reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = p | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateProgramAssignments program_assignments = p | "Load Program Assignments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateAssessments assessments = p | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionPeriods supervision_periods = p | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_period_to_agent_associations_as_kv = ( p | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, ) ) state_race_ethnicity_population_counts = ( p | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, ) ) # Group each StatePerson with their other entities persons_entities = { "person": persons, "program_assignments": program_assignments, "assessments": assessments, "supervision_periods": supervision_periods, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, } | "Group StatePerson to StateProgramAssignments and" >> beam.CoGroupByKey() # Identify ProgramEvents from the StatePerson's StateProgramAssignments person_program_events = persons_entities | beam.ParDo( ClassifyProgramAssignments() ) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts) ) ) person_program_events_with_metadata = ( {"person_events": person_program_events, "person_metadata": person_metadata} | "Group ProgramEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ProgramEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get program metrics program_metrics = ( person_program_events_with_metadata | "Get Program Metrics" >> GetProgramMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, ) ) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( program_metrics | "Convert to dict to be written to BQ" >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( ProgramMetricType.PROGRAM_PARTICIPATION.value, ProgramMetricType.PROGRAM_REFERRAL.value, ) ) # Write the metrics to the output tables in BigQuery referrals_table_id = DATAFLOW_METRICS_TO_TABLES[ProgramReferralMetric] participation_table_id = DATAFLOW_METRICS_TO_TABLES[ProgramParticipationMetric] _ = ( writable_metrics.PROGRAM_REFERRAL | f"Write referral metrics to BQ table: {referrals_table_id}" >> WriteAppendToBigQuery( output_table=referrals_table_id, output_dataset=output, ) ) _ = ( writable_metrics.PROGRAM_PARTICIPATION | f"Write participation metrics to BQ table: {participation_table_id}" >> WriteAppendToBigQuery( output_table=participation_table_id, output_dataset=output, ) )
def run(argv=sys.argv): # setup the command Line argument related to data processing parser = argparse.ArgumentParser() # Input file that contains sales data parser.add_argument( '--input1', dest='input1', default='../data/spikey_sales_weekly.txt', # required=True, help='Input Sales Data file') # Input file that contains all the offers currently running parser.add_argument( '--input2', dest='input2', default='../data/spikey_offers.txt', # required=True, help='Input Offers Data file') # This is output file to that will contains discounted top selling products parser.add_argument( '--output', dest='output', default='../output/output_top_seller_offer', # required=True, help='Input Offers Data file') known_args, pipeline_args = parser.parse_known_args(argv) # Apache beam pipeline option pipeline_option = PipelineOptions(pipeline_args) pipeline_option.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_option) as p: # Read sales data logging.info("Input sales data file %s", known_args.input1) sold_items = p | 'Read Sales Data' >> ReadFromText(known_args.input1) logging.info("Input offer data file %s", known_args.input2) discounted_items = p | 'Read offers' >> ReadFromText(known_args.input2) # Get Top Selling Items top_selling_items = ( sold_items | 'Top Selling Products' >> beam.ParDo(TopSellingProducts())) logging.info("top_selling_items calculation completed") discounted_item_ids = (discounted_items | 'Offer Items' >> beam.ParDo(OfferedItem())) logging.info("discounted_item_ids calculation completed") top_discounted_items = (top_selling_items | 'Discounted Item Match' >> beam.FlatMap( match_id_fn, AsList(discounted_item_ids))) (top_discounted_items | 'Write output File' >> WriteToText(known_args.output, file_name_suffix='.csv', header='Product_ID, Product_Name'))
def execute_pipeline( self, pipeline: beam.Pipeline, all_pipeline_options: Dict[str, Any], state_code: str, input_dataset: str, reference_dataset: str, static_reference_dataset: str, metric_types: List[str], person_id_filter_set: Optional[Set[int]], calculation_month_count: int = -1, calculation_end_month: Optional[str] = None, ) -> beam.Pipeline: persons = pipeline | "Load StatePersons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSentenceGroups sentence_groups = pipeline | "Load StateSentenceGroups" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationSentences incarceration_sentences = ( pipeline | "Load StateIncarcerationSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionSentences supervision_sentences = ( pipeline | "Load StateSupervisionSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionPeriods supervision_periods = ( pipeline | "Load StateSupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateAssessments assessments = pipeline | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionViolations supervision_violations = ( pipeline | "Load SupervisionViolations" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionViolationResponses supervision_violation_responses = ( pipeline | "Load SupervisionViolationResponses" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) if state_code == "US_MO": # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set, ) us_mo_sentence_statuses = ( pipeline | "Read MO sentence status table from BigQuery" >> ReadFromBigQuery(query=us_mo_sentence_status_query)) else: us_mo_sentence_statuses = ( pipeline | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | "Convert MO sentence status ranking table to KV tuples" >> beam.ParDo(ConvertDictToKVTuple(), "person_id")) supervision_sentences_and_statuses = ( { "incarceration_sentences": incarceration_sentences, "supervision_sentences": supervision_sentences, "sentence_statuses": us_mo_sentence_status_rankings_as_kv, } | "Group sentences to the sentence statuses for that person" >> beam.CoGroupByKey()) sentences_converted = ( supervision_sentences_and_statuses | "Convert to state-specific sentences" >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( "incarceration_sentences", "supervision_sentences")) # Set hydrated supervision periods on the corresponding incarceration sentences incarceration_sentences_with_hydrated_sps = ( { "supervision_periods": supervision_periods, "sentences": sentences_converted.incarceration_sentences, } | "Group supervision periods to incarceration sentences" >> beam.CoGroupByKey() | "Set hydrated supervision periods on incarceration sentences" >> beam.ParDo(SetSupervisionPeriodsOnSentences())) # Set hydrated supervision periods on the corresponding supervision sentences supervision_sentences_with_hydrated_sps = ( { "supervision_periods": supervision_periods, "sentences": sentences_converted.supervision_sentences, } | "Group supervision periods to supervision sentences" >> beam.CoGroupByKey() | "Set hydrated supervision periods on supervision sentences" >> beam.ParDo(SetSupervisionPeriodsOnSentences())) sentences_and_sentence_groups = { "sentence_groups": sentence_groups, "incarceration_sentences": incarceration_sentences_with_hydrated_sps, "supervision_sentences": supervision_sentences_with_hydrated_sps, } | "Group sentences to sentence groups" >> beam.CoGroupByKey() # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | "Set hydrated sentences on sentence groups" >> beam.ParDo( SetSentencesOnSentenceGroup())) # Bring in the table that associates people and their county of residence person_id_to_county_kv = ( pipeline | "Load person_id_to_county_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) ip_to_judicial_district_kv = ( pipeline | "Load ip_to_judicial_district_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id= INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) supervision_period_to_agent_associations_as_kv = ( pipeline | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) state_race_ethnicity_population_counts = ( pipeline | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { "violations": supervision_violations, "violation_responses": supervision_violation_responses, } | "Group StateSupervisionViolationResponses to " "StateSupervisionViolations" >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | "Set hydrated StateSupervisionViolations on " "the StateSupervisionViolationResponses" >> beam.ParDo( SetViolationOnViolationsResponse())) # Group each StatePerson with their related entities person_entities = { "person": persons, "assessments": assessments, "sentence_groups": sentence_groups_with_hydrated_sentences, "violation_responses": violation_responses_with_hydrated_violations, "incarceration_period_judicial_district_association": ip_to_judicial_district_kv, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, "persons_to_recent_county_of_residence": person_id_to_county_kv, } | "Group StatePerson to SentenceGroups" >> beam.CoGroupByKey() # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_incarceration_events = ( person_entities | "Classify Incarceration Events" >> beam.ParDo( ClassifyEvents(), identifier=self.pipeline_config.identifier)) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), state_race_ethnicity_population_counts=AsList( state_race_ethnicity_population_counts), )) person_incarceration_events_with_metadata = ( { "person_events": person_incarceration_events, "person_metadata": person_metadata, } | "Group IncarcerationEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata())) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get IncarcerationMetrics incarceration_metrics = ( person_incarceration_events_with_metadata | "Get Incarceration Metrics" >> GetMetrics( pipeline_options=all_pipeline_options, pipeline_config=self.pipeline_config, metric_types_to_include=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) return incarceration_metrics
def execute_pipeline( self, pipeline: beam.Pipeline, all_pipeline_options: Dict[str, Any], state_code: str, input_dataset: str, reference_dataset: str, static_reference_dataset: str, metric_types: List[str], person_id_filter_set: Optional[Set[int]], calculation_month_count: int = -1, calculation_end_month: Optional[str] = None, ) -> beam.Pipeline: # Get StatePersons persons = pipeline | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateProgramAssignments program_assignments = pipeline | "Load Program Assignments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateAssessments assessments = pipeline | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionPeriods supervision_periods = pipeline | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_period_to_agent_associations_as_kv = ( pipeline | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) state_race_ethnicity_population_counts = ( pipeline | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) # Group each StatePerson with their other entities persons_entities = { "person": persons, "program_assignments": program_assignments, "assessments": assessments, "supervision_periods": supervision_periods, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, } | "Group StatePerson to StateProgramAssignments and" >> beam.CoGroupByKey( ) # Identify ProgramEvents from the StatePerson's StateProgramAssignments person_program_events = persons_entities | beam.ParDo( ClassifyEvents(), identifier=self.pipeline_config.identifier) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), state_race_ethnicity_population_counts=AsList( state_race_ethnicity_population_counts), )) person_program_events_with_metadata = ( { "person_events": person_program_events, "person_metadata": person_metadata } | "Group ProgramEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ProgramEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata())) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get program metrics program_metrics = (person_program_events_with_metadata | "Get Program Metrics" >> GetMetrics( pipeline_options=all_pipeline_options, pipeline_config=self.pipeline_config, metric_types_to_include=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) return program_metrics
def run_test_pipeline(self, dataset: str, fake_supervision_period_id: int, unifying_id_field_filter_set: Optional[Set[int]] = None, metric_types_filter: Optional[Set[str]] = None): """Runs a test version of the program pipeline.""" test_pipeline = TestPipeline() # Get StatePersons persons = (test_pipeline | 'Load Persons' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True)) # Get StateProgramAssignments program_assignments = (test_pipeline | 'Load Program Assignments' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities. StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=unifying_id_field_filter_set)) # Get StateAssessments assessments = (test_pipeline | 'Load Assessments' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities. StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=unifying_id_field_filter_set)) # Get StateSupervisionPeriods supervision_periods = (test_pipeline | 'Load SupervisionPeriods' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class= entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=unifying_id_field_filter_set)) supervision_period_to_agent_map = { 'agent_id': 1010, 'agent_external_id': 'OFFICER0009', 'district_external_id': '10', 'supervision_period_id': fake_supervision_period_id } supervision_period_to_agent_associations = ( test_pipeline | 'Create SupervisionPeriod to Agent table' >> beam.Create([supervision_period_to_agent_map]) ) supervision_period_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert SupervisionPeriod to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id') ) state_race_ethnicity_population_count = { 'state_code': 'US_XX', 'race_or_ethnicity': 'BLACK', 'population_count': 1, 'representation_priority': 1 } state_race_ethnicity_population_counts = ( test_pipeline | 'Create state_race_ethnicity_population_count table' >> beam.Create( [state_race_ethnicity_population_count]) ) # Group each StatePerson with their other entities persons_entities = ( {'person': persons, 'program_assignments': program_assignments, 'assessments': assessments, 'supervision_periods': supervision_periods } | 'Group StatePerson to StateProgramAssignments and' >> beam.CoGroupByKey() ) # Identify ProgramEvents from the StatePerson's # StateProgramAssignments person_program_events = ( persons_entities | beam.ParDo(pipeline.ClassifyProgramAssignments(), AsDict( supervision_period_to_agent_associations_as_kv )) ) person_metadata = (persons | "Build the person_metadata dictionary" >> beam.ParDo(BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_program_events_with_metadata = ( { 'person_events': person_program_events, 'person_metadata': person_metadata } | 'Group ProgramEvents with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and ProgramEvents for calculations' >> beam.ParDo(ExtractPersonEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = PipelineOptions().get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp metric_types = metric_types_filter if metric_types_filter else {'ALL'} # Get program metrics program_metrics = (person_program_events_with_metadata | 'Get Program Metrics' >> # type: ignore pipeline.GetProgramMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types, calculation_end_month=None, calculation_month_count=-1)) assert_that(program_metrics, AssertMatchers.validate_pipeline_test()) test_pipeline.run()
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, metric_types: List[str], state_code: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options['project'] input_dataset = project_id + '.' + data_input reference_dataset = project_id + '.' + reference_view_input static_reference_dataset = project_id + '.' + static_reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationPeriods incarceration_periods = (p | 'Load IncarcerationPeriods' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateSupervisionViolations supervision_violations = \ (p | 'Load SupervisionViolations' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # TODO(#2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = \ (p | 'Load SupervisionViolationResponses' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( {'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo(SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( {'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations} | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo(SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods person_and_incarceration_periods = ( {'person': persons, 'incarceration_periods': incarceration_periods_with_source_violations} | 'Group StatePerson to StateIncarcerationPeriods' >> beam.CoGroupByKey() ) # Bring in the table that associates people and their county of residence person_id_to_county_kv = (p | 'Load person_id_to_county_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key='person_id', state_code_filter=state_code, person_id_filter_set=person_id_filter_set )) state_race_ethnicity_population_counts = ( p | 'Load state_race_ethnicity_population_counts' >> ImportTable( dataset_id=static_reference_dataset, table_id='state_race_ethnicity_population_counts', state_code_filter=state_code, person_id_filter_set=None )) # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods person_release_events = ( person_and_incarceration_periods | "ClassifyReleaseEvents" >> beam.ParDo(ClassifyReleaseEvents(), AsDict(person_id_to_county_kv)) ) person_metadata = (persons | "Build the person_metadata dictionary" >> beam.ParDo(BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_release_events_with_metadata = ( { 'person_events': person_release_events, 'person_metadata': person_metadata } | 'Group ReleaseEvents with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and ReleaseEvents for calculations' >> beam.ParDo(ExtractPersonReleaseEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get recidivism metrics recidivism_metrics = (person_release_events_with_metadata | 'Get Recidivism Metrics' >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set)) if person_id_filter_set: logging.warning("Non-empty person filter set - returning before writing metrics.") return # Convert the metrics into a format that's writable to BQ writable_metrics = (recidivism_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( ReincarcerationRecidivismMetricType.REINCARCERATION_RATE.value, ReincarcerationRecidivismMetricType.REINCARCERATION_COUNT.value )) # Write the recidivism metrics to the output tables in BigQuery rates_table_id = DATAFLOW_METRICS_TO_TABLES.get(ReincarcerationRecidivismRateMetric) counts_table_id = DATAFLOW_METRICS_TO_TABLES.get(ReincarcerationRecidivismCountMetric) _ = (writable_metrics.REINCARCERATION_RATE | f"Write rate metrics to BQ table: {rates_table_id}" >> beam.io.WriteToBigQuery( table=rates_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS )) _ = (writable_metrics.REINCARCERATION_COUNT | f"Write count metrics to BQ table: {counts_table_id}" >> beam.io.WriteToBigQuery( table=counts_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS ))
def run( apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: str, calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]], ) -> None: """Runs the supervision calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options["project"] if project_id is None: raise ValueError( f"No project set in pipeline options: {all_pipeline_options}") if state_code is None: raise ValueError("No state_code set for pipeline") input_dataset = project_id + "." + data_input reference_dataset = project_id + "." + reference_view_input static_reference_dataset = project_id + "." + static_reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = p | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationPeriods incarceration_periods = p | "Load IncarcerationPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionViolations supervision_violations = p | "Load SupervisionViolations" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # TODO(#2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | "Load SupervisionViolationResponses" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionSentences supervision_sentences = p | "Load SupervisionSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationSentences incarceration_sentences = p | "Load IncarcerationSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionPeriods supervision_periods = p | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateAssessments assessments = p | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_contacts = p | "Load StateSupervisionContacts" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionContact, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_period_to_agent_associations_as_kv = ( p | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) # Bring in the judicial districts associated with supervision_periods sp_to_judicial_district_kv = ( p | "Load sp_to_judicial_district_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id= SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, state_code_filter=state_code, person_id_filter_set=person_id_filter_set, table_key="person_id", )) state_race_ethnicity_population_counts = ( p | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) if state_code == "US_MO": # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set, ) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> ReadFromBigQuery(query=us_mo_sentence_status_query)) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | "Convert MO sentence status ranking table to KV tuples" >> beam.ParDo(ConvertDictToKVTuple(), "person_id")) sentences_and_statuses = ( { "incarceration_sentences": incarceration_sentences, "supervision_sentences": supervision_sentences, "sentence_statuses": us_mo_sentence_status_rankings_as_kv, } | "Group sentences to the sentence statuses for that person" >> beam.CoGroupByKey()) sentences_converted = ( sentences_and_statuses | "Convert to state-specific sentences" >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( "incarceration_sentences", "supervision_sentences")) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { "violations": supervision_violations, "violation_responses": supervision_violation_responses, } | "Group StateSupervisionViolationResponses to " "StateSupervisionViolations" >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | "Set hydrated StateSupervisionViolations on " "the StateSupervisionViolationResponses" >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id incarceration_periods_and_violation_responses = ( { "incarceration_periods": incarceration_periods, "violation_responses": violation_responses_with_hydrated_violations, } | "Group StateIncarcerationPeriods to " "StateSupervisionViolationResponses" >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding # StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | "Set hydrated StateSupervisionViolationResponses on " "the StateIncarcerationPeriods" >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their related entities person_entities = { "person": persons, "assessments": assessments, "incarceration_periods": incarceration_periods_with_source_violations, "supervision_periods": supervision_periods, "supervision_sentences": sentences_converted.supervision_sentences, "incarceration_sentences": sentences_converted.incarceration_sentences, "violation_responses": violation_responses_with_hydrated_violations, "supervision_contacts": supervision_contacts, "supervision_period_judicial_district_association": sp_to_judicial_district_kv, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, } | "Group StatePerson to all entities" >> beam.CoGroupByKey() # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods person_time_buckets = (person_entities | "Get SupervisionTimeBuckets" >> beam.ParDo( ClassifySupervisionTimeBuckets())) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_time_buckets_with_metadata = ( { "person_events": person_time_buckets, "person_metadata": person_metadata } | "Group SupervisionTimeBuckets with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and SupervisionTimeBuckets for calculations" >> beam.ParDo(ExtractPersonEventsMetadata())) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Get the type of metric to calculate metric_types_set = set(metric_types) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get supervision metrics supervision_metrics = ( person_time_buckets_with_metadata | "Get Supervision Metrics" >> GetSupervisionMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( supervision_metrics | "Convert to dict to be written to BQ" >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( SupervisionMetricType.SUPERVISION_COMPLIANCE.value, SupervisionMetricType.SUPERVISION_POPULATION.value, SupervisionMetricType.SUPERVISION_REVOCATION.value, SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value, SupervisionMetricType.SUPERVISION_START.value, SupervisionMetricType.SUPERVISION_SUCCESS.value, SupervisionMetricType. SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value, SupervisionMetricType.SUPERVISION_TERMINATION.value, SupervisionMetricType.SUPERVISION_OUT_OF_STATE_POPULATION. value, SupervisionMetricType.SUPERVISION_DOWNGRADE.value, )) terminations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionTerminationMetric] compliance_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionCaseComplianceMetric] populations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionPopulationMetric] revocations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionRevocationMetric] revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionRevocationAnalysisMetric] successes_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionSuccessMetric] successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES[ SuccessfulSupervisionSentenceDaysServedMetric] supervision_starts_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionStartMetric] out_of_state_populations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionOutOfStatePopulationMetric] supervision_downgrade_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionDowngradeMetric] _ = (writable_metrics.SUPERVISION_POPULATION | f"Write population metrics to BQ table: {populations_table_id}" >> WriteAppendToBigQuery( output_table=populations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_OUT_OF_STATE_POPULATION | f"Write out of state population metrics to BQ table: " f"{out_of_state_populations_table_id}" >> WriteAppendToBigQuery( output_table=out_of_state_populations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_REVOCATION | f"Write revocation metrics to BQ table: {revocations_table_id}" >> WriteAppendToBigQuery( output_table=revocations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_SUCCESS | f"Write success metrics to BQ table: {successes_table_id}" >> WriteAppendToBigQuery( output_table=successes_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED | f"Write supervision successful sentence length metrics to BQ" f" table: {successful_sentence_lengths_table_id}" >> WriteAppendToBigQuery( output_table=successful_sentence_lengths_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_TERMINATION | f"Write termination metrics to BQ table: {terminations_table_id}" >> WriteAppendToBigQuery( output_table=terminations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_REVOCATION_ANALYSIS | f"Write revocation analyses metrics to BQ table: " f"{revocation_analysis_table_id}" >> WriteAppendToBigQuery( output_table=revocation_analysis_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_COMPLIANCE | f"Write compliance metrics to BQ table: {compliance_table_id}" >> WriteAppendToBigQuery( output_table=compliance_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_START | f"Write start metrics to BQ table: {supervision_starts_table_id}" >> WriteAppendToBigQuery( output_table=supervision_starts_table_id, output_dataset=output, )) _ = ( writable_metrics.SUPERVISION_DOWNGRADE | f"Write downgrade metrics to BQ table: {supervision_downgrade_table_id}" >> WriteAppendToBigQuery( output_table=supervision_downgrade_table_id, output_dataset=output, ))