示例#1
0
def create_groups(group_ids, corpus, word, ignore_corpus, ignore_word):
    """Generate groups given the input PCollections."""
    def attach_corpus_fn(group, corpus, ignore):
        selected = None
        len_corpus = len(corpus)
        while not selected:
            c = list(corpus[randrange(0, len_corpus)].values())[0]
            if c != ignore:
                selected = c

        yield (group, selected)

    def attach_word_fn(group, words, ignore):
        selected = None
        len_words = len(words)
        while not selected:
            c = list(words[randrange(0, len_words)].values())[0]
            if c != ignore:
                selected = c

        yield group + (selected, )

    return (group_ids
            | 'attach corpus' >> beam.FlatMap(attach_corpus_fn, AsList(corpus),
                                              AsSingleton(ignore_corpus))
            | 'attach word' >> beam.FlatMap(attach_word_fn, AsList(word),
                                            AsSingleton(ignore_word)))
示例#2
0
def main(argv=None):
    options = PipelineOptions(argv)
    p = Pipeline(options=options)

    input = p | 'Input' >> beam.Create([1, 2, 3], reshuffle=False)
    output1 = input | 'Output1' >> beam.Map(lambda x, side:
                                            (x, side), AsList(input))
    input | 'Output2' >> beam.Map(
        lambda x, side: logging.info('x: %s, side: %s', x, side),
        AsList(output1))

    p.run()
示例#3
0
    def test_sdf_with_side_inputs(self):
        with TestPipeline() as p:
            side1 = p | 'Create1' >> Create(['1', '2'])
            side2 = p | 'Create2' >> Create(['3', '4'])
            side3 = p | 'Create3' >> Create(['5'])
            result = (p
                      | 'create_main' >> beam.Create(['a', 'b', 'c'])
                      | beam.ParDo(ExpandStrings(), AsList(side1),
                                   AsList(side2), AsSingleton(side3)))

            expected_result = []
            for c in ['a', 'b', 'c']:
                for i in range(5):
                    expected_result.append(c + ':' + str(i + 1))
            assert_that(result, equal_to(expected_result))
    def test_side_inputs(self):
        class SplitNumbersFn(DoFn):
            def process(self, element):
                if element < 0:
                    yield pvalue.OutputValue('tag_negative', element)
                else:
                    yield element

        class ProcessNumbersFn(DoFn):
            def process(self, element, negatives):
                yield element

        class DummySource(iobase.BoundedSource):
            pass

        root_read = Read(DummySource())

        result = (self.pipeline
                  | 'read' >> root_read
                  | ParDo(SplitNumbersFn()).with_outputs('tag_negative',
                                                         main='positive'))
        positive, negative = result
        positive | ParDo(ProcessNumbersFn(), AsList(negative))

        self.pipeline.visit(self.visitor)

        root_transforms = sorted(
            [t.transform for t in self.visitor.root_transforms])
        self.assertEqual(root_transforms, sorted([root_read]))
        self.assertEqual(len(self.visitor.step_names), 3)
        self.assertEqual(len(self.visitor.views), 1)
        self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList))
示例#5
0
    def test_side_inputs(self):
        class SplitNumbersFn(NewDoFn):
            def process(self, element):
                if element < 0:
                    yield pvalue.SideOutputValue('tag_negative', element)
                else:
                    yield element

        class ProcessNumbersFn(NewDoFn):
            def process(self, element, negatives):
                yield element

        root_create = Create('create', [[-1, 2, 3]])

        result = (self.pipeline
                  | root_create
                  | ParDo(SplitNumbersFn()).with_outputs('tag_negative',
                                                         main='positive'))
        positive, negative = result
        positive | ParDo(ProcessNumbersFn(), AsList(negative))

        self.pipeline.visit(self.visitor)

        root_transforms = sorted(
            [t.transform for t in self.visitor.root_transforms])
        self.assertEqual(root_transforms, sorted([root_create]))
        self.assertEqual(len(self.visitor.step_names), 4)
        self.assertEqual(len(self.visitor.views), 1)
        self.assertTrue(
            isinstance(self.visitor.views[0], pvalue.ListPCollectionView))
    def test_side_inputs(self):
        class SplitNumbersFn(DoFn):
            def process(self, element):
                if element < 0:
                    yield pvalue.TaggedOutput('tag_negative', element)
                else:
                    yield element

        class ProcessNumbersFn(DoFn):
            def process(self, element, negatives):
                yield element

        root_read = beam.Impulse()

        result = (self.pipeline
                  | 'read' >> root_read
                  | ParDo(SplitNumbersFn()).with_outputs('tag_negative',
                                                         main='positive'))
        positive, negative = result
        positive | ParDo(ProcessNumbersFn(), AsList(negative))

        self.pipeline.visit(self.visitor)

        root_transforms = [t.transform for t in self.visitor.root_transforms]
        self.assertEqual(root_transforms, [root_read])
        self.assertEqual(len(self.visitor.step_names), 3)
        self.assertEqual(len(self.visitor.views), 1)
        self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList))
示例#7
0
    def test_pcollectionview_not_recreated(self):
        pipeline = Pipeline('DirectRunner')
        value = pipeline | 'create1' >> Create([1, 2, 3])
        value2 = pipeline | 'create2' >> Create([(1, 1), (2, 2), (3, 3)])
        value3 = pipeline | 'create3' >> Create([(1, 1), (2, 2), (3, 3)])
        self.assertEqual(AsSingleton(value), AsSingleton(value))
        self.assertEqual(AsSingleton('new', value, default_value=1),
                         AsSingleton('new', value, default_value=1))
        self.assertNotEqual(AsSingleton(value),
                            AsSingleton('new', value, default_value=1))
        self.assertEqual(AsIter(value), AsIter(value))
        self.assertEqual(AsList(value), AsList(value))
        self.assertEqual(AsDict(value2), AsDict(value2))

        self.assertNotEqual(AsSingleton(value), AsSingleton(value2))
        self.assertNotEqual(AsIter(value), AsIter(value2))
        self.assertNotEqual(AsList(value), AsList(value2))
        self.assertNotEqual(AsDict(value2), AsDict(value3))
    def test_pardo_unfusable_side_inputs(self):
        def cross_product(elem, sides):
            for side in sides:
                yield elem, side

        with self.create_pipeline() as p:
            pcoll = p | beam.Create(['a', 'b'])
            assert_that(
                pcoll | beam.FlatMap(cross_product, AsList(pcoll)),
                equal_to([('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')]))

        with self.create_pipeline() as p:
            pcoll = p | beam.Create(['a', 'b'])
            derived = ((pcoll, ) | beam.Flatten()
                       | beam.Map(lambda x: (x, x))
                       | beam.GroupByKey()
                       | 'Unkey' >> beam.Map(lambda (x, _): x))
            assert_that(
                pcoll | beam.FlatMap(cross_product, AsList(derived)),
                equal_to([('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')]))
    def test_pardo_side_inputs(self):
        def cross_product(elem, sides):
            for side in sides:
                yield elem, side

        with self.create_pipeline() as p:
            main = p | 'main' >> beam.Create(['a', 'b', 'c'])
            side = p | 'side' >> beam.Create(['x', 'y'])
            assert_that(
                main | beam.FlatMap(cross_product, AsList(side)),
                equal_to([('a', 'x'), ('b', 'x'), ('c', 'x'), ('a', 'y'),
                          ('b', 'y'), ('c', 'y')]))
示例#10
0
def run(argv=None):
    parser = argparse.ArgumentParser()

    parser.add_argument('--input_file',
                        required=True,
                        help=('Input file in the form of gcs bucket url'))

    parser.add_argument('--input_topic',
                        required=True,
                        help=('Input PubSub topic of the form '))

    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    p = beam.Pipeline(options=pipeline_options)

    pipeline_options.view_as(SetupOptions).save_main_session = True

    pipeline_options.view_as(StandardOptions).streaming = True

    live_sales = (p | 'Read From PubSub' >> beam.io.ReadFromPubSub(
        topic=known_args.input_topic).with_output_types(bytes)
                  | 'decode' >> beam.Map(lambda x: x.decode('utf-8')))

    historical_sales = (p | beam.io.ReadFromText(known_args.input_file,
                                                 skip_header_lines=1))

    order_stats = (historical_sales | 'Split & Collect order_size' >>
                   (beam.ParDo(CollectOrderSize()))
                   | 'Calcuate Mean & Standard Deviation' >>
                   beam.CombineGlobally(StandardDeviation()))

    current_asp = (
        live_sales
        | 'Get Current Order size' >> (beam.ParDo(CollectOrderTuple()))
        | 'Calcuate Z-score' >> beam.Map(calculate_zscore, AsList(order_stats))
        | 'Write to Bigquery' >> beam.io.WriteToBigQuery(
            'spikey-gcp:spikey_orders.order_zscores', schema=spikey_schema))

    result = p.run()
    result.wait_until_finish()
示例#11
0
    def test_side_inputs(self):
        class SplitNumbersFn(DoFn):
            def process(self, element):
                if element < 0:
                    yield pvalue.TaggedOutput('tag_negative', element)
                else:
                    yield element

        class ProcessNumbersFn(DoFn):
            def process(self, element, negatives):
                yield element

        def _process_numbers(pcoll, negatives):
            first_output = (pcoll
                            | 'process numbers step 1' >> ParDo(
                                ProcessNumbersFn(), negatives))

            second_output = (first_output
                             | 'process numbers step 2' >> ParDo(
                                 ProcessNumbersFn(), negatives))

            output_pc = ((first_output, second_output)
                         | 'flatten results' >> beam.Flatten())
            return output_pc

        root_read = beam.Impulse()

        result = (self.pipeline
                  | 'read' >> root_read
                  | ParDo(SplitNumbersFn()).with_outputs('tag_negative',
                                                         main='positive'))
        positive, negative = result
        _process_numbers(positive, AsList(negative))

        self.pipeline.visit(self.visitor)

        root_transforms = [t.transform for t in self.visitor.root_transforms]
        self.assertEqual(root_transforms, [root_read])
        self.assertEqual(len(self.visitor.step_names), 5)
        self.assertEqual(len(self.visitor.views), 1)
        self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList))
    # Esta funcion calcula el schema del parquet a escribir, aplicando el renombre de columnas al schema original
    def getSchema():
        df_schema = pyarrow.Schema.from_pandas(
            pd.read_parquet(user_options.schema_source.get()))
        for (key, value) in ast.literal_eval(
                user_options.rename_columns.get()).items():
            df_schema = df_schema.set(
                df_schema.get_field_index(key),
                pyarrow.field(value,
                              df_schema.types[df_schema.get_field_index(key)]))
        return df_schema

    # Este lee los archivos parquet fuente y calcula el diccionario con el mapeo de las columnas a renombrar
    map_rename_cols = (
        p | "Read for rename cols" >> ReadFromParquet(user_options.url_raw)
        | "Map rename cols" >> beam.Map(mapRenameCols)
        | "Rename cols to string" >> beam.Map(str)
        | "Deduplicate elements" >> beam.Distinct())
    # Este lee los datos desde los archivos fuente
    data = (p
            | "Read parquet for data" >> ReadFromParquet(user_options.url_raw))
    # Este aplica la funcion para renombarar las columnas y recibe el resultado del paso anterior como diccionario
    rename_data = (data | "Rename columns" >> beam.Map(
        reColumns, rename_cols=AsList(map_rename_cols)))
    # Este escribe los datos en la ruta destino, obteniendo el schema desde la funcion getSchema
    _ = (rename_data | "Write to storage TRN" >> WriteToParquet(
        user_options.url_trn, schema=getSchema(), file_name_suffix=".parquet"))

print("End Pipeline")
示例#13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--dataset',
        default='musicbrainz',
        help='BigQuery dataset name'
    )
    parser.add_argument(
        '--table',
        default='recordings_by_artists_dataflow',
        help='BiqQuery table'
    )
    args, argv = parser.parse_known_args()

    pipeline_options = PipelineOptions(argv)
    pipeline_options.view_as(StandardOptions).runner = 'DataflowRunner'
    gcp_options = pipeline_options.view_as(GoogleCloudOptions)
    if not gcp_options.job_name:
        gcp_options.job_name = 'music-job'
    worker_options = pipeline_options.view_as(WorkerOptions)
    if not worker_options.use_public_ips:
        worker_options.use_public_ips = False

    table_spec = bigquery.TableReference(projectId=gcp_options.project,
                                         datasetId=args.dataset,
                                         tableId=args.table)
    table_schema = {
        'fields': [
            {'name': 'id', 'mode': 'NULLABLE', 'type': 'INTEGER'},
            {'name': 'artist_gid', 'mode': 'NULLABLE', 'type': 'STRING'},
            {'name': 'artist_name', 'mode': 'NULLABLE', 'type': 'STRING'},
            {'name': 'area', 'mode': 'NULLABLE', 'type': 'STRING'},
            {'name': 'gender', 'mode': 'NULLABLE', 'type': 'STRING'},
            {'name': 'artist_credit', 'mode': 'NULLABLE', 'type': 'INTEGER'},
            {'name': 'recording_name', 'mode': 'NULLABLE', 'type': 'STRING'},
            {'name': 'length', 'mode': 'NULLABLE', 'type': 'INTEGER'},
            {'name': 'recording_gid', 'mode': 'NULLABLE', 'type': 'STRING'},
            {'name': 'video', 'mode': 'NULLABLE', 'type': 'BOOLEAN'},
        ]
    }

    with beam.Pipeline(options=pipeline_options) as pipeline:
        gender = pipeline | \
                 'Read gender' >> beam.io.ReadFromText('gs://solutions-public-assets/bqetl/gender.json') | \
                 'Process gender' >> beam.Map(process_gender_or_area)

        area = pipeline | \
               'Read area' >> beam.io.ReadFromText('gs://solutions-public-assets/bqetl/area.json') | \
               'Process area' >> beam.Map(process_gender_or_area)

        artists = pipeline | \
                  'Read Artists' >> beam.io.ReadFromText('gs://solutions-public-assets/bqetl/artist.json') | \
                  'Convert artist from json to dict' >> beam.Map(lambda e: json.loads(e)) | \
                  'Process artists' >> beam.Map(process_artists, AsList(gender), AsList(area))

        recordings = pipeline | \
                     'Read Recordings' >> beam.io.ReadFromText('gs://solutions-public-assets/bqetl/recording.json') | \
                     'Process recording' >> beam.Map(process_recording)

        artist_credit_name = pipeline | \
                             'Read Artists Credit Name' >> beam.io.ReadFromText('gs://solutions-public-assets/bqetl/artist_credit_name.json') | \
                             'Process artist credit name' >> beam.Map(process_artist_credit)

        # Joining artist and artist_credit_name
        # SELECT artist.id,
        #   artist.gid as artist_gid,
        #   artist.name as artist_name,
        #   artist.area,
        #   artist_credit_name.artist_credit
        # FROM datafusion-dataproc-tutorial.musicbrainz.artist as artist
        #  INNER JOIN datafusion-dataproc-tutorial.musicbrainz.artist_credit_name AS artist_credit_name
        #       ON artist.id = artist_credit_name.artist
        #
        joined_artist_and_artist_credit_name = ({
            'artists': artists,
            'artist_credit_name': artist_credit_name}) | \
            'Merge artist and artist_credit_name to intermitent' >> beam.CoGroupByKey() | \
            'UnSetCoGroup intermitent' >> beam.ParDo(UnSetCoGroup(),
                                                     'artists',
                                                     'artist_credit_name',
                                                     'artist') | \
            'Map artist_credit to dict element' >> beam.Map(lambda e: (e['artist_credit'], e))

        # Joining previous table with recordings
        # SELECT intermitent.id,
        #   intermitent.artist_gid,
        #   intermitent.artist_name,
        #   intermitent.area,
        #   intermitent.artist_credit,
        #   recording.recording_name,
        #   recording.length,
        #   recording.video
        # FROM datafusion-dataproc-tutorial.musicbrainz.intermitents as intermitent
        #  INNER JOIN datafusion-dataproc-tutorial.musicbrainz.recording AS recording
        #       ON intermitent.artist_credit = recording.artist_credit
        #
        joined_artist_and_artist_credit_name_and_recording = ({
            'joined_artist_and_artist_credit_name': joined_artist_and_artist_credit_name,
            'recordings': recordings}) | \
            'Merge intermitent and recording' >> beam.CoGroupByKey() | \
            'UnSetCoGroup final' >> beam.ParDo(UnSetCoGroup(),
                                               'joined_artist_and_artist_credit_name',
                                               'recordings',
                                               'artist_credit') | \
            'Write To BQ' >> beam.io.WriteToBigQuery(table_spec,
                                                     schema=table_schema,
                                                     write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
                                                     create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)

    logging.getLogger().setLevel(logging.INFO)
示例#14
0
    def run_test_pipeline(
            fake_person_id: int,
            state_code: str,
            dataset: str,
            expected_metric_types: Set[IncarcerationMetricType],
            allow_empty: bool = False,
            unifying_id_field_filter_set: Optional[Set[int]] = None,
            metric_types_filter: Optional[Set[str]] = None):
        """Runs a test version of the incarceration pipeline."""
        test_pipeline = TestPipeline()

        # Get StatePersons
        persons = (
            test_pipeline
            | 'Load Persons' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StatePerson,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True))

        # Get StateSentenceGroups
        sentence_groups = (
            test_pipeline
            | 'Load StateSentenceGroups' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateSentenceGroup,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            test_pipeline
            | 'Load StateIncarcerationSentences' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateSupervisionSentences
        supervision_sentences = (
            test_pipeline | 'Load StateSupervisionSentences' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        us_mo_sentence_status_rows: List[Dict[str, Any]] = [{
            'person_id':
            fake_person_id,
            'sentence_external_id':
            'XXX',
            'sentence_status_external_id':
            'YYY',
            'status_code':
            'ZZZ',
            'status_date':
            'not_a_date',
            'status_description':
            'XYZ'
        }]

        us_mo_sentence_statuses = (test_pipeline
                                   | 'Create MO sentence statuses' >>
                                   beam.Create(us_mo_sentence_status_rows))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        sentences_and_statuses = (
            {
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences,
                'sentence_statuses': us_mo_sentence_status_rankings_as_kv
            }
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey())

        sentences_converted = (
            sentences_and_statuses
            | 'Convert to state-specific sentences' >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    'incarceration_sentences', 'supervision_sentences'))

        sentences_and_sentence_groups = (
            {
                'sentence_groups': sentence_groups,
                'incarceration_sentences':
                sentences_converted.incarceration_sentences,
                'supervision_sentences':
                sentences_converted.supervision_sentences
            }
            | 'Group sentences to sentence groups' >> beam.CoGroupByKey())

        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | 'Set hydrated sentences on sentence groups' >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Identify IncarcerationEvents events from the StatePerson's
        # StateIncarcerationPeriods
        fake_person_id_to_county_query_result = [{
            'person_id':
            fake_person_id,
            'county_of_residence':
            _COUNTY_OF_RESIDENCE
        }]
        person_id_to_county_kv = (
            test_pipeline
            | "Read person id to county associations from BigQuery" >>
            beam.Create(fake_person_id_to_county_query_result)
            | "Convert person_id to counties to KV" >> beam.ParDo(
                ConvertDictToKVTuple(), 'person_id'))

        incarceration_period_judicial_district_association_row = \
            {'person_id': fake_person_id, 'incarceration_period_id': 123, 'judicial_district_code': 'NW'}

        ip_to_judicial_district_kv = (
            test_pipeline
            |
            "Read incarceration_period to judicial_district associations from BigQuery"
            >> beam.Create(
                [incarceration_period_judicial_district_association_row])
            | "Convert ips to judicial districts to KV" >> beam.ParDo(
                ConvertDictToKVTuple(), 'person_id'))

        state_race_ethnicity_population_count = {
            'state_code': state_code,
            'race_or_ethnicity': 'BLACK',
            'population_count': 1,
            'representation_priority': 1
        }

        state_race_ethnicity_population_counts = (
            test_pipeline
            | 'Create state_race_ethnicity_population_count table' >>
            beam.Create([state_race_ethnicity_population_count]))

        # Group each StatePerson with their related entities
        person_entities = (
            {
                'person':
                persons,
                'sentence_groups':
                sentence_groups_with_hydrated_sentences,
                'incarceration_period_judicial_district_association':
                ip_to_judicial_district_kv
            }
            | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey())

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_incarceration_events = (
            person_entities | 'Classify Incarceration Events' >> beam.ParDo(
                pipeline.ClassifyIncarcerationEvents(),
                AsDict(person_id_to_county_kv)))

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                AsList(state_race_ethnicity_population_counts)))

        person_incarceration_events_with_metadata = (
            {
                'person_events': person_incarceration_events,
                'person_metadata': person_metadata
            }
            | 'Group IncarcerationEvents with person-level metadata' >>
            beam.CoGroupByKey()
            |
            'Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations'
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = PipelineOptions().get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        metric_types = metric_types_filter if metric_types_filter else {'ALL'}

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_incarceration_events_with_metadata
            | 'Get Incarceration Metrics' >>  # type: ignore
            pipeline.GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types,
                calculation_end_month=None,
                calculation_month_count=-1))

        assert_that(
            incarceration_metrics,
            AssertMatchers.validate_metric_type(allow_empty=allow_empty),
            'Assert that all metrics are of the expected type.')

        assert_that(
            incarceration_metrics,
            AssertMatchers.validate_pipeline_test(expected_metric_types),
            'Assert the type of metrics produced are expected')

        test_pipeline.run()
示例#15
0
    def execute_pipeline(
        self,
        pipeline: beam.Pipeline,
        all_pipeline_options: Dict[str, Any],
        state_code: str,
        input_dataset: str,
        reference_dataset: str,
        static_reference_dataset: str,
        metric_types: List[str],
        person_id_filter_set: Optional[Set[int]],
        calculation_month_count: int = -1,
        calculation_end_month: Optional[str] = None,
    ) -> beam.Pipeline:
        persons = pipeline | "Load Persons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            pipeline
            | "Load IncarcerationPeriods" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        # Get StateSupervisionPeriods
        supervision_periods = pipeline | "Load SupervisionPeriods" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Bring in the table that associates people and their county of residence
        person_id_to_county_kv = (
            pipeline
            | "Load person_id_to_county_kv" >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            ))

        # Group each StatePerson with their StateIncarcerationPeriods
        person_entities = {
            "person": persons,
            "incarceration_periods": incarceration_periods,
            "supervision_periods": supervision_periods,
            "persons_to_recent_county_of_residence": person_id_to_county_kv,
        } | "Group StatePerson to StateIncarcerationPeriods" >> beam.CoGroupByKey(
        )

        state_race_ethnicity_population_counts = (
            pipeline
            | "Load state_race_ethnicity_population_counts" >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            ))

        # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods
        person_release_events = person_entities | "ClassifyReleaseEvents" >> beam.ParDo(
            ClassifyEvents(), identifier=self.pipeline_config.identifier)

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                state_race_ethnicity_population_counts=AsList(
                    state_race_ethnicity_population_counts),
            ))

        person_release_events_with_metadata = (
            {
                "person_events": person_release_events,
                "person_metadata": person_metadata
            }
            | "Group ReleaseEvents with person-level metadata" >>
            beam.CoGroupByKey()
            |
            "Organize StatePerson, PersonMetadata and ReleaseEvents for calculations"
            >> beam.ParDo(ExtractPersonReleaseEventsMetadata()))

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            "%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get recidivism metrics
        recidivism_metrics = (
            person_release_events_with_metadata
            | "Get Recidivism Metrics" >> GetMetrics(
                pipeline_options=all_pipeline_options,
                pipeline_config=self.pipeline_config,
                metric_types_to_include=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count,
            ))
        return recidivism_metrics
示例#16
0
def run(
    apache_beam_pipeline_options: PipelineOptions,
    data_input: str,
    reference_view_input: str,
    static_reference_input: str,
    output: str,
    metric_types: List[str],
    state_code: str,
    person_filter_ids: Optional[List[int]],
):
    """Runs the recidivism calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is
    # necessary because the BuildRootEntity function tries to access attributes
    # of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been
    # instantiated, then the relationship properties are loaded and their
    # attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()
    project_id = all_pipeline_options["project"]

    if project_id is None:
        raise ValueError(f"No project set in pipeline options: {all_pipeline_options}")

    if state_code is None:
        raise ValueError("No state_code set for pipeline")

    input_dataset = project_id + "." + data_input
    reference_dataset = project_id + "." + reference_view_input
    static_reference_dataset = project_id + "." + static_reference_input

    person_id_filter_set = set(person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = p | "Load Persons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationPeriods
        incarceration_periods = p | "Load IncarcerationPeriods" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateIncarcerationPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionViolations
        supervision_violations = p | "Load SupervisionViolations" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionViolation,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # TODO(#2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            p
            | "Load SupervisionViolationResponses"
            >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            )
        )

        # Group StateSupervisionViolationResponses and
        # StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                "violations": supervision_violations,
                "violation_responses": supervision_violation_responses,
            }
            | "Group StateSupervisionViolationResponses to "
            "StateSupervisionViolations" >> beam.CoGroupByKey()
        )

        # Set the fully hydrated StateSupervisionViolation entities on
        # the corresponding StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | "Set hydrated StateSupervisionViolations on "
            "the StateSupervisionViolationResponses"
            >> beam.ParDo(SetViolationOnViolationsResponse())
        )

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses
        # by person_id
        incarceration_periods_and_violation_responses = (
            {
                "incarceration_periods": incarceration_periods,
                "violation_responses": violation_responses_with_hydrated_violations,
            }
            | "Group StateIncarcerationPeriods to "
            "StateSupervisionViolationResponses" >> beam.CoGroupByKey()
        )

        # Set the fully hydrated StateSupervisionViolationResponse entities on
        # the corresponding StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | "Set hydrated StateSupervisionViolationResponses on "
            "the StateIncarcerationPeriods"
            >> beam.ParDo(SetViolationResponseOnIncarcerationPeriod())
        )

        # Bring in the table that associates people and their county of residence
        person_id_to_county_kv = (
            p
            | "Load person_id_to_county_kv"
            >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            )
        )

        # Group each StatePerson with their StateIncarcerationPeriods
        person_entities = {
            "person": persons,
            "incarceration_periods": incarceration_periods_with_source_violations,
            "persons_to_recent_county_of_residence": person_id_to_county_kv,
        } | "Group StatePerson to StateIncarcerationPeriods" >> beam.CoGroupByKey()

        state_race_ethnicity_population_counts = (
            p
            | "Load state_race_ethnicity_population_counts"
            >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            )
        )

        # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods
        person_release_events = person_entities | "ClassifyReleaseEvents" >> beam.ParDo(
            ClassifyReleaseEvents()
        )

        person_metadata = (
            persons
            | "Build the person_metadata dictionary"
            >> beam.ParDo(
                BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts)
            )
        )

        person_release_events_with_metadata = (
            {"person_events": person_release_events, "person_metadata": person_metadata}
            | "Group ReleaseEvents with person-level metadata" >> beam.CoGroupByKey()
            | "Organize StatePerson, PersonMetadata and ReleaseEvents for calculations"
            >> beam.ParDo(ExtractPersonReleaseEventsMetadata())
        )

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get recidivism metrics
        recidivism_metrics = (
            person_release_events_with_metadata
            | "Get Recidivism Metrics"
            >> GetRecidivismMetrics(
                pipeline_options=all_pipeline_options, metric_types=metric_types_set
            )
        )

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            recidivism_metrics
            | "Convert to dict to be written to BQ"
            >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                ReincarcerationRecidivismMetricType.REINCARCERATION_RATE.value,
                ReincarcerationRecidivismMetricType.REINCARCERATION_COUNT.value,
            )
        )

        # Write the recidivism metrics to the output tables in BigQuery
        rates_table_id = DATAFLOW_METRICS_TO_TABLES[ReincarcerationRecidivismRateMetric]
        counts_table_id = DATAFLOW_METRICS_TO_TABLES[
            ReincarcerationRecidivismCountMetric
        ]

        _ = (
            writable_metrics.REINCARCERATION_RATE
            | f"Write rate metrics to BQ table: {rates_table_id}"
            >> WriteAppendToBigQuery(
                output_table=rates_table_id,
                output_dataset=output,
            )
        )

        _ = (
            writable_metrics.REINCARCERATION_COUNT
            | f"Write count metrics to BQ table: {counts_table_id}"
            >> WriteAppendToBigQuery(
                output_table=counts_table_id,
                output_dataset=output,
            )
        )
示例#17
0
    def execute_pipeline(
        self,
        pipeline: beam.Pipeline,
        all_pipeline_options: Dict[str, Any],
        state_code: str,
        input_dataset: str,
        _reference_dataset: str,
        static_reference_dataset: str,
        metric_types: List[str],
        person_id_filter_set: Optional[Set[int]],
        calculation_month_count: int = -1,
        calculation_end_month: Optional[str] = None,
    ) -> beam.Pipeline:
        # Get StatePersons
        persons = pipeline | "Load Persons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionViolations
        supervision_violations = (
            pipeline
            | "Load SupervisionViolations"
            >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolation,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            )
        )

        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            pipeline
            | "Load SupervisionViolationResponses"
            >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            )
        )

        state_race_ethnicity_population_counts = (
            pipeline
            | "Load state_race_ethnicity_population_counts"
            >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            )
        )

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                "violations": supervision_violations,
                "violation_responses": supervision_violation_responses,
            }
            | "Group StateSupervisionViolationResponses to StateSupervisionViolations"
            >> beam.CoGroupByKey()
        )

        violations_with_hydrated_violation_responses = (
            supervision_violations_and_responses
            | "Set hydrated StateSupervisionViolationResponses on the StateSupervisionViolations"
            >> beam.ParDo(SetViolationResponsesOntoViolations())
        )

        person_entities = {
            "person": persons,
            "violations": violations_with_hydrated_violation_responses,
        } | "Group StatePerson to violation entities" >> beam.CoGroupByKey()

        person_violation_events = person_entities | "Get ViolationEvents" >> beam.ParDo(
            ClassifyEvents(), identifier=self.pipeline_config.identifier
        )

        person_metadata = (
            persons
            | "Build the person_metadata dictionary"
            >> beam.ParDo(
                BuildPersonMetadata(),
                state_race_ethnicity_population_counts=AsList(
                    state_race_ethnicity_population_counts
                ),
            )
        )

        person_violation_events_with_metadata = (
            {
                "person_events": person_violation_events,
                "person_metadata": person_metadata,
            }
            | "Group ViolationEvents with person-level metadata" >> beam.CoGroupByKey()
            | "Organize StatePerson, PersonMetadata and ViolationEvents for calculations"
            >> beam.ParDo(ExtractPersonEventsMetadata())
        )

        metric_types_set = set(metric_types)
        job_timestamp = datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get violation metrics
        violation_metrics = (
            person_violation_events_with_metadata
            | "Get Violation Metrics"
            >> GetMetrics(
                pipeline_options=all_pipeline_options,
                pipeline_config=self.pipeline_config,
                metric_types_to_include=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count,
            )
        )

        return violation_metrics
示例#18
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_view_input: str, static_reference_input: str, output: str,
        calculation_month_count: int, metric_types: List[str],
        state_code: Optional[str], calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the incarceration calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()
    project_id = all_pipeline_options['project']

    input_dataset = project_id + '.' + data_input
    reference_dataset = project_id + '.' + reference_view_input
    static_reference_dataset = project_id + '.' + static_reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load StatePersons' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateSentenceGroups
        sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSentenceGroup,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p | 'Load StateIncarcerationSentences' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionSentences
        supervision_sentences = (
            p | 'Load StateSupervisionSentences' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        if state_code is None or state_code == 'US_MO':
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code, person_id_filter_set)

            us_mo_sentence_statuses = (
                p |
                "Read MO sentence status table from BigQuery" >> beam.io.Read(
                    beam.io.BigQuerySource(query=us_mo_sentence_status_query,
                                           use_standard_sql=True)))
        else:
            us_mo_sentence_statuses = (
                p |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert MO sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        supervision_sentences_and_statuses = (
            {
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences,
                'sentence_statuses': us_mo_sentence_status_rankings_as_kv
            }
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey())

        sentences_converted = (
            supervision_sentences_and_statuses
            | 'Convert to state-specific sentences' >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    'incarceration_sentences', 'supervision_sentences'))

        sentences_and_sentence_groups = (
            {
                'sentence_groups': sentence_groups,
                'incarceration_sentences':
                sentences_converted.incarceration_sentences,
                'supervision_sentences':
                sentences_converted.supervision_sentences
            }
            | 'Group sentences to sentence groups' >> beam.CoGroupByKey())

        # Set hydrated sentences on the corresponding sentence groups
        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | 'Set hydrated sentences on sentence groups' >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Bring in the table that associates people and their county of residence
        person_id_to_county_kv = (
            p | 'Load person_id_to_county_kv' >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
                table_key='person_id',
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set))

        ip_to_judicial_district_kv = (
            p | 'Load ip_to_judicial_district_kv' >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=
                INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
                table_key='person_id',
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set))

        state_race_ethnicity_population_counts = (
            p | 'Load state_race_ethnicity_population_counts' >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id='state_race_ethnicity_population_counts',
                state_code_filter=state_code,
                person_id_filter_set=None))

        # Group each StatePerson with their related entities
        person_entities = (
            {
                'person':
                persons,
                'sentence_groups':
                sentence_groups_with_hydrated_sentences,
                'incarceration_period_judicial_district_association':
                ip_to_judicial_district_kv
            }
            | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey())

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_incarceration_events = (
            person_entities | 'Classify Incarceration Events' >> beam.ParDo(
                ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv)))

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                AsList(state_race_ethnicity_population_counts)))

        person_incarceration_events_with_metadata = (
            {
                'person_events': person_incarceration_events,
                'person_metadata': person_metadata
            }
            | 'Group IncarcerationEvents with person-level metadata' >>
            beam.CoGroupByKey()
            |
            'Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations'
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_incarceration_events_with_metadata
            | 'Get Incarceration Metrics' >> GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            incarceration_metrics | 'Convert to dict to be written to BQ' >>
            beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                IncarcerationMetricType.INCARCERATION_ADMISSION.value,
                IncarcerationMetricType.INCARCERATION_POPULATION.value,
                IncarcerationMetricType.INCARCERATION_RELEASE.value))

        # Write the metrics to the output tables in BigQuery
        admissions_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            IncarcerationAdmissionMetric)
        population_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            IncarcerationPopulationMetric)
        releases_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            IncarcerationReleaseMetric)

        _ = (writable_metrics.INCARCERATION_ADMISSION
             | f"Write admission metrics to BQ table: {admissions_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=admissions_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.INCARCERATION_POPULATION
             | f"Write population metrics to BQ table: {population_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=population_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.INCARCERATION_RELEASE
             | f"Write release metrics to BQ table: {releases_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=releases_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
示例#19
0
def run(
    apache_beam_pipeline_options: PipelineOptions,
    data_input: str,
    reference_view_input: str,
    static_reference_input: str,
    output: str,
    calculation_month_count: int,
    metric_types: List[str],
    state_code: str,
    calculation_end_month: Optional[str],
    person_filter_ids: Optional[List[int]],
):
    """Runs the incarceration calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()
    project_id = all_pipeline_options["project"]

    if project_id is None:
        raise ValueError(f"No project set in pipeline options: {all_pipeline_options}")

    if state_code is None:
        raise ValueError("No state_code set for pipeline")

    input_dataset = project_id + "." + data_input
    reference_dataset = project_id + "." + reference_view_input
    static_reference_dataset = project_id + "." + static_reference_input

    person_id_filter_set = set(person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = p | "Load StatePersons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSentenceGroups
        sentence_groups = p | "Load StateSentenceGroups" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSentenceGroup,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p
            | "Load StateIncarcerationSentences"
            >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            )
        )

        # Get StateSupervisionSentences
        supervision_sentences = p | "Load StateSupervisionSentences" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionSentence,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        if state_code == "US_MO":
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset,
                US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code,
                person_id_filter_set,
            )

            us_mo_sentence_statuses = (
                p
                | "Read MO sentence status table from BigQuery"
                >> ReadFromBigQuery(query=us_mo_sentence_status_query)
            )
        else:
            us_mo_sentence_statuses = (
                p
                | f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([])
            )

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | "Convert MO sentence status ranking table to KV tuples"
            >> beam.ParDo(ConvertDictToKVTuple(), "person_id")
        )

        supervision_sentences_and_statuses = (
            {
                "incarceration_sentences": incarceration_sentences,
                "supervision_sentences": supervision_sentences,
                "sentence_statuses": us_mo_sentence_status_rankings_as_kv,
            }
            | "Group sentences to the sentence statuses for that person"
            >> beam.CoGroupByKey()
        )

        sentences_converted = (
            supervision_sentences_and_statuses
            | "Convert to state-specific sentences"
            >> beam.ParDo(ConvertSentencesToStateSpecificType()).with_outputs(
                "incarceration_sentences", "supervision_sentences"
            )
        )

        sentences_and_sentence_groups = {
            "sentence_groups": sentence_groups,
            "incarceration_sentences": sentences_converted.incarceration_sentences,
            "supervision_sentences": sentences_converted.supervision_sentences,
        } | "Group sentences to sentence groups" >> beam.CoGroupByKey()

        # Set hydrated sentences on the corresponding sentence groups
        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | "Set hydrated sentences on sentence groups"
            >> beam.ParDo(SetSentencesOnSentenceGroup())
        )

        # Bring in the table that associates people and their county of residence
        person_id_to_county_kv = (
            p
            | "Load person_id_to_county_kv"
            >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            )
        )

        ip_to_judicial_district_kv = (
            p
            | "Load ip_to_judicial_district_kv"
            >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            )
        )

        state_race_ethnicity_population_counts = (
            p
            | "Load state_race_ethnicity_population_counts"
            >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            )
        )

        # Group each StatePerson with their related entities
        person_entities = {
            "person": persons,
            "sentence_groups": sentence_groups_with_hydrated_sentences,
            "incarceration_period_judicial_district_association": ip_to_judicial_district_kv,
            "persons_to_recent_county_of_residence": person_id_to_county_kv,
        } | "Group StatePerson to SentenceGroups" >> beam.CoGroupByKey()

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_incarceration_events = (
            person_entities
            | "Classify Incarceration Events"
            >> beam.ParDo(ClassifyIncarcerationEvents())
        )

        person_metadata = (
            persons
            | "Build the person_metadata dictionary"
            >> beam.ParDo(
                BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts)
            )
        )

        person_incarceration_events_with_metadata = (
            {
                "person_events": person_incarceration_events,
                "person_metadata": person_metadata,
            }
            | "Group IncarcerationEvents with person-level metadata"
            >> beam.CoGroupByKey()
            | "Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations"
            >> beam.ParDo(ExtractPersonEventsMetadata())
        )

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_incarceration_events_with_metadata
            | "Get Incarceration Metrics"
            >> GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count,
            )
        )

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            incarceration_metrics
            | "Convert to dict to be written to BQ"
            >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                IncarcerationMetricType.INCARCERATION_ADMISSION.value,
                IncarcerationMetricType.INCARCERATION_POPULATION.value,
                IncarcerationMetricType.INCARCERATION_RELEASE.value,
            )
        )

        # Write the metrics to the output tables in BigQuery
        admissions_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationAdmissionMetric]
        population_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationPopulationMetric]
        releases_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationReleaseMetric]

        _ = (
            writable_metrics.INCARCERATION_ADMISSION
            | f"Write admission metrics to BQ table: {admissions_table_id}"
            >> WriteAppendToBigQuery(
                output_table=admissions_table_id,
                output_dataset=output,
            )
        )

        _ = (
            writable_metrics.INCARCERATION_POPULATION
            | f"Write population metrics to BQ table: {population_table_id}"
            >> WriteAppendToBigQuery(
                output_table=population_table_id,
                output_dataset=output,
            )
        )

        _ = (
            writable_metrics.INCARCERATION_RELEASE
            | f"Write release metrics to BQ table: {releases_table_id}"
            >> WriteAppendToBigQuery(
                output_table=releases_table_id,
                output_dataset=output,
            )
        )
def run(argv=None):
    """
    This funciton parses the command line arguments and runs the Beam Pipeline.

    Args:
        argv: list containing the commandline arguments for this call of the
         script.
    """
    # Keeps track if schema was inferred by input or ouput table.
    schema_inferred = False

    data_args, pipeline_args = parse_data_generator_args(argv)
    data_args, schema_inferred = fetch_schema(data_args, schema_inferred)

    pipeline_options = PipelineOptions(pipeline_args)

    temp_location = pipeline_options.display_data()['temp_location']
    temp_blob = write_n_line_file_to_gcs(
        pipeline_options.display_data()['project'], temp_location,
        data_args.num_records)

    data_gen = DataGenerator(
        bq_schema_filename=data_args.schema_file,
        input_bq_table=data_args.input_bq_table,
        p_null=data_args.p_null,
        min_date=data_args.min_date,
        max_date=data_args.max_date,
        only_pos=data_args.only_pos,
        max_int=data_args.max_int,
        max_float=data_args.max_float,
        float_precision=data_args.float_precision,
        write_disp=data_args.write_disp,
        key_skew=data_args.key_skew,
        primary_key_cols=data_args.primary_key_cols,
        dest_joining_key_col=data_args.dest_joining_key_col)

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information including where Dataflow should
    # store temp files, and what the project id is and what runner to use.
    p = beam.Pipeline(options=pipeline_options)

    # When generating a dimension table we get the distinct keys as a side
    # input from the main table so we generate dimension records that join to
    # the main data table.
    key_set = \
        (p
         | 'Query Keys from main table' >> beam.io.Read(
            beam.io.BigQuerySource(
                query="SELECT DISTINCT({}) FROM `{}`".format(
                    data_args.source_joining_key_col,
                    data_args.fact_table),
                use_standard_sql=True)
            )
         | 'Extract key values' >> beam.Map(
                lambda x: (x[data_args.source_joining_key_col]))
        )

    rows = (
        p

        # Read the file we created with num_records newlines.
        #
        | 'Read file with num_records lines' >> beam.io.ReadFromText(
            os.path.join('gs://', temp_blob.bucket.name, temp_blob.name))

        # Use our instance of our custom DataGenerator Class to generate 1 fake
        # datum with the appropriate schema for each element in the PColleciton
        # created above.
        | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen))
        | 'Parse Json Strings' >> beam.FlatMap(lambda row: [json.loads(row)])
        | 'Enforce joining keys' >> beam.FlatMap(
            data_gen.enforce_joinable_keys, key_set=AsList(key_set)))

    if data_args.primary_key_cols:
        for key in data_args.primary_key_cols.split(','):
            rows |= 'Enforcing primary key: {}'.format(
                key) >> EnforcePrimaryKeys(key)

    if data_args.csv_schema_order:
        (rows
         | 'Order fields for CSV writing.' >> beam.FlatMap(
             lambda d: [dict_to_csv(d, data_args.csv_schema_order.split(','))])
         | 'Write to GCS' >> beam.io.textio.WriteToText(
             file_path_prefix=data_args.output_prefix, file_name_suffix='.csv')
         )

    if data_args.avro_schema_file:
        avsc = avro.schema.parse(open(data_args.avro_schema_file, 'rb').read())

        (rows
         # Need to convert time stamps from strings to timestamp-micros
         | 'Fix date and time Types for Avro.' >>
         beam.FlatMap(lambda row: fix_record_for_avro(row, avsc))
         | 'Write to Avro.' >> beam.io.avroio.WriteToAvro(
             file_path_prefix=data_args.output_prefix,
             codec='null',
             file_name_suffix='.avro',
             use_fastavro=True,
             schema=avsc))

    if data_args.output_bq_table:
        (rows
         | 'Write to BigQuery.' >> beam.io.gcp.bigquery.WriteToBigQuery(
             # The table name is a required argument for the BigQuery sink.
             # In this case we use the value passed in from the command line.
             data_args.output_bq_table,
             schema=None
             if schema_inferred else data_gen.get_bq_schema_string(),
             # Creates the table in BigQuery if it does not yet exist.
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=data_gen.write_disp,
             # Use the max recommended batch size.
             batch_size=500))

    p.run().wait_until_finish()

    # Manually clean up of temp_num_records.txt because it will be outside this
    # job's directory and Dataflow will not remove it for us.
    temp_blob.delete()
示例#21
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_view_input: str, static_reference_input: str, output: str,
        calculation_month_count: int, metric_types: List[str],
        state_code: Optional[str], calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the supervision calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()
    project_id = all_pipeline_options['project']

    input_dataset = project_id + '.' + data_input
    reference_dataset = project_id + '.' + reference_view_input
    static_reference_dataset = project_id + '.' + static_reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load Persons' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            p | 'Load IncarcerationPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionViolations
        supervision_violations = (
            p | 'Load SupervisionViolations' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolation,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # TODO(#2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            p | 'Load SupervisionViolationResponses' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionSentences
        supervision_sentences = (
            p | 'Load SupervisionSentences' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p | 'Load IncarcerationSentences' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionPeriods
        supervision_periods = (
            p | 'Load SupervisionPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateAssessments
        assessments = (p | 'Load Assessments' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        supervision_contacts = (
            p | 'Load StateSupervisionContacts' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionContact,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        ssvr_agent_associations_as_kv = (
            p | 'Load ssvr_agent_associations_as_kv' >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=SSVR_TO_AGENT_ASSOCIATION_VIEW_NAME,
                table_key='supervision_violation_response_id',
                state_code_filter=state_code,
                person_id_filter_set=None))

        supervision_period_to_agent_associations_as_kv = (
            p | 'Load supervision_period_to_agent_associations_as_kv' >>
            ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME,
                table_key='supervision_period_id',
                state_code_filter=state_code,
                person_id_filter_set=None))

        # Bring in the judicial districts associated with supervision_periods
        sp_to_judicial_district_kv = (
            p | 'Load sp_to_judicial_district_kv' >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=
                SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
                table_key='person_id'))

        state_race_ethnicity_population_counts = (
            p | 'Load state_race_ethnicity_population_counts' >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id='state_race_ethnicity_population_counts',
                state_code_filter=state_code,
                person_id_filter_set=None,
            ))

        if state_code is None or state_code == 'US_MO':
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code, person_id_filter_set)

            us_mo_sentence_statuses = (
                p |
                "Read MO sentence status table from BigQuery" >> beam.io.Read(
                    beam.io.BigQuerySource(query=us_mo_sentence_status_query,
                                           use_standard_sql=True)))
        else:
            us_mo_sentence_statuses = (
                p |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert MO sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        sentences_and_statuses = (
            {
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences,
                'sentence_statuses': us_mo_sentence_status_rankings_as_kv
            }
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey())

        sentences_converted = (
            sentences_and_statuses
            | 'Convert to state-specific sentences' >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    'incarceration_sentences', 'supervision_sentences'))

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                'violations': supervision_violations,
                'violation_responses': supervision_violation_responses
            } | 'Group StateSupervisionViolationResponses to '
            'StateSupervisionViolations' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on the corresponding
        # StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
            'the StateSupervisionViolationResponses' >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id
        incarceration_periods_and_violation_responses = (
            {
                'incarceration_periods': incarceration_periods,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StateIncarcerationPeriods to '
            'StateSupervisionViolationResponses' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding
        # StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their related entities
        person_entities = (
            {
                'person':
                persons,
                'assessments':
                assessments,
                'incarceration_periods':
                incarceration_periods_with_source_violations,
                'supervision_periods':
                supervision_periods,
                'supervision_sentences':
                sentences_converted.supervision_sentences,
                'incarceration_sentences':
                sentences_converted.incarceration_sentences,
                'violation_responses':
                violation_responses_with_hydrated_violations,
                'supervision_contacts':
                supervision_contacts,
                'supervision_period_judicial_district_association':
                sp_to_judicial_district_kv
            }
            | 'Group StatePerson to all entities' >> beam.CoGroupByKey())

        # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods
        person_time_buckets = (
            person_entities
            | 'Get SupervisionTimeBuckets' >> beam.ParDo(
                ClassifySupervisionTimeBuckets(),
                AsDict(ssvr_agent_associations_as_kv),
                AsDict(supervision_period_to_agent_associations_as_kv)))

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                AsList(state_race_ethnicity_population_counts)))

        person_time_buckets_with_metadata = (
            {
                'person_events': person_time_buckets,
                'person_metadata': person_metadata
            }
            | 'Group SupervisionTimeBuckets with person-level metadata' >>
            beam.CoGroupByKey()
            |
            'Organize StatePerson, PersonMetadata and SupervisionTimeBuckets for calculations'
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get supervision metrics
        supervision_metrics = (
            person_time_buckets_with_metadata
            | 'Get Supervision Metrics' >> GetSupervisionMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count))
        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            supervision_metrics | 'Convert to dict to be written to BQ' >>
            beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                SupervisionMetricType.SUPERVISION_COMPLIANCE.value,
                SupervisionMetricType.SUPERVISION_POPULATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value,
                SupervisionMetricType.
                SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS.value,
                SupervisionMetricType.SUPERVISION_SUCCESS.value,
                SupervisionMetricType.
                SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value,
                SupervisionMetricType.SUPERVISION_TERMINATION.value))

        # Write the metrics to the output tables in BigQuery
        terminations_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionTerminationMetric)
        compliance_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionCaseComplianceMetric)
        populations_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionPopulationMetric)
        revocations_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionRevocationMetric)
        revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionRevocationAnalysisMetric)
        revocation_violation_type_analysis_table_id = \
            DATAFLOW_METRICS_TO_TABLES.get(SupervisionRevocationViolationTypeAnalysisMetric)
        successes_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionSuccessMetric)
        successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SuccessfulSupervisionSentenceDaysServedMetric)

        _ = (writable_metrics.SUPERVISION_POPULATION
             | f"Write population metrics to BQ table: {populations_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=populations_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_REVOCATION
             | f"Write revocation metrics to BQ table: {revocations_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=revocations_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_SUCCESS
             | f"Write success metrics to BQ table: {successes_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=successes_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED
             | f"Write supervision successful sentence length metrics to BQ"
             f" table: {successful_sentence_lengths_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=successful_sentence_lengths_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_TERMINATION
             |
             f"Write termination metrics to BQ table: {terminations_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=terminations_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (
            writable_metrics.SUPERVISION_REVOCATION_ANALYSIS
            |
            f"Write revocation analyses metrics to BQ table: {revocation_analysis_table_id}"
            >> beam.io.WriteToBigQuery(
                table=revocation_analysis_table_id,
                dataset=output,
                create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS
             |
             f"Write revocation violation type analyses metrics to BQ table: "
             f"{revocation_violation_type_analysis_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=revocation_violation_type_analysis_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_COMPLIANCE
             | f"Write compliance metrics to BQ table: {compliance_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=compliance_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
示例#22
0
def run(
    apache_beam_pipeline_options: PipelineOptions,
    data_input: str,
    reference_view_input: str,
    static_reference_input: str,
    output: str,
    calculation_month_count: int,
    metric_types: List[str],
    state_code: str,
    calculation_end_month: Optional[str],
    person_filter_ids: Optional[List[int]],
):
    """Runs the program calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()
    project_id = all_pipeline_options["project"]

    if project_id is None:
        raise ValueError(f"No project set in pipeline options: {all_pipeline_options}")

    if state_code is None:
        raise ValueError("No state_code set for pipeline")

    input_dataset = project_id + "." + data_input
    reference_dataset = project_id + "." + reference_view_input
    static_reference_dataset = project_id + "." + static_reference_input

    person_id_filter_set = set(person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = p | "Load Persons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateProgramAssignments
        program_assignments = p | "Load Program Assignments" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateProgramAssignment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateAssessments
        assessments = p | "Load Assessments" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionPeriods
        supervision_periods = p | "Load SupervisionPeriods" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        supervision_period_to_agent_associations_as_kv = (
            p
            | "Load supervision_period_to_agent_associations_as_kv"
            >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            )
        )

        state_race_ethnicity_population_counts = (
            p
            | "Load state_race_ethnicity_population_counts"
            >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            )
        )

        # Group each StatePerson with their other entities
        persons_entities = {
            "person": persons,
            "program_assignments": program_assignments,
            "assessments": assessments,
            "supervision_periods": supervision_periods,
            "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv,
        } | "Group StatePerson to StateProgramAssignments and" >> beam.CoGroupByKey()

        # Identify ProgramEvents from the StatePerson's StateProgramAssignments
        person_program_events = persons_entities | beam.ParDo(
            ClassifyProgramAssignments()
        )

        person_metadata = (
            persons
            | "Build the person_metadata dictionary"
            >> beam.ParDo(
                BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts)
            )
        )

        person_program_events_with_metadata = (
            {"person_events": person_program_events, "person_metadata": person_metadata}
            | "Group ProgramEvents with person-level metadata" >> beam.CoGroupByKey()
            | "Organize StatePerson, PersonMetadata and ProgramEvents for calculations"
            >> beam.ParDo(ExtractPersonEventsMetadata())
        )

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get program metrics
        program_metrics = (
            person_program_events_with_metadata
            | "Get Program Metrics"
            >> GetProgramMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count,
            )
        )

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            program_metrics
            | "Convert to dict to be written to BQ"
            >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                ProgramMetricType.PROGRAM_PARTICIPATION.value,
                ProgramMetricType.PROGRAM_REFERRAL.value,
            )
        )

        # Write the metrics to the output tables in BigQuery
        referrals_table_id = DATAFLOW_METRICS_TO_TABLES[ProgramReferralMetric]
        participation_table_id = DATAFLOW_METRICS_TO_TABLES[ProgramParticipationMetric]

        _ = (
            writable_metrics.PROGRAM_REFERRAL
            | f"Write referral metrics to BQ table: {referrals_table_id}"
            >> WriteAppendToBigQuery(
                output_table=referrals_table_id,
                output_dataset=output,
            )
        )

        _ = (
            writable_metrics.PROGRAM_PARTICIPATION
            | f"Write participation metrics to BQ table: {participation_table_id}"
            >> WriteAppendToBigQuery(
                output_table=participation_table_id,
                output_dataset=output,
            )
        )
def run(argv=sys.argv):

    # setup the command Line argument related to data processing
    parser = argparse.ArgumentParser()

    # Input file that contains sales data
    parser.add_argument(
        '--input1',
        dest='input1',
        default='../data/spikey_sales_weekly.txt',
        # required=True,
        help='Input Sales Data file')

    # Input file that contains all the offers currently running
    parser.add_argument(
        '--input2',
        dest='input2',
        default='../data/spikey_offers.txt',
        # required=True,
        help='Input Offers Data file')

    # This is output file to that will contains discounted top selling products
    parser.add_argument(
        '--output',
        dest='output',
        default='../output/output_top_seller_offer',
        # required=True,
        help='Input Offers Data file')

    known_args, pipeline_args = parser.parse_known_args(argv)

    # Apache beam pipeline option
    pipeline_option = PipelineOptions(pipeline_args)
    pipeline_option.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_option) as p:
        # Read sales data
        logging.info("Input sales data file %s", known_args.input1)
        sold_items = p | 'Read Sales Data' >> ReadFromText(known_args.input1)

        logging.info("Input offer data file %s", known_args.input2)
        discounted_items = p | 'Read offers' >> ReadFromText(known_args.input2)

        # Get Top Selling Items
        top_selling_items = (
            sold_items
            | 'Top Selling Products' >> beam.ParDo(TopSellingProducts()))
        logging.info("top_selling_items calculation completed")

        discounted_item_ids = (discounted_items
                               | 'Offer Items' >> beam.ParDo(OfferedItem()))
        logging.info("discounted_item_ids calculation completed")

        top_discounted_items = (top_selling_items
                                | 'Discounted Item Match' >> beam.FlatMap(
                                    match_id_fn, AsList(discounted_item_ids)))

        (top_discounted_items |
         'Write output File' >> WriteToText(known_args.output,
                                            file_name_suffix='.csv',
                                            header='Product_ID, Product_Name'))
示例#24
0
    def execute_pipeline(
        self,
        pipeline: beam.Pipeline,
        all_pipeline_options: Dict[str, Any],
        state_code: str,
        input_dataset: str,
        reference_dataset: str,
        static_reference_dataset: str,
        metric_types: List[str],
        person_id_filter_set: Optional[Set[int]],
        calculation_month_count: int = -1,
        calculation_end_month: Optional[str] = None,
    ) -> beam.Pipeline:
        persons = pipeline | "Load StatePersons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSentenceGroups
        sentence_groups = pipeline | "Load StateSentenceGroups" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSentenceGroup,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            pipeline
            | "Load StateIncarcerationSentences" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        # Get StateSupervisionSentences
        supervision_sentences = (
            pipeline
            | "Load StateSupervisionSentences" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        # Get StateSupervisionPeriods
        supervision_periods = (
            pipeline
            | "Load StateSupervisionPeriods" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        # Get StateAssessments
        assessments = pipeline | "Load Assessments" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionViolations
        supervision_violations = (
            pipeline
            | "Load SupervisionViolations" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolation,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            pipeline
            | "Load SupervisionViolationResponses" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        if state_code == "US_MO":
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset,
                US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code,
                person_id_filter_set,
            )

            us_mo_sentence_statuses = (
                pipeline
                | "Read MO sentence status table from BigQuery" >>
                ReadFromBigQuery(query=us_mo_sentence_status_query))
        else:
            us_mo_sentence_statuses = (
                pipeline
                |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | "Convert MO sentence status ranking table to KV tuples" >>
            beam.ParDo(ConvertDictToKVTuple(), "person_id"))

        supervision_sentences_and_statuses = (
            {
                "incarceration_sentences": incarceration_sentences,
                "supervision_sentences": supervision_sentences,
                "sentence_statuses": us_mo_sentence_status_rankings_as_kv,
            }
            | "Group sentences to the sentence statuses for that person" >>
            beam.CoGroupByKey())

        sentences_converted = (
            supervision_sentences_and_statuses
            | "Convert to state-specific sentences" >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    "incarceration_sentences", "supervision_sentences"))

        # Set hydrated supervision periods on the corresponding incarceration sentences
        incarceration_sentences_with_hydrated_sps = (
            {
                "supervision_periods": supervision_periods,
                "sentences": sentences_converted.incarceration_sentences,
            }
            | "Group supervision periods to incarceration sentences" >>
            beam.CoGroupByKey()
            | "Set hydrated supervision periods on incarceration sentences" >>
            beam.ParDo(SetSupervisionPeriodsOnSentences()))

        # Set hydrated supervision periods on the corresponding supervision sentences
        supervision_sentences_with_hydrated_sps = (
            {
                "supervision_periods": supervision_periods,
                "sentences": sentences_converted.supervision_sentences,
            }
            | "Group supervision periods to supervision sentences" >>
            beam.CoGroupByKey()
            | "Set hydrated supervision periods on supervision sentences" >>
            beam.ParDo(SetSupervisionPeriodsOnSentences()))

        sentences_and_sentence_groups = {
            "sentence_groups": sentence_groups,
            "incarceration_sentences":
            incarceration_sentences_with_hydrated_sps,
            "supervision_sentences": supervision_sentences_with_hydrated_sps,
        } | "Group sentences to sentence groups" >> beam.CoGroupByKey()

        # Set hydrated sentences on the corresponding sentence groups
        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | "Set hydrated sentences on sentence groups" >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Bring in the table that associates people and their county of residence
        person_id_to_county_kv = (
            pipeline
            | "Load person_id_to_county_kv" >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            ))

        ip_to_judicial_district_kv = (
            pipeline
            | "Load ip_to_judicial_district_kv" >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=
                INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            ))

        supervision_period_to_agent_associations_as_kv = (
            pipeline
            | "Load supervision_period_to_agent_associations_as_kv" >>
            ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            ))

        state_race_ethnicity_population_counts = (
            pipeline
            | "Load state_race_ethnicity_population_counts" >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            ))

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                "violations": supervision_violations,
                "violation_responses": supervision_violation_responses,
            }
            | "Group StateSupervisionViolationResponses to "
            "StateSupervisionViolations" >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on the corresponding
        # StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | "Set hydrated StateSupervisionViolations on "
            "the StateSupervisionViolationResponses" >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group each StatePerson with their related entities
        person_entities = {
            "person": persons,
            "assessments": assessments,
            "sentence_groups": sentence_groups_with_hydrated_sentences,
            "violation_responses":
            violation_responses_with_hydrated_violations,
            "incarceration_period_judicial_district_association":
            ip_to_judicial_district_kv,
            "supervision_period_to_agent_association":
            supervision_period_to_agent_associations_as_kv,
            "persons_to_recent_county_of_residence": person_id_to_county_kv,
        } | "Group StatePerson to SentenceGroups" >> beam.CoGroupByKey()

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_incarceration_events = (
            person_entities
            | "Classify Incarceration Events" >> beam.ParDo(
                ClassifyEvents(), identifier=self.pipeline_config.identifier))

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                state_race_ethnicity_population_counts=AsList(
                    state_race_ethnicity_population_counts),
            ))

        person_incarceration_events_with_metadata = (
            {
                "person_events": person_incarceration_events,
                "person_metadata": person_metadata,
            }
            | "Group IncarcerationEvents with person-level metadata" >>
            beam.CoGroupByKey()
            |
            "Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations"
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            "%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_incarceration_events_with_metadata
            | "Get Incarceration Metrics" >> GetMetrics(
                pipeline_options=all_pipeline_options,
                pipeline_config=self.pipeline_config,
                metric_types_to_include=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count,
            ))

        return incarceration_metrics
示例#25
0
    def execute_pipeline(
        self,
        pipeline: beam.Pipeline,
        all_pipeline_options: Dict[str, Any],
        state_code: str,
        input_dataset: str,
        reference_dataset: str,
        static_reference_dataset: str,
        metric_types: List[str],
        person_id_filter_set: Optional[Set[int]],
        calculation_month_count: int = -1,
        calculation_end_month: Optional[str] = None,
    ) -> beam.Pipeline:
        # Get StatePersons
        persons = pipeline | "Load Persons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateProgramAssignments
        program_assignments = pipeline | "Load Program Assignments" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateProgramAssignment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateAssessments
        assessments = pipeline | "Load Assessments" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionPeriods
        supervision_periods = pipeline | "Load SupervisionPeriods" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        supervision_period_to_agent_associations_as_kv = (
            pipeline
            | "Load supervision_period_to_agent_associations_as_kv" >>
            ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            ))

        state_race_ethnicity_population_counts = (
            pipeline
            | "Load state_race_ethnicity_population_counts" >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            ))

        # Group each StatePerson with their other entities
        persons_entities = {
            "person":
            persons,
            "program_assignments":
            program_assignments,
            "assessments":
            assessments,
            "supervision_periods":
            supervision_periods,
            "supervision_period_to_agent_association":
            supervision_period_to_agent_associations_as_kv,
        } | "Group StatePerson to StateProgramAssignments and" >> beam.CoGroupByKey(
        )

        # Identify ProgramEvents from the StatePerson's StateProgramAssignments
        person_program_events = persons_entities | beam.ParDo(
            ClassifyEvents(), identifier=self.pipeline_config.identifier)

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                state_race_ethnicity_population_counts=AsList(
                    state_race_ethnicity_population_counts),
            ))

        person_program_events_with_metadata = (
            {
                "person_events": person_program_events,
                "person_metadata": person_metadata
            }
            | "Group ProgramEvents with person-level metadata" >>
            beam.CoGroupByKey()
            |
            "Organize StatePerson, PersonMetadata and ProgramEvents for calculations"
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            "%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get program metrics
        program_metrics = (person_program_events_with_metadata
                           | "Get Program Metrics" >> GetMetrics(
                               pipeline_options=all_pipeline_options,
                               pipeline_config=self.pipeline_config,
                               metric_types_to_include=metric_types_set,
                               calculation_end_month=calculation_end_month,
                               calculation_month_count=calculation_month_count,
                           ))

        return program_metrics
示例#26
0
    def run_test_pipeline(self,
                          dataset: str,
                          fake_supervision_period_id: int,
                          unifying_id_field_filter_set: Optional[Set[int]] = None,
                          metric_types_filter: Optional[Set[str]] = None):
        """Runs a test version of the program pipeline."""
        test_pipeline = TestPipeline()

        # Get StatePersons
        persons = (test_pipeline
                   | 'Load Persons' >>  # type: ignore
                   extractor_utils.BuildRootEntity(
                       dataset=dataset,
                       root_entity_class=entities.StatePerson,
                       unifying_id_field=entities.StatePerson.get_class_id_name(),
                       build_related_entities=True))

        # Get StateProgramAssignments
        program_assignments = (test_pipeline
                               | 'Load Program Assignments' >>  # type: ignore
                               extractor_utils.BuildRootEntity(
                                   dataset=dataset,
                                   root_entity_class=entities.
                                   StateProgramAssignment,
                                   unifying_id_field=entities.StatePerson.get_class_id_name(),
                                   build_related_entities=True,
                                   unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateAssessments
        assessments = (test_pipeline
                       | 'Load Assessments' >>  # type: ignore
                       extractor_utils.BuildRootEntity(
                           dataset=dataset,
                           root_entity_class=entities.
                           StateAssessment,
                           unifying_id_field=entities.StatePerson.get_class_id_name(),
                           build_related_entities=False,
                           unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateSupervisionPeriods
        supervision_periods = (test_pipeline
                               | 'Load SupervisionPeriods' >>  # type: ignore
                               extractor_utils.BuildRootEntity(
                                   dataset=dataset,
                                   root_entity_class=
                                   entities.StateSupervisionPeriod,
                                   unifying_id_field=entities.StatePerson.get_class_id_name(),
                                   build_related_entities=False,
                                   unifying_id_field_filter_set=unifying_id_field_filter_set))

        supervision_period_to_agent_map = {
            'agent_id': 1010,
            'agent_external_id': 'OFFICER0009',
            'district_external_id': '10',
            'supervision_period_id': fake_supervision_period_id
        }

        supervision_period_to_agent_associations = (
            test_pipeline
            | 'Create SupervisionPeriod to Agent table' >>
            beam.Create([supervision_period_to_agent_map])
        )

        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations |
            'Convert SupervisionPeriod to Agent table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(),
                       'supervision_period_id')
        )

        state_race_ethnicity_population_count = {
            'state_code': 'US_XX',
            'race_or_ethnicity': 'BLACK',
            'population_count': 1,
            'representation_priority': 1
        }

        state_race_ethnicity_population_counts = (
            test_pipeline | 'Create state_race_ethnicity_population_count table' >> beam.Create(
                [state_race_ethnicity_population_count])
        )

        # Group each StatePerson with their other entities
        persons_entities = (
            {'person': persons,
             'program_assignments': program_assignments,
             'assessments': assessments,
             'supervision_periods': supervision_periods
             }
            | 'Group StatePerson to StateProgramAssignments and' >>
            beam.CoGroupByKey()
        )

        # Identify ProgramEvents from the StatePerson's
        # StateProgramAssignments
        person_program_events = (
            persons_entities
            | beam.ParDo(pipeline.ClassifyProgramAssignments(),
                         AsDict(
                             supervision_period_to_agent_associations_as_kv
                         ))
        )

        person_metadata = (persons
                           | "Build the person_metadata dictionary" >>
                           beam.ParDo(BuildPersonMetadata(),
                                      AsList(state_race_ethnicity_population_counts)))

        person_program_events_with_metadata = (
            {
                'person_events': person_program_events,
                'person_metadata': person_metadata
            }
            | 'Group ProgramEvents with person-level metadata' >> beam.CoGroupByKey()
            | 'Organize StatePerson, PersonMetadata and ProgramEvents for calculations' >>
            beam.ParDo(ExtractPersonEventsMetadata())
        )

        # Get pipeline job details for accessing job_id
        all_pipeline_options = PipelineOptions().get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        metric_types = metric_types_filter if metric_types_filter else {'ALL'}

        # Get program metrics
        program_metrics = (person_program_events_with_metadata
                           | 'Get Program Metrics' >>  # type: ignore
                           pipeline.GetProgramMetrics(
                               pipeline_options=all_pipeline_options,
                               metric_types=metric_types,
                               calculation_end_month=None,
                               calculation_month_count=-1))

        assert_that(program_metrics, AssertMatchers.validate_pipeline_test())

        test_pipeline.run()
示例#27
0
def run(apache_beam_pipeline_options: PipelineOptions,
        data_input: str,
        reference_view_input: str,
        static_reference_input: str,
        output: str,
        metric_types: List[str],
        state_code: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the recidivism calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is
    # necessary because the BuildRootEntity function tries to access attributes
    # of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been
    # instantiated, then the relationship properties are loaded and their
    # attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()
    project_id = all_pipeline_options['project']

    input_dataset = project_id + '.' + data_input
    reference_dataset = project_id + '.' + reference_view_input
    static_reference_dataset = project_id + '.' + static_reference_input

    person_id_filter_set = set(person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p
                   | 'Load Persons' >>
                   BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StatePerson,
                                   unifying_id_field=entities.StatePerson.get_class_id_name(),
                                   build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set,
                                   state_code=state_code))

        # Get StateIncarcerationPeriods
        incarceration_periods = (p
                                 | 'Load IncarcerationPeriods' >>
                                 BuildRootEntity(dataset=input_dataset,
                                                 root_entity_class=entities.StateIncarcerationPeriod,
                                                 unifying_id_field=entities.StatePerson.get_class_id_name(),
                                                 build_related_entities=True,
                                                 unifying_id_field_filter_set=person_id_filter_set,
                                                 state_code=state_code
                                                 ))

        # Get StateSupervisionViolations
        supervision_violations = \
            (p
             | 'Load SupervisionViolations' >>
             BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation,
                             unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True,
                             unifying_id_field_filter_set=person_id_filter_set,
                             state_code=state_code
                             ))

        # TODO(#2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = \
            (p
             | 'Load SupervisionViolationResponses' >>
             BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse,
                             unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True,
                             unifying_id_field_filter_set=person_id_filter_set,
                             state_code=state_code
                             ))

        # Group StateSupervisionViolationResponses and
        # StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {'violations': supervision_violations,
             'violation_responses': supervision_violation_responses
             } | 'Group StateSupervisionViolationResponses to '
                 'StateSupervisionViolations' >>
            beam.CoGroupByKey()
        )

        # Set the fully hydrated StateSupervisionViolation entities on
        # the corresponding StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
              'the StateSupervisionViolationResponses' >>
            beam.ParDo(SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses
        # by person_id
        incarceration_periods_and_violation_responses = (
            {'incarceration_periods': incarceration_periods,
             'violation_responses':
                 violation_responses_with_hydrated_violations}
            | 'Group StateIncarcerationPeriods to '
              'StateSupervisionViolationResponses' >>
            beam.CoGroupByKey()
        )

        # Set the fully hydrated StateSupervisionViolationResponse entities on
        # the corresponding StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >>
            beam.ParDo(SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their StateIncarcerationPeriods
        person_and_incarceration_periods = (
            {'person': persons,
             'incarceration_periods':
                 incarceration_periods_with_source_violations}
            | 'Group StatePerson to StateIncarcerationPeriods' >>
            beam.CoGroupByKey()
        )

        # Bring in the table that associates people and their county of residence
        person_id_to_county_kv = (p | 'Load person_id_to_county_kv' >> ImportTableAsKVTuples(
            dataset_id=reference_dataset,
            table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
            table_key='person_id',
            state_code_filter=state_code,
            person_id_filter_set=person_id_filter_set
        ))

        state_race_ethnicity_population_counts = (
                p | 'Load state_race_ethnicity_population_counts' >>
                ImportTable(
                    dataset_id=static_reference_dataset,
                    table_id='state_race_ethnicity_population_counts',
                    state_code_filter=state_code,
                    person_id_filter_set=None
                ))

        # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods
        person_release_events = (
            person_and_incarceration_periods
            | "ClassifyReleaseEvents" >>
            beam.ParDo(ClassifyReleaseEvents(), AsDict(person_id_to_county_kv))
        )

        person_metadata = (persons
                           | "Build the person_metadata dictionary" >>
                           beam.ParDo(BuildPersonMetadata(),
                                      AsList(state_race_ethnicity_population_counts)))

        person_release_events_with_metadata = (
            {
                'person_events': person_release_events,
                'person_metadata': person_metadata
            }
            | 'Group ReleaseEvents with person-level metadata' >> beam.CoGroupByKey()
            | 'Organize StatePerson, PersonMetadata and ReleaseEvents for calculations' >>
            beam.ParDo(ExtractPersonReleaseEventsMetadata())
        )

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get recidivism metrics
        recidivism_metrics = (person_release_events_with_metadata
                              | 'Get Recidivism Metrics' >>
                              GetRecidivismMetrics(
                                  pipeline_options=all_pipeline_options,
                                  metric_types=metric_types_set))

        if person_id_filter_set:
            logging.warning("Non-empty person filter set - returning before writing metrics.")
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (recidivism_metrics
                            | 'Convert to dict to be written to BQ' >>
                            beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                                ReincarcerationRecidivismMetricType.REINCARCERATION_RATE.value,
                                ReincarcerationRecidivismMetricType.REINCARCERATION_COUNT.value
                            ))

        # Write the recidivism metrics to the output tables in BigQuery
        rates_table_id = DATAFLOW_METRICS_TO_TABLES.get(ReincarcerationRecidivismRateMetric)
        counts_table_id = DATAFLOW_METRICS_TO_TABLES.get(ReincarcerationRecidivismCountMetric)

        _ = (writable_metrics.REINCARCERATION_RATE
             | f"Write rate metrics to BQ table: {rates_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=rates_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS
             ))

        _ = (writable_metrics.REINCARCERATION_COUNT
             | f"Write count metrics to BQ table: {counts_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=counts_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS
             ))
示例#28
0
def run(
    apache_beam_pipeline_options: PipelineOptions,
    data_input: str,
    reference_view_input: str,
    static_reference_input: str,
    output: str,
    calculation_month_count: int,
    metric_types: List[str],
    state_code: str,
    calculation_end_month: Optional[str],
    person_filter_ids: Optional[List[int]],
) -> None:
    """Runs the supervision calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()
    project_id = all_pipeline_options["project"]

    if project_id is None:
        raise ValueError(
            f"No project set in pipeline options: {all_pipeline_options}")

    if state_code is None:
        raise ValueError("No state_code set for pipeline")

    input_dataset = project_id + "." + data_input
    reference_dataset = project_id + "." + reference_view_input
    static_reference_dataset = project_id + "." + static_reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = p | "Load Persons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationPeriods
        incarceration_periods = p | "Load IncarcerationPeriods" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateIncarcerationPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionViolations
        supervision_violations = p | "Load SupervisionViolations" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionViolation,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # TODO(#2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            p
            | "Load SupervisionViolationResponses" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        # Get StateSupervisionSentences
        supervision_sentences = p | "Load SupervisionSentences" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionSentence,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationSentences
        incarceration_sentences = p | "Load IncarcerationSentences" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateIncarcerationSentence,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionPeriods
        supervision_periods = p | "Load SupervisionPeriods" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateAssessments
        assessments = p | "Load Assessments" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        supervision_contacts = p | "Load StateSupervisionContacts" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionContact,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        supervision_period_to_agent_associations_as_kv = (
            p
            | "Load supervision_period_to_agent_associations_as_kv" >>
            ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            ))

        # Bring in the judicial districts associated with supervision_periods
        sp_to_judicial_district_kv = (
            p
            | "Load sp_to_judicial_district_kv" >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=
                SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
                table_key="person_id",
            ))

        state_race_ethnicity_population_counts = (
            p
            | "Load state_race_ethnicity_population_counts" >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            ))

        if state_code == "US_MO":
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset,
                US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code,
                person_id_filter_set,
            )

            us_mo_sentence_statuses = (
                p
                | "Read MO sentence status table from BigQuery" >>
                ReadFromBigQuery(query=us_mo_sentence_status_query))
        else:
            us_mo_sentence_statuses = (
                p
                |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | "Convert MO sentence status ranking table to KV tuples" >>
            beam.ParDo(ConvertDictToKVTuple(), "person_id"))

        sentences_and_statuses = (
            {
                "incarceration_sentences": incarceration_sentences,
                "supervision_sentences": supervision_sentences,
                "sentence_statuses": us_mo_sentence_status_rankings_as_kv,
            }
            | "Group sentences to the sentence statuses for that person" >>
            beam.CoGroupByKey())

        sentences_converted = (
            sentences_and_statuses
            | "Convert to state-specific sentences" >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    "incarceration_sentences", "supervision_sentences"))

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                "violations": supervision_violations,
                "violation_responses": supervision_violation_responses,
            }
            | "Group StateSupervisionViolationResponses to "
            "StateSupervisionViolations" >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on the corresponding
        # StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | "Set hydrated StateSupervisionViolations on "
            "the StateSupervisionViolationResponses" >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id
        incarceration_periods_and_violation_responses = (
            {
                "incarceration_periods":
                incarceration_periods,
                "violation_responses":
                violation_responses_with_hydrated_violations,
            }
            | "Group StateIncarcerationPeriods to "
            "StateSupervisionViolationResponses" >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding
        # StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | "Set hydrated StateSupervisionViolationResponses on "
            "the StateIncarcerationPeriods" >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their related entities
        person_entities = {
            "person":
            persons,
            "assessments":
            assessments,
            "incarceration_periods":
            incarceration_periods_with_source_violations,
            "supervision_periods":
            supervision_periods,
            "supervision_sentences":
            sentences_converted.supervision_sentences,
            "incarceration_sentences":
            sentences_converted.incarceration_sentences,
            "violation_responses":
            violation_responses_with_hydrated_violations,
            "supervision_contacts":
            supervision_contacts,
            "supervision_period_judicial_district_association":
            sp_to_judicial_district_kv,
            "supervision_period_to_agent_association":
            supervision_period_to_agent_associations_as_kv,
        } | "Group StatePerson to all entities" >> beam.CoGroupByKey()

        # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods
        person_time_buckets = (person_entities
                               | "Get SupervisionTimeBuckets" >> beam.ParDo(
                                   ClassifySupervisionTimeBuckets()))

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                AsList(state_race_ethnicity_population_counts)))

        person_time_buckets_with_metadata = (
            {
                "person_events": person_time_buckets,
                "person_metadata": person_metadata
            }
            | "Group SupervisionTimeBuckets with person-level metadata" >>
            beam.CoGroupByKey()
            |
            "Organize StatePerson, PersonMetadata and SupervisionTimeBuckets for calculations"
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            "%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get supervision metrics
        supervision_metrics = (
            person_time_buckets_with_metadata
            | "Get Supervision Metrics" >> GetSupervisionMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count,
            ))
        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            supervision_metrics
            | "Convert to dict to be written to BQ" >>
            beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                SupervisionMetricType.SUPERVISION_COMPLIANCE.value,
                SupervisionMetricType.SUPERVISION_POPULATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value,
                SupervisionMetricType.SUPERVISION_START.value,
                SupervisionMetricType.SUPERVISION_SUCCESS.value,
                SupervisionMetricType.
                SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value,
                SupervisionMetricType.SUPERVISION_TERMINATION.value,
                SupervisionMetricType.SUPERVISION_OUT_OF_STATE_POPULATION.
                value,
                SupervisionMetricType.SUPERVISION_DOWNGRADE.value,
            ))

        terminations_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionTerminationMetric]
        compliance_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionCaseComplianceMetric]
        populations_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionPopulationMetric]
        revocations_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionRevocationMetric]
        revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionRevocationAnalysisMetric]
        successes_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionSuccessMetric]
        successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES[
            SuccessfulSupervisionSentenceDaysServedMetric]
        supervision_starts_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionStartMetric]
        out_of_state_populations_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionOutOfStatePopulationMetric]
        supervision_downgrade_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionDowngradeMetric]

        _ = (writable_metrics.SUPERVISION_POPULATION
             | f"Write population metrics to BQ table: {populations_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=populations_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_OUT_OF_STATE_POPULATION
             | f"Write out of state population metrics to BQ table: "
             f"{out_of_state_populations_table_id}" >> WriteAppendToBigQuery(
                 output_table=out_of_state_populations_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_REVOCATION
             | f"Write revocation metrics to BQ table: {revocations_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=revocations_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_SUCCESS
             | f"Write success metrics to BQ table: {successes_table_id}" >>
             WriteAppendToBigQuery(
                 output_table=successes_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED
             | f"Write supervision successful sentence length metrics to BQ"
             f" table: {successful_sentence_lengths_table_id}" >>
             WriteAppendToBigQuery(
                 output_table=successful_sentence_lengths_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_TERMINATION
             |
             f"Write termination metrics to BQ table: {terminations_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=terminations_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_REVOCATION_ANALYSIS
             | f"Write revocation analyses metrics to BQ table: "
             f"{revocation_analysis_table_id}" >> WriteAppendToBigQuery(
                 output_table=revocation_analysis_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_COMPLIANCE
             | f"Write compliance metrics to BQ table: {compliance_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=compliance_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_START
             |
             f"Write start metrics to BQ table: {supervision_starts_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=supervision_starts_table_id,
                 output_dataset=output,
             ))

        _ = (
            writable_metrics.SUPERVISION_DOWNGRADE
            |
            f"Write downgrade metrics to BQ table: {supervision_downgrade_table_id}"
            >> WriteAppendToBigQuery(
                output_table=supervision_downgrade_table_id,
                output_dataset=output,
            ))