Пример #1
0
def run(
    apache_beam_pipeline_options: PipelineOptions,
    data_input: str,
    reference_view_input: str,
    static_reference_input: str,
    output: str,
    calculation_month_count: int,
    metric_types: List[str],
    state_code: str,
    calculation_end_month: Optional[str],
    person_filter_ids: Optional[List[int]],
) -> None:
    """Runs the supervision calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()
    project_id = all_pipeline_options["project"]

    if project_id is None:
        raise ValueError(
            f"No project set in pipeline options: {all_pipeline_options}")

    if state_code is None:
        raise ValueError("No state_code set for pipeline")

    input_dataset = project_id + "." + data_input
    reference_dataset = project_id + "." + reference_view_input
    static_reference_dataset = project_id + "." + static_reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = p | "Load Persons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationPeriods
        incarceration_periods = p | "Load IncarcerationPeriods" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateIncarcerationPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionViolations
        supervision_violations = p | "Load SupervisionViolations" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionViolation,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # TODO(#2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            p
            | "Load SupervisionViolationResponses" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        # Get StateSupervisionSentences
        supervision_sentences = p | "Load SupervisionSentences" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionSentence,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationSentences
        incarceration_sentences = p | "Load IncarcerationSentences" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateIncarcerationSentence,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionPeriods
        supervision_periods = p | "Load SupervisionPeriods" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateAssessments
        assessments = p | "Load Assessments" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        supervision_contacts = p | "Load StateSupervisionContacts" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionContact,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        supervision_period_to_agent_associations_as_kv = (
            p
            | "Load supervision_period_to_agent_associations_as_kv" >>
            ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            ))

        # Bring in the judicial districts associated with supervision_periods
        sp_to_judicial_district_kv = (
            p
            | "Load sp_to_judicial_district_kv" >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=
                SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
                table_key="person_id",
            ))

        state_race_ethnicity_population_counts = (
            p
            | "Load state_race_ethnicity_population_counts" >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            ))

        if state_code == "US_MO":
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset,
                US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code,
                person_id_filter_set,
            )

            us_mo_sentence_statuses = (
                p
                | "Read MO sentence status table from BigQuery" >>
                ReadFromBigQuery(query=us_mo_sentence_status_query))
        else:
            us_mo_sentence_statuses = (
                p
                |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | "Convert MO sentence status ranking table to KV tuples" >>
            beam.ParDo(ConvertDictToKVTuple(), "person_id"))

        sentences_and_statuses = (
            {
                "incarceration_sentences": incarceration_sentences,
                "supervision_sentences": supervision_sentences,
                "sentence_statuses": us_mo_sentence_status_rankings_as_kv,
            }
            | "Group sentences to the sentence statuses for that person" >>
            beam.CoGroupByKey())

        sentences_converted = (
            sentences_and_statuses
            | "Convert to state-specific sentences" >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    "incarceration_sentences", "supervision_sentences"))

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                "violations": supervision_violations,
                "violation_responses": supervision_violation_responses,
            }
            | "Group StateSupervisionViolationResponses to "
            "StateSupervisionViolations" >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on the corresponding
        # StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | "Set hydrated StateSupervisionViolations on "
            "the StateSupervisionViolationResponses" >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id
        incarceration_periods_and_violation_responses = (
            {
                "incarceration_periods":
                incarceration_periods,
                "violation_responses":
                violation_responses_with_hydrated_violations,
            }
            | "Group StateIncarcerationPeriods to "
            "StateSupervisionViolationResponses" >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding
        # StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | "Set hydrated StateSupervisionViolationResponses on "
            "the StateIncarcerationPeriods" >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their related entities
        person_entities = {
            "person":
            persons,
            "assessments":
            assessments,
            "incarceration_periods":
            incarceration_periods_with_source_violations,
            "supervision_periods":
            supervision_periods,
            "supervision_sentences":
            sentences_converted.supervision_sentences,
            "incarceration_sentences":
            sentences_converted.incarceration_sentences,
            "violation_responses":
            violation_responses_with_hydrated_violations,
            "supervision_contacts":
            supervision_contacts,
            "supervision_period_judicial_district_association":
            sp_to_judicial_district_kv,
            "supervision_period_to_agent_association":
            supervision_period_to_agent_associations_as_kv,
        } | "Group StatePerson to all entities" >> beam.CoGroupByKey()

        # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods
        person_time_buckets = (person_entities
                               | "Get SupervisionTimeBuckets" >> beam.ParDo(
                                   ClassifySupervisionTimeBuckets()))

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                AsList(state_race_ethnicity_population_counts)))

        person_time_buckets_with_metadata = (
            {
                "person_events": person_time_buckets,
                "person_metadata": person_metadata
            }
            | "Group SupervisionTimeBuckets with person-level metadata" >>
            beam.CoGroupByKey()
            |
            "Organize StatePerson, PersonMetadata and SupervisionTimeBuckets for calculations"
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            "%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get supervision metrics
        supervision_metrics = (
            person_time_buckets_with_metadata
            | "Get Supervision Metrics" >> GetSupervisionMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count,
            ))
        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            supervision_metrics
            | "Convert to dict to be written to BQ" >>
            beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                SupervisionMetricType.SUPERVISION_COMPLIANCE.value,
                SupervisionMetricType.SUPERVISION_POPULATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value,
                SupervisionMetricType.SUPERVISION_START.value,
                SupervisionMetricType.SUPERVISION_SUCCESS.value,
                SupervisionMetricType.
                SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value,
                SupervisionMetricType.SUPERVISION_TERMINATION.value,
                SupervisionMetricType.SUPERVISION_OUT_OF_STATE_POPULATION.
                value,
                SupervisionMetricType.SUPERVISION_DOWNGRADE.value,
            ))

        terminations_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionTerminationMetric]
        compliance_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionCaseComplianceMetric]
        populations_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionPopulationMetric]
        revocations_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionRevocationMetric]
        revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionRevocationAnalysisMetric]
        successes_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionSuccessMetric]
        successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES[
            SuccessfulSupervisionSentenceDaysServedMetric]
        supervision_starts_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionStartMetric]
        out_of_state_populations_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionOutOfStatePopulationMetric]
        supervision_downgrade_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionDowngradeMetric]

        _ = (writable_metrics.SUPERVISION_POPULATION
             | f"Write population metrics to BQ table: {populations_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=populations_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_OUT_OF_STATE_POPULATION
             | f"Write out of state population metrics to BQ table: "
             f"{out_of_state_populations_table_id}" >> WriteAppendToBigQuery(
                 output_table=out_of_state_populations_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_REVOCATION
             | f"Write revocation metrics to BQ table: {revocations_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=revocations_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_SUCCESS
             | f"Write success metrics to BQ table: {successes_table_id}" >>
             WriteAppendToBigQuery(
                 output_table=successes_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED
             | f"Write supervision successful sentence length metrics to BQ"
             f" table: {successful_sentence_lengths_table_id}" >>
             WriteAppendToBigQuery(
                 output_table=successful_sentence_lengths_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_TERMINATION
             |
             f"Write termination metrics to BQ table: {terminations_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=terminations_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_REVOCATION_ANALYSIS
             | f"Write revocation analyses metrics to BQ table: "
             f"{revocation_analysis_table_id}" >> WriteAppendToBigQuery(
                 output_table=revocation_analysis_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_COMPLIANCE
             | f"Write compliance metrics to BQ table: {compliance_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=compliance_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_START
             |
             f"Write start metrics to BQ table: {supervision_starts_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=supervision_starts_table_id,
                 output_dataset=output,
             ))

        _ = (
            writable_metrics.SUPERVISION_DOWNGRADE
            |
            f"Write downgrade metrics to BQ table: {supervision_downgrade_table_id}"
            >> WriteAppendToBigQuery(
                output_table=supervision_downgrade_table_id,
                output_dataset=output,
            ))
Пример #2
0
    def execute_pipeline(
        self,
        pipeline: beam.Pipeline,
        all_pipeline_options: Dict[str, Any],
        state_code: str,
        input_dataset: str,
        reference_dataset: str,
        static_reference_dataset: str,
        metric_types: List[str],
        person_id_filter_set: Optional[Set[int]],
        calculation_month_count: int = -1,
        calculation_end_month: Optional[str] = None,
    ) -> beam.Pipeline:
        # Get StatePersons
        persons = pipeline | "Load Persons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateProgramAssignments
        program_assignments = pipeline | "Load Program Assignments" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateProgramAssignment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateAssessments
        assessments = pipeline | "Load Assessments" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionPeriods
        supervision_periods = pipeline | "Load SupervisionPeriods" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        supervision_period_to_agent_associations_as_kv = (
            pipeline
            | "Load supervision_period_to_agent_associations_as_kv" >>
            ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            ))

        state_race_ethnicity_population_counts = (
            pipeline
            | "Load state_race_ethnicity_population_counts" >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            ))

        # Group each StatePerson with their other entities
        persons_entities = {
            "person":
            persons,
            "program_assignments":
            program_assignments,
            "assessments":
            assessments,
            "supervision_periods":
            supervision_periods,
            "supervision_period_to_agent_association":
            supervision_period_to_agent_associations_as_kv,
        } | "Group StatePerson to StateProgramAssignments and" >> beam.CoGroupByKey(
        )

        # Identify ProgramEvents from the StatePerson's StateProgramAssignments
        person_program_events = persons_entities | beam.ParDo(
            ClassifyEvents(), identifier=self.pipeline_config.identifier)

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                state_race_ethnicity_population_counts=AsList(
                    state_race_ethnicity_population_counts),
            ))

        person_program_events_with_metadata = (
            {
                "person_events": person_program_events,
                "person_metadata": person_metadata
            }
            | "Group ProgramEvents with person-level metadata" >>
            beam.CoGroupByKey()
            |
            "Organize StatePerson, PersonMetadata and ProgramEvents for calculations"
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            "%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get program metrics
        program_metrics = (person_program_events_with_metadata
                           | "Get Program Metrics" >> GetMetrics(
                               pipeline_options=all_pipeline_options,
                               pipeline_config=self.pipeline_config,
                               metric_types_to_include=metric_types_set,
                               calculation_end_month=calculation_end_month,
                               calculation_month_count=calculation_month_count,
                           ))

        return program_metrics