def setUp(self) -> None:
        super().setUp()
        view_builders = DirectIngestPreProcessedIngestViewCollector(
            get_region(STATE_CODE, is_direct_ingest=True), []
        ).collect_view_builders()
        self.view_builder = one(
            view
            for view in view_builders
            if view.file_tag == "sci_incarceration_period"
        )

        self.expected_result_columns = [
            "control_number",
            "inmate_number",
            "sequence_number",
            "start_movement_date",
            "end_movement_date",
            "location",
            "start_sentence_status_code",
            "end_sentence_status_code",
            "start_parole_status_code",
            "end_parole_status_code",
            "start_movement_code",
            "end_movement_code",
            "start_is_new_revocation",
            "start_is_admin_edge",
            "end_is_admin_edge",
            "sentence_type",
        ]
示例#2
0
def get_ingest_view_configs(
    region_code: str, ) -> List[DataDiscoveryStandardizedFileConfig]:
    """Collect ingest views for region; reads columns from their corresponding fixture csv"""
    if not StateCode.is_state_code(region_code):
        raise ValueError(
            f"Unknown region_code [{region_code}] received, must be a valid state code."
        )

    region_code = region_code.lower()

    views = DirectIngestPreProcessedIngestViewCollector(
        get_region(region_code, True), []).collect_view_builders()

    configs = []
    for view in views:
        try:
            # TODO(#6925) Infer columns from the mapping file rather than the fixture csv
            fixture_path = os.path.join(
                os.path.dirname(recidiviz.__file__),
                f"tests/ingest/direct/direct_ingest_fixtures/{region_code}/{view.ingest_view_name}.csv",
            )

            with open(fixture_path, "r") as f:
                columns = f.readline().split(",")
        except FileNotFoundError:
            continue

        standardized_config = DataDiscoveryStandardizedFileConfig(
            file_tag=view.ingest_view_name,
            columns=columns,
        )

        configs.append(standardized_config)

    return configs
    def test_collect_and_build_ingest_view_builders(
            self, _name: str, project_id: str,
            environment: GCPEnvironment) -> None:
        with patch("recidiviz.utils.environment.get_gcp_environment",
                   return_value=environment):
            with patch("recidiviz.utils.metadata.project_id",
                       return_value=project_id):
                for region_code in self.region_dir_names:
                    region = get_region(
                        region_code,
                        is_direct_ingest=True,
                        region_module_override=self.region_module_override,
                    )

                    with patch(
                            "recidiviz.utils.metadata.project_id",
                            return_value="recidiviz-456",
                    ):
                        controller = DirectIngestControllerFactory.build(
                            ingest_bucket_path=self.
                            primary_ingest_bucket_for_region(region),
                            allow_unlaunched=True,
                        )

                    builders = DirectIngestPreProcessedIngestViewCollector(
                        region, controller.get_file_tag_rank_list()
                    ).collect_view_builders()
                    for builder in builders:
                        builder.build()
示例#4
0
    def test_raw_files_yaml_parses_all_regions(self) -> None:
        for region_code in self.region_dir_names:
            region = get_region(
                region_code,
                is_direct_ingest=True,
                region_module_override=self.region_module_override,
            )

            controller_class = region.get_ingestor_class()
            if not issubclass(controller_class, GcsfsDirectIngestController):
                continue

            builders = DirectIngestPreProcessedIngestViewCollector(
                region, controller_class.get_file_tag_rank_list()
            ).collect_view_builders()

            raw_file_manager = DirectIngestRegionRawFileConfig(
                region_code=region.region_code,
                region_module=self.region_module_override,
            )

            if builders or raw_file_manager.raw_file_configs:
                if region.raw_data_bq_imports_enabled_env is not None:
                    self.test.assertTrue(raw_file_manager.raw_file_configs)
                config_file_tags = set()
                for config in raw_file_manager.raw_file_configs.values():
                    self.test.assertTrue(
                        config.file_tag not in config_file_tags,
                        f"Multiple raw file configs defined with the same "
                        f"file_tag [{config.file_tag}]",
                    )
                    config_file_tags.add(config.file_tag)
示例#5
0
    def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]:
        """Generates documentation for all raw file configs for the given region and
        returns all of it as a combined string.

        Returns one Markdown-formatted string per raw file, mapped to its filename, as
        well as a header file with a table of contents.
        """
        region_config = DirectIngestRegionRawFileConfig(region_code=region_code)

        sorted_file_tags = sorted(region_config.raw_file_tags)

        if StateCode.is_state_code(region_code):
            state_code = StateCode(region_code.upper())
            state_name = state_code.get_state().name

            file_header = STATE_RAW_DATA_FILE_HEADER_TEMPLATE.format(
                state_name=state_name, state_code_lower=state_code.value.lower()
            )
        else:
            file_header = ""

        raw_file_configs = [
            region_config.raw_file_configs[file_tag] for file_tag in sorted_file_tags
        ]

        config_paths_by_file_tag = {
            file_tag: file_config.file_path
            for file_tag, file_config in region_config.raw_file_configs.items()
        }

        file_tags_with_raw_file_configs = [
            raw_file_config.file_tag for raw_file_config in raw_file_configs
        ]

        region = regions.get_region(region_code=region_code, is_direct_ingest=True)

        view_collector = DirectIngestPreProcessedIngestViewCollector(region, [])
        views_by_raw_file = self.get_referencing_views(view_collector)
        touched_configs = self._get_touched_raw_data_configs(
            region_config.yaml_config_file_dir
        )

        raw_file_table = self._generate_raw_file_table(
            config_paths_by_file_tag,
            file_tags_with_raw_file_configs,
            views_by_raw_file,
            touched_configs,
        )

        docs_per_file: Dict[str, str] = {
            f"{config.file_tag}.md": self._generate_docs_for_raw_config(config)
            for config in raw_file_configs
        }

        docs_per_file[STATE_RAW_DATA_FILE_HEADER_PATH] = (
            file_header + "\n" + raw_file_table
        )

        return docs_per_file
示例#6
0
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.temp_output_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path())

        ingest_job_file_type_filter = \
            GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None
        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self.get_file_tag_rank_list(),
                ingest_job_file_type_filter)

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code)

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl())

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()))
示例#7
0
    def __init__(self, ingest_bucket_path: GcsfsBucketPath) -> None:
        """Initialize the controller."""
        self.cloud_task_manager = DirectIngestCloudTaskManagerImpl()
        self.ingest_instance = DirectIngestInstance.for_ingest_bucket(
            ingest_bucket_path)
        self.region_lock_manager = DirectIngestRegionLockManager.for_direct_ingest(
            region_code=self.region.region_code,
            schema_type=self.system_level.schema_type(),
            ingest_instance=self.ingest_instance,
        )
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.ingest_bucket_path = ingest_bucket_path
        self.storage_directory_path = (
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code=self.region_code(),
                system_level=self.system_level,
                ingest_instance=self.ingest_instance,
            ))

        self.temp_output_directory_path = (
            gcsfs_direct_ingest_temporary_output_directory_path())

        self.file_prioritizer = GcsfsDirectIngestJobPrioritizer(
            self.fs,
            self.ingest_bucket_path,
            self.get_file_tag_rank_list(),
        )

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code,
            ingest_database_name=self.ingest_database_key.db_name,
        )

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_bucket_path=self.ingest_bucket_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl(),
        )

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            output_bucket_name=self.ingest_bucket_path.bucket_name,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()),
            launched_file_tags=self.get_file_tag_rank_list(),
        )

        self.ingest_instance_status_manager = DirectIngestInstanceStatusManager(
            self.region_code(), self.ingest_instance)
示例#8
0
    def test_collect_ingest_views(self):
        with local_project_id_override('project'):
            for region_code in self._get_existing_region_dir_names():
                region = get_region(region_code, is_direct_ingest=True)

                controller_class = region.get_ingestor_class()
                if not issubclass(controller_class, GcsfsDirectIngestController):
                    continue

                _ = DirectIngestPreProcessedIngestViewCollector(
                    region, controller_class.get_file_tag_rank_list()).collect_views()
示例#9
0
    def get_referencing_views(
        view_collector: DirectIngestPreProcessedIngestViewCollector,
    ) -> Dict[str, List[str]]:
        """Generates a dictionary mapping raw files to ingest views that reference them"""
        views_by_raw_file = defaultdict(list)

        for builder in view_collector.collect_view_builders():
            ingest_view = builder.build()
            dependency_configs = ingest_view.raw_table_dependency_configs
            for config in dependency_configs:
                views_by_raw_file[config.file_tag].append(ingest_view.file_tag)

        return views_by_raw_file
示例#10
0
    def generate_raw_file_docs_for_region(self, region_code: str) -> str:
        """Generates documentation for all raw file configs for the given region and returns all of it
        as a combined string."""
        region_config = DirectIngestRegionRawFileConfig(
            region_code=region_code)

        sorted_file_tags = sorted(region_config.raw_file_tags)

        if StateCode.is_state_code(region_code):
            state_code = StateCode(region_code.upper())
            state_name = state_code.get_state()

            file_header = STATE_RAW_DATA_FILE_HEADER_TEMPLATE.format(
                state_name=state_name,
                state_code_lower=state_code.value.lower())
        else:
            file_header = ""

        raw_file_configs = [
            region_config.raw_file_configs[file_tag]
            for file_tag in sorted_file_tags
        ]

        config_paths_by_file_tag = {
            file_tag: file_config.file_path
            for file_tag, file_config in
            region_config.raw_file_configs.items()
        }

        file_tags_with_raw_file_configs = [
            raw_file_config.file_tag for raw_file_config in raw_file_configs
        ]

        region = regions.get_region(region_code=region_code,
                                    is_direct_ingest=True)

        view_collector = DirectIngestPreProcessedIngestViewCollector(
            region, [])
        views_by_raw_file = self.get_referencing_views(view_collector)

        raw_file_table = self._generate_raw_file_table(
            config_paths_by_file_tag, file_tags_with_raw_file_configs,
            views_by_raw_file)

        docs_per_file = [
            self._generate_docs_for_raw_config(config)
            for config in raw_file_configs
        ]

        return file_header + "\n" + raw_file_table + "\n" + "\n\n".join(
            docs_per_file)
    def setUp(self) -> None:
        super().setUp()
        view_builders = DirectIngestPreProcessedIngestViewCollector(
            get_region(STATE_CODE, is_direct_ingest=True),
            []).collect_view_builders()
        self.view_builder = one(view for view in view_builders
                                if view.file_tag == "person_external_ids")

        self.expected_result_columns = [
            "recidiviz_master_person_id",
            "control_numbers",
            "inmate_numbers",
            "parole_numbers",
        ]
示例#12
0
    def test_collect_and_build_ingest_view_builders(
        self, _name: str, project_id: str, environment: GCPEnvironment
    ) -> None:
        with patch(
            "recidiviz.utils.environment.get_gcp_environment", return_value=environment
        ):
            with patch("recidiviz.utils.metadata.project_id", return_value=project_id):
                for region_code in self.region_dir_names:
                    region = get_region(
                        region_code,
                        is_direct_ingest=True,
                        region_module_override=self.region_module_override,
                    )

                    controller_class = region.get_ingestor_class()
                    if not issubclass(controller_class, GcsfsDirectIngestController):
                        continue

                    builders = DirectIngestPreProcessedIngestViewCollector(
                        region, controller_class.get_file_tag_rank_list()
                    ).collect_view_builders()
                    for builder in builders:
                        builder.build()
    def test_raw_files_yaml_parses_all_regions(self) -> None:
        for region_code in self.region_dir_names:
            region = get_region(
                region_code,
                is_direct_ingest=True,
                region_module_override=self.region_module_override,
            )

            with patch("recidiviz.utils.metadata.project_id",
                       return_value="recidiviz-456"):
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=self.primary_ingest_bucket_for_region(
                        region),
                    allow_unlaunched=True,
                )

            builders = DirectIngestPreProcessedIngestViewCollector(
                region,
                controller.get_file_tag_rank_list()).collect_view_builders()

            raw_file_manager = DirectIngestRegionRawFileConfig(
                region_code=region.region_code,
                region_module=self.region_module_override,
            )

            if builders or raw_file_manager.raw_file_configs:
                if region.is_ingest_launched_in_env() is not None:
                    self.test.assertTrue(raw_file_manager.raw_file_configs)
                config_file_tags = set()
                for config in raw_file_manager.raw_file_configs.values():
                    self.test.assertTrue(
                        config.file_tag not in config_file_tags,
                        f"Multiple raw file configs defined with the same "
                        f"file_tag [{config.file_tag}]",
                    )
                    config_file_tags.add(config.file_tag)
            )
            for metadata in metadata_list
        ]


if __name__ == "__main__":

    # Update these variables and run to print an export query you can run in the BigQuery UI
    region_code_: str = "us_mo"
    ingest_view_name_: str = "tak001_offender_identification"
    upper_bound_datetime_prev_: datetime.datetime = datetime.datetime(2020, 10, 15)
    upper_bound_datetime_to_export_: datetime.datetime = datetime.datetime(2020, 12, 18)

    with local_project_id_override(GCP_PROJECT_STAGING):
        region_ = regions.get_region(region_code_, is_direct_ingest=True)
        view_collector_ = DirectIngestPreProcessedIngestViewCollector(region_, [])
        views_by_tag_ = {
            builder.file_tag: builder.build()
            for builder in view_collector_.collect_view_builders()
        }

        debug_query = DirectIngestIngestViewExportManager.debug_query_for_args(
            views_by_tag_,
            GcsfsIngestViewExportArgs(
                ingest_view_name=ingest_view_name_,
                upper_bound_datetime_prev=upper_bound_datetime_prev_,
                upper_bound_datetime_to_export=upper_bound_datetime_to_export_,
            ),
        )
        print(debug_query)
            for metadata in metadata_list
        ]


if __name__ == '__main__':

    # Update these variables and run to print an export query you can run in the BigQuery UI
    region_code_: str = 'us_id'
    ingest_view_name_: str = 'movement_facility_location_offstat_supervision_periods'
    upper_bound_datetime_prev_: datetime.datetime = datetime.datetime(
        2020, 6, 29)
    upper_bound_datetime_to_export_: datetime.datetime = datetime.datetime(
        2020, 7, 29)

    with local_project_id_override(GCP_PROJECT_STAGING):
        region_ = regions.get_region(region_code_, is_direct_ingest=True)
        view_collector_ = DirectIngestPreProcessedIngestViewCollector(
            region_, [])
        views_by_tag_ = {
            view.file_tag: view
            for view in view_collector_.collect_views()
        }

        DirectIngestIngestViewExportManager.print_debug_query_for_args(
            views_by_tag_,
            GcsfsIngestViewExportArgs(
                ingest_view_name=ingest_view_name_,
                upper_bound_datetime_prev=upper_bound_datetime_prev_,
                upper_bound_datetime_to_export=upper_bound_datetime_to_export_)
        )