def get_ingest_view_configs( region_code: str, ) -> List[DataDiscoveryStandardizedFileConfig]: """Collect ingest views for region; reads columns from their corresponding fixture csv""" if not StateCode.is_state_code(region_code): raise ValueError( f"Unknown region_code [{region_code}] received, must be a valid state code." ) region_code = region_code.lower() views = DirectIngestPreProcessedIngestViewCollector( get_region(region_code, True), []).collect_view_builders() configs = [] for view in views: try: # TODO(#6925) Infer columns from the mapping file rather than the fixture csv fixture_path = os.path.join( os.path.dirname(recidiviz.__file__), f"tests/ingest/direct/direct_ingest_fixtures/{region_code}/{view.ingest_view_name}.csv", ) with open(fixture_path, "r") as f: columns = f.readline().split(",") except FileNotFoundError: continue standardized_config = DataDiscoveryStandardizedFileConfig( file_tag=view.ingest_view_name, columns=columns, ) configs.append(standardized_config) return configs
def _get_state_code_from_str(state_code_str: str) -> StateCode: if not StateCode.is_state_code(state_code_str): raise ValueError( f"Unknown region_code [{state_code_str}] received, must be a valid state code." ) return StateCode[state_code_str.upper()]
def main(argv: Optional[Sequence[str]] = None) -> int: """Generates direct ingest region documentation.""" parser = argparse.ArgumentParser() parser.add_argument( "filenames", nargs="*", help="Modified files to indicate which regions need their docs to be regenerated. " "Paths must be relative to the root of the repository. " "If none are provided, will use `git diff` to determine modified files.", ) args = parser.parse_args(argv) # Arbitrary project ID - we just need to build views in order to obtain raw table dependencies with local_project_id_override(GCP_PROJECT_STAGING): modified = False touched_raw_data_regions = get_touched_raw_data_regions(args.filenames) for region_code in touched_raw_data_regions: if not StateCode.is_state_code(region_code): logging.info( "Skipping raw data documentation for non-state region [%s]", region_code, ) continue logging.info( "Generating raw data documentation for region [%s]", region_code ) modified |= generate_raw_data_documentation_for_region(region_code) if modified: update_summary_file( _create_ingest_catalog_summary(), "## State Ingest Catalog" ) return 1 if modified else 0
def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]: """Generates documentation for all raw file configs for the given region and returns all of it as a combined string. Returns one Markdown-formatted string per raw file, mapped to its filename, as well as a header file with a table of contents. """ region_config = DirectIngestRegionRawFileConfig(region_code=region_code) sorted_file_tags = sorted(region_config.raw_file_tags) if StateCode.is_state_code(region_code): state_code = StateCode(region_code.upper()) state_name = state_code.get_state().name file_header = STATE_RAW_DATA_FILE_HEADER_TEMPLATE.format( state_name=state_name, state_code_lower=state_code.value.lower() ) else: file_header = "" raw_file_configs = [ region_config.raw_file_configs[file_tag] for file_tag in sorted_file_tags ] config_paths_by_file_tag = { file_tag: file_config.file_path for file_tag, file_config in region_config.raw_file_configs.items() } file_tags_with_raw_file_configs = [ raw_file_config.file_tag for raw_file_config in raw_file_configs ] region = regions.get_region(region_code=region_code, is_direct_ingest=True) view_collector = DirectIngestPreProcessedIngestViewCollector(region, []) views_by_raw_file = self.get_referencing_views(view_collector) touched_configs = self._get_touched_raw_data_configs( region_config.yaml_config_file_dir ) raw_file_table = self._generate_raw_file_table( config_paths_by_file_tag, file_tags_with_raw_file_configs, views_by_raw_file, touched_configs, ) docs_per_file: Dict[str, str] = { f"{config.file_tag}.md": self._generate_docs_for_raw_config(config) for config in raw_file_configs } docs_per_file[STATE_RAW_DATA_FILE_HEADER_PATH] = ( file_header + "\n" + raw_file_table ) return docs_per_file
def _ingest_lock_name_for_instance(self) -> str: if StateCode.is_state_code(self.region_code): return ( STATE_GCS_TO_POSTGRES_INGEST_RUNNING_LOCK_PREFIX + self.region_code.upper() + f"_{self.ingest_instance.name}" ) return ( JAILS_GCS_TO_POSTGRES_INGEST_RUNNING_LOCK_PREFIX + self.region_code.upper() )
def generate_raw_file_docs_for_region(self, region_code: str) -> str: """Generates documentation for all raw file configs for the given region and returns all of it as a combined string.""" region_config = DirectIngestRegionRawFileConfig( region_code=region_code) sorted_file_tags = sorted(region_config.raw_file_tags) if StateCode.is_state_code(region_code): state_code = StateCode(region_code.upper()) state_name = state_code.get_state() file_header = STATE_RAW_DATA_FILE_HEADER_TEMPLATE.format( state_name=state_name, state_code_lower=state_code.value.lower()) else: file_header = "" raw_file_configs = [ region_config.raw_file_configs[file_tag] for file_tag in sorted_file_tags ] config_paths_by_file_tag = { file_tag: file_config.file_path for file_tag, file_config in region_config.raw_file_configs.items() } file_tags_with_raw_file_configs = [ raw_file_config.file_tag for raw_file_config in raw_file_configs ] region = regions.get_region(region_code=region_code, is_direct_ingest=True) view_collector = DirectIngestPreProcessedIngestViewCollector( region, []) views_by_raw_file = self.get_referencing_views(view_collector) raw_file_table = self._generate_raw_file_table( config_paths_by_file_tag, file_tags_with_raw_file_configs, views_by_raw_file) docs_per_file = [ self._generate_docs_for_raw_config(config) for config in raw_file_configs ] return file_header + "\n" + raw_file_table + "\n" + "\n\n".join( docs_per_file)
def _get_product_enabled_states(self) -> Set[StateCode]: states: Set[str] = set() for product in self.products: if product.states is not None: states = states.union( {state.state_code for state in product.states}) for state_code in states: if not StateCode.is_state_code(state_code): raise ValueError( f"Found invalid state code value [{state_code}]" f" in product config.") return {StateCode(state_code) for state_code in states}
def for_region_code(cls, region_code: str, is_direct_ingest: bool) -> "SystemLevel": if is_direct_ingest is None: raise ValueError( "Region flag is_direct_ingest is None, expected boolean value." ) if not is_direct_ingest: # There are some scrapers that scrape state jails websites (e.g. # recidiviz/ingest/scrape/regions/us_pa/us_pa_scraper.py) which we always # write to the Vera county jails database. return SystemLevel.COUNTY if StateCode.is_state_code(region_code.upper()): return SystemLevel.STATE return SystemLevel.COUNTY
def test_state_codes_match_terraform_config(self) -> None: yaml_path = os.path.join( os.path.dirname(deploy.__file__), "terraform", "direct_ingest_state_codes.yaml", ) with open(yaml_path, "r") as ymlfile: region_codes_list = yaml.full_load(ymlfile) for region in self.region_dir_names: if not StateCode.is_state_code(region): continue self.assertTrue( region.upper() in region_codes_list, f"State [{region}] must be listed in [{yaml_path}]", )
def get_export_configs_for_job_filter( self, export_job_filter: str ) -> List[ProductExportConfig]: """Returns the export configs for the given export_job_filter, which can be either state_code or export job name.""" filter_uppercase = export_job_filter.upper() if StateCode.is_state_code(filter_uppercase): return [ export for export in self.get_all_export_configs() if export["state_code"] == filter_uppercase ] return [ export for export in self.get_all_export_configs() if export["export_job_name"] == filter_uppercase ]
def _get_dataflow_pipeline_enabled_states(self) -> Set[StateCode]: """Returns the set of StateCodes for all states present in our production calc pipeline template.""" states = { pipeline.peek("state_code", str).upper() for pipeline in self.daily_pipelines }.union({ pipeline.peek("state_code", str).upper() for pipeline in self.historical_pipelines }) for state_code in states: if not StateCode.is_state_code(state_code): raise ValueError( f"Found invalid state code value [{state_code}]" f" in pipeline template config.") return {StateCode(state_code) for state_code in states}
def _get_translated_key_column_mask(self) -> int: """Returns an integer mask to add to every primary/foreign key column in this query. The mask is stable across all tables and derived from the region code. Example: 46000000000000 For the above mask, if a primary key is 123456 in Postgres, then the translated primary key would be 46000000123456. """ if not self.region_code: raise ValueError( "Must have set region code to do primary/foreign key translation." ) if not StateCode.is_state_code(self.region_code): raise ValueError( "No support yet for doing primary/foreign key translation on non-state " "regions.") # The FIPS code is always a two-digit code for states fips = int(StateCode(self.region_code).get_state().fips) return fips * pow(10, 12)
def _create_ingest_catalog_summary() -> List[str]: """Creates the State Ingest Catalog portion of SUMMARY.md, as a list of lines.""" ingest_catalog_states = sorted( [ f.lower() for f in listdir(_INGEST_CATALOG_ROOT) if isdir(join(_INGEST_CATALOG_ROOT, f)) ] ) ingest_catalog_summary = ["## State Ingest Catalog\n\n"] for state in ingest_catalog_states: if StateCode.is_state_code(state): state_code = StateCode(state.upper()) state_name = state_code.get_state() else: raise ValueError( f"Folder under {_INGEST_CATALOG_ROOT} named {state} is not a valid state code" ) ingest_catalog_summary.extend( [ f"- [{state_name}](ingest/{state}/{state}.md)\n", f" - [Schema Mappings](ingest/{state}/schema_mappings.md)\n", f" - [Raw Data Description](ingest/{state}/raw_data.md)\n", ] ) raw_data_dir = join(_INGEST_CATALOG_ROOT, state, "raw_data") if not isdir(raw_data_dir): continue raw_data_files = sorted( [f for f in listdir(raw_data_dir) if isfile(join(raw_data_dir, f))] ) for file_name in raw_data_files: ingest_catalog_summary.append( f" - [{file_name[:-3]}](ingest/{state}/raw_data/{file_name})\n" ) return ingest_catalog_summary
def test_regions_are_clean(self) -> None: """Check that all existing region directories start with a valid state code.""" for region in self.region_dir_names: self.test.assertTrue(StateCode.is_state_code(region[:5]))
def _validate_region_code(region_code: str) -> None: if not StateCode.is_state_code(region_code.upper()): raise ValueError( f"Unknown region_code [{region_code}] received, must be a valid state code." )