def test_raw_up_to_date_view(self) -> None: view = DirectIngestRawDataTableUpToDateView( region_code="us_xx", raw_file_config=DirectIngestRawFileConfig( file_tag="table_name", file_path="path/to/file.yaml", file_description="file description", primary_key_cols=["col1"], columns=[ RawTableColumnInfo(name="col1", is_datetime=False, description="col1 description"), RawTableColumnInfo(name="col2", is_datetime=True, description="col2 description"), RawTableColumnInfo(name="undocumented_column", is_datetime=True, description=None), ], supplemental_order_by_clause="", encoding="any-encoding", separator="@", ignore_quotes=False, always_historical_export=False, ), ) self.assertEqual(self.PROJECT_ID, view.project) self.assertEqual("us_xx_raw_data_up_to_date_views", view.dataset_id) self.assertEqual("table_name_by_update_date", view.table_id) self.assertEqual("table_name_by_update_date", view.view_id) expected_datetime_cols_clause = """ COALESCE( CAST(SAFE_CAST(col2 AS DATETIME) AS STRING), CAST(SAFE_CAST(SAFE.PARSE_DATE('%m/%d/%y', col2) AS DATETIME) AS STRING), CAST(SAFE_CAST(SAFE.PARSE_DATE('%m/%d/%Y', col2) AS DATETIME) AS STRING), CAST(SAFE_CAST(SAFE.PARSE_TIMESTAMP('%Y-%m-%d %H:%M', col2) AS DATETIME) AS STRING), CAST(SAFE_CAST(SAFE.PARSE_TIMESTAMP('%m/%d/%Y %H:%M:%S', col2) AS DATETIME) AS STRING), col2 ) AS col2""" expected_view_query = RAW_DATA_UP_TO_DATE_VIEW_QUERY_TEMPLATE.format( project_id=self.PROJECT_ID, raw_table_primary_key_str="col1", raw_table_dataset_id="us_xx_raw_data", raw_table_name="table_name", columns_clause=f"col1, {expected_datetime_cols_clause}", legacy_except_clause="", legacy_datetime_cols_clause="", supplemental_order_by_clause="", ) self.assertEqual(expected_view_query, view.view_query) self.assertEqual( "SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.table_name_by_update_date`", view.select_query, )
def test_raw_latest_historical_file_view(self) -> None: view = DirectIngestRawDataTableLatestView( region_code="us_xx", raw_file_config=DirectIngestRawFileConfig( file_tag="table_name", file_path="path/to/file.yaml", file_description="file description", primary_key_cols=["col1", "col2"], columns=[ RawTableColumnInfo(name="col1", is_datetime=False, description="col1 description"), RawTableColumnInfo(name="col2", is_datetime=False, description="col2 description"), ], supplemental_order_by_clause="CAST(seq_num AS INT64)", encoding="any-encoding", separator="@", ignore_quotes=False, always_historical_export=True, ), dataset_overrides=None, ) self.assertEqual(self.PROJECT_ID, view.project) self.assertEqual("us_xx_raw_data_up_to_date_views", view.dataset_id) self.assertEqual("table_name_latest", view.table_id) self.assertEqual("table_name_latest", view.view_id) expected_view_query = ( RAW_DATA_LATEST_HISTORICAL_FILE_VIEW_QUERY_TEMPLATE.format( project_id=self.PROJECT_ID, raw_table_primary_key_str="col1, col2", raw_table_dataset_id="us_xx_raw_data", raw_table_name="table_name", columns_clause="col1, col2", legacy_except_clause="", legacy_datetime_cols_clause="", supplemental_order_by_clause=", CAST(seq_num AS INT64)", )) self.assertEqual(expected_view_query, view.view_query) self.assertEqual( "SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.table_name_latest`", view.select_query, )
def _get_columns_by_file( state_code: str, project_id: str) -> Dict[str, List[RawTableColumnInfo]]: """Creates a list of RawTableColumnInfo for each raw file in a given state""" columns_by_file: Dict[str, List[RawTableColumnInfo]] = {} raw_data_dataset = f"{state_code.lower()}_raw_data" query_string = f""" SELECT * EXCEPT(is_generated, generation_expression, is_stored, is_updatable) FROM `{project_id}.{raw_data_dataset}.INFORMATION_SCHEMA.COLUMNS` ORDER BY table_name ASC, ordinal_position ASC """ bq_client = BigQueryClientImpl() query_job = bq_client.run_query_async(query_string) for row in query_job: column_name = row["column_name"] if column_name in {"file_id", "update_datetime"}: continue file_name = row["table_name"] is_datetime = row["data_type"].upper() == "DATETIME" if file_name not in columns_by_file: columns_by_file[file_name] = [] column_info = RawTableColumnInfo(name=column_name, is_datetime=is_datetime, description="TKTK") columns_by_file[file_name].append(column_info) return columns_by_file
def test_parse_yaml(self) -> None: region_config = DirectIngestRegionRawFileConfig( region_code="us_xx", yaml_config_file_dir=fixtures.as_filepath("us_xx"), ) self.assertEqual(7, len(region_config.raw_file_configs)) self.assertEqual( { "file_tag_first", "file_tag_second", "tagC", "tagFullHistoricalExport", "tagInvalidCharacters", "tagNormalizationConflict", "tagPipeSeparatedNonUTF8", }, region_config.raw_file_configs.keys(), ) config_1 = region_config.raw_file_configs["file_tag_first"] self.assertEqual("file_tag_first", config_1.file_tag) self.assertEqual("First raw file.", config_1.file_description) self.assertEqual(["col_name_1a", "col_name_1b"], config_1.primary_key_cols) self.assertEqual("ISO-456-7", config_1.encoding) self.assertEqual(",", config_1.separator) expected_column2_description = ( "A column description that is long enough to take up\nmultiple lines. This" " text block will be interpreted\nliterally and trailing/leading whitespace" " is removed.") expected_columns_config_1 = [ RawTableColumnInfo(name="col_name_1a", is_datetime=False, description="First column."), RawTableColumnInfo( name="col_name_1b", is_datetime=False, description=expected_column2_description, ), RawTableColumnInfo(name="undocumented_column", is_datetime=False, description=None), ] self.assertEqual(expected_columns_config_1, config_1.columns) config_2 = region_config.raw_file_configs["file_tag_second"] expected_file_description_config_2 = ( "Some special/unusual character's in the description &\nlong enough to" " make a second line!\\n Trailing/leading white\nspace is stripped & the" " text block is interpreted literally.") self.assertEqual("file_tag_second", config_2.file_tag) self.assertEqual(expected_file_description_config_2, config_2.file_description) self.assertEqual(["col_name_2a"], config_2.primary_key_cols) self.assertEqual("UTF-8", config_2.encoding) self.assertEqual("$", config_2.separator) self.assertEqual( [ RawTableColumnInfo( name="col_name_2a", is_datetime=False, description="column description", ) ], config_2.columns, ) config_3 = region_config.raw_file_configs["tagC"] self.assertEqual("tagC", config_3.file_tag) self.assertEqual("tagC file description", config_3.file_description) self.assertEqual(["COL1"], config_3.primary_key_cols) self.assertEqual("UTF-8", config_3.encoding) self.assertEqual(",", config_3.separator) self.assertEqual( [ RawTableColumnInfo( name="COL1", is_datetime=False, description=None) ], config_3.columns, ) config_4 = region_config.raw_file_configs["tagPipeSeparatedNonUTF8"] self.assertEqual("tagPipeSeparatedNonUTF8", config_4.file_tag) self.assertEqual(["PRIMARY_COL1"], config_4.primary_key_cols) self.assertEqual("ISO-8859-1", config_4.encoding) self.assertEqual("|", config_4.separator)
def _get_raw_data_file_configs( self) -> Dict[str, DirectIngestRawFileConfig]: return { "tagA": DirectIngestRawFileConfig( file_tag="tagA", file_path="path/to/tagA.yaml", file_description="file description", primary_key_cols=["mockKey"], columns=[ RawTableColumnInfo( name="mockKey", description="mockKey description", is_datetime=False, ) ], supplemental_order_by_clause="", encoding="UTF-8", separator=",", custom_line_terminator=None, ignore_quotes=False, always_historical_export=False, ), "tagB": DirectIngestRawFileConfig( file_tag="tagB", file_path="path/to/tagB.yaml", file_description="file description", primary_key_cols=["mockKey"], columns=[ RawTableColumnInfo( name="mockKey", description="mockKey description", is_datetime=False, ) ], supplemental_order_by_clause="", encoding="UTF-8", separator=",", custom_line_terminator=None, ignore_quotes=False, always_historical_export=False, ), "tagC": DirectIngestRawFileConfig( file_tag="tagC", file_path="path/to/tagC.yaml", file_description="file description", primary_key_cols=["mockKey"], columns=[ RawTableColumnInfo( name="mockKey", description="mockKey description", is_datetime=False, ) ], supplemental_order_by_clause="", encoding="UTF-8", separator=",", custom_line_terminator=None, ignore_quotes=False, always_historical_export=False, ), "tagWeDoNotIngest": DirectIngestRawFileConfig( file_tag="tagWeDoNotIngest", file_path="path/to/tagWeDoNotIngest.yaml", file_description="file description", primary_key_cols=[], columns=[], supplemental_order_by_clause="", encoding="UTF-8", separator=",", custom_line_terminator=None, ignore_quotes=False, always_historical_export=False, ), }
def test_parse_yaml(self) -> None: region_config = DirectIngestRegionRawFileConfig( region_code="us_xx", region_module=fake_regions_module, ) self.assertEqual(13, len(region_config.raw_file_configs)) self.assertEqual( { "file_tag_first", "file_tag_second", "tagC", "tagColCapsDoNotMatchConfig", "tagFullHistoricalExport", "tagInvalidCharacters", "tagNormalizationConflict", "tagCustomLineTerminatorNonUTF8", "tagPipeSeparatedNonUTF8", "tagDoubleDaggerWINDOWS1252", "tagColumnsMissing", "tagRowExtraColumns", "tagRowMissingColumns", }, set(region_config.raw_file_configs.keys()), ) config_1 = region_config.raw_file_configs["file_tag_first"] self.assertEqual("file_tag_first", config_1.file_tag) self.assertEqual("First raw file.", config_1.file_description) self.assertEqual(["col_name_1a", "col_name_1b"], config_1.primary_key_cols) self.assertEqual("ISO-456-7", config_1.encoding) self.assertEqual(",", config_1.separator) self.assertIsNone(config_1.custom_line_terminator) expected_column2_description = ( "A column description that is long enough to take up\nmultiple lines. This" " text block will be interpreted\nliterally and trailing/leading whitespace" " is removed.") expected_columns_config_1 = [ RawTableColumnInfo( name="col_name_1a", is_datetime=False, description="First column.", known_values=[ ColumnEnumValueInfo(value="A", description="A description"), ColumnEnumValueInfo(value="B", description=None), ], ), RawTableColumnInfo( name="col_name_1b", is_datetime=False, description=expected_column2_description, ), RawTableColumnInfo(name="undocumented_column", is_datetime=False, description=None), ] self.assertEqual(expected_columns_config_1, config_1.columns) config_2 = region_config.raw_file_configs["file_tag_second"] expected_file_description_config_2 = ( "Some special/unusual character's in the description &\nlong enough to" " make a second line!\\n Trailing/leading white\nspace is stripped & the" " text block is interpreted literally.") self.assertEqual("file_tag_second", config_2.file_tag) self.assertEqual(expected_file_description_config_2, config_2.file_description) self.assertEqual(["col_name_2a"], config_2.primary_key_cols) self.assertEqual("UTF-8", config_2.encoding) self.assertEqual("$", config_2.separator) self.assertEqual( [ RawTableColumnInfo( name="col_name_2a", is_datetime=False, description="column description", ) ], config_2.columns, ) config_3 = region_config.raw_file_configs["tagC"] self.assertEqual("tagC", config_3.file_tag) self.assertEqual("tagC file description", config_3.file_description) self.assertEqual(["COL1"], config_3.primary_key_cols) self.assertEqual("UTF-8", config_3.encoding) self.assertEqual(",", config_3.separator) self.assertEqual( [ RawTableColumnInfo(name="COL1", is_datetime=False, description=None, known_values=None) ], config_3.columns, ) config_4 = region_config.raw_file_configs["tagPipeSeparatedNonUTF8"] self.assertEqual("tagPipeSeparatedNonUTF8", config_4.file_tag) self.assertEqual(["PRIMARY_COL1"], config_4.primary_key_cols) self.assertEqual("ISO-8859-1", config_4.encoding) self.assertEqual("|", config_4.separator)