def testWriteToBq_MetadataMissing(mock_csv: mock.MagicMock, mock_download: mock.MagicMock): ctp = CovidTrackingProject() kwargs = {'filename': 'test_file.csv', 'table_name': 'output_table'} with pytest.raises(RuntimeError, match=r'BigQuery call to dataset returned 0 rows'): ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs)
def testWriteToBq(mock_append_to_bq: mock.MagicMock, mock_csv: mock.MagicMock, mock_download: mock.MagicMock): ctp = CovidTrackingProject() kwargs = {'filename': 'test_file.csv', 'metadata_table_id': 'test_metadata', 'table_name': 'output_table'} ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs) assert mock_append_to_bq.call_count == 4 var_types = ['cases', 'deaths', 'tests', 'hosp'] for i in range(len(var_types)): result = mock_append_to_bq.call_args_list[i].args[0] expected_rows = (_RACE_CATEGORIES - 1) * _NUM_ROWS expected_col_names = [ 'date', 'state_postal_abbreviation', 'race', var_types[i], 'reports_race', 'race_ethnicity_separately'] assert result.shape == (expected_rows, len(expected_col_names)) assert set(result.columns) == set(expected_col_names) expected_ind_rows = {'cases': 1, 'deaths': 1} assert (len(result.loc[ result['race'] == col_std.Race.INDIGENOUS.value].index) == expected_ind_rows.get(var_types[i], 0)) expected_api_rows = {'cases': 4, 'deaths': 2} assert (len(result.loc[ result['race'] == col_std.Race.API.value].index) == expected_api_rows.get(var_types[i], 0)) expected_dtypes = {col: np.object for col in result.columns} expected_dtypes[var_types[i]] = np.float64 for col in result.columns: assert result[col].dtype == expected_dtypes[col]
def testWriteToBq(mock_append_to_bq: mock.MagicMock, mock_csv: mock.MagicMock, mock_download: mock.MagicMock): ctp = CovidTrackingProject() kwargs = { 'filename': 'test_file.csv', 'metadata_table_id': 'test_metadata', 'table_name': 'output_table' } ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs) mock_append_to_bq.assert_called_once() result = mock_append_to_bq.call_args.args[0] expected_rows = (_RACE_CATEGORIES - 1) * _VARIABLE_TYPES * _NUM_ROWS expected_cols = 7 assert result.shape == (expected_rows, expected_cols) expected_col_names = [ 'date', 'state_postal_abbreviation', 'race', 'variable_type', 'value', 'reports_race', 'race_ethnicity_separately' ] assert set(result.columns) == set(expected_col_names) assert len( result.loc[result['race'] == col_std.Race.INDIGENOUS.value].index) == 2 assert len(result.loc[result['race'] == col_std.Race.API.value].index) == 6 expected_dtypes = {col: np.object for col in result.columns} expected_dtypes['date'] = np.dtype('datetime64[ns]') expected_dtypes['value'] = np.float64 for col in result.columns: assert result[col].dtype == expected_dtypes[col]
def testStandardize(): ctp = CovidTrackingProject() df = get_test_data_as_df() df = ctp.standardize(df) expected_cols = { 'date', col_std.STATE_POSTAL_COL, col_std.RACE_COL, 'variable_type', 'value' } assert set(df.columns) == expected_cols expected_race_categories = [r.race for r in col_std.Race] assert set(df[col_std.RACE_COL]).issubset(set(expected_race_categories))
def testWriteToBq_MissingAttr(): ctp = CovidTrackingProject() kwargs = {} with pytest.raises(RuntimeError, match=r'filename not found'): ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs) kwargs = {'filename': 'test_file.csv'} with pytest.raises(RuntimeError, match=r'metadata_table_id not found'): ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs) kwargs = { 'filename': 'test_file.csv', 'metadata_table_id': 'test_metadata' } with pytest.raises(RuntimeError, match=r'table_name not found'): ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs)
def testMergeWithMetadata(): ctp = CovidTrackingProject() df = get_test_data_as_df() mdf = get_test_metadata_as_df() df = ctp.standardize(df) df = ctp.merge_with_metadata(df, mdf) expected_cols = { 'date', col_std.STATE_POSTAL_COL, col_std.RACE_COL, 'variable_type', 'value', 'reports_race', 'race_ethnicity_separately' } assert set(df.columns) == expected_cols expected_race_categories = { col_std.Race.AIAN.race, col_std.Race.API.race, col_std.Race.ASIAN.race, col_std.Race.BLACK.race, col_std.Race.HISP.race, col_std.Race.INDIGENOUS.race, col_std.Race.NHPI.race, col_std.Race.MULTI.race, col_std.Race.WHITE.race, col_std.Race.NH.race, col_std.Race.ETHNICITY_UNKNOWN.race, col_std.Race.OTHER_NONSTANDARD.race, col_std.Race.UNKNOWN.race, col_std.Race.TOTAL.race } assert set(df[col_std.RACE_COL]) == expected_race_categories
def testWriteToBq_MissingAttr(): ctp = CovidTrackingProject() kwargs = {} with pytest.raises(RuntimeError, match=r'filename not found'): ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs)
from datasources.acs_population import ACSPopulation from datasources.cdc_covid_deaths import CDCCovidDeaths from datasources.county_adjacency import CountyAdjacency from datasources.county_names import CountyNames from datasources.covid_tracking_project import CovidTrackingProject from datasources.covid_tracking_project_metadata import CtpMetadata from datasources.household_income import HouseholdIncome from datasources.manual_uploads import ManualUploads from datasources.primary_care_access import PrimaryCareAccess from datasources.state_names import StateNames from datasources.urgent_care_facilities import UrgentCareFacilities # Map of data source ID to the class that implements the ingestion methods for # that data source. DATA_SOURCES_DICT = { ACSPopulation.get_id(): ACSPopulation(), CDCCovidDeaths.get_id(): CDCCovidDeaths(), CountyAdjacency.get_id(): CountyAdjacency(), CountyNames.get_id(): CountyNames(), CovidTrackingProject.get_id(): CovidTrackingProject(), CtpMetadata.get_id(): CtpMetadata(), HouseholdIncome.get_id(): HouseholdIncome(), ManualUploads.get_id(): ManualUploads(), PrimaryCareAccess.get_id(): PrimaryCareAccess(), StateNames.get_id(): StateNames(), UrgentCareFacilities.get_id(): UrgentCareFacilities(), }