def testWriteToBq_MetadataMissing(mock_csv: mock.MagicMock,
                                  mock_download: mock.MagicMock):
    ctp = CovidTrackingProject()
    kwargs = {'filename': 'test_file.csv', 'table_name': 'output_table'}
    with pytest.raises(RuntimeError,
                       match=r'BigQuery call to dataset returned 0 rows'):
        ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs)
示例#2
0
def testWriteToBq(mock_append_to_bq: mock.MagicMock, mock_csv: mock.MagicMock,
                  mock_download: mock.MagicMock):
    ctp = CovidTrackingProject()
    kwargs = {'filename': 'test_file.csv',
              'metadata_table_id': 'test_metadata',
              'table_name': 'output_table'}
    ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs)
    assert mock_append_to_bq.call_count == 4
    var_types = ['cases', 'deaths', 'tests', 'hosp']
    for i in range(len(var_types)):
        result = mock_append_to_bq.call_args_list[i].args[0]
        expected_rows = (_RACE_CATEGORIES - 1) * _NUM_ROWS
        expected_col_names = [
            'date', 'state_postal_abbreviation', 'race',
            var_types[i], 'reports_race', 'race_ethnicity_separately']
        assert result.shape == (expected_rows, len(expected_col_names))
        assert set(result.columns) == set(expected_col_names)
        expected_ind_rows = {'cases': 1, 'deaths': 1}
        assert (len(result.loc[
            result['race'] == col_std.Race.INDIGENOUS.value].index) ==
            expected_ind_rows.get(var_types[i], 0))
        expected_api_rows = {'cases': 4, 'deaths': 2}
        assert (len(result.loc[
            result['race'] == col_std.Race.API.value].index) ==
            expected_api_rows.get(var_types[i], 0))
        expected_dtypes = {col: np.object for col in result.columns}
        expected_dtypes[var_types[i]] = np.float64
        for col in result.columns:
            assert result[col].dtype == expected_dtypes[col]
def testWriteToBq(mock_append_to_bq: mock.MagicMock, mock_csv: mock.MagicMock,
                  mock_download: mock.MagicMock):
    ctp = CovidTrackingProject()
    kwargs = {
        'filename': 'test_file.csv',
        'metadata_table_id': 'test_metadata',
        'table_name': 'output_table'
    }
    ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs)
    mock_append_to_bq.assert_called_once()
    result = mock_append_to_bq.call_args.args[0]
    expected_rows = (_RACE_CATEGORIES - 1) * _VARIABLE_TYPES * _NUM_ROWS
    expected_cols = 7
    assert result.shape == (expected_rows, expected_cols)
    expected_col_names = [
        'date', 'state_postal_abbreviation', 'race', 'variable_type', 'value',
        'reports_race', 'race_ethnicity_separately'
    ]
    assert set(result.columns) == set(expected_col_names)
    assert len(
        result.loc[result['race'] == col_std.Race.INDIGENOUS.value].index) == 2
    assert len(result.loc[result['race'] == col_std.Race.API.value].index) == 6
    expected_dtypes = {col: np.object for col in result.columns}
    expected_dtypes['date'] = np.dtype('datetime64[ns]')
    expected_dtypes['value'] = np.float64
    for col in result.columns:
        assert result[col].dtype == expected_dtypes[col]
示例#4
0
def testStandardize():
    ctp = CovidTrackingProject()
    df = get_test_data_as_df()
    df = ctp.standardize(df)

    expected_cols = {
        'date', col_std.STATE_POSTAL_COL, col_std.RACE_COL, 'variable_type',
        'value'
    }
    assert set(df.columns) == expected_cols

    expected_race_categories = [r.race for r in col_std.Race]
    assert set(df[col_std.RACE_COL]).issubset(set(expected_race_categories))
def testWriteToBq_MissingAttr():
    ctp = CovidTrackingProject()
    kwargs = {}
    with pytest.raises(RuntimeError, match=r'filename not found'):
        ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs)

    kwargs = {'filename': 'test_file.csv'}
    with pytest.raises(RuntimeError, match=r'metadata_table_id not found'):
        ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs)

    kwargs = {
        'filename': 'test_file.csv',
        'metadata_table_id': 'test_metadata'
    }
    with pytest.raises(RuntimeError, match=r'table_name not found'):
        ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs)
示例#6
0
def testMergeWithMetadata():
    ctp = CovidTrackingProject()
    df = get_test_data_as_df()
    mdf = get_test_metadata_as_df()

    df = ctp.standardize(df)
    df = ctp.merge_with_metadata(df, mdf)

    expected_cols = {
        'date', col_std.STATE_POSTAL_COL, col_std.RACE_COL, 'variable_type',
        'value', 'reports_race', 'race_ethnicity_separately'
    }
    assert set(df.columns) == expected_cols

    expected_race_categories = {
        col_std.Race.AIAN.race, col_std.Race.API.race, col_std.Race.ASIAN.race,
        col_std.Race.BLACK.race, col_std.Race.HISP.race,
        col_std.Race.INDIGENOUS.race, col_std.Race.NHPI.race,
        col_std.Race.MULTI.race, col_std.Race.WHITE.race, col_std.Race.NH.race,
        col_std.Race.ETHNICITY_UNKNOWN.race,
        col_std.Race.OTHER_NONSTANDARD.race, col_std.Race.UNKNOWN.race,
        col_std.Race.TOTAL.race
    }
    assert set(df[col_std.RACE_COL]) == expected_race_categories
示例#7
0
def testWriteToBq_MissingAttr():
    ctp = CovidTrackingProject()
    kwargs = {}
    with pytest.raises(RuntimeError, match=r'filename not found'):
        ctp.write_to_bq('dataset', 'gcs_bucket', **kwargs)
from datasources.acs_population import ACSPopulation
from datasources.cdc_covid_deaths import CDCCovidDeaths
from datasources.county_adjacency import CountyAdjacency
from datasources.county_names import CountyNames
from datasources.covid_tracking_project import CovidTrackingProject
from datasources.covid_tracking_project_metadata import CtpMetadata
from datasources.household_income import HouseholdIncome
from datasources.manual_uploads import ManualUploads
from datasources.primary_care_access import PrimaryCareAccess
from datasources.state_names import StateNames
from datasources.urgent_care_facilities import UrgentCareFacilities


# Map of data source ID to the class that implements the ingestion methods for
# that data source.
DATA_SOURCES_DICT = {
    ACSPopulation.get_id(): ACSPopulation(),
    CDCCovidDeaths.get_id(): CDCCovidDeaths(),
    CountyAdjacency.get_id(): CountyAdjacency(),
    CountyNames.get_id(): CountyNames(),
    CovidTrackingProject.get_id(): CovidTrackingProject(),
    CtpMetadata.get_id(): CtpMetadata(),
    HouseholdIncome.get_id(): HouseholdIncome(),
    ManualUploads.get_id(): ManualUploads(),
    PrimaryCareAccess.get_id(): PrimaryCareAccess(),
    StateNames.get_id(): StateNames(),
    UrgentCareFacilities.get_id(): UrgentCareFacilities(),
}