def write_to_bq(self, dataset, gcs_bucket, **attrs): gcs_files = self.get_attr(attrs, 'filename') # In this instance, we expect filename to be a string with # comma-separated CSV filenames. if ',' not in gcs_files: raise ValueError('filename passed to write_to_bq is not a ' 'comma-separated list of files') files = gcs_files.split(',') print("Files that will be written to BQ:", files) # For each of the files, we load it as a dataframe and add it as a # table in the BigQuery dataset. We expect that all aggregation and # standardization of the data has been done by this point. int_cols = [std_col.COVID_CASES, std_col.COVID_HOSP_Y, std_col.COVID_HOSP_N, std_col.COVID_HOSP_UNKNOWN, std_col.COVID_DEATH_Y, std_col.COVID_DEATH_N, std_col.COVID_DEATH_UNKNOWN] for f in files: # Explicitly specify county_fips is a string. df = gcs_to_bq_util.load_csv_as_dataframe( gcs_bucket, f, dtype={'county_fips': str}) # All columns are str, except outcome columns. column_types = {c: 'STRING' for c in df.columns} for col in int_cols: if col in column_types: column_types[col] = 'INT64' # Clean up column names. self.clean_frame_column_names(df) table_name = f.replace('.csv', '') # Table name is file name gcs_to_bq_util.append_dataframe_to_bq( df, dataset, table_name, column_types=column_types)
def write_to_bq(self, dataset, gcs_bucket): # Get an ACS mapping of Fip Codes to State Names and county codes to county names self.get_state_fips_mapping() self.get_county_fips_mapping() # Pull data from GCS and aggregate in memory self.get_health_insurance_data_by_sex(use_gcs=True, gcs_bucket=gcs_bucket) self.get_health_insurance_data_by_race(use_gcs=True, gcs_bucket=gcs_bucket) # Split internal memory into data frames for sex/race by state/county self.split_data_frames() # Create BQ columns and write dataframes to BQ for table_name, df in self.frames.items(): # All breakdown columns are strings column_types = {c: 'STRING' for c in df.columns} column_types[WITH_HEALTH_INSURANCE_COL] = 'INT64' column_types[WITHOUT_HEALTH_INSURANCE_COL] = 'INT64' column_types[TOTAL_HEALTH_INSURANCE_COL] = 'INT64' gcs_to_bq_util.append_dataframe_to_bq(df, dataset, table_name, column_types=column_types)
def write_to_bq(self, dataset, gcs_bucket, filename): """Writes state names to BigQuery from the provided GCS bucket dataset: The BigQuery dataset to write to table_name: The name of the biquery table to write to gcs_bucket: The name of the gcs bucket to read the data from filename: The name of the file in the gcs bucket to read from""" try: frame = gcs_to_bq_util.load_values_as_dataframe( gcs_bucket, filename) frame = frame.rename(columns={ 'state': 'state_fips_code', 'NAME': 'state_name' }) column_types = { 'state_fips_code': 'STRING', 'state_name': 'STRING' } gcs_to_bq_util.append_dataframe_to_bq( frame, dataset, self.get_staging_table_name(), column_types=column_types) except json.JSONDecodeError as err: logging.error( 'Unable to write to BigQuery due to improperly formatted data: %s', err)
def write_to_bq(self, dataset, gcs_bucket, **attrs): gcs_files = self.get_attr(attrs, 'filename') # In this instance, we expect filename to be a string with # comma-separated CSV filenames. if ',' not in gcs_files: raise ValueError('filename passed to write_to_bq is not a ' 'comma-separated list of files') files = gcs_files.split(',') # For each of the files, we load it as a dataframe and add it as a # table in the BigQuery dataset. We expect that all aggregation and # standardization of the data has been done by this point. str_cols = [ std_col.COUNTY_NAME_COL, std_col.STATE_NAME_COL, std_col.RACE_OR_HISPANIC_COL, std_col.AGE_COL, std_col.SEX_COL ] for f in files: df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, f) # All columns are int, except certain geo and breakdown columns. column_types = {c: 'INT64' for c in df.columns} for col in str_cols: if col in column_types: column_types[col] = 'STRING' # Clean up column names. self.clean_frame_column_names(df) table_name = f.removesuffix('.csv') # Table name is file name gcs_to_bq_util.append_dataframe_to_bq(df, dataset, table_name, column_types=column_types)
def write_to_bq(self, dataset, gcs_bucket, filename): """Writes county adjacencies to BigQuery from the provided GCS bucket dataset: The BigQuery dataset to write to table_name: The name of the biquery table to write to gcs_bucket: The name of the gcs bucket to read the data from filename: The name of the file in the gcs bucket to read from""" frame = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, filename, dtype={ 'fipscounty': 'string', 'fipsneighbor': 'string' }) frame = frame[['fipscounty', 'fipsneighbor']] frame = frame.rename(columns={ 'fipscounty': 'county_geoid', 'fipsneighbor': 'neighbor_geoids' }) frame = frame.groupby('county_geoid', as_index=False).agg(list) column_types = {'county_geoid': 'STRING', 'neighbor_geoids': 'STRING'} col_modes = {'neighbor_geoids': 'REPEATED'} gcs_to_bq_util.append_dataframe_to_bq(frame, dataset, self.get_table_name(), column_types=column_types, col_modes=col_modes)
def testAppendDataframeToBq_IgnoreColModes(self): """Tests that col_modes is ignored when no column_types are provided to append_dataframe_to_bq.""" test_frame = DataFrame( data=self._test_data[1:], columns=self._test_data[0], index=[1, 2]) with patch('ingestion.gcs_to_bq_util.bigquery.Client') as mock_client: # Set up mock calls mock_instance = mock_client.return_value mock_table = Mock() mock_instance.dataset.return_value = mock_table mock_table.table.return_value = 'test-project.test-dataset.table' gcs_to_bq_util.append_dataframe_to_bq( test_frame.copy(deep=True), "test-dataset", "table", col_modes={'label1': 'REPEATED', 'label2': 'REQUIRED'}) mock_instance.load_table_from_json.assert_called() call_args = mock_instance.load_table_from_json.call_args test_frame['ingestion_ts'] = datetime( 2020, 1, 1, tzinfo=timezone.utc).strftime( "%Y-%m-%d %H:%M:%S.%f %Z") self.assertEqual(call_args.args[0], json.loads(test_frame.to_json(orient='records'))) job_config = call_args.kwargs['job_config'] self.assertTrue(job_config.autodetect)
def write_to_bq(self, dataset, gcs_bucket, **attrs): gcs_file = self.get_attr(attrs, 'filename') # Download the raw data df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, gcs_file) self.clean_frame_column_names(df) # Standardize the data # The metadata currently only has information for cases and deaths, # not tests or hospitalizations. keep_cols = [ 'state_postal_abbreviation', 'api_death', 'defines_other_death', 'race_ethnicity_separately_death', 'race_ethnicity_combined_death', 'race_mutually_exclusive_death', 'combined_category_other_than_api_death', 'race_death', 'ethnicity_death', 'api_cases', 'defines_other_cases', 'race_ethnicity_separately_cases', 'race_ethnicity_combined_cases', 'race_mutually_exclusive_cases', 'combined_category_other_than_api_cases', 'race_cases', 'ethnicity_cases'] df = df[keep_cols] df = df.melt(id_vars=['state_postal_abbreviation']) df[['col_name', 'variable_type']] = df.variable.str.rsplit( '_', 1, expand=True) df.drop('variable', axis=1, inplace=True) df = df.pivot( index=['state_postal_abbreviation', 'variable_type'], columns='col_name', values='value').reset_index() df.replace({'variable_type': {'death': 'deaths'}}, inplace=True) df.rename_axis(None, inplace=True) df.rename(columns=self._metadata_columns_map(), inplace=True) # Write to BQ gcs_to_bq_util.append_dataframe_to_bq(df, dataset, self.get_table_name())
def write_to_bq(self, dataset, gcs_bucket, **attrs): filename = self.get_attr(attrs, 'filename') df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, filename, parse_dates=['Date'], thousands=',') self.clean_frame_column_names(df) # Massage the data into the standard format. df.drop(columns=[ 'cases_latinx', 'deaths_latinx', 'hosp_latinx', 'tests_latinx' ], inplace=True) df = df.melt(id_vars=['date', 'state']) df[['variable_type', col_std.RACE_COL]] = df.variable.str.split("_", 1, expand=True) df.drop('variable', axis=1, inplace=True) df.rename(columns={'state': 'state_postal_abbreviation'}, inplace=True) df.replace({col_std.RACE_COL: self.get_standard_columns()}, inplace=True) df['date'] = df['date'].map(lambda ts: ts.strftime("%Y-%m-%d")) # Get the metadata table metadata = self._download_metadata(dataset) if len(metadata.index) == 0: raise RuntimeError( 'BigQuery call to {} returned 0 rows'.format(dataset)) # Merge the tables merged = pd.merge(df, metadata, how='left', on=['state_postal_abbreviation', 'variable_type']) # Rename combined race categories self._rename_race_category(merged, 'reports_api', Race.ASIAN, Race.API) self._rename_race_category(merged, 'reports_ind', Race.AIAN, Race.INDIGENOUS) merged.drop(columns=['reports_api', 'reports_ind'], inplace=True) # Split into separate tables by variable type for variable_type in ['cases', 'deaths', 'tests', 'hosp']: result = merged.copy() result = result.loc[result['variable_type'] == variable_type] result.rename(columns={'value': variable_type}, inplace=True) result.drop('variable_type', axis='columns', inplace=True) # Write to BQ gcs_to_bq_util.append_dataframe_to_bq( result, dataset, self.get_table_name() + '_' + variable_type)
def write_to_bq_table(self, dataset: str, gcs_bucket: str, filename: str, table_name: str, project=None): """Writes source data from GCS bucket to BigQuery dataset: The BigQuery dataset to write to gcs_bucket: The name of the gcs bucket to read the data from filename: The name of the file in the gcs bucket to read from table_name: The name of the BigQuery table to write to""" chunked_frame = gcs_to_bq_util.load_csv_as_dataframe( gcs_bucket, filename, chunksize=1000) for chunk in chunked_frame: self.clean_frame_column_names(chunk) gcs_to_bq_util.append_dataframe_to_bq( chunk, dataset, table_name, project=project)
def write_to_bq(self, dataset, gcs_bucket): """Writes population data to BigQuery from the provided GCS bucket dataset: The BigQuery dataset to write to gcs_bucket: The name of the gcs bucket to read the data from""" # TODO change this to have it read metadata from GCS bucket metadata = fetch_acs_metadata(self.base_acs_url) var_map = parse_acs_metadata(metadata, list(GROUPS.keys())) race_and_hispanic_frame = gcs_to_bq_util.load_values_as_dataframe( gcs_bucket, self.get_filename(HISPANIC_BY_RACE_CONCEPT)) race_and_hispanic_frame = update_col_types(race_and_hispanic_frame) race_and_hispanic_frame = standardize_frame( race_and_hispanic_frame, get_vars_for_group(HISPANIC_BY_RACE_CONCEPT, var_map, 2), [HISPANIC_COL, RACE_COL], self.county_level, POPULATION_COL) total_frame = gcs_to_bq_util.load_values_as_dataframe( gcs_bucket, self.add_filename_suffix(TOTAL_POP_VARIABLE_ID)) total_frame = update_col_types(total_frame) total_frame = standardize_frame(total_frame, {TOTAL_POP_VARIABLE_ID: ['Total']}, [RACE_OR_HISPANIC_COL], self.county_level, POPULATION_COL) sex_by_age_frames = {} for concept in SEX_BY_AGE_CONCEPTS_TO_RACE: sex_by_age_frame = gcs_to_bq_util.load_values_as_dataframe( gcs_bucket, self.get_filename(concept)) sex_by_age_frame = update_col_types(sex_by_age_frame) sex_by_age_frames[concept] = sex_by_age_frame frames = { self.get_staging_table_name_by_race(): self.get_all_races_frame(race_and_hispanic_frame, total_frame), self.get_staging_table_name_by_sex_age_race(): self.get_sex_by_age_and_race(var_map, sex_by_age_frames) } for table_name, df in frames.items(): # All breakdown columns are strings column_types = {c: 'STRING' for c in df.columns} column_types[POPULATION_COL] = 'INT64' gcs_to_bq_util.append_dataframe_to_bq(df, dataset, table_name, column_types=column_types)
def write_to_bq(self, dataset, bucket): self.getData(bucket) # Split internal memory into data frames for sex/race by state/county self.split_data_frames() # Create BQ columns and write dataframes to BQ for table_name, df in self.frames.items(): # All breakdown columns are strings column_types = {c: "STRING" for c in df.columns} column_types[POPULATION_COL] = "INT64" gcs_to_bq_util.append_dataframe_to_bq(df, dataset, table_name, column_types=column_types)
def write_to_bq(self, dataset, gcs_bucket, **attrs): filename = self.get_attr(attrs, 'filename') metadata_table_id = self.get_attr(attrs, 'metadata_table_id') table_name = self.get_attr(attrs, 'table_name') df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, filename, parse_dates=['Date'], thousands=',') self.clean_frame_column_names(df) # Massage the data into the standard format. df.drop(columns=[ 'cases_latinx', 'deaths_latinx', 'hosp_latinx', 'tests_latinx' ], inplace=True) df = df.melt(id_vars=['date', 'state']) df[['variable_type', col_std.RACE_COL]] = df.variable.str.split("_", 1, expand=True) df.drop('variable', axis=1, inplace=True) df.rename(columns={'state': 'state_postal_abbreviation'}, inplace=True) df.replace({col_std.RACE_COL: self.get_standard_columns()}, inplace=True) # Get the metadata table metadata = self._download_metadata(metadata_table_id) if len(metadata.index) == 0: raise RuntimeError('BigQuery call to {} returned 0 rows'.format( metadata_table_id)) # Merge the tables merged = pd.merge(df, metadata, how='left', on=['state_postal_abbreviation', 'variable_type']) # Rename combined race categories self._rename_race_category(merged, 'reports_api', Race.ASIAN, Race.API) self._rename_race_category(merged, 'reports_ind', Race.AIAN, Race.INDIGENOUS) merged.drop(columns=['reports_api', 'reports_ind'], inplace=True) # Write to BQ gcs_to_bq_util.append_dataframe_to_bq(merged, dataset, table_name)
def write_to_bq(self, dataset, gcs_bucket, filename): """Writes primary care access stats to BigQuery from bucket dataset: The BigQuery dataset to write to table_name: The name of the biquery table to write to gcs_bucket: The name of the gcs bucket to read the data from filename: The prefix of files in the landing bucket to read from""" client = storage.Client() bucket = client.get_bucket(gcs_bucket) data = [] for state_name in constants.STATE_NAMES: filename = self._FILEPATH.format(filename, state_name) blob = bucket.blob(filename) local_path = '/tmp/{}'.format(filename) blob.download_to_filename(local_path) frame = read_excel(io=local_path, sheet_name='Ranked Measure Data', skiprows=[0, 2]) for _, row in frame.iterrows(): # These fields may not be set for every county. # If they're not set, we'll use -1 as the numerical value # Number of physicians in the county num_physicians = row[108] if not math.isnan(row[108]) else -1 # Primary Care Physicians per 100,000 population physicians_rate = row[109] if not math.isnan(row[108]) else -1 row = [row[0], row[1], row[2], num_physicians, physicians_rate] data.append(row) new_dataframe = DataFrame(data=data, columns=('county_fips_code', 'state_name', 'county_name', 'num_primary_care_physicians', 'primary_care_physicians_rate')) column_types = { 'county_fips_code': 'INT64', 'state_name': 'STRING', 'county_name': 'STRING', 'num_primary_care_physicians': 'FLOAT64', 'primary_care_physicians_rate': 'FLOAT64', } gcs_to_bq_util.append_dataframe_to_bq(new_dataframe, dataset, self.get_staging_table_name(), column_types=column_types)
def testAppendDataframeToBq_SpecifySchema(self): """Tests that the BigQuery schema is properly defined when column_types are provided to append_dataframe_to_bq.""" test_frame = DataFrame(data=self._test_data[1:], columns=self._test_data[0], index=[1, 2]) with patch('ingestion.gcs_to_bq_util.bigquery.Client') as mock_client: # Set up mock calls mock_instance = mock_client.return_value mock_table = Mock() mock_instance.dataset.return_value = mock_table mock_table.table.return_value = 'test-project.test-dataset.table' column_types = {label: 'STRING' for label in test_frame.columns} col_modes = {'label1': 'REPEATED', 'label2': 'REQUIRED'} gcs_to_bq_util.append_dataframe_to_bq(test_frame.copy(deep=True), 'test-dataset', 'table', column_types=column_types, col_modes=col_modes) mock_instance.load_table_from_json.assert_called() call_args = mock_instance.load_table_from_json.call_args test_frame['ingestion_ts'] = datetime( 2020, 1, 1, tzinfo=timezone.utc).strftime('%Y-%m-%d %H:%M:%S.%f %Z') self.assertEqual(call_args.args[0], json.loads(test_frame.to_json(orient='records'))) job_config = call_args.kwargs['job_config'] self.assertFalse(job_config.autodetect) expected_cols = ['label1', 'label2', 'label3', 'ingestion_ts'] expected_types = ['STRING', 'STRING', 'STRING', 'TIMESTAMP'] expected_modes = ['REPEATED', 'REQUIRED', 'NULLABLE', 'NULLABLE'] self.assertListEqual([field.name for field in job_config.schema], expected_cols) self.assertListEqual( [field.field_type for field in job_config.schema], expected_types) self.assertListEqual([field.mode for field in job_config.schema], expected_modes)
def write_to_bq(self, dataset, gcs_bucket, filename): """Fetches all SAIPE blobs from a GCS bucket and uploads to a single BQ table. Also does some preprocessing. dataset: The BigQuery dataset to write to table_name: The name of the BigQuery table to write to gcs_bucket: The name of the GCS bucket to pull from filename: File name prefix used to identify which GCS blobs to fetch""" client = storage.Client() saipe_blobs = client.list_blobs(gcs_bucket, prefix=filename) frames = [] for blob in saipe_blobs: frame = gcs_to_bq_util.load_values_blob_as_dataframe(blob) frames.append(frame) concat = pandas.concat(frames, ignore_index=True) # The SAIPE API includes the query predicate columns, which are duplicates of their # ALL_CAPS counterparts. Toss 'em. concat.drop(columns=['state', 'county', 'time'], inplace=True) gcs_to_bq_util.append_dataframe_to_bq( concat, dataset, self.get_staging_table_name())