Пример #1
0
    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        gcs_files = self.get_attr(attrs, 'filename')

        # In this instance, we expect filename to be a string with
        # comma-separated CSV filenames.
        if ',' not in gcs_files:
            raise ValueError('filename passed to write_to_bq is not a '
                             'comma-separated list of files')
        files = gcs_files.split(',')
        print("Files that will be written to BQ:", files)

        # For each of the files, we load it as a dataframe and add it as a
        # table in the BigQuery dataset. We expect that all aggregation and
        # standardization of the data has been done by this point.
        int_cols = [std_col.COVID_CASES, std_col.COVID_HOSP_Y,
                    std_col.COVID_HOSP_N, std_col.COVID_HOSP_UNKNOWN,
                    std_col.COVID_DEATH_Y, std_col.COVID_DEATH_N,
                    std_col.COVID_DEATH_UNKNOWN]
        for f in files:
            # Explicitly specify county_fips is a string.
            df = gcs_to_bq_util.load_csv_as_dataframe(
                gcs_bucket, f, dtype={'county_fips': str})

            # All columns are str, except outcome columns.
            column_types = {c: 'STRING' for c in df.columns}
            for col in int_cols:
                if col in column_types:
                    column_types[col] = 'INT64'

            # Clean up column names.
            self.clean_frame_column_names(df)

            table_name = f.replace('.csv', '')  # Table name is file name
            gcs_to_bq_util.append_dataframe_to_bq(
                df, dataset, table_name, column_types=column_types)
Пример #2
0
    def write_to_bq(self, dataset, gcs_bucket):
        # Get an ACS mapping of Fip Codes to State Names and county codes to county names
        self.get_state_fips_mapping()
        self.get_county_fips_mapping()

        # Pull data from GCS and aggregate in memory
        self.get_health_insurance_data_by_sex(use_gcs=True,
                                              gcs_bucket=gcs_bucket)
        self.get_health_insurance_data_by_race(use_gcs=True,
                                               gcs_bucket=gcs_bucket)

        # Split internal memory into data frames for sex/race by state/county
        self.split_data_frames()

        # Create BQ columns and write dataframes to BQ
        for table_name, df in self.frames.items():
            # All breakdown columns are strings
            column_types = {c: 'STRING' for c in df.columns}

            column_types[WITH_HEALTH_INSURANCE_COL] = 'INT64'
            column_types[WITHOUT_HEALTH_INSURANCE_COL] = 'INT64'
            column_types[TOTAL_HEALTH_INSURANCE_COL] = 'INT64'

            gcs_to_bq_util.append_dataframe_to_bq(df,
                                                  dataset,
                                                  table_name,
                                                  column_types=column_types)
Пример #3
0
    def write_to_bq(self, dataset, gcs_bucket, filename):
        """Writes state names to BigQuery from the provided GCS bucket

        dataset: The BigQuery dataset to write to
        table_name: The name of the biquery table to write to
        gcs_bucket: The name of the gcs bucket to read the data from
        filename: The name of the file in the gcs bucket to read from"""
        try:
            frame = gcs_to_bq_util.load_values_as_dataframe(
                gcs_bucket, filename)
            frame = frame.rename(columns={
                'state': 'state_fips_code',
                'NAME': 'state_name'
            })
            column_types = {
                'state_fips_code': 'STRING',
                'state_name': 'STRING'
            }
            gcs_to_bq_util.append_dataframe_to_bq(
                frame,
                dataset,
                self.get_staging_table_name(),
                column_types=column_types)
        except json.JSONDecodeError as err:
            logging.error(
                'Unable to write to BigQuery due to improperly formatted data: %s',
                err)
Пример #4
0
    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        gcs_files = self.get_attr(attrs, 'filename')

        # In this instance, we expect filename to be a string with
        # comma-separated CSV filenames.
        if ',' not in gcs_files:
            raise ValueError('filename passed to write_to_bq is not a '
                             'comma-separated list of files')
        files = gcs_files.split(',')

        # For each of the files, we load it as a dataframe and add it as a
        # table in the BigQuery dataset. We expect that all aggregation and
        # standardization of the data has been done by this point.
        str_cols = [
            std_col.COUNTY_NAME_COL, std_col.STATE_NAME_COL,
            std_col.RACE_OR_HISPANIC_COL, std_col.AGE_COL, std_col.SEX_COL
        ]
        for f in files:
            df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, f)

            # All columns are int, except certain geo and breakdown columns.
            column_types = {c: 'INT64' for c in df.columns}
            for col in str_cols:
                if col in column_types:
                    column_types[col] = 'STRING'

            # Clean up column names.
            self.clean_frame_column_names(df)

            table_name = f.removesuffix('.csv')  # Table name is file name
            gcs_to_bq_util.append_dataframe_to_bq(df,
                                                  dataset,
                                                  table_name,
                                                  column_types=column_types)
    def write_to_bq(self, dataset, gcs_bucket, filename):
        """Writes county adjacencies to BigQuery from the provided GCS bucket

        dataset: The BigQuery dataset to write to
        table_name: The name of the biquery table to write to
        gcs_bucket: The name of the gcs bucket to read the data from
        filename: The name of the file in the gcs bucket to read from"""
        frame = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket,
                                                     filename,
                                                     dtype={
                                                         'fipscounty':
                                                         'string',
                                                         'fipsneighbor':
                                                         'string'
                                                     })
        frame = frame[['fipscounty', 'fipsneighbor']]
        frame = frame.rename(columns={
            'fipscounty': 'county_geoid',
            'fipsneighbor': 'neighbor_geoids'
        })
        frame = frame.groupby('county_geoid', as_index=False).agg(list)

        column_types = {'county_geoid': 'STRING', 'neighbor_geoids': 'STRING'}
        col_modes = {'neighbor_geoids': 'REPEATED'}
        gcs_to_bq_util.append_dataframe_to_bq(frame,
                                              dataset,
                                              self.get_table_name(),
                                              column_types=column_types,
                                              col_modes=col_modes)
    def testAppendDataframeToBq_IgnoreColModes(self):
        """Tests that col_modes is ignored when no column_types are provided
           to append_dataframe_to_bq."""
        test_frame = DataFrame(
            data=self._test_data[1:], columns=self._test_data[0], index=[1, 2])

        with patch('ingestion.gcs_to_bq_util.bigquery.Client') as mock_client:
            # Set up mock calls
            mock_instance = mock_client.return_value
            mock_table = Mock()
            mock_instance.dataset.return_value = mock_table
            mock_table.table.return_value = 'test-project.test-dataset.table'

            gcs_to_bq_util.append_dataframe_to_bq(
                test_frame.copy(deep=True), "test-dataset", "table",
                col_modes={'label1': 'REPEATED', 'label2': 'REQUIRED'})

            mock_instance.load_table_from_json.assert_called()
            call_args = mock_instance.load_table_from_json.call_args
            test_frame['ingestion_ts'] = datetime(
                2020, 1, 1, tzinfo=timezone.utc).strftime(
                    "%Y-%m-%d %H:%M:%S.%f %Z")
            self.assertEqual(call_args.args[0],
                             json.loads(test_frame.to_json(orient='records')))
            job_config = call_args.kwargs['job_config']
            self.assertTrue(job_config.autodetect)
    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        gcs_file = self.get_attr(attrs, 'filename')

        # Download the raw data
        df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, gcs_file)
        self.clean_frame_column_names(df)

        # Standardize the data
        # The metadata currently only has information for cases and deaths,
        # not tests or hospitalizations.
        keep_cols = [
            'state_postal_abbreviation', 'api_death',
            'defines_other_death', 'race_ethnicity_separately_death',
            'race_ethnicity_combined_death', 'race_mutually_exclusive_death',
            'combined_category_other_than_api_death', 'race_death',
            'ethnicity_death', 'api_cases', 'defines_other_cases',
            'race_ethnicity_separately_cases', 'race_ethnicity_combined_cases',
            'race_mutually_exclusive_cases', 'combined_category_other_than_api_cases',
            'race_cases', 'ethnicity_cases']
        df = df[keep_cols]
        df = df.melt(id_vars=['state_postal_abbreviation'])
        df[['col_name', 'variable_type']] = df.variable.str.rsplit(
            '_', 1, expand=True)
        df.drop('variable', axis=1, inplace=True)
        df = df.pivot(
            index=['state_postal_abbreviation', 'variable_type'],
            columns='col_name', values='value').reset_index()
        df.replace({'variable_type': {'death': 'deaths'}}, inplace=True)
        df.rename_axis(None, inplace=True)
        df.rename(columns=self._metadata_columns_map(), inplace=True)

        # Write to BQ
        gcs_to_bq_util.append_dataframe_to_bq(df, dataset, self.get_table_name())
    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        filename = self.get_attr(attrs, 'filename')

        df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket,
                                                  filename,
                                                  parse_dates=['Date'],
                                                  thousands=',')
        self.clean_frame_column_names(df)

        # Massage the data into the standard format.
        df.drop(columns=[
            'cases_latinx', 'deaths_latinx', 'hosp_latinx', 'tests_latinx'
        ],
                inplace=True)
        df = df.melt(id_vars=['date', 'state'])
        df[['variable_type',
            col_std.RACE_COL]] = df.variable.str.split("_", 1, expand=True)
        df.drop('variable', axis=1, inplace=True)
        df.rename(columns={'state': 'state_postal_abbreviation'}, inplace=True)
        df.replace({col_std.RACE_COL: self.get_standard_columns()},
                   inplace=True)
        df['date'] = df['date'].map(lambda ts: ts.strftime("%Y-%m-%d"))

        # Get the metadata table
        metadata = self._download_metadata(dataset)
        if len(metadata.index) == 0:
            raise RuntimeError(
                'BigQuery call to {} returned 0 rows'.format(dataset))

        # Merge the tables
        merged = pd.merge(df,
                          metadata,
                          how='left',
                          on=['state_postal_abbreviation', 'variable_type'])
        # Rename combined race categories
        self._rename_race_category(merged, 'reports_api', Race.ASIAN, Race.API)
        self._rename_race_category(merged, 'reports_ind', Race.AIAN,
                                   Race.INDIGENOUS)

        merged.drop(columns=['reports_api', 'reports_ind'], inplace=True)

        # Split into separate tables by variable type
        for variable_type in ['cases', 'deaths', 'tests', 'hosp']:
            result = merged.copy()
            result = result.loc[result['variable_type'] == variable_type]
            result.rename(columns={'value': variable_type}, inplace=True)
            result.drop('variable_type', axis='columns', inplace=True)
            # Write to BQ
            gcs_to_bq_util.append_dataframe_to_bq(
                result, dataset,
                self.get_table_name() + '_' + variable_type)
Пример #9
0
    def write_to_bq_table(self, dataset: str, gcs_bucket: str, filename: str, table_name: str, project=None):
        """Writes source data from GCS bucket to BigQuery

        dataset: The BigQuery dataset to write to
        gcs_bucket: The name of the gcs bucket to read the data from
        filename: The name of the file in the gcs bucket to read from
        table_name: The name of the BigQuery table to write to"""
        chunked_frame = gcs_to_bq_util.load_csv_as_dataframe(
            gcs_bucket, filename, chunksize=1000)

        for chunk in chunked_frame:
            self.clean_frame_column_names(chunk)
            gcs_to_bq_util.append_dataframe_to_bq(
                chunk, dataset, table_name, project=project)
Пример #10
0
    def write_to_bq(self, dataset, gcs_bucket):
        """Writes population data to BigQuery from the provided GCS bucket

        dataset: The BigQuery dataset to write to
        gcs_bucket: The name of the gcs bucket to read the data from"""
        # TODO change this to have it read metadata from GCS bucket
        metadata = fetch_acs_metadata(self.base_acs_url)
        var_map = parse_acs_metadata(metadata, list(GROUPS.keys()))

        race_and_hispanic_frame = gcs_to_bq_util.load_values_as_dataframe(
            gcs_bucket, self.get_filename(HISPANIC_BY_RACE_CONCEPT))
        race_and_hispanic_frame = update_col_types(race_and_hispanic_frame)

        race_and_hispanic_frame = standardize_frame(
            race_and_hispanic_frame,
            get_vars_for_group(HISPANIC_BY_RACE_CONCEPT, var_map, 2),
            [HISPANIC_COL, RACE_COL], self.county_level, POPULATION_COL)

        total_frame = gcs_to_bq_util.load_values_as_dataframe(
            gcs_bucket, self.add_filename_suffix(TOTAL_POP_VARIABLE_ID))
        total_frame = update_col_types(total_frame)
        total_frame = standardize_frame(total_frame,
                                        {TOTAL_POP_VARIABLE_ID: ['Total']},
                                        [RACE_OR_HISPANIC_COL],
                                        self.county_level, POPULATION_COL)

        sex_by_age_frames = {}
        for concept in SEX_BY_AGE_CONCEPTS_TO_RACE:
            sex_by_age_frame = gcs_to_bq_util.load_values_as_dataframe(
                gcs_bucket, self.get_filename(concept))
            sex_by_age_frame = update_col_types(sex_by_age_frame)
            sex_by_age_frames[concept] = sex_by_age_frame

        frames = {
            self.get_staging_table_name_by_race():
            self.get_all_races_frame(race_and_hispanic_frame, total_frame),
            self.get_staging_table_name_by_sex_age_race():
            self.get_sex_by_age_and_race(var_map, sex_by_age_frames)
        }

        for table_name, df in frames.items():
            # All breakdown columns are strings
            column_types = {c: 'STRING' for c in df.columns}
            column_types[POPULATION_COL] = 'INT64'
            gcs_to_bq_util.append_dataframe_to_bq(df,
                                                  dataset,
                                                  table_name,
                                                  column_types=column_types)
    def write_to_bq(self, dataset, bucket):
        self.getData(bucket)

        # Split internal memory into data frames for sex/race by state/county
        self.split_data_frames()

        # Create BQ columns and write dataframes to BQ
        for table_name, df in self.frames.items():
            # All breakdown columns are strings
            column_types = {c: "STRING" for c in df.columns}

            column_types[POPULATION_COL] = "INT64"

            gcs_to_bq_util.append_dataframe_to_bq(df,
                                                  dataset,
                                                  table_name,
                                                  column_types=column_types)
Пример #12
0
    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        filename = self.get_attr(attrs, 'filename')
        metadata_table_id = self.get_attr(attrs, 'metadata_table_id')
        table_name = self.get_attr(attrs, 'table_name')

        df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket,
                                                  filename,
                                                  parse_dates=['Date'],
                                                  thousands=',')
        self.clean_frame_column_names(df)

        # Massage the data into the standard format.
        df.drop(columns=[
            'cases_latinx', 'deaths_latinx', 'hosp_latinx', 'tests_latinx'
        ],
                inplace=True)
        df = df.melt(id_vars=['date', 'state'])
        df[['variable_type',
            col_std.RACE_COL]] = df.variable.str.split("_", 1, expand=True)
        df.drop('variable', axis=1, inplace=True)
        df.rename(columns={'state': 'state_postal_abbreviation'}, inplace=True)
        df.replace({col_std.RACE_COL: self.get_standard_columns()},
                   inplace=True)

        # Get the metadata table
        metadata = self._download_metadata(metadata_table_id)
        if len(metadata.index) == 0:
            raise RuntimeError('BigQuery call to {} returned 0 rows'.format(
                metadata_table_id))

        # Merge the tables
        merged = pd.merge(df,
                          metadata,
                          how='left',
                          on=['state_postal_abbreviation', 'variable_type'])
        # Rename combined race categories
        self._rename_race_category(merged, 'reports_api', Race.ASIAN, Race.API)
        self._rename_race_category(merged, 'reports_ind', Race.AIAN,
                                   Race.INDIGENOUS)

        merged.drop(columns=['reports_api', 'reports_ind'], inplace=True)

        # Write to BQ
        gcs_to_bq_util.append_dataframe_to_bq(merged, dataset, table_name)
    def write_to_bq(self, dataset, gcs_bucket, filename):
        """Writes primary care access stats to BigQuery from bucket
            dataset: The BigQuery dataset to write to
            table_name: The name of the biquery table to write to
            gcs_bucket: The name of the gcs bucket to read the data from
            filename: The prefix of files in the landing bucket to read from"""
        client = storage.Client()
        bucket = client.get_bucket(gcs_bucket)

        data = []
        for state_name in constants.STATE_NAMES:
            filename = self._FILEPATH.format(filename, state_name)
            blob = bucket.blob(filename)
            local_path = '/tmp/{}'.format(filename)
            blob.download_to_filename(local_path)

            frame = read_excel(io=local_path,
                               sheet_name='Ranked Measure Data',
                               skiprows=[0, 2])
            for _, row in frame.iterrows():
                # These fields may not be set for every county.
                # If they're not set, we'll use -1 as the numerical value
                # Number of physicians in the county
                num_physicians = row[108] if not math.isnan(row[108]) else -1
                # Primary Care Physicians per 100,000 population
                physicians_rate = row[109] if not math.isnan(row[108]) else -1
                row = [row[0], row[1], row[2], num_physicians, physicians_rate]
                data.append(row)
        new_dataframe = DataFrame(data=data,
                                  columns=('county_fips_code', 'state_name',
                                           'county_name',
                                           'num_primary_care_physicians',
                                           'primary_care_physicians_rate'))
        column_types = {
            'county_fips_code': 'INT64',
            'state_name': 'STRING',
            'county_name': 'STRING',
            'num_primary_care_physicians': 'FLOAT64',
            'primary_care_physicians_rate': 'FLOAT64',
        }
        gcs_to_bq_util.append_dataframe_to_bq(new_dataframe,
                                              dataset,
                                              self.get_staging_table_name(),
                                              column_types=column_types)
Пример #14
0
    def testAppendDataframeToBq_SpecifySchema(self):
        """Tests that the BigQuery schema is properly defined when column_types
           are provided to append_dataframe_to_bq."""
        test_frame = DataFrame(data=self._test_data[1:],
                               columns=self._test_data[0],
                               index=[1, 2])

        with patch('ingestion.gcs_to_bq_util.bigquery.Client') as mock_client:
            # Set up mock calls
            mock_instance = mock_client.return_value
            mock_table = Mock()
            mock_instance.dataset.return_value = mock_table
            mock_table.table.return_value = 'test-project.test-dataset.table'

            column_types = {label: 'STRING' for label in test_frame.columns}
            col_modes = {'label1': 'REPEATED', 'label2': 'REQUIRED'}
            gcs_to_bq_util.append_dataframe_to_bq(test_frame.copy(deep=True),
                                                  'test-dataset',
                                                  'table',
                                                  column_types=column_types,
                                                  col_modes=col_modes)

            mock_instance.load_table_from_json.assert_called()
            call_args = mock_instance.load_table_from_json.call_args
            test_frame['ingestion_ts'] = datetime(
                2020, 1, 1,
                tzinfo=timezone.utc).strftime('%Y-%m-%d %H:%M:%S.%f %Z')
            self.assertEqual(call_args.args[0],
                             json.loads(test_frame.to_json(orient='records')))
            job_config = call_args.kwargs['job_config']
            self.assertFalse(job_config.autodetect)

            expected_cols = ['label1', 'label2', 'label3', 'ingestion_ts']
            expected_types = ['STRING', 'STRING', 'STRING', 'TIMESTAMP']
            expected_modes = ['REPEATED', 'REQUIRED', 'NULLABLE', 'NULLABLE']
            self.assertListEqual([field.name for field in job_config.schema],
                                 expected_cols)
            self.assertListEqual(
                [field.field_type for field in job_config.schema],
                expected_types)
            self.assertListEqual([field.mode for field in job_config.schema],
                                 expected_modes)
Пример #15
0
    def write_to_bq(self, dataset, gcs_bucket, filename):
        """Fetches all SAIPE blobs from a GCS bucket and uploads to a single BQ table.
        Also does some preprocessing.

        dataset: The BigQuery dataset to write to
        table_name: The name of the BigQuery table to write to
        gcs_bucket: The name of the GCS bucket to pull from
        filename: File name prefix used to identify which GCS blobs to fetch"""
        client = storage.Client()
        saipe_blobs = client.list_blobs(gcs_bucket, prefix=filename)

        frames = []
        for blob in saipe_blobs:
            frame = gcs_to_bq_util.load_values_blob_as_dataframe(blob)
            frames.append(frame)

        concat = pandas.concat(frames, ignore_index=True)
        # The SAIPE API includes the query predicate columns, which are duplicates of their
        # ALL_CAPS counterparts. Toss 'em.
        concat.drop(columns=['state', 'county', 'time'], inplace=True)
        gcs_to_bq_util.append_dataframe_to_bq(
            concat, dataset, self.get_staging_table_name())