def write_to_bq(self, dataset, gcs_bucket, **attrs):
        gcs_files = self.get_attr(attrs, 'filename')

        # In this instance, we expect filename to be a string with
        # comma-separated CSV filenames.
        if ',' not in gcs_files:
            raise ValueError('filename passed to write_to_bq is not a '
                             'comma-separated list of files')
        files = gcs_files.split(',')
        print("Files that will be written to BQ:", files)

        # For each of the files, we load it as a dataframe and add it as a
        # table in the BigQuery dataset. We expect that all aggregation and
        # standardization of the data has been done by this point.
        int_cols = [std_col.COVID_CASES, std_col.COVID_HOSP_Y,
                    std_col.COVID_HOSP_N, std_col.COVID_HOSP_UNKNOWN,
                    std_col.COVID_DEATH_Y, std_col.COVID_DEATH_N,
                    std_col.COVID_DEATH_UNKNOWN]
        for f in files:
            # Explicitly specify county_fips is a string.
            df = gcs_to_bq_util.load_csv_as_dataframe(
                gcs_bucket, f, dtype={'county_fips': str})

            # All columns are str, except outcome columns.
            column_types = {c: 'STRING' for c in df.columns}
            for col in int_cols:
                if col in column_types:
                    column_types[col] = 'INT64'

            # Clean up column names.
            self.clean_frame_column_names(df)

            table_name = f.replace('.csv', '')  # Table name is file name
            gcs_to_bq_util.add_dataframe_to_bq(
                df, dataset, table_name, column_types=column_types)
    def testAddDataframeToBq_IgnoreColModes(self):
        """Tests that col_modes is ignored when no column_types are provided
           to add_dataframe_to_bq."""
        test_frame = DataFrame(data=self._test_data[1:],
                               columns=self._test_data[0],
                               index=[1, 2])

        with patch('ingestion.gcs_to_bq_util.bigquery.Client') as mock_client:
            # Set up mock calls
            mock_instance = mock_client.return_value
            mock_table = Mock()
            mock_instance.dataset.return_value = mock_table
            mock_table.table.return_value = 'test-project.test-dataset.table'

            gcs_to_bq_util.add_dataframe_to_bq(test_frame.copy(deep=True),
                                               "test-dataset",
                                               "table",
                                               col_modes={
                                                   'label1': 'REPEATED',
                                                   'label2': 'REQUIRED'
                                               })

            mock_instance.load_table_from_json.assert_called()
            call_args = mock_instance.load_table_from_json.call_args
            test_frame['ingestion_ts'] = datetime(
                2020, 1, 1,
                tzinfo=timezone.utc).strftime("%Y-%m-%d %H:%M:%S.%f %Z")
            self.assertEqual(call_args.args[0],
                             json.loads(test_frame.to_json(orient='records')))
            job_config = call_args.kwargs['job_config']
            self.assertTrue(job_config.autodetect)
    def write_to_bq(self, dataset, gcs_bucket, filename):
        """Writes county adjacencies to BigQuery from the provided GCS bucket

        dataset: The BigQuery dataset to write to
        table_name: The name of the biquery table to write to
        gcs_bucket: The name of the gcs bucket to read the data from
        filename: The name of the file in the gcs bucket to read from"""
        frame = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, filename, dtype={
            'fipscounty': 'string',
            'fipsneighbor': 'string'
        })
        frame = frame[['fipscounty', 'fipsneighbor']]
        frame = frame.rename(columns={
            'fipscounty': 'county_geoid',
            'fipsneighbor': 'neighbor_geoids'
        })
        frame = frame.groupby('county_geoid', as_index=False).agg(list)

        column_types = {
            'county_geoid': 'STRING',
            'neighbor_geoids': 'STRING'
        }
        col_modes = {'neighbor_geoids': 'REPEATED'}
        gcs_to_bq_util.add_dataframe_to_bq(
            frame, dataset, self.get_table_name(), column_types=column_types,
            col_modes=col_modes)
    def write_to_bq(self, dataset, gcs_bucket):
        # Get an ACS mapping of Fip Codes to State Names and county codes to county names
        self.get_state_fips_mapping()
        self.get_county_fips_mapping()

        # Pull data from GCS and aggregate in memory
        self.get_health_insurance_data_by_sex(use_gcs=True,
                                              gcs_bucket=gcs_bucket)
        self.get_health_insurance_data_by_race(use_gcs=True,
                                               gcs_bucket=gcs_bucket)

        # Split internal memory into data frames for sex/race by state/county
        self.split_data_frames()

        # Create BQ columns and write dataframes to BQ
        for table_name, df in self.frames.items():
            # All breakdown columns are strings
            column_types = {c: 'STRING' for c in df.columns}

            column_types[WITH_HEALTH_INSURANCE_COL] = 'INT64'
            column_types[WITHOUT_HEALTH_INSURANCE_COL] = 'INT64'
            column_types[TOTAL_HEALTH_INSURANCE_COL] = 'INT64'

            gcs_to_bq_util.add_dataframe_to_bq(df,
                                               dataset,
                                               table_name,
                                               column_types=column_types)
示例#5
0
    def write_to_bq(self, dataset, gcs_bucket, filename):
        """Writes state names to BigQuery from the provided GCS bucket

        dataset: The BigQuery dataset to write to
        table_name: The name of the biquery table to write to
        gcs_bucket: The name of the gcs bucket to read the data from
        filename: The name of the file in the gcs bucket to read from"""
        try:
            frame = gcs_to_bq_util.load_values_as_dataframe(
                gcs_bucket, filename)
            frame = frame.rename(columns={
                'state': 'state_fips_code',
                'NAME': 'state_name'
            })
            column_types = {
                'state_fips_code': 'STRING',
                'state_name': 'STRING'
            }
            gcs_to_bq_util.add_dataframe_to_bq(frame,
                                               dataset,
                                               self.get_table_name(),
                                               column_types=column_types)
        except json.JSONDecodeError as err:
            logging.error(
                'Unable to write to BigQuery due to improperly formatted data: %s',
                err)
    def write_to_bq_table(self,
                          dataset: str,
                          gcs_bucket: str,
                          filename: str,
                          table_name: str,
                          project=None):
        """Writes source data from GCS bucket to BigQuery

        dataset: The BigQuery dataset to write to
        gcs_bucket: The name of the gcs bucket to read the data from
        filename: The name of the file in the gcs bucket to read from
        table_name: The name of the BigQuery table to write to"""
        chunked_frame = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket,
                                                             filename,
                                                             chunksize=1000)

        # For the very first chunk, we set the mode to overwrite to clear the
        # previous table. For subsequent chunks we append.
        overwrite = True
        for chunk in chunked_frame:
            self.clean_frame_column_names(chunk)
            gcs_to_bq_util.add_dataframe_to_bq(chunk,
                                               dataset,
                                               table_name,
                                               project=project,
                                               overwrite=overwrite)
            overwrite = False
示例#7
0
    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        gcs_file = self.get_attr(attrs, "filename")

        # Download the raw data
        df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, gcs_file)
        df = self.standardize(df)
        # Write to BQ
        gcs_to_bq_util.add_dataframe_to_bq(df, dataset, self.get_table_name())
示例#8
0
    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        filename = self.get_attr(attrs, 'filename')

        df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket,
                                                  filename,
                                                  parse_dates=['Date'],
                                                  thousands=',')
        self.clean_frame_column_names(df)

        # Massage the data into the standard format.
        df.drop(columns=[
            'cases_latinx', 'deaths_latinx', 'hosp_latinx', 'tests_latinx'
        ],
                inplace=True)
        df = df.melt(id_vars=['date', 'state'])
        df[['variable_type',
            col_std.RACE_COL]] = df.variable.str.split("_", 1, expand=True)
        df.drop('variable', axis=1, inplace=True)
        df.rename(columns={'state': 'state_postal_abbreviation'}, inplace=True)
        df.replace({col_std.RACE_COL: self.get_standard_columns()},
                   inplace=True)
        df['date'] = df['date'].map(lambda ts: ts.strftime("%Y-%m-%d"))

        # Get the metadata table
        metadata = self._download_metadata(dataset)
        if len(metadata.index) == 0:
            raise RuntimeError(
                'BigQuery call to {} returned 0 rows'.format(dataset))

        # Merge the tables
        merged = pd.merge(df,
                          metadata,
                          how='left',
                          on=['state_postal_abbreviation', 'variable_type'])
        # Rename combined race categories
        self._rename_race_category(merged, 'reports_api', Race.ASIAN, Race.API)
        self._rename_race_category(merged, 'reports_ind', Race.AIAN,
                                   Race.INDIGENOUS)

        merged.drop(columns=['reports_api', 'reports_ind'], inplace=True)

        # Split into separate tables by variable type
        for variable_type in ['cases', 'deaths', 'tests', 'hosp']:
            result = merged.copy()
            result = result.loc[result['variable_type'] == variable_type]
            result.rename(columns={'value': variable_type}, inplace=True)
            result.drop('variable_type', axis='columns', inplace=True)
            # Write to BQ
            gcs_to_bq_util.add_dataframe_to_bq(
                result, dataset,
                self.get_table_name() + '_' + variable_type)
示例#9
0
    def write_to_bq(self, dataset, gcs_bucket):
        """Writes population data to BigQuery from the provided GCS bucket

        dataset: The BigQuery dataset to write to
        gcs_bucket: The name of the gcs bucket to read the data from"""
        # TODO change this to have it read metadata from GCS bucket
        metadata = fetch_acs_metadata(self.base_acs_url)
        var_map = parse_acs_metadata(metadata, list(GROUPS.keys()))

        race_and_hispanic_frame = gcs_to_bq_util.load_values_as_dataframe(
            gcs_bucket, self.get_filename(HISPANIC_BY_RACE_CONCEPT))
        race_and_hispanic_frame = update_col_types(race_and_hispanic_frame)

        race_and_hispanic_frame = standardize_frame(
            race_and_hispanic_frame,
            get_vars_for_group(HISPANIC_BY_RACE_CONCEPT, var_map, 2),
            [HISPANIC_COL, RACE_COL],
            self.county_level,
            POPULATION_COL)

        total_frame = gcs_to_bq_util.load_values_as_dataframe(
            gcs_bucket, self.add_filename_suffix(TOTAL_POP_VARIABLE_ID))
        total_frame = update_col_types(total_frame)
        total_frame = standardize_frame(
            total_frame,
            {TOTAL_POP_VARIABLE_ID: ['Total']},
            [RACE_OR_HISPANIC_COL],
            self.county_level,
            POPULATION_COL)

        sex_by_age_frames = {}
        for concept in SEX_BY_AGE_CONCEPTS_TO_RACE:
            sex_by_age_frame = gcs_to_bq_util.load_values_as_dataframe(
                gcs_bucket, self.get_filename(concept))
            sex_by_age_frame = update_col_types(sex_by_age_frame)
            sex_by_age_frames[concept] = sex_by_age_frame

        frames = {
            self.get_table_name_by_race(): self.get_all_races_frame(
                race_and_hispanic_frame, total_frame),
            self.get_table_name_by_sex_age_race(): self.get_sex_by_age_and_race(
                var_map, sex_by_age_frames)
        }

        for table_name, df in frames.items():
            # All breakdown columns are strings
            column_types = {c: 'STRING' for c in df.columns}
            column_types[POPULATION_COL] = 'INT64'
            gcs_to_bq_util.add_dataframe_to_bq(
                df, dataset, table_name, column_types=column_types)
    def write_to_bq(self, dataset, bucket):
        self.getData(bucket)

        # Split internal memory into data frames for sex/race by state/county
        self.split_data_frames()

        # Create BQ columns and write dataframes to BQ
        for table_name, df in self.frames.items():
            # All breakdown columns are strings
            column_types = {c: "STRING" for c in df.columns}

            column_types[ABOVE_POVERTY_COL] = "INT64"
            column_types[BELOW_POVERTY_COL] = "INT64"

            gcs_to_bq_util.add_dataframe_to_bq(
                df, dataset, table_name, column_types=column_types
            )
    def write_to_bq(self, dataset, bucket):
        self.getData(bucket)

        # Split internal memory into data frames for sex/race by state/county
        self.split_data_frames()

        # Create BQ columns and write dataframes to BQ
        for table_name, df in self.frames.items():
            # All breakdown columns are strings
            column_types = {c: "STRING" for c in df.columns}
            if RACE_INCLUDES_HISPANIC_COL in df.columns:
                column_types[RACE_INCLUDES_HISPANIC_COL] = 'BOOL'

            column_types[POPULATION_COL] = "INT64"

            gcs_to_bq_util.add_dataframe_to_bq(
                df, dataset, table_name, column_types=column_types
            )
示例#12
0
    def write_to_bq(self, dataset, gcs_bucket):
        self.getData(gcs_bucket=gcs_bucket)

        # Split internal memory into data frames for sex/race by state/county
        self.split_data_frames()

        # Create BQ columns and write dataframes to BQ
        for table_name, df in self.frames.items():
            # All breakdown columns are strings
            column_types = {c: "STRING" for c in df.columns}

            column_types[WITH_HEALTH_INSURANCE_COL] = "INT64"
            column_types[WITHOUT_HEALTH_INSURANCE_COL] = "INT64"
            column_types[TOTAL_HEALTH_INSURANCE_COL] = "INT64"

            gcs_to_bq_util.add_dataframe_to_bq(
                df, dataset, table_name, column_types=column_types
            )
    def write_to_bq(self, dataset, gcs_bucket, filename):
        """Writes primary care access stats to BigQuery from bucket
            dataset: The BigQuery dataset to write to
            table_name: The name of the biquery table to write to
            gcs_bucket: The name of the gcs bucket to read the data from
            filename: The prefix of files in the landing bucket to read from"""
        client = storage.Client()
        bucket = client.get_bucket(gcs_bucket)

        data = []
        for state_name in constants.STATE_NAMES:
            filename = self._FILEPATH.format(filename, state_name)
            blob = bucket.blob(filename)
            local_path = '/tmp/{}'.format(filename)
            blob.download_to_filename(local_path)

            frame = read_excel(io=local_path,
                               sheet_name='Ranked Measure Data',
                               skiprows=[0, 2])
            for _, row in frame.iterrows():
                # These fields may not be set for every county.
                # If they're not set, we'll use -1 as the numerical value
                # Number of physicians in the county
                num_physicians = row[108] if not math.isnan(row[108]) else -1
                # Primary Care Physicians per 100,000 population
                physicians_rate = row[109] if not math.isnan(row[108]) else -1
                row = [row[0], row[1], row[2], num_physicians, physicians_rate]
                data.append(row)
        new_dataframe = DataFrame(data=data,
                                  columns=('county_fips_code', 'state_name',
                                           'county_name',
                                           'num_primary_care_physicians',
                                           'primary_care_physicians_rate'))
        column_types = {
            'county_fips_code': 'INT64',
            'state_name': 'STRING',
            'county_name': 'STRING',
            'num_primary_care_physicians': 'FLOAT64',
            'primary_care_physicians_rate': 'FLOAT64',
        }
        gcs_to_bq_util.add_dataframe_to_bq(new_dataframe,
                                           dataset,
                                           self.get_table_name(),
                                           column_types=column_types)
示例#14
0
    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        df = gcs_to_bq_util.load_csv_as_dataframe_from_web(BASE_UHC_URL)

        for breakdown in [
                std_col.RACE_OR_HISPANIC_COL, std_col.AGE_COL, std_col.SEX_COL
        ]:
            breakdown_df = self.generate_breakdown(breakdown, df)

            column_types = {c: 'STRING' for c in breakdown_df.columns}
            for col in [std_col.COPD_PCT, std_col.DIABETES_PCT]:
                column_types[col] = 'FLOAT'

            if std_col.RACE_INCLUDES_HISPANIC_COL in breakdown_df.columns:
                column_types[std_col.RACE_INCLUDES_HISPANIC_COL] = 'BOOL'

            gcs_to_bq_util.add_dataframe_to_bq(breakdown_df,
                                               dataset,
                                               breakdown,
                                               column_types=column_types)
    def testAddDataframeToBq_SpecifySchema(self):
        """Tests that the BigQuery schema is properly defined when column_types
           are provided to add_dataframe_to_bq."""
        test_frame = DataFrame(data=self._test_data[1:],
                               columns=self._test_data[0],
                               index=[1, 2])

        with patch('ingestion.gcs_to_bq_util.bigquery.Client') as mock_client:
            # Set up mock calls
            mock_instance = mock_client.return_value
            mock_table = Mock()
            mock_instance.dataset.return_value = mock_table
            mock_table.table.return_value = 'test-project.test-dataset.table'

            column_types = {label: 'STRING' for label in test_frame.columns}
            col_modes = {'label1': 'REPEATED', 'label2': 'REQUIRED'}
            gcs_to_bq_util.add_dataframe_to_bq(test_frame.copy(deep=True),
                                               'test-dataset',
                                               'table',
                                               column_types=column_types,
                                               col_modes=col_modes)

            mock_instance.load_table_from_json.assert_called()
            call_args = mock_instance.load_table_from_json.call_args
            test_frame['ingestion_ts'] = datetime(
                2020, 1, 1,
                tzinfo=timezone.utc).strftime('%Y-%m-%d %H:%M:%S.%f %Z')
            self.assertEqual(call_args.args[0],
                             json.loads(test_frame.to_json(orient='records')))
            job_config = call_args.kwargs['job_config']
            self.assertFalse(job_config.autodetect)

            expected_cols = ['label1', 'label2', 'label3', 'ingestion_ts']
            expected_types = ['STRING', 'STRING', 'STRING', 'TIMESTAMP']
            expected_modes = ['REPEATED', 'REQUIRED', 'NULLABLE', 'NULLABLE']
            self.assertListEqual([field.name for field in job_config.schema],
                                 expected_cols)
            self.assertListEqual(
                [field.field_type for field in job_config.schema],
                expected_types)
            self.assertListEqual([field.mode for field in job_config.schema],
                                 expected_modes)
    def write_to_bq(self, dataset, gcs_bucket, filename):
        """Fetches all SAIPE blobs from a GCS bucket and uploads to a single BQ table.
        Also does some preprocessing.

        dataset: The BigQuery dataset to write to
        table_name: The name of the BigQuery table to write to
        gcs_bucket: The name of the GCS bucket to pull from
        filename: File name prefix used to identify which GCS blobs to fetch"""
        client = storage.Client()
        saipe_blobs = client.list_blobs(gcs_bucket, prefix=filename)

        frames = []
        for blob in saipe_blobs:
            frame = gcs_to_bq_util.load_values_blob_as_dataframe(blob)
            frames.append(frame)

        concat = pandas.concat(frames, ignore_index=True)
        # The SAIPE API includes the query predicate columns, which are duplicates of their
        # ALL_CAPS counterparts. Toss 'em.
        concat.drop(columns=['state', 'county', 'time'], inplace=True)
        gcs_to_bq_util.add_dataframe_to_bq(concat, dataset,
                                           self.get_table_name())
    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        filename = self.get_attr(attrs, "filename")

        df = gcs_to_bq_util.load_csv_as_dataframe(
            gcs_bucket, filename, parse_dates=["Date"], thousands=",")
        df = self.standardize(df)

        # Get the metadata table
        metadata = self._download_metadata(dataset)
        if len(metadata.index) == 0:
            raise RuntimeError("BigQuery call to {} returned 0 rows".format(dataset))
        merged = CovidTrackingProject.merge_with_metadata(df, metadata)

        # Split into separate tables by variable type
        for variable_type in ["cases", "deaths", "tests", "hosp"]:
            result = merged.copy()
            result = result.loc[result["variable_type"] == variable_type]
            result.rename(columns={"value": variable_type}, inplace=True)
            result.drop("variable_type", axis="columns", inplace=True)
            # Write to BQ
            gcs_to_bq_util.add_dataframe_to_bq(
                result, dataset, self.get_table_name() + "_" + variable_type)
    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        gcs_file = self.get_attr(attrs, 'filename')

        # Download the raw data
        df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, gcs_file)
        self.clean_frame_column_names(df)

        # Standardize the data
        # The metadata currently only has information for cases and deaths,
        # not tests or hospitalizations.
        keep_cols = [
            'state_postal_abbreviation', 'api_death', 'defines_other_death',
            'race_ethnicity_separately_death', 'race_ethnicity_combined_death',
            'race_mutually_exclusive_death',
            'combined_category_other_than_api_death', 'race_death',
            'ethnicity_death', 'api_cases', 'defines_other_cases',
            'race_ethnicity_separately_cases', 'race_ethnicity_combined_cases',
            'race_mutually_exclusive_cases',
            'combined_category_other_than_api_cases', 'race_cases',
            'ethnicity_cases'
        ]
        df = df[keep_cols]
        df = df.melt(id_vars=['state_postal_abbreviation'])
        df[['col_name', 'variable_type']] = df.variable.str.rsplit('_',
                                                                   1,
                                                                   expand=True)
        df.drop('variable', axis=1, inplace=True)
        df = df.pivot(index=['state_postal_abbreviation', 'variable_type'],
                      columns='col_name',
                      values='value').reset_index()
        df.replace({'variable_type': {'death': 'deaths'}}, inplace=True)
        df.rename_axis(None, inplace=True)
        df.rename(columns=self._metadata_columns_map(), inplace=True)

        # Write to BQ
        gcs_to_bq_util.add_dataframe_to_bq(df, dataset, self.get_table_name())