def load_parquet( client: bigquery.Client, dataframe: pandas.DataFrame, destination_table_ref: bigquery.TableReference, location: Optional[str], schema: Optional[Dict[str, Any]], billing_project: Optional[str] = None, ): job_config = bigquery.LoadJobConfig() job_config.write_disposition = "WRITE_APPEND" job_config.create_disposition = "CREATE_NEVER" job_config.source_format = "PARQUET" if schema is not None: schema = pandas_gbq.schema.remove_policy_tags(schema) job_config.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) dataframe = cast_dataframe_for_parquet(dataframe, schema) try: client.load_table_from_dataframe( dataframe, destination_table_ref, job_config=job_config, location=location, project=billing_project, ).result() except pyarrow.lib.ArrowInvalid as exc: raise exceptions.ConversionError( "Could not convert DataFrame to Parquet.") from exc
def _upload_entity_df_into_bigquery( client: Client, project: str, dataset_name: str, entity_df: Union[pandas.DataFrame, str], ) -> Table: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" table_id = _get_table_id_for_new_entity(client, project, dataset_name) if type(entity_df) is str: job = client.query(f"CREATE TABLE {table_id} AS ({entity_df})") job.result() elif isinstance(entity_df, pandas.DataFrame): # Drop the index so that we dont have unnecessary columns entity_df.reset_index(drop=True, inplace=True) # Upload the dataframe into BigQuery, creating a temporary table job_config = bigquery.LoadJobConfig() job = client.load_table_from_dataframe(entity_df, table_id, job_config=job_config) job.result() else: raise ValueError( f"The entity dataframe you have provided must be a Pandas DataFrame or BigQuery SQL query, " f"but we found: {type(entity_df)} ") # Ensure that the table expires after some time table = client.get_table(table=table_id) table.expires = datetime.utcnow() + timedelta(minutes=30) client.update_table(table, ["expires"]) return table
def _upload_entity_df_and_get_entity_schema( client: Client, table_name: str, entity_df: Union[pd.DataFrame, str], ) -> Dict[str, np.dtype]: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" if type(entity_df) is str: job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})") block_until_done(client, job) limited_entity_df = ( client.query(f"SELECT * FROM {table_name} LIMIT 1").result().to_dataframe() ) entity_schema = dict(zip(limited_entity_df.columns, limited_entity_df.dtypes)) elif isinstance(entity_df, pd.DataFrame): # Drop the index so that we dont have unnecessary columns entity_df.reset_index(drop=True, inplace=True) job = client.load_table_from_dataframe(entity_df, table_name) block_until_done(client, job) entity_schema = dict(zip(entity_df.columns, entity_df.dtypes)) else: raise InvalidEntityType(type(entity_df)) # Ensure that the table expires after some time table = client.get_table(table=table_name) table.expires = datetime.utcnow() + timedelta(minutes=30) client.update_table(table, ["expires"]) return entity_schema
def _upload_entity_df( client: Client, table_name: str, entity_df: Union[pd.DataFrame, str], ) -> Table: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" if isinstance(entity_df, str): job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})") elif isinstance(entity_df, pd.DataFrame): # Drop the index so that we don't have unnecessary columns entity_df.reset_index(drop=True, inplace=True) job = client.load_table_from_dataframe(entity_df, table_name) else: raise InvalidEntityType(type(entity_df)) block_until_done(client, job) # Ensure that the table expires after some time table = client.get_table(table=table_name) table.expires = datetime.utcnow() + timedelta(minutes=30) client.update_table(table, ["expires"]) return table
def load_rules_lookup(self, client: bigquery.Client) -> bigquery.job.LoadJob: """ Load rule csv files to a BigQuery table :param client: active BigQuery Client object :return: the completed LoadJob object """ job_config = bigquery.LoadJobConfig() rules_dataframe = self.create_rules_dataframe() job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE job = client.load_table_from_dataframe(rules_dataframe, destination=self.lookup_table, job_config=job_config) return job.result()