def run( self, uri: str = None, dataset_id: str = None, table: str = None, project: str = None, schema: List[bigquery.SchemaField] = None, location: str = "US", credentials: dict = None, **kwargs, ): """ Run method for this Task. Invoked by _calling_ this Task within a Flow context, after initialization. Args: - uri (str, optional): GCS path to load data from - dataset_id (str, optional): the id of a destination dataset to write the records to; if not provided here, will default to the one provided at initialization - table (str, optional): the name of a destination table to write the records to; if not provided here, will default to the one provided at initialization - project (str, optional): the project to initialize the BigQuery Client with; if not provided, will default to the one inferred from your credentials - schema (List[bigquery.SchemaField], optional): the schema to use when creating the table - location (str, optional): location of the dataset that will be written to; defaults to "US" - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. - **kwargs (optional): additional kwargs to pass to the `bigquery.LoadJobConfig`; see the documentation here: https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html Raises: - ValueError: if all required arguments haven't been provided - ValueError: if the load job results in an error Returns: - the response from `load_table_from_uri` """ # check for any argument inconsistencies if dataset_id is None or table is None: raise ValueError("Both dataset_id and table must be provided.") # create client client = get_bigquery_client(project=project, credentials=credentials) # get table reference table_ref = client.dataset(dataset_id).table(table) # load data autodetect = kwargs.pop("autodetect", True) job_config = bigquery.LoadJobConfig(autodetect=autodetect, **kwargs) if schema: job_config.schema = schema load_job = client.load_table_from_uri(uri, table_ref, job_config=job_config) load_job.result() # block until job is finished
def run( self, project: str = None, credentials: dict = None, dataset: str = None, table: str = None, schema: List[bigquery.SchemaField] = None, ): """ Run method for this Task. Invoked by _calling_ this Task within a Flow context, after initialization. Args: - project (str, optional): the project to initialize the BigQuery Client with; if not provided, will default to the one inferred from your credentials - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. - dataset (str, optional): the name of a dataset in that the table will be created - table (str, optional): the name of a table to create - schema (List[bigquery.SchemaField], optional): the schema to use when creating the table Returns: - None Raises: - SUCCESS: a `SUCCESS` signal if the table already exists """ client = get_bigquery_client(project=project, credentials=credentials) try: dataset_ref = client.get_dataset(dataset) except NotFound: self.logger.debug( "Dataset {} not found, creating...".format(dataset)) dataset_ref = client.create_dataset(dataset) table_ref = dataset_ref.table(table) try: client.get_table(table_ref) raise SUCCESS("{dataset}.{table} already exists.".format( dataset=dataset, table=table)) except NotFound: self.logger.debug("Table {} not found, creating...".format(table)) table = bigquery.Table(table_ref, schema=schema) # partitioning if self.time_partitioning: table.time_partitioning = self.time_partitioning # cluster for optimal data sorting/access if self.clustering_fields: table.clustering_fields = self.clustering_fields client.create_table(table)
def get_client(project: str, credentials: dict, credentials_secret: str = None) -> bigquery.Client: creds = None if credentials_secret is not None: warnings.warn( "The `credentials_secret` argument is deprecated. Use a `Secret` task " "to pass the credentials value at runtime instead.", UserWarning, ) # TODO: make this optional so that the machine can authenticate externally to Prefect creds = Secret(credentials_secret).get() return get_bigquery_client(credentials=creds, project=project)
def extract_ga_table(project: str, gcp_credentials: dict, dataset: str, date: str, output_root: str): """ Runs a BigQuery extraction job, extracting the google analytics' `ga_sessions` table for a given date to a location in GCS in gzipped compressed JSON format. """ table_name = "ga_sessions_{}".format(date) dest_filename = "{}_*.json.gz".format(table_name) base_extraction_path = os.path.join(output_root, dataset, date) destination_uri = os.path.join(base_extraction_path, dest_filename) client = get_bigquery_client(credentials=gcp_credentials, project=project) dataset = client.dataset(dataset, project=project) table = dataset.table(table_name) job_config = bigquery.job.ExtractJobConfig() job_config.destination_format = bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON job_config.compression = "GZIP" extract_job = client.extract_table(table, destination_uri, job_config=job_config) extract_job.result() return base_extraction_path
def run( self, query: str = None, query_params: List[tuple] = None, project: str = None, location: str = "US", dry_run_max_bytes: int = None, credentials: dict = None, dataset_dest: str = None, table_dest: str = None, job_config: dict = None, ): """ Run method for this Task. Invoked by _calling_ this Task within a Flow context, after initialization. Args: - query (str, optional): a string of the query to execute - query_params (list[tuple], optional): a list of 3-tuples specifying BigQuery query parameters; currently only scalar query parameters are supported. See [the Google documentation](https://cloud.google.com/bigquery/docs/parameterized-queries#bigquery-query-params-python) for more details on how both the query and the query parameters should be formatted - project (str, optional): the project to initialize the BigQuery Client with; if not provided, will default to the one inferred from your credentials - location (str, optional): location of the dataset that will be queried; defaults to "US" - dry_run_max_bytes (int, optional): if provided, the maximum number of bytes the query is allowed to process; this will be determined by executing a dry run and raising a `ValueError` if the maximum is exceeded - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. - dataset_dest (str, optional): the optional name of a destination dataset to write the query results to, if you don't want them returned; if provided, `table_dest` must also be provided - table_dest (str, optional): the optional name of a destination table to write the query results to, if you don't want them returned; if provided, `dataset_dest` must also be provided - job_config (dict, optional): an optional dictionary of job configuration parameters; note that the parameters provided here must be pickleable (e.g., dataset references will be rejected) Raises: - ValueError: if the `query` is `None` - ValueError: if only one of `dataset_dest` / `table_dest` is provided - ValueError: if the query will execeed `dry_run_max_bytes` Returns: - list: a fully populated list of Query results, with one item per row """ ## check for any argument inconsistencies if query is None: raise ValueError("No query provided.") if sum([dataset_dest is None, table_dest is None]) == 1: raise ValueError( "Both `dataset_dest` and `table_dest` must be provided if writing to a destination table." ) ## create client client = get_bigquery_client(project=project, credentials=credentials) ## setup jobconfig job_config = bigquery.QueryJobConfig(**job_config) if query_params is not None: hydrated_params = [ bigquery.ScalarQueryParameter(*qp) for qp in query_params ] job_config.query_parameters = hydrated_params ## perform dry_run if requested if dry_run_max_bytes is not None: old_info = dict(dry_run=job_config.dry_run, use_query_cache=job_config.use_query_cache) job_config.dry_run = True job_config.use_query_cache = False self.logger.debug("Performing a dry run...") query_job = client.query(query, location=location, job_config=job_config) if query_job.total_bytes_processed > dry_run_max_bytes: raise ValueError( "Query will process {0} bytes which is above the set maximum of {1} for this task." .format(query_job.total_bytes_processed, dry_run_max_bytes)) job_config.dry_run = old_info["dry_run"] job_config.use_query_cache = old_info["use_query_cache"] ## if writing to a destination table if dataset_dest is not None: table_ref = client.dataset(dataset_dest).table(table_dest) job_config.destination = table_ref query_job = client.query(query, location=location, job_config=job_config) return list(query_job.result())
def run(self, records: List[dict], dataset_id: str = None, table: str = None, project: str = None, location: str = "US", credentials: dict = None, **kwargs): """ Run method for this Task. Invoked by _calling_ this Task within a Flow context, after initialization. Args: - records (list[dict]): the list of records to insert as rows into the BigQuery table; each item in the list should be a dictionary whose keys correspond to columns in the table - dataset_id (str, optional): the id of a destination dataset to write the records to; if not provided here, will default to the one provided at initialization - table (str, optional): the name of a destination table to write the records to; if not provided here, will default to the one provided at initialization - project (str, optional): the project to initialize the BigQuery Client with; if not provided, will default to the one inferred from your credentials - location (str, optional): location of the dataset that will be written to; defaults to "US" - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. - **kwargs (optional): additional kwargs to pass to the `insert_rows_json` method; see the documentation here: https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html Raises: - ValueError: if all required arguments haven't been provided - ValueError: if any of the records result in errors Returns: - the response from `insert_rows_json` """ ## check for any argument inconsistencies if dataset_id is None or table is None: raise ValueError("Both dataset_id and table must be provided.") ## create client client = get_bigquery_client(project=project, credentials=credentials) ## get table reference table_ref = client.dataset(dataset_id).table(table) ## stream data in response = client.insert_rows_json(table=table_ref, json_rows=records, **kwargs) errors = [] output = [] for row in response: output.append(row) if "errors" in row: errors.append(row["errors"]) if errors: raise ValueError(errors) return output
def run( self, file: Union[str, Path] = None, rewind: bool = False, size: int = None, num_retries: int = 6, dataset_id: str = None, table: str = None, project: str = None, schema: List[bigquery.SchemaField] = None, location: str = "US", credentials: dict = None, **kwargs, ): """ Run method for this Task. Invoked by _calling_ this Task within a Flow context, after initialization. Args: - file (Union[str, path-liike object], optional): A string or path-like object of the file to be loaded - rewind (bool, optional): if True, seek to the beginning of the file handle before reading the file - size (int, optional): the number of bytes to read from the file handle. If size is None or large, resumable upload will be used. Otherwise, multipart upload will be used. - dataset_id (str, optional): the id of a destination dataset to write the records to; if not provided here, will default to the one provided at initialization - table (str, optional): the name of a destination table to write the records to; if not provided here, will default to the one provided at initialization - project (str, optional): the project to initialize the BigQuery Client with; if not provided, will default to the one inferred from your credentials - schema (List[bigquery.SchemaField], optional): the schema to use when creating the table - location (str, optional): location of the dataset that will be written to; defaults to "US" - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. - **kwargs (optional): additional kwargs to pass to the `bigquery.LoadJobConfig`; see the documentation here: https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html Raises: - ValueError: if all required arguments haven't been provided or file does not exist - IOError: if file can't be opened and read - ValueError: if the load job results in an error Returns: - the response from `load_table_from_file` """ # check for any argument inconsistencies if dataset_id is None or table is None: raise ValueError("Both dataset_id and table must be provided.") try: path = Path(file) except Exception: raise ValueError("A string or path-like object must be provided.") if not path.is_file(): raise ValueError(f"File {path.as_posix()} does not exist.") # create client client = get_bigquery_client( project=project, credentials=credentials, ) # get table reference table_ref = client.dataset(dataset_id).table(table) # configure job autodetect = kwargs.pop("autodetect", True) job_config = bigquery.LoadJobConfig(autodetect=autodetect, **kwargs) if schema: job_config.schema = schema # load data try: with open(file, "rb") as file_obj: load_job = client.load_table_from_file( file_obj, table_ref, rewind, size, num_retries, job_config=job_config, ) except IOError: raise IOError(f"Can't open and read from {path.as_posix()}.") load_job.result() # block until job is finished
try: path = Path(file) <<<<<<< HEAD except Exception as value_error: raise ValueError( "A string or path-like object must be provided." ) from value_error ======= except Exception: raise ValueError("A string or path-like object must be provided.") >>>>>>> prefect clone if not path.is_file(): raise ValueError(f"File {path.as_posix()} does not exist.") # create client client = get_bigquery_client(project=project, credentials=credentials) # get table reference table_ref = client.dataset(dataset_id).table(table) # configure job autodetect = kwargs.pop("autodetect", True) job_config = bigquery.LoadJobConfig(autodetect=autodetect, **kwargs) if schema: job_config.schema = schema # load data try: with open(file, "rb") as file_obj: load_job = client.load_table_from_file( file_obj,