def wait_on_bq_job_id(bq_client: bigquery.Client, job_id: str, polling_timeout: int, polling_interval: int = 1) -> bool: """" Wait for a BigQuery Job ID to complete. Args: bq_client: bigquery.Client job_id: str the BQ job ID to wait on polling_timeout: int number of seconds to poll this job ID polling_interval: frequency to query the job state during polling Returns: bool: if the job ID has finished successfully. True if DONE without errors, False if RUNNING or PENDING Raises: exceptions.BigQueryJobFailure if the job failed. google.api_core.exceptions.NotFound if the job id cannot be found. """ start_poll = time.monotonic() while time.monotonic() - start_poll < (polling_timeout - polling_interval): job: Union[bigquery.LoadJob, bigquery.QueryJob] = bq_client.get_job(job_id) if job.state == "DONE": check_for_bq_job_and_children_errors(bq_client, job) return True if job.state in {"RUNNING", "PENDING"}: print(f"waiting on BigQuery Job {job.job_id}") time.sleep(polling_interval) print(f"reached polling timeout waiting for bigquery job {job_id}") return False
def retry_query(gcs_client: storage.Client, bq_client: bigquery.Client, lock_blob: storage.Blob, failed_job_id: str, table: bigquery.TableReference, retry_attempt_cnt): """Retry a query that failed""" if retry_attempt_cnt > 1: # if this is not the first retry, truncate over the previous # job_id retry attempt suffix '_xx' (3 chars) retry_job_id = f"{failed_job_id[:-3]}_{retry_attempt_cnt:02}" # pad with zero else: retry_job_id = f"{failed_job_id}_{retry_attempt_cnt:02}" # pad with zero failed_job: bigquery.QueryJob = bq_client.get_job(failed_job_id) job_config: bigquery.QueryJobConfig = bigquery.QueryJobConfig( table_definitions=failed_job.table_definitions, use_legacy_sql=False) retry_job = bq_client.query(failed_job.query, job_config=job_config, job_id=retry_job_id) # To keep track of retry attempts between cloud # function invocations, the retry count state is # kept in the _bqlock lock file. utils.handle_bq_lock(gcs_client, lock_blob, retry_job_id, table, retry_attempt_cnt=retry_attempt_cnt) logging.log_bigquery_job( retry_job, table, f"Submitted asynchronous query job: {retry_job_id}")
def block_until_done( client: Client, bq_job: Union[bigquery.job.query.QueryJob, bigquery.job.load.LoadJob], timeout: int = 1800, retry_cadence: float = 1, ): """ Waits for bq_job to finish running, up to a maximum amount of time specified by the timeout parameter (defaulting to 30 minutes). Args: client: A bigquery.client.Client to monitor the bq_job. bq_job: The bigquery.job.QueryJob that blocks until done runnning. timeout: An optional number of seconds for setting the time limit of the job. retry_cadence: An optional number of seconds for setting how long the job should checked for completion. Raises: BigQueryJobStillRunning exception if the function has blocked longer than 30 minutes. BigQueryJobCancelled exception to signify when that the job has been cancelled (i.e. from timeout or KeyboardInterrupt). """ # For test environments, retry more aggressively if flags_helper.is_test(): retry_cadence = 0.1 def _wait_until_done(bq_job): if client.get_job(bq_job).state in ["PENDING", "RUNNING"]: raise BigQueryJobStillRunning(job_id=bq_job.job_id) try: retryer = Retrying( wait=wait_fixed(retry_cadence), stop=stop_after_delay(timeout), retry=retry_if_exception_type(BigQueryJobStillRunning), reraise=True, ) retryer(_wait_until_done, bq_job) finally: if client.get_job(bq_job).state in ["PENDING", "RUNNING"]: client.cancel_job(bq_job) raise BigQueryJobCancelled(job_id=bq_job.job_id) if bq_job.exception(): raise bq_job.exception()
class BigQueryDatasetsProvider: def __init__(self, client: Optional[Client] = None, logger: Optional[logging.Logger] = None): self.client = client if client is None: self.client = Client() self.logger = logger if logger is None: self.logger = logging.getLogger(__name__) def get_facets(self, job_id: str) -> BigQueryFacets: inputs = [] output = None run_facets = {} try: try: job = self.client.get_job(job_id=job_id) props = job._properties run_stat_facet, dataset_stat_facet = self._get_output_statistics( props) run_facets.update({"bigQuery_job": run_stat_facet}) inputs = self._get_input_from_bq(props) output = self._get_output_from_bq(props) if output and dataset_stat_facet: output.custom_facets.update({"stats": dataset_stat_facet}) output.output_facets.update({ 'outputStatistics': dataset_stat_facet.to_openlineage() }) finally: # Ensure client has close() defined, otherwise ignore. # NOTE: close() was introduced in python-bigquery v1.23.0 if hasattr(self.client, "close"): self.client.close() except Exception as e: self.logger.error( f"Cannot retrieve job details from BigQuery.Client. {e}", exc_info=True) run_facets.update({ "bigQuery_error": BigQueryErrorRunFacet( clientError=f"{e}: {traceback.format_exc()}", ) }) return BigQueryFacets(run_facets, inputs, output) def _get_output_statistics(self, properties) \ -> Tuple[BigQueryJobRunFacet, Optional[BigQueryStatisticsDatasetFacet]]: stages = get_from_nullable_chain(properties, ['statistics', 'query', 'queryPlan']) json_props = json.dumps(properties) if not stages: if get_from_nullable_chain(properties, ['statistics', 'query', 'statementType']) \ in ['CREATE_VIEW', 'CREATE_TABLE', 'ALTER_TABLE']: return BigQueryJobRunFacet(cached=False), None # we're probably getting cached results if get_from_nullable_chain(properties, ['statistics', 'query', 'cacheHit']): return BigQueryJobRunFacet(cached=True), None if get_from_nullable_chain(properties, ['status', 'state']) != "DONE": raise ValueError( "Trying to extract data from running bigquery job") raise ValueError( f"BigQuery properties did not have required data: queryPlan - {json_props}" ) out_stage = stages[-1] out_rows = out_stage.get("recordsWritten", None) out_bytes = out_stage.get("shuffleOutputBytes", None) billed_bytes = get_from_nullable_chain( properties, ['statistics', 'query', 'totalBytesBilled']) return BigQueryJobRunFacet( cached=False, billedBytes=int(billed_bytes) if billed_bytes else None, properties=json_props), BigQueryStatisticsDatasetFacet( rowCount=int(out_rows), size=int(out_bytes)) if out_bytes and out_rows else None def _get_input_from_bq(self, properties): bq_input_tables = get_from_nullable_chain( properties, ['statistics', 'query', 'referencedTables']) if not bq_input_tables: return [] input_table_names = [ self._bq_table_name(bq_t) for bq_t in bq_input_tables ] sources = [self._source() for bq_t in bq_input_tables] try: return [ Dataset.from_table_schema(source=source, table_schema=table_schema) for table_schema, source in zip( self._get_table_schemas(input_table_names), sources) ] except Exception as e: self.logger.warning(f'Could not extract schema from bigquery. {e}') return [ Dataset.from_table(source, table) for table, source in zip(input_table_names, sources) ] def _get_output_from_bq(self, properties) -> Optional[Dataset]: bq_output_table = get_from_nullable_chain( properties, ['configuration', 'query', 'destinationTable']) if not bq_output_table: return None output_table_name = self._bq_table_name(bq_output_table) source = self._source() table_schema = self._get_table_safely(output_table_name) if table_schema: return Dataset.from_table_schema( source=source, table_schema=table_schema, ) else: self.logger.warning("Could not resolve output table from bq") return Dataset.from_table(source, output_table_name) def _get_table_safely(self, output_table_name): try: return self._get_table(output_table_name) except Exception as e: self.logger.warning( f'Could not extract output schema from bigquery. {e}') return None def _get_table_schemas(self, tables: [str]) \ -> [DbTableSchema]: # Avoid querying BigQuery by returning an empty array # if no tables have been provided. if not tables: return [] return [self._get_table(table) for table in tables] def _get_table(self, table: str) -> Optional[DbTableSchema]: bq_table = self.client.get_table(table) if not bq_table._properties: return table = bq_table._properties fields = get_from_nullable_chain(table, ['schema', 'fields']) if not fields: return columns = [ DbColumn(name=fields[i].get('name'), type=fields[i].get('type'), description=fields[i].get('description'), ordinal_position=i) for i in range(len(fields)) ] return DbTableSchema( schema_name=table.get('tableReference').get('projectId') + '.' + table.get('tableReference').get('datasetId'), table_name=DbTableName(table.get('tableReference').get('tableId')), columns=columns) def _source(self) -> Source: return Source(scheme='bigquery', connection_url='bigquery') def _bq_table_name(self, bq_table): project = bq_table.get('projectId') dataset = bq_table.get('datasetId') table = bq_table.get('tableId') return f"{project}.{dataset}.{table}"