def query(self, query, max_results=None, timeout=0, dry_run=False): """Submit a query to BigQuery. Args: query: BigQuery query string. max_results: maximum number of rows to return per page of results. timeout: how long to wait for the query to complete, in seconds, before the request times out and returns. dry_run: if True, the query isn't actually run. A valid query will return an empty response, while an invalid one will return the same error message it would if it wasn't a dry run. Returns: job id and query results if query completed. If dry_run is True, job id will be None and results will be empty if the query is valid or a dict containing the response if invalid. Raises: BigQueryTimeoutException on timeout """ logging.debug('Executing query: %s' % query) job_collection = self.bigquery.jobs() query_data = { 'query': query, 'timeoutMs': timeout * 1000, 'dryRun': dry_run, 'maxResults': max_results, } try: query_reply = job_collection.query( projectId=self.project_id, body=query_data).execute() except HttpError as e: if dry_run: return None, json.loads(e.content) raise job_id = query_reply['jobReference'].get('jobId') schema = query_reply.get('schema', {'fields': None})['fields'] rows = query_reply.get('rows', []) job_complete = query_reply.get('jobComplete', False) # raise exceptions if it's not an async query # and job is not completed after timeout if not job_complete and timeout: logging.error('BigQuery job %s timeout' % job_id) raise BigQueryTimeoutException() return job_id, [self._transform_row(row, schema) for row in rows]
def _submit_query_job(self, query_data): """ Submit a query job to BigQuery. This is similar to BigQueryClient.query, but gives the user direct access to the query method on the offical BigQuery python client. For fine-grained control over a query job, see: https://google-api-client-libraries.appspot.com/documentation/bigquery/v2/python/latest/bigquery_v2.jobs.html#query Args: query_data: query object as per "configuration.query" in https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.query Returns: job id and query results if query completed. If dry_run is True, job id will be None and results will be empty if the query is valid or a dict containing the response if invalid. Raises: BigQueryTimeoutException on timeout """ logging.debug('Submitting query job: %s' % query_data) job_collection = self.bigquery.jobs() try: query_reply = job_collection.query( projectId=self.project_id, body=query_data).execute() except HttpError as e: if query_data.get("dryRun", False): return None, json.loads(e.content.decode('utf8')) raise job_id = query_reply['jobReference'].get('jobId') schema = query_reply.get('schema', {'fields': None})['fields'] rows = query_reply.get('rows', []) job_complete = query_reply.get('jobComplete', False) # raise exceptions if it's not an async query # and job is not completed after timeout if not job_complete and query_data.get("timeoutMs", False): logging.error('BigQuery job %s timeout' % job_id) raise BigQueryTimeoutException() return job_id, [self._transform_row(row, schema) for row in rows]
def wait_for_job(self, job, interval=5, timeout=60): """ Waits until the job indicated by job_resource is done or has failed Args: job: dict, representing a BigQuery job resource or str, representing a BigQuery job id interval: optional float polling interval in seconds, default = 5 timeout: optional float timeout in seconds, default = 60 Returns: dict, final state of the job_resource, as described here: https://developers.google.com/resources/api-libraries/documentation /bigquery/v2/python/latest/bigquery_v2.jobs.html#get Raises: JobExecutingException on http/auth failures or error in result BigQueryTimeoutException on timeout """ complete = False job_id = str(job if isinstance(job, (six.binary_type, six.text_type, int)) else job['jobReference']['jobId']) job_resource = None start_time = time() elapsed_time = 0 while not (complete or elapsed_time > timeout): sleep(interval) request = self.bigquery.jobs().get(projectId=self.project_id, jobId=job_id) job_resource = request.execute() self._raise_executing_exception_if_error(job_resource) complete = job_resource.get('status').get('state') == u'DONE' elapsed_time = time() - start_time # raise exceptions if timeout if not complete: logging.error('BigQuery job %s timeout' % job_id) raise BigQueryTimeoutException() return job_resource