示例#1
0
    def query(self, query, max_results=None, timeout=0, dry_run=False):
        """Submit a query to BigQuery.

        Args:
            query: BigQuery query string.
            max_results: maximum number of rows to return per page of results.
            timeout: how long to wait for the query to complete, in seconds,
                     before the request times out and returns.
            dry_run: if True, the query isn't actually run. A valid query will
                     return an empty response, while an invalid one will return
                     the same error message it would if it wasn't a dry run.

        Returns:
            job id and query results if query completed. If dry_run is True,
            job id will be None and results will be empty if the query is valid
            or a dict containing the response if invalid.

        Raises:
            BigQueryTimeoutException on timeout
        """

        logging.debug('Executing query: %s' % query)

        job_collection = self.bigquery.jobs()
        query_data = {
            'query': query,
            'timeoutMs': timeout * 1000,
            'dryRun': dry_run,
            'maxResults': max_results,
        }

        try:
            query_reply = job_collection.query(
                projectId=self.project_id, body=query_data).execute()
        except HttpError as e:
            if dry_run:
                return None, json.loads(e.content)
            raise

        job_id = query_reply['jobReference'].get('jobId')
        schema = query_reply.get('schema', {'fields': None})['fields']
        rows = query_reply.get('rows', [])
        job_complete = query_reply.get('jobComplete', False)

        # raise exceptions if it's not an async query
        # and job is not completed after timeout
        if not job_complete and timeout:
            logging.error('BigQuery job %s timeout' % job_id)
            raise BigQueryTimeoutException()

        return job_id, [self._transform_row(row, schema) for row in rows]
示例#2
0
    def _submit_query_job(self, query_data):
        """ Submit a query job to BigQuery.

            This is similar to BigQueryClient.query, but gives the user
            direct access to the query method on the offical BigQuery
            python client.

            For fine-grained control over a query job, see:
            https://google-api-client-libraries.appspot.com/documentation/bigquery/v2/python/latest/bigquery_v2.jobs.html#query



        Args:
            query_data: query object as per "configuration.query" in
                        https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.query

        Returns:
            job id and query results if query completed. If dry_run is True,
            job id will be None and results will be empty if the query is valid
            or a dict containing the response if invalid.

        Raises:
            BigQueryTimeoutException on timeout
        """

        logging.debug('Submitting query job: %s' % query_data)

        job_collection = self.bigquery.jobs()

        try:
            query_reply = job_collection.query(
                projectId=self.project_id, body=query_data).execute()
        except HttpError as e:
            if query_data.get("dryRun", False):
                return None, json.loads(e.content.decode('utf8'))
            raise

        job_id = query_reply['jobReference'].get('jobId')
        schema = query_reply.get('schema', {'fields': None})['fields']
        rows = query_reply.get('rows', [])
        job_complete = query_reply.get('jobComplete', False)

        # raise exceptions if it's not an async query
        # and job is not completed after timeout
        if not job_complete and query_data.get("timeoutMs", False):
            logging.error('BigQuery job %s timeout' % job_id)
            raise BigQueryTimeoutException()

        return job_id, [self._transform_row(row, schema) for row in rows]
示例#3
0
    def wait_for_job(self, job, interval=5, timeout=60):
        """
        Waits until the job indicated by job_resource is done or has failed
        Args:
            job: dict, representing a BigQuery job resource
                 or str, representing a BigQuery job id
            interval: optional float polling interval in seconds, default = 5
            timeout: optional float timeout in seconds, default = 60
        Returns:
            dict, final state of the job_resource, as described here:
            https://developers.google.com/resources/api-libraries/documentation
            /bigquery/v2/python/latest/bigquery_v2.jobs.html#get
        Raises:
            JobExecutingException on http/auth failures or error in result
            BigQueryTimeoutException on timeout
        """
        complete = False
        job_id = str(job if isinstance(job,
                                       (six.binary_type, six.text_type, int))
                     else job['jobReference']['jobId'])
        job_resource = None

        start_time = time()
        elapsed_time = 0
        while not (complete or elapsed_time > timeout):
            sleep(interval)
            request = self.bigquery.jobs().get(projectId=self.project_id,
                                               jobId=job_id)
            job_resource = request.execute()
            self._raise_executing_exception_if_error(job_resource)
            complete = job_resource.get('status').get('state') == u'DONE'
            elapsed_time = time() - start_time

        # raise exceptions if timeout
        if not complete:
            logging.error('BigQuery job %s timeout' % job_id)
            raise BigQueryTimeoutException()

        return job_resource