def do_query(project_id, project_name, dataset_name, table_name, gene_label, value_field, cohort_dataset, cohort_table, cohort_id_array): bigquery_service = get_bigquery_service() query = build_query(project_name, dataset_name, table_name, gene_label, value_field, cohort_dataset, cohort_table, cohort_id_array) query_body = {'query': query} table_data = bigquery_service.jobs() query_response = table_data.query(projectId=project_id, body=query_body).execute() result = [] num_result_rows = int(query_response['totalRows']) if num_result_rows == 0: return result for row in query_response['rows']: result.append({ 'case_id': row['f'][0]['v'], 'sample_id': row['f'][1]['v'], 'aliquot_id': row['f'][2]['v'], 'value': row['f'][5]['v'], }) result.append({ 'case_id': row['f'][0]['v'], 'sample_id': row['f'][3]['v'], 'aliquot_id': row['f'][4]['v'], 'value': row['f'][5]['v'], }) return result
def _streaming_insert(self, rows): bigquery_service = get_bigquery_service() table_data = bigquery_service.tabledata() index = 0 next = 0 logger.info("[STATUS] Beginning row stream...") while index < len(rows) and next is not None: next = MAX_INSERT + index body = None if next > len(rows): next = None body = self._build_request_body_from_rows(rows[index:]) else: body = self._build_request_body_from_rows(rows[index:next]) response = table_data.insertAll(projectId=self.project_id, datasetId=self.dataset_id, tableId=self.table_id, body=body).execute() index = next logger.info("[STATUS] ...done.") return response
def submit_jobs_with_user_data(params_array): bigquery_service = get_bigquery_service() provider_array = [] cohort_settings = settings.GET_BQ_COHORT_SETTINGS() # Submit jobs for parameter_object in params_array: feature_id = parameter_object.feature_id cohort_id_array = parameter_object.cohort_id_array user_data = user_feature_handler(feature_id, cohort_id_array) if user_data['include_tcga']: job_item = submit_tcga_job(parameter_object, bigquery_service, cohort_settings) provider_array.append(job_item) if len(user_data['user_studies']) > 0: converted_feature_id = user_data['converted_feature_id'] user_feature_id = user_data['user_feature_id'] logging.debug("user_feature_id: {0}".format(user_feature_id)) provider = UserDataQueryHandler(converted_feature_id, user_feature_id=user_feature_id) # The UserDataQueryHandler instance might not generate a BigQuery query and job at all given the combination # of cohort(s) and feature identifiers. The provider is not added to the array, and therefore to the # polling loop below, if it would not submit a BigQuery job. if provider.is_queryable(cohort_id_array): job_reference = provider.get_data_job_reference( cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) logging.info( "Submitted USER {job_id}: {fid} - {cohorts}".format( job_id=job_reference['jobId'], fid=feature_id, cohorts=str(cohort_id_array))) provider_array.append({ 'feature_id': feature_id, 'provider': provider, 'ready': False, 'job_reference': job_reference['job_reference'], 'tables_used': job_reference['tables_queried'] }) else: logging.debug("No UserFeatureDefs for '{0}'".format( converted_feature_id)) return provider_array
def __init__(self, project_id, dataset_id, table_id, executing_project=None, table_schema=None): # Project which will execute any jobs run by this class self.executing_project = executing_project or settings.BIGQUERY_PROJECT_ID # Destination project self.project_id = project_id # Destination dataset self.dataset_id = dataset_id # Destination table self.table_id = table_id self.bq_service = get_bigquery_service() self.table_schema = table_schema
def get_feature_vectors_tcga_only(params_array, poll_retry_limit=20, skip_formatting_for_plot=False): bigquery_service = get_bigquery_service() provider_array = [] cohort_settings = settings.GET_BQ_COHORT_SETTINGS() # Submit jobs for parameter_object in params_array: job_item = submit_tcga_job(parameter_object, bigquery_service, cohort_settings) provider_array.append(job_item) project_id = settings.BQ_PROJECT_ID result = get_submitted_job_results(provider_array, project_id, poll_retry_limit, skip_formatting_for_plot) return result
def run_with_schema_object(project_id, dataset_id, table_name, schema, data_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_EMPTY', num_retries=5): bigquery_service = get_bigquery_service() job = load_table(bigquery_service, project_id, dataset_id, table_name, schema, data_path, source_format, num_retries, write_disposition) poll_job(bigquery_service, job)
def run(project_id, dataset_id, table_name, schema_file, data_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_EMPTY', num_retries=5, poll_interval=1): bigquery_service = get_bigquery_service() with open(schema_file, 'r') as f: schema = json.load(f) job = load_table(bigquery_service, project_id, dataset_id, table_name, schema, data_path, source_format, num_retries, write_disposition) poll_job(bigquery_service, job)
def do_query(self, project_id, project_name, uniprot_id): bigquery_service = get_bigquery_service() query = self.build_query(project_name, uniprot_id) query_body = {'query': query} table_data = bigquery_service.jobs() query_response = table_data.query(projectId=project_id, body=query_body).execute() num_result_rows = int(query_response['totalRows']) if num_result_rows == 0: return None row = query_response['rows'][0] interpro_literal = row['f'][1]['v'] interpro_literal = interpro_literal.replace('\'', '"') interpro_literal = json_loads(interpro_literal) return interpro_literal
def get_bq_service(self): if self.bigquery_service is None: self.bigquery_service = get_bigquery_service() return self.bigquery_service
def _table_to_gcs(self, file_format, dataset_and_table, export_type, table_job_id=None): bq_service = get_bigquery_service() result = {'status': None, 'message': None} # presence of a table_job_id means the export query was still running when this # method was called; give it another round of checks if table_job_id: job_is_done = bq_service.jobs().get( projectId=settings.BIGQUERY_PROJECT_ID, jobId=table_job_id).execute() retries = 0 while (job_is_done and not job_is_done['status']['state'] == 'DONE' ) and retries < BQ_ATTEMPT_MAX: retries += 1 sleep(1) job_is_done = bq_service.jobs().get( projectId=settings.BIGQUERY_PROJECT_ID, jobId=table_job_id).execute() if job_is_done and not job_is_done['status']['state'] == 'DONE': logger.debug(str(job_is_done)) msg = "Export of {} to gs://{}/{} did not complete in the time allowed".format( export_type, self.bucket_path, self.file_name) logger.error("[ERROR] {}.".format(msg)) result['status'] = 'error' result['message'] = msg + "--please contact the administrator." return result else: dataset_and_table = { 'dataset_id': job_is_done['configuration']['query']['destinationTable'] ['datasetId'], 'table_id': job_is_done['configuration']['query']['destinationTable'] ['tableId'] } job_id = str(uuid4()) export_config = { 'jobReference': { 'projectId': self.project_id, 'jobId': job_id }, 'configuration': { 'extract': { 'sourceTable': { 'projectId': self.project_id, 'datasetId': dataset_and_table['dataset_id'], 'tableId': dataset_and_table['table_id'] }, 'destinationUris': ['gs://{}/{}'.format(self.bucket_path, self.file_name)], 'destinationFormat': file_format, 'compression': 'GZIP' } } } export_job = bq_service.jobs().insert( projectId=settings.BIGQUERY_PROJECT_ID, body=export_config).execute(num_retries=5) job_is_done = bq_service.jobs().get( projectId=settings.BIGQUERY_PROJECT_ID, jobId=job_id).execute() retries = 0 while (job_is_done and not job_is_done['status']['state'] == 'DONE' ) and retries < BQ_ATTEMPT_MAX: retries += 1 sleep(1) job_is_done = bq_service.jobs().get( projectId=settings.BIGQUERY_PROJECT_ID, jobId=job_id).execute() logger.debug("[STATUS] extraction job_is_done: {}".format( str(job_is_done))) if job_is_done and job_is_done['status']['state'] == 'DONE': if 'status' in job_is_done and 'errors' in job_is_done['status']: msg = "Export of {} to GCS bucket {} was unsuccessful, reason: {}".format( export_type, self.bucket, job_is_done['status']['errors'][0]['message']) logger.error("[ERROR] {}".format(msg)) result['status'] = 'error' result[ 'message'] = "Unable to export {} to bucket {}--please contact the administrator.".format( export_type, self.bucket) else: # Check the file exported_file = get_storage_resource().objects().get( bucket=self.bucket_path, object=self.file_name).execute() if not exported_file: msg = "Export file {}/{} not found".format( self.bucket_path, self.file_name) logger.error("[ERROR] ".format({msg})) export_result = bq_service.jobs().get( projectId=settings.BIGQUERY_PROJECT_ID, jobId=job_id).execute() if 'errors' in export_result: logger.error('[ERROR] Errors seen: {}'.format( export_result['errors'][0]['message'])) result['status'] = 'error' result[ 'message'] = "Unable to export {} to file {}/{}--please contact the administrator.".format( export_type, self.bucket_path, self.file_name) else: if int(exported_file['size']) > 0: logger.info( "[STATUS] Successfully exported {} into GCS file gs://{}/{}" .format(export_type, self.bucket_path, self.file_name)) result['status'] = 'success' result['message'] = "{}MB".format( str( round((float(exported_file['size']) / 1000000), 2))) else: msg = "File gs://{}/{} created, but appears empty. Export of {} may not have succeeded".format( export_type, self.bucket_path, self.file_name) logger.warn("[WARNING] {}.".format(msg)) result['status'] = 'error' result[ 'message'] = msg + "--please contact the administrator." else: logger.debug(str(job_is_done)) msg = "Export of {} to gs://{}/{} did not complete in the time allowed".format( export_type, self.bucket_path, self.file_name) logger.error("[ERROR] {}.".format(msg)) result['status'] = 'error' result['message'] = msg + "--please contact the administrator." return result