def do_query(project_id, project_name, dataset_name, table_name, gene_symbol, value_field, cohort_dataset, cohort_table, cohort_id_array): bigquery_service = get_bigquery_service() query = build_query(project_name, dataset_name, table_name, gene_symbol, value_field, cohort_dataset, cohort_table, cohort_id_array) query_body = { 'query': query } table_data = bigquery_service.jobs() query_response = table_data.query(projectId=project_id, body=query_body).execute() result = [] num_result_rows = int(query_response['totalRows']) if num_result_rows == 0: return result for row in query_response['rows']: result.append({ 'case_id': row['f'][0]['v'], 'sample_id': row['f'][1]['v'], 'aliquot_id': row['f'][2]['v'], 'value': float(row['f'][3]['v']) }) return result
def run( project_id, dataset_id, table_name, schema_file, data_path, source_format="NEWLINE_DELIMITED_JSON", write_disposition="WRITE_EMPTY", num_retries=5, poll_interval=1, ): bigquery_service = get_bigquery_service() with open(schema_file, "r") as f: schema = json.load(f) job = load_table( bigquery_service, project_id, dataset_id, table_name, schema, data_path, source_format, num_retries, write_disposition, ) poll_job(bigquery_service, job)
def _streaming_insert(self, rows): bigquery_service = get_bigquery_service() table_data = bigquery_service.tabledata() body = self._build_request_body_from_rows(rows) response = table_data.insertAll(projectId=self.project_id, datasetId=self.dataset_id, tableId=self.table_id, body=body).execute() return response
def submit_jobs_with_user_data(params_array): bigquery_service = get_bigquery_service() provider_array = [] cohort_settings = settings.GET_BQ_COHORT_SETTINGS() # Submit jobs for parameter_object in params_array: feature_id = parameter_object.feature_id cohort_id_array = parameter_object.cohort_id_array user_data = user_feature_handler(feature_id, cohort_id_array) if user_data['include_tcga']: job_item = submit_tcga_job(parameter_object, bigquery_service, cohort_settings) provider_array.append(job_item) if len(user_data['user_studies']) > 0: converted_feature_id = user_data['converted_feature_id'] user_feature_id = user_data['user_feature_id'] logging.debug("user_feature_id: {0}".format(user_feature_id)) provider = UserFeatureProvider(converted_feature_id, user_feature_id=user_feature_id) # The UserFeatureProvider instance might not generate a BigQuery query and job at all given the combination # of cohort(s) and feature identifiers. The provider is not added to the array, and therefore to the # polling loop below, if it would not submit a BigQuery job. if provider.is_queryable(cohort_id_array): job_reference = provider.get_data_job_reference( cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) logging.info( "Submitted USER {job_id}: {fid} - {cohorts}".format( job_id=job_reference['jobId'], fid=feature_id, cohorts=str(cohort_id_array))) provider_array.append({ 'feature_id': feature_id, 'provider': provider, 'ready': False, 'job_reference': job_reference }) else: logging.debug("No UserFeatureDefs for '{0}'".format( converted_feature_id)) return provider_array
def do_query(self, project_id, project_name, uniprot_id): bigquery_service = get_bigquery_service() query = self.build_query(project_name, uniprot_id) query_body = {'query': query} table_data = bigquery_service.jobs() query_response = table_data.query(projectId=project_id, body=query_body).execute() num_result_rows = int(query_response['totalRows']) if num_result_rows == 0: return None row = query_response['rows'][0] interpro_literal = row['f'][1]['v'] interpro_literal = interpro_literal.replace('\'', '"') interpro_literal = json_loads(interpro_literal) return interpro_literal
def get_feature_vectors_tcga_only(params_array, poll_retry_limit=20, skip_formatting_for_plot=False): bigquery_service = get_bigquery_service() provider_array = [] cohort_settings = settings.GET_BQ_COHORT_SETTINGS() # Submit jobs for parameter_object in params_array: job_item = submit_tcga_job(parameter_object, bigquery_service, cohort_settings) provider_array.append(job_item) project_id = settings.BQ_PROJECT_ID result = get_submitted_job_results(provider_array, project_id, poll_retry_limit, skip_formatting_for_plot) return result
def get_bq_service(self): if self.bigquery_service is None: self.bigquery_service = get_bigquery_service() return self.bigquery_service