예제 #1
0
def do_query(project_id, project_name, dataset_name, table_name, gene_symbol, value_field,
             cohort_dataset, cohort_table, cohort_id_array):
    bigquery_service = get_bigquery_service()

    query = build_query(project_name, dataset_name, table_name, gene_symbol, value_field,
                        cohort_dataset, cohort_table, cohort_id_array)
    query_body = {
        'query': query
    }

    table_data = bigquery_service.jobs()
    query_response = table_data.query(projectId=project_id, body=query_body).execute()
    result = []
    num_result_rows = int(query_response['totalRows'])
    if num_result_rows == 0:
        return result

    for row in query_response['rows']:
        result.append({
            'case_id': row['f'][0]['v'],
            'sample_id': row['f'][1]['v'],
            'aliquot_id': row['f'][2]['v'],
            'value': float(row['f'][3]['v'])
        })

    return result
예제 #2
0
def run(
    project_id,
    dataset_id,
    table_name,
    schema_file,
    data_path,
    source_format="NEWLINE_DELIMITED_JSON",
    write_disposition="WRITE_EMPTY",
    num_retries=5,
    poll_interval=1,
):

    bigquery_service = get_bigquery_service()

    with open(schema_file, "r") as f:
        schema = json.load(f)

    job = load_table(
        bigquery_service,
        project_id,
        dataset_id,
        table_name,
        schema,
        data_path,
        source_format,
        num_retries,
        write_disposition,
    )

    poll_job(bigquery_service, job)
예제 #3
0
    def _streaming_insert(self, rows):
        bigquery_service = get_bigquery_service()
        table_data = bigquery_service.tabledata()

        body = self._build_request_body_from_rows(rows)

        response = table_data.insertAll(projectId=self.project_id,
                                        datasetId=self.dataset_id,
                                        tableId=self.table_id,
                                        body=body).execute()

        return response
def submit_jobs_with_user_data(params_array):
    bigquery_service = get_bigquery_service()
    provider_array = []

    cohort_settings = settings.GET_BQ_COHORT_SETTINGS()

    # Submit jobs
    for parameter_object in params_array:
        feature_id = parameter_object.feature_id
        cohort_id_array = parameter_object.cohort_id_array

        user_data = user_feature_handler(feature_id, cohort_id_array)

        if user_data['include_tcga']:
            job_item = submit_tcga_job(parameter_object, bigquery_service,
                                       cohort_settings)
            provider_array.append(job_item)

        if len(user_data['user_studies']) > 0:
            converted_feature_id = user_data['converted_feature_id']
            user_feature_id = user_data['user_feature_id']
            logging.debug("user_feature_id: {0}".format(user_feature_id))
            provider = UserFeatureProvider(converted_feature_id,
                                           user_feature_id=user_feature_id)

            # The UserFeatureProvider instance might not generate a BigQuery query and job at all given the combination
            # of cohort(s) and feature identifiers. The provider is not added to the array, and therefore to the
            # polling loop below, if it would not submit a BigQuery job.
            if provider.is_queryable(cohort_id_array):
                job_reference = provider.get_data_job_reference(
                    cohort_id_array, cohort_settings.dataset_id,
                    cohort_settings.table_id)

                logging.info(
                    "Submitted USER {job_id}: {fid} - {cohorts}".format(
                        job_id=job_reference['jobId'],
                        fid=feature_id,
                        cohorts=str(cohort_id_array)))
                provider_array.append({
                    'feature_id': feature_id,
                    'provider': provider,
                    'ready': False,
                    'job_reference': job_reference
                })
            else:
                logging.debug("No UserFeatureDefs for '{0}'".format(
                    converted_feature_id))

    return provider_array
예제 #5
0
    def do_query(self, project_id, project_name, uniprot_id):
        bigquery_service = get_bigquery_service()

        query = self.build_query(project_name, uniprot_id)
        query_body = {'query': query}

        table_data = bigquery_service.jobs()
        query_response = table_data.query(projectId=project_id,
                                          body=query_body).execute()

        num_result_rows = int(query_response['totalRows'])
        if num_result_rows == 0:
            return None

        row = query_response['rows'][0]
        interpro_literal = row['f'][1]['v']
        interpro_literal = interpro_literal.replace('\'', '"')
        interpro_literal = json_loads(interpro_literal)

        return interpro_literal
예제 #6
0
def get_feature_vectors_tcga_only(params_array,
                                  poll_retry_limit=20,
                                  skip_formatting_for_plot=False):
    bigquery_service = get_bigquery_service()
    provider_array = []

    cohort_settings = settings.GET_BQ_COHORT_SETTINGS()

    # Submit jobs
    for parameter_object in params_array:
        job_item = submit_tcga_job(parameter_object, bigquery_service,
                                   cohort_settings)
        provider_array.append(job_item)

    project_id = settings.BQ_PROJECT_ID
    result = get_submitted_job_results(provider_array, project_id,
                                       poll_retry_limit,
                                       skip_formatting_for_plot)

    return result
    def get_bq_service(self):
        if self.bigquery_service is None:
            self.bigquery_service = get_bigquery_service()

        return self.bigquery_service