def run_query(gcp_project_id, x, y, cohort_id_array, program_array, project_id_array, cohort_table_id, log_transform): from bq_data_access.v2.data_access import FeatureVectorBigQueryBuilder from bq_data_access.bigquery_cohorts import BigQueryCohortStorageSettings from bq_data_access.v2.plot_data_support import get_merged_feature_vectors from google_helpers.bigquery.service_v2 import BigQueryServiceSupport # Verify the program set # ---------------------- program_set = get_bq_program_set(program_array) logger.info("Selected programs: {}".format(program_set)) if len(program_set) == 0: logger.info("No programs set. Please include at least one program.") sys_exit(0) # Verify the cohort ID array # -------------------------- if len(cohort_id_array) == 0: logger.info("No cohort IDs set. Please include at least one cohort ID.") sys_exit(0) # Verify the project ID array # --------------------------- if len(project_id_array) == 0: logger.info("No project IDs set. Please include at least one project ID.") sys_exit(0) cohort_settings = BigQueryCohortStorageSettings.build_from_full_table_id(cohort_table_id) bqss = BigQueryServiceSupport.build_from_application_default() fvb = FeatureVectorBigQueryBuilder(gcp_project_id, cohort_settings, bqss) data = get_merged_feature_vectors(fvb, x, y, None, cohort_id_array, log_transform, project_id_array, program_set=program_set)
def oncoprint_view_data(request): try: gene_list_str = request.GET.get('gene_list', None) gene_array = gene_list_str.split(',') genomic_build = request.GET.get('genomic_build', None) cohort_id_param_array = request.GET.getlist('cohort_id', None) if not is_valid_genomic_build(genomic_build): return JsonResponse({'error': 'Invalid genomic build'}, status=400) cohort_id_array = [] for cohort_id in cohort_id_param_array: try: cohort_id = int(cohort_id) cohort_id_array.append(cohort_id) except Exception as e: return JsonResponse({'error': 'Invalid cohort parameter'}, status=400) if len(cohort_id_array) == 0: return JsonResponse({'error': 'No cohorts specified'}, status=400) program_set = get_program_set_for_oncoprint(cohort_id_array) confirmed_project_ids, user_only_study_ids = get_confirmed_project_ids_for_cohorts(cohort_id_array) # Only samples in projects from a data type's valid programs should be queried projects_this_program_set = Project.objects.filter(id__in=confirmed_project_ids,program__in=program_set).values_list('id', flat=True) if not len(program_set): return JsonResponse( {'message': "The chosen cohorts do not contain samples from programs with Gene Mutation data."}) query_template = """ #standardSQL SELECT cs.case_barcode, sm.Hugo_Symbol, sm.Alteration, sm.Type FROM ( SELECT case_barcode FROM `{cohort_table}` WHERE cohort_id IN ({cohort_id_list}) AND (project_id IS NULL{project_clause}) GROUP BY case_barcode ) cs LEFT JOIN ( SELECT case_barcode, Hugo_Symbol, CASE WHEN Protein_position IS NOT NULL AND Protein_position NOT LIKE '-/%' THEN CONCAT( COALESCE(REGEXP_EXTRACT(Amino_acids,r'^([A-Za-z*\-]+)'),'-'), COALESCE(REGEXP_EXTRACT(Protein_position,r'^([0-9]+)'), '-'), CASE WHEN Variant_Classification IN ('Frame_Shift_Del', 'Frame_Shift_Ins') OR {conseq_col} LIKE '%frameshift%' THEN '_fs' WHEN Variant_Classification IN ('Splice_Site', 'Splice_Region') THEN '_splice' WHEN Amino_acids LIKE '%/%' THEN REGEXP_EXTRACT(Amino_acids,r'^.*/([A-Za-z*-]+)') ELSE '-' END ) ELSE CASE WHEN {conseq_col} LIKE '%splice_%_variant%' THEN REGEXP_EXTRACT({conseq_col},r'^(splice_[^_]+_variant)') WHEN {conseq_col} LIKE '%intron_variant%' THEN 'intron_variant' WHEN Variant_Classification = 'IGR' THEN 'Intergenic' ELSE Variant_Classification END END AS Alteration, CASE WHEN (Amino_acids IS NOT NULL AND REGEXP_EXTRACT(Amino_acids,r'^.*/([A-Za-z*-]+)$') = '*') OR Variant_Classification IN ('Frame_Shift_Del', 'Frame_Shift_Ins', 'Splice_Site', 'Splice_Region') THEN 'TRUNC' WHEN Variant_Classification = 'Nonsense_Mutation' AND {conseq_col} LIKE 'stop_gained%' THEN 'TRUNC' WHEN Variant_Classification = 'Nonstop_Mutation' OR (Variant_Classification = 'Missense_Mutation' AND Variant_Type IN ('DEL','INS')) OR (Variant_Classification = 'Translation_Start_Site') THEN 'MISSENSE' WHEN (Variant_Classification = 'Missense_Mutation' AND Variant_Type IN ('ONP','SNP', 'TNP')) OR (Variant_Classification IN ('In_Frame_Del','In_Frame_Ins')) OR {conseq_col} LIKE '%inframe%' THEN 'INFRAME' WHEN Variant_Classification IN ("RNA","IGR", "3\'UTR","3\'Flank","5\'UTR","5\'Flank") THEN CASE WHEN {conseq_col} LIKE '%intergenic%' THEN 'INTERGENIC' WHEN {conseq_col} LIKE '%regulatory%' THEN 'REGULATORY' WHEN {conseq_col} LIKE '%miRNA%' THEN 'miRNA' WHEN {conseq_col} LIKE '%transcript%' THEN 'TRANSCRIPT' WHEN {conseq_col} LIKE '%downstream%' THEN 'DOWNSTREAM' WHEN {conseq_col} LIKE '%upstream%' THEN 'UPSTREAM' ELSE UPPER(Variant_Classification) END ELSE UPPER(Variant_Classification) END AS Type FROM `{bq_data_project_id}.{dataset_name}.{table_name}` WHERE Variant_Classification NOT IN ('Silent') {filter_clause} AND case_barcode IN ( SELECT case_barcode FROM `{cohort_table}` WHERE cohort_id IN ({cohort_id_list}) AND (project_id IS NULL{project_clause}) GROUP BY case_barcode ) GROUP BY case_barcode, Hugo_Symbol, Alteration, Type ORDER BY case_barcode ) sm ON sm.case_barcode = cs.case_barcode ; """ project_id_stmt = "" if projects_this_program_set and len(projects_this_program_set): project_id_stmt = ', '.join([str(project_id) for project_id in projects_this_program_set]) project_clause = " OR project_id IN ({})".format(project_id_stmt) if projects_this_program_set else "" gene_list_stm = '' if gene_array is not None: gene_list_stm = ', '.join('\'{0}\''.format(gene) for gene in gene_array) filter_clause = "AND Hugo_Symbol IN ({})".format(gene_list_stm) if gene_list_stm != "" else "" cohort_id_list = ', '.join([str(cohort_id) for cohort_id in cohort_id_array]) cohort_table_id = "{project_name}.{dataset_id}.{table_id}".format( project_name=settings.BIGQUERY_PROJECT_ID, dataset_id=settings.BIGQUERY_COHORT_DATASET_ID, table_id=settings.BIGQUERY_COHORT_TABLE_ID) bq_table_info = BQ_MOLECULAR_ATTR_TABLES['TCGA'][genomic_build] somatic_mut_query = query_template.format( bq_data_project_id = settings.BIGQUERY_DATA_PROJECT_ID, dataset_name=bq_table_info['dataset'], table_name=bq_table_info['table'], conseq_col=("one_consequence" if genomic_build == "hg38" else 'consequence'), cohort_table=cohort_table_id, filter_clause=filter_clause, cohort_id_list=cohort_id_list, project_clause=project_clause ) somatic_mut_query_job = BigQuerySupport.insert_query_job(somatic_mut_query) plot_data = [] genes_with_no_cnvr = [] # Build the CNVR features for gene in gene_array: feature = build_feature_ids( "CNVR", {'value_field': 'segment_mean', 'gene_name': gene, 'genomic_build': genomic_build} ) if not feature or not len(feature): logger.warn("[WARNING] No internal feature ID found for CNVR, gene {}, build {}.".format(gene,genomic_build)) genes_with_no_cnvr.append(gene) continue feature = feature[0]['internal_feature_id'] fvb = FeatureVectorBigQueryBuilder.build_from_django_settings(BigQueryServiceSupport.build_from_django_settings()) data = get_merged_feature_vectors(fvb, feature, None, None, cohort_id_array, None, projects_this_program_set, program_set=program_set)['items'] if data and len(data): for item in data: # 01A are tumor samples, which is what we want if item['sample_id'].split('-')[-1] == '01A': seg_mean = float(item['x']) if seg_mean > 0.112 or seg_mean < -0.112: cnvr_result = "AMP" if seg_mean > 1 else "GAIN" if seg_mean > 0.62 else "HOMDEL" if seg_mean < -1 else "HETLOSS" plot_data.append("{}\t{}\t{}\t{}".format(item['case_id'],gene,cnvr_result,"CNA")) attempts = 0 job_is_done = BigQuerySupport.check_job_is_done(somatic_mut_query_job) while attempts < settings.BQ_MAX_ATTEMPTS and not job_is_done: job_is_done = BigQuerySupport.check_job_is_done(somatic_mut_query_job) sleep(1) attempts += 1 if job_is_done: results = BigQuerySupport.get_job_results(somatic_mut_query_job['jobReference']) #Only add plot_data if gene info is not missing if results and len(results) > 0: for row in results: if row['f'][1]['v']: plot_data.append("{}\t{}\t{}\t{}".format(str(row['f'][0]['v']),str(row['f'][1]['v']),str(row['f'][2]['v']),str(row['f'][3]['v']))) if len(plot_data): plot_message = \ '' if not genes_with_no_cnvr \ else "No internal feature ID found for CNVR, gene [{}], build {}."\ .format(', '.join(genes_with_no_cnvr), genomic_build) return JsonResponse({ 'plot_data': plot_data, 'gene_list': gene_array, 'bq_tables': ["{bq_data_project_id}:{dataset_name}.{table_name}".format( bq_data_project_id=settings.BIGQUERY_DATA_PROJECT_ID, dataset_name=bq_table_info['dataset'], table_name=bq_table_info['table'])], 'plot_message': plot_message, }) else: return JsonResponse( {'message': "The chosen genes and cohorts do not contain any samples with Gene Mutation data."}) except Exception as e: logger.error("[ERROR] In oncoprint_view_data: ") logger.exception(e) return JsonResponse({'Error': str(e)}, status=500)
def data_access_for_plot(request): """ Used by the web application. """ try: logTransform = None ver = request.GET.get('ver', '1') x_id = request.GET.get('x_id', None) y_id = request.GET.get('y_id', None) c_id = request.GET.get('c_id', None) try: # TODO Use jsonschema to validate logTransform object logTransform = json.loads(request.GET.get('log_transform', None)) except Exception as e: logger.warn("[WARNING] Log transform parameter not supplied") logTransform = None cohort_id_array = request.GET.getlist('cohort_id', None) # Check that all requested feature identifiers are valid. Do not check for y_id if it is not # supplied in the request. feature_ids_to_check = [x_id] if c_id is not None: feature_ids_to_check.append(c_id) if y_id is not None: feature_ids_to_check.append(y_id) valid_features = get_feature_id_validity_for_array( feature_ids_to_check) for feature_id, is_valid in valid_features: if not is_valid: logging.error( "Invalid internal feature ID '{}'".format(feature_id)) raise Exception('Feature Not Found') # Gives the user data handler a chance to map e.g. "v2:USER:343:58901" to "v2:CLIN:case_barcode" x_id = _feature_converter(x_id) y_id = _feature_converter(y_id) c_id = _feature_converter(c_id) # Get the project IDs these cohorts' samples come from confirmed_study_ids, user_only_study_ids = get_confirmed_project_ids_for_cohorts( cohort_id_array) bqss = BigQueryServiceSupport.build_from_django_settings() fvb = FeatureVectorBigQueryBuilder.build_from_django_settings(bqss) # By extracting info from the cohort, we get the NAMES of the public projects # we need to access (public projects have unique name tags, e.g. tcga). program_set = get_public_program_name_set_for_cohorts(cohort_id_array) # We need to do this for cohorts that contain samples found in user data projects, # where those projects are extension of public data. This is because the cohorts # only reference the user project, but if we are plotting against pubic data, we # have to know which public programs we need to look at. prog_set_extended = get_extended_public_program_name_set_for_user_extended_projects( confirmed_study_ids) program_set.update(prog_set_extended) # Check to see if these programs have data for the requested vectors; if not, there's no reason to plot features_without_program_data = [] for id in [x_id, y_id, c_id]: if id: type = id.split(':')[1].lower() plot_type = FEATURE_ID_TO_TYPE_MAP[ type] if type in FEATURE_ID_TO_TYPE_MAP else None if plot_type: programs = FeatureDataTypeHelper.get_supported_programs_from_data_type( plot_type) valid_programs = set(programs).intersection(program_set) if not len(valid_programs): features_without_program_data.append( FeatureDataTypeHelper.get_proper_feature_type_name( plot_type)) if len(features_without_program_data): return JsonResponse({ 'message': "The chosen cohorts do not contain programs with data for these features: {}." .format(", ".join(features_without_program_data)) }) user_programs = get_user_program_id_set_for_user_only_projects( user_only_study_ids) # Fix for #2381: confirmed_study_ids MUST ALWAYS contain the public dataset project IDs, because that's how we # enable older cohorts which didn't store project IDs (check for NULL) against ones where we did require the # project ID if len(user_programs): program_set.update(user_programs) confirmed_study_ids += user_only_study_ids data = get_merged_feature_vectors(fvb, x_id, y_id, c_id, cohort_id_array, logTransform, confirmed_study_ids, program_set=program_set) # Annotate each data point with cohort information add_cohort_info_to_merged_vectors(data, x_id, y_id, c_id, cohort_id_array) # convert to display strings where needed (eg. categoricals stored as indicies rather than strings) programs_by_project = {} preformatted_vals = {} for item in data['items']: programs = [] for project in item['project']: # Fetch the program if we don't know it already if project not in programs_by_project: programs_by_project[project] = Project.objects.get( id=project).program.id programs.append(programs_by_project[project]) for program in programs: if program not in preformatted_vals: preformatted_vals[program] = fetch_metadata_value_set( program) if x_id is not None and x_id.split(':')[-1] in preformatted_vals[program] and item['x'] in \ preformatted_vals[program][x_id.split(':')[-1]]['values']: item['x'] = preformatted_vals[program][x_id.split( ':')[-1]]['values'][item['x']]['displ_value'] if y_id is not None and y_id.split(':')[-1] in preformatted_vals[program] and item['y'] in \ preformatted_vals[program][y_id.split(':')[-1]]['values']: item['y'] = preformatted_vals[program][y_id.split( ':')[-1]]['values'][item['y']]['displ_value'] if c_id is not None and c_id.split(':')[-1] in preformatted_vals[program] and item['c'] in \ preformatted_vals[program][c_id.split(':')[-1]]['values']: item['c'] = preformatted_vals[program][c_id.split( ':')[-1]]['values'][item['c']]['displ_value'] return JsonResponse(data) except Exception as e: logger.error("[ERROR] In data access for plot: ") logger.exception(e) return JsonResponse({'error': str(e)}, status=500)
def seqpeek_view_data(request): try: hugo_symbol = request.GET.get('hugo_symbol', None) genomic_build = request.GET.get('genomic_build', None) cohort_id_param_array = request.GET.getlist('cohort_id', None) cohort_id_array = [] for cohort_id in cohort_id_param_array: try: cohort_id = int(cohort_id) cohort_id_array.append(cohort_id) except Exception as e: return JsonResponse({'error': 'Invalid cohort parameter'}, status=400) if not is_valid_genomic_build(genomic_build): return JsonResponse({'error': 'Invalid genomic build'}, status=400) genomic_build = genomic_build.lower() if len(cohort_id_array) == 0: return JsonResponse({'error': 'No cohorts specified'}, status=400) # By extracting info from the cohort, we get the NAMES of the public projects # we need to access (public projects have unique name tags, e.g. tcga). program_set = get_public_program_name_set_for_cohorts(cohort_id_array) # Check to see if these programs have data for the requested vectors; if not, there's no reason to plot programs = FeatureDataTypeHelper.get_supported_programs_from_data_type( FEATURE_ID_TO_TYPE_MAP['gnab']) valid_programs = set(programs).intersection(program_set) if not len(valid_programs): return JsonResponse({ 'message': "The chosen cohorts do not contain samples from programs with Gene Mutation data." }) gnab_feature_id = build_gnab_feature_id(hugo_symbol, genomic_build) logger.debug( "GNAB feature ID for SeqPeek: {0}".format(gnab_feature_id)) # Get the project IDs these cohorts' samples come from confirmed_project_ids, user_only_study_ids = get_confirmed_project_ids_for_cohorts( cohort_id_array) bqss = BigQueryServiceSupport.build_from_django_settings() fvb = FeatureVectorBigQueryBuilder.build_from_django_settings(bqss) program_set = get_program_set_for_seqpeek_plot(cohort_id_array) extra_provider_params = {"genomic_buid": genomic_build} async_params = [ ProviderClassQueryDescription(SeqPeekDataQueryHandler, gnab_feature_id, cohort_id_array, confirmed_project_ids, program_set, extra_provider_params) ] maf_data_result = fvb.get_feature_vectors_tcga_only( async_params, skip_formatting_for_plot=True) maf_data_vector = maf_data_result[gnab_feature_id]['data'] if len(maf_data_vector) == 0: return JsonResponse( build_empty_data_response(hugo_symbol, cohort_id_array, maf_data_result['tables_queried'])) if len(maf_data_vector) > 0: seqpeek_data = SeqPeekMAFDataFormatter( ).format_maf_vector_for_view(maf_data_vector, cohort_id_array, genomic_build) if len(seqpeek_data.maf_vector) == 0: return JsonResponse( build_empty_data_response(hugo_symbol, cohort_id_array, maf_data_result['tables_queried'])) # Since the gene (hugo_symbol) parameter is part of the GNAB feature ID, # it will be sanity-checked in the SeqPeekMAFDataAccess instance. seqpeek_maf_vector = seqpeek_data.maf_vector seqpeek_cohort_info = seqpeek_data.cohort_info removed_row_statistics_dict = seqpeek_data.removed_row_statistics seqpeek_view_data = SeqPeekViewDataBuilder().build_view_data( hugo_symbol, seqpeek_maf_vector, seqpeek_cohort_info, cohort_id_array, removed_row_statistics_dict, maf_data_result['tables_queried']) return JsonResponse(seqpeek_view_data) except Exception as e: logger.error("[ERROR] In seqpeek_view_data: ") logger.exception(e) return JsonResponse({'Error': str(e)}, status=500)
def seqpeek_view_data(request): try: hugo_symbol = request.GET.get('hugo_symbol', None) cohort_id_param_array = request.GET.getlist('cohort_id', None) cohort_id_array = [] for cohort_id in cohort_id_param_array: try: cohort_id = int(cohort_id) cohort_id_array.append(cohort_id) except Exception as e: return JsonResponse({'error': 'Invalid cohort parameter'}, status=400) if len(cohort_id_array) == 0: return JsonResponse({'error': 'No cohorts specified'}, status=400) gnab_feature_id = build_gnab_feature_id(hugo_symbol) logger.debug( "GNAB feature ID for SeqPeek: {0}".format(gnab_feature_id)) # Get the project IDs these cohorts' samples come from confirmed_project_ids = get_confirmed_project_ids_for_cohorts( cohort_id_array) bqss = BigQueryServiceSupport.build_from_django_settings() fvb = FeatureVectorBigQueryBuilder.build_from_django_settings(bqss) program_set = get_program_set_for_seqpeek_plot(cohort_id_array) async_params = [ ProviderClassQueryDescription(SeqPeekDataQueryHandler, gnab_feature_id, cohort_id_array, confirmed_project_ids, program_set) ] maf_data_result = fvb.get_feature_vectors_tcga_only( async_params, skip_formatting_for_plot=True) maf_data_vector = maf_data_result[gnab_feature_id]['data'] if len(maf_data_vector) > 0: # Since the gene (hugo_symbol) parameter is part of the GNAB feature ID, # it will be sanity-checked in the SeqPeekMAFDataAccess instance. seqpeek_data = SeqPeekMAFDataFormatter( ).format_maf_vector_for_view(maf_data_vector, cohort_id_array) seqpeek_maf_vector = seqpeek_data.maf_vector seqpeek_cohort_info = seqpeek_data.cohort_info removed_row_statistics_dict = seqpeek_data.removed_row_statistics seqpeek_view_data = SeqPeekViewDataBuilder().build_view_data( hugo_symbol, seqpeek_maf_vector, seqpeek_cohort_info, cohort_id_array, removed_row_statistics_dict) return JsonResponse(seqpeek_view_data) else: # No data found return JsonResponse({ # The SeqPeek client side view detects data availability by checking if # the "plot_data" object has the "tracks" key present. 'plot_data': {}, 'hugo_symbol': hugo_symbol, 'cohort_id_list': [str(i) for i in cohort_id_array], 'removed_row_statistics': [] }) except Exception as e: print >> sys.stdout, traceback.format_exc() logger.exception(e) return JsonResponse({'error': str(e)}, status=500)
def data_access_for_plot(request): """ Used by the web application. """ try: logTransform = None ver = request.GET.get('ver', '1') x_id = request.GET.get('x_id', None) y_id = request.GET.get('y_id', None) c_id = request.GET.get('c_id', None) try: logTransform = json.loads(request.GET.get('log_transform', None)) except Exception as e: print >> sys.stdout, "[WARNING] Log transform parameter not supplied" logger.warn("[WARNING] Log transform parameter not supplied") logTransform = None cohort_id_array = request.GET.getlist('cohort_id', None) # Check that all requested feature identifiers are valid. Do not check for y_id if it is not # supplied in the request. feature_ids_to_check = [x_id] if c_id is not None: feature_ids_to_check.append(c_id) if y_id is not None: feature_ids_to_check.append(y_id) valid_features = get_feature_id_validity_for_array( feature_ids_to_check) for feature_id, is_valid in valid_features: logging.info((feature_id, is_valid)) if not is_valid: logging.error( "Invalid internal feature ID '{}'".format(feature_id)) raise Exception('Feature Not Found') # Get the project IDs these cohorts' samples come from confirmed_study_ids = get_confirmed_project_ids_for_cohorts( cohort_id_array) bqss = BigQueryServiceSupport.build_from_django_settings() fvb = FeatureVectorBigQueryBuilder.build_from_django_settings(bqss) program_set = get_public_program_name_set_for_cohorts(cohort_id_array) data = get_merged_feature_vectors(fvb, x_id, y_id, None, cohort_id_array, logTransform, confirmed_study_ids, program_set=program_set) # Annotate each data point with cohort information add_cohort_info_to_merged_vectors(data, x_id, y_id, c_id, cohort_id_array) return JsonResponse(data) except Exception as e: print >> sys.stdout, traceback.format_exc() logger.exception(e) return JsonResponse({'error': str(e)}, status=500)