def fix_filters(debug): db = None cursor = None try: db = get_mysql_connection() cursor = db.cursor() db.autocommit(True) # Project -> Program fix_filters_program = """ UPDATE cohorts_filters SET name='program_name' WHERE name='Project'; """ # Study -> disease_code fix_filters_project = """ UPDATE cohorts_filters SET name='disease_code' WHERE name='Study'; """ # Remove CLIN: and SAMP:, leave in MUT: fix_filters_names = """ UPDATE cohorts_filters SET name=SUBSTR(name,LOCATE(':',name)+1) WHERE name LIKE '%:%' AND name NOT LIKE 'MUT:%'; """ if debug: print >> sys.stdout, "[STATUS] Executing update statement: " + fix_filters_program print >> sys.stdout, "[STATUS] Executing update statement: " + fix_filters_project print >> sys.stdout, "[STATUS] Executing update statement: " + fix_filters_names else: cursor.execute(fix_filters_program) cursor.execute(fix_filters_project) cursor.execute(fix_filters_names) get_cohots_tcga_and_ccle = """ SELECT DISTINCT ccle.cohort_id FROM cohorts_samples ccle JOIN ( SELECT DISTINCT cs.cohort_id FROM cohorts_samples cs WHERE cs.sample_barcode NOT LIKE 'CCLE%' ) tcga ON tcga.cohort_id = ccle.cohort_id WHERE ccle.sample_barcode LIKE 'CCLE%'; """ fix_tcga_only_filters = """ UPDATE cohorts_filters cf JOIN ( SELECT id FROM cohorts_filters WHERE program_id IS NULL AND resulting_cohort_id IN ( SELECT DISTINCT tcga.cohort_id FROM cohorts_samples tcga LEFT JOIN ( SELECT DISTINCT cs.cohort_id FROM cohorts_samples cs WHERE cs.sample_barcode LIKE 'CCLE%%' ) ccle ON tcga.cohort_id = ccle.cohort_id WHERE tcga.sample_barcode NOT LIKE 'CCLE%%' AND ccle.cohort_id IS NULL ) ) tcga_cf ON tcga_cf.id = cf.id SET program_id = %s WHERE program_id IS NULL; """ fix_ccle_only_filters = """ UPDATE cohorts_filters cf JOIN ( SELECT id FROM cohorts_filters WHERE program_id IS NULL AND resulting_cohort_id IN ( SELECT DISTINCT ccle.cohort_id FROM cohorts_samples ccle LEFT JOIN ( SELECT DISTINCT cs.cohort_id FROM cohorts_samples cs WHERE cs.sample_barcode NOT LIKE 'CCLE%%' ) tcga ON tcga.cohort_id = ccle.cohort_id WHERE ccle.sample_barcode LIKE 'CCLE%%' AND tcga.cohort_id IS NULL ) ) ccle_cf ON ccle_cf.id = cf.id SET program_id = %s WHERE program_id IS NULL; """ get_filters = """ SELECT id,name,value,resulting_cohort_id FROM cohorts_filters WHERE program_id IS NULL AND resulting_cohort_id = %s; """ add_filter_program = """ UPDATE cohorts_filters SET program_id = %s WHERE id = %s; """ insert_filter = """ INSERT INTO cohorts_filters(name,value,resulting_cohort_id,program_id) VALUES(%s,%s,%s,%s); """ isb_userid = User.objects.get(username='******', is_staff=True, is_superuser=True, is_active=True).id tcga_program_id = Program.objects.get(name='TCGA', owner=isb_userid, is_public=True, active=True).id ccle_program_id = Program.objects.get(name='CCLE', owner=isb_userid, is_public=True, active=True).id ccle_attr = fetch_metadata_value_set(ccle_program_id) # Fix CCLE-only cohort filters if debug: print >> sys.stdout, "Executing statement: " + fix_ccle_only_filters print >> sys.stdout, "Values: " + str((ccle_program_id, )) else: cursor.execute(fix_ccle_only_filters, (ccle_program_id, )) # Fix TCGA-only cohort filters if debug: print >> sys.stdout, "Executing statement: " + fix_tcga_only_filters print >> sys.stdout, "Values: " + str((tcga_program_id, )) else: cursor.execute(fix_tcga_only_filters, (tcga_program_id, )) # Fix mixed TCGA/CCLE cohort filters cursor.execute(get_cohots_tcga_and_ccle) for row in cursor.fetchall(): cursor.execute(get_filters, (row[0], )) for filter_row in cursor.fetchall(): if debug: print >> sys.stdout, "Executing statement: " + add_filter_program print >> sys.stdout, "Values: " + str(( tcga_program_id, filter_row[0], )) if filter_row[1] in ccle_attr: print >> sys.stdout, filter_row[ 1] + " found in ccle_attr" print >> sys.stdout, "Executing statement: " + insert_filter print >> sys.stdout, "Values: " + str(( filter_row[1], filter_row[2], row[0], ccle_program_id, )) else: # First, apply TCGA as the program ID cursor.execute(add_filter_program, ( tcga_program_id, filter_row[0], )) # Then, check to see if CCLE has this attr and value - if yes, duplicate the filter to CCLE if filter_row[1] in ccle_attr and filter_row[ 2] in ccle_attr[filter_row[1]]['values']: cursor.execute(insert_filter, ( filter_row[1], filter_row[2], row[0], ccle_program_id, )) except Exception as e: print >> sys.stdout, traceback.format_exc() finally: if cursor: cursor.close() if db and db.open: db.close()
def data_access_for_plot(request): """ Used by the web application. """ try: logTransform = None ver = request.GET.get('ver', '1') x_id = request.GET.get('x_id', None) y_id = request.GET.get('y_id', None) c_id = request.GET.get('c_id', None) try: # TODO Use jsonschema to validate logTransform object logTransform = json.loads(request.GET.get('log_transform', None)) except Exception as e: logger.warn("[WARNING] Log transform parameter not supplied") logTransform = None cohort_id_array = request.GET.getlist('cohort_id', None) # Check that all requested feature identifiers are valid. Do not check for y_id if it is not # supplied in the request. feature_ids_to_check = [x_id] if c_id is not None: feature_ids_to_check.append(c_id) if y_id is not None: feature_ids_to_check.append(y_id) valid_features = get_feature_id_validity_for_array( feature_ids_to_check) for feature_id, is_valid in valid_features: if not is_valid: logging.error( "Invalid internal feature ID '{}'".format(feature_id)) raise Exception('Feature Not Found') # Gives the user data handler a chance to map e.g. "v2:USER:343:58901" to "v2:CLIN:case_barcode" x_id = _feature_converter(x_id) y_id = _feature_converter(y_id) c_id = _feature_converter(c_id) # Get the project IDs these cohorts' samples come from confirmed_study_ids, user_only_study_ids = get_confirmed_project_ids_for_cohorts( cohort_id_array) bqss = BigQueryServiceSupport.build_from_django_settings() fvb = FeatureVectorBigQueryBuilder.build_from_django_settings(bqss) # By extracting info from the cohort, we get the NAMES of the public projects # we need to access (public projects have unique name tags, e.g. tcga). program_set = get_public_program_name_set_for_cohorts(cohort_id_array) # We need to do this for cohorts that contain samples found in user data projects, # where those projects are extension of public data. This is because the cohorts # only reference the user project, but if we are plotting against pubic data, we # have to know which public programs we need to look at. prog_set_extended = get_extended_public_program_name_set_for_user_extended_projects( confirmed_study_ids) program_set.update(prog_set_extended) # Check to see if these programs have data for the requested vectors; if not, there's no reason to plot features_without_program_data = [] for id in [x_id, y_id, c_id]: if id: type = id.split(':')[1].lower() plot_type = FEATURE_ID_TO_TYPE_MAP[ type] if type in FEATURE_ID_TO_TYPE_MAP else None if plot_type: programs = FeatureDataTypeHelper.get_supported_programs_from_data_type( plot_type) valid_programs = set(programs).intersection(program_set) if not len(valid_programs): features_without_program_data.append( FeatureDataTypeHelper.get_proper_feature_type_name( plot_type)) if len(features_without_program_data): return JsonResponse({ 'message': "The chosen cohorts do not contain programs with data for these features: {}." .format(", ".join(features_without_program_data)) }) user_programs = get_user_program_id_set_for_user_only_projects( user_only_study_ids) # Fix for #2381: confirmed_study_ids MUST ALWAYS contain the public dataset project IDs, because that's how we # enable older cohorts which didn't store project IDs (check for NULL) against ones where we did require the # project ID if len(user_programs): program_set.update(user_programs) confirmed_study_ids += user_only_study_ids data = get_merged_feature_vectors(fvb, x_id, y_id, c_id, cohort_id_array, logTransform, confirmed_study_ids, program_set=program_set) # Annotate each data point with cohort information add_cohort_info_to_merged_vectors(data, x_id, y_id, c_id, cohort_id_array) # convert to display strings where needed (eg. categoricals stored as indicies rather than strings) programs_by_project = {} preformatted_vals = {} for item in data['items']: programs = [] for project in item['project']: # Fetch the program if we don't know it already if project not in programs_by_project: programs_by_project[project] = Project.objects.get( id=project).program.id programs.append(programs_by_project[project]) for program in programs: if program not in preformatted_vals: preformatted_vals[program] = fetch_metadata_value_set( program) if x_id is not None and x_id.split(':')[-1] in preformatted_vals[program] and item['x'] in \ preformatted_vals[program][x_id.split(':')[-1]]['values']: item['x'] = preformatted_vals[program][x_id.split( ':')[-1]]['values'][item['x']]['displ_value'] if y_id is not None and y_id.split(':')[-1] in preformatted_vals[program] and item['y'] in \ preformatted_vals[program][y_id.split(':')[-1]]['values']: item['y'] = preformatted_vals[program][y_id.split( ':')[-1]]['values'][item['y']]['displ_value'] if c_id is not None and c_id.split(':')[-1] in preformatted_vals[program] and item['c'] in \ preformatted_vals[program][c_id.split(':')[-1]]['values']: item['c'] = preformatted_vals[program][c_id.split( ':')[-1]]['values'][item['c']]['displ_value'] return JsonResponse(data) except Exception as e: logger.error("[ERROR] In data access for plot: ") logger.exception(e) return JsonResponse({'error': str(e)}, status=500)