Exemplo n.º 1
0
def fix_filters(debug):
    db = None
    cursor = None

    try:
        db = get_mysql_connection()
        cursor = db.cursor()

        db.autocommit(True)

        # Project -> Program
        fix_filters_program = """
            UPDATE cohorts_filters
            SET name='program_name'
            WHERE name='Project';
        """

        # Study -> disease_code
        fix_filters_project = """
            UPDATE cohorts_filters
            SET name='disease_code'
            WHERE name='Study';
        """

        # Remove CLIN: and SAMP:, leave in MUT:
        fix_filters_names = """
            UPDATE cohorts_filters
            SET name=SUBSTR(name,LOCATE(':',name)+1)
            WHERE name LIKE '%:%' AND name NOT LIKE 'MUT:%';
        """

        if debug:
            print >> sys.stdout, "[STATUS] Executing update statement: " + fix_filters_program
            print >> sys.stdout, "[STATUS] Executing update statement: " + fix_filters_project
            print >> sys.stdout, "[STATUS] Executing update statement: " + fix_filters_names
        else:
            cursor.execute(fix_filters_program)
            cursor.execute(fix_filters_project)
            cursor.execute(fix_filters_names)

        get_cohots_tcga_and_ccle = """
            SELECT DISTINCT ccle.cohort_id
            FROM cohorts_samples ccle
            JOIN (
                SELECT DISTINCT cs.cohort_id
                FROM cohorts_samples cs
                WHERE cs.sample_barcode NOT LIKE 'CCLE%'
            ) tcga
            ON tcga.cohort_id = ccle.cohort_id
            WHERE ccle.sample_barcode LIKE 'CCLE%';
        """

        fix_tcga_only_filters = """
            UPDATE cohorts_filters cf
            JOIN (
                SELECT id
                FROM cohorts_filters
                WHERE program_id IS NULL AND resulting_cohort_id IN (
                    SELECT DISTINCT tcga.cohort_id
                    FROM cohorts_samples tcga
                    LEFT JOIN (
                        SELECT DISTINCT cs.cohort_id
                        FROM cohorts_samples cs
                        WHERE cs.sample_barcode LIKE 'CCLE%%'
                    ) ccle
                    ON tcga.cohort_id = ccle.cohort_id
                    WHERE tcga.sample_barcode NOT LIKE 'CCLE%%' AND ccle.cohort_id IS NULL
                )
            ) tcga_cf
            ON tcga_cf.id = cf.id
            SET program_id = %s
            WHERE program_id IS NULL;
        """

        fix_ccle_only_filters = """
            UPDATE cohorts_filters cf
            JOIN (
                SELECT id
                FROM cohorts_filters
                WHERE program_id IS NULL AND resulting_cohort_id IN (
                    SELECT DISTINCT ccle.cohort_id
                    FROM cohorts_samples ccle
                    LEFT JOIN (
                        SELECT DISTINCT cs.cohort_id
                        FROM cohorts_samples cs
                        WHERE cs.sample_barcode NOT LIKE 'CCLE%%'
                    ) tcga
                    ON tcga.cohort_id = ccle.cohort_id
                    WHERE ccle.sample_barcode LIKE 'CCLE%%' AND tcga.cohort_id IS NULL
                )
            ) ccle_cf
            ON ccle_cf.id = cf.id
            SET program_id = %s
            WHERE program_id IS NULL;
        """

        get_filters = """
            SELECT id,name,value,resulting_cohort_id
            FROM cohorts_filters
            WHERE program_id IS NULL AND resulting_cohort_id = %s;
        """

        add_filter_program = """
            UPDATE cohorts_filters
            SET program_id = %s
            WHERE id = %s;
        """

        insert_filter = """
            INSERT INTO cohorts_filters(name,value,resulting_cohort_id,program_id)
            VALUES(%s,%s,%s,%s);
        """

        isb_userid = User.objects.get(username='******',
                                      is_staff=True,
                                      is_superuser=True,
                                      is_active=True).id

        tcga_program_id = Program.objects.get(name='TCGA',
                                              owner=isb_userid,
                                              is_public=True,
                                              active=True).id
        ccle_program_id = Program.objects.get(name='CCLE',
                                              owner=isb_userid,
                                              is_public=True,
                                              active=True).id

        ccle_attr = fetch_metadata_value_set(ccle_program_id)

        # Fix CCLE-only cohort filters
        if debug:
            print >> sys.stdout, "Executing statement: " + fix_ccle_only_filters
            print >> sys.stdout, "Values: " + str((ccle_program_id, ))
        else:
            cursor.execute(fix_ccle_only_filters, (ccle_program_id, ))

        # Fix TCGA-only cohort filters
        if debug:
            print >> sys.stdout, "Executing statement: " + fix_tcga_only_filters
            print >> sys.stdout, "Values: " + str((tcga_program_id, ))
        else:
            cursor.execute(fix_tcga_only_filters, (tcga_program_id, ))

        # Fix mixed TCGA/CCLE cohort filters
        cursor.execute(get_cohots_tcga_and_ccle)

        for row in cursor.fetchall():
            cursor.execute(get_filters, (row[0], ))

            for filter_row in cursor.fetchall():
                if debug:
                    print >> sys.stdout, "Executing statement: " + add_filter_program
                    print >> sys.stdout, "Values: " + str((
                        tcga_program_id,
                        filter_row[0],
                    ))

                    if filter_row[1] in ccle_attr:
                        print >> sys.stdout, filter_row[
                            1] + " found in ccle_attr"
                        print >> sys.stdout, "Executing statement: " + insert_filter
                        print >> sys.stdout, "Values: " + str((
                            filter_row[1],
                            filter_row[2],
                            row[0],
                            ccle_program_id,
                        ))
                else:

                    # First, apply TCGA as the program ID
                    cursor.execute(add_filter_program, (
                        tcga_program_id,
                        filter_row[0],
                    ))

                    # Then, check to see if CCLE has this attr and value - if yes, duplicate the filter to CCLE
                    if filter_row[1] in ccle_attr and filter_row[
                            2] in ccle_attr[filter_row[1]]['values']:
                        cursor.execute(insert_filter, (
                            filter_row[1],
                            filter_row[2],
                            row[0],
                            ccle_program_id,
                        ))

    except Exception as e:
        print >> sys.stdout, traceback.format_exc()
    finally:
        if cursor: cursor.close()
        if db and db.open: db.close()
def data_access_for_plot(request):
    """
    Used by the web application.
    """
    try:
        logTransform = None
        ver = request.GET.get('ver', '1')
        x_id = request.GET.get('x_id', None)
        y_id = request.GET.get('y_id', None)
        c_id = request.GET.get('c_id', None)
        try:
            # TODO Use jsonschema to validate logTransform object
            logTransform = json.loads(request.GET.get('log_transform', None))
        except Exception as e:
            logger.warn("[WARNING] Log transform parameter not supplied")
            logTransform = None

        cohort_id_array = request.GET.getlist('cohort_id', None)

        # Check that all requested feature identifiers are valid. Do not check for y_id if it is not
        # supplied in the request.
        feature_ids_to_check = [x_id]
        if c_id is not None:
            feature_ids_to_check.append(c_id)
        if y_id is not None:
            feature_ids_to_check.append(y_id)

        valid_features = get_feature_id_validity_for_array(
            feature_ids_to_check)

        for feature_id, is_valid in valid_features:
            if not is_valid:
                logging.error(
                    "Invalid internal feature ID '{}'".format(feature_id))
                raise Exception('Feature Not Found')

        # Gives the user data handler a chance to map e.g. "v2:USER:343:58901" to "v2:CLIN:case_barcode"
        x_id = _feature_converter(x_id)
        y_id = _feature_converter(y_id)
        c_id = _feature_converter(c_id)

        # Get the project IDs these cohorts' samples come from
        confirmed_study_ids, user_only_study_ids = get_confirmed_project_ids_for_cohorts(
            cohort_id_array)

        bqss = BigQueryServiceSupport.build_from_django_settings()
        fvb = FeatureVectorBigQueryBuilder.build_from_django_settings(bqss)

        # By extracting info from the cohort, we get the NAMES of the public projects
        # we need to access (public projects have unique name tags, e.g. tcga).
        program_set = get_public_program_name_set_for_cohorts(cohort_id_array)

        # We need to do this for cohorts that contain samples found in user data projects,
        # where those projects are extension of public data. This is because the cohorts
        # only reference the user project, but if we are plotting against pubic data, we
        # have to know which public programs we need to look at.

        prog_set_extended = get_extended_public_program_name_set_for_user_extended_projects(
            confirmed_study_ids)
        program_set.update(prog_set_extended)

        # Check to see if these programs have data for the requested vectors; if not, there's no reason to plot
        features_without_program_data = []
        for id in [x_id, y_id, c_id]:
            if id:
                type = id.split(':')[1].lower()
                plot_type = FEATURE_ID_TO_TYPE_MAP[
                    type] if type in FEATURE_ID_TO_TYPE_MAP else None
                if plot_type:
                    programs = FeatureDataTypeHelper.get_supported_programs_from_data_type(
                        plot_type)
                    valid_programs = set(programs).intersection(program_set)

                    if not len(valid_programs):
                        features_without_program_data.append(
                            FeatureDataTypeHelper.get_proper_feature_type_name(
                                plot_type))

        if len(features_without_program_data):
            return JsonResponse({
                'message':
                "The chosen cohorts do not contain programs with data for these features: {}."
                .format(", ".join(features_without_program_data))
            })

        user_programs = get_user_program_id_set_for_user_only_projects(
            user_only_study_ids)

        # Fix for #2381: confirmed_study_ids MUST ALWAYS contain the public dataset project IDs, because that's how we
        # enable older cohorts which didn't store project IDs (check for NULL) against ones where we did require the
        # project ID
        if len(user_programs):
            program_set.update(user_programs)
            confirmed_study_ids += user_only_study_ids

        data = get_merged_feature_vectors(fvb,
                                          x_id,
                                          y_id,
                                          c_id,
                                          cohort_id_array,
                                          logTransform,
                                          confirmed_study_ids,
                                          program_set=program_set)

        # Annotate each data point with cohort information
        add_cohort_info_to_merged_vectors(data, x_id, y_id, c_id,
                                          cohort_id_array)

        # convert to display strings where needed (eg. categoricals stored as indicies rather than strings)
        programs_by_project = {}
        preformatted_vals = {}
        for item in data['items']:
            programs = []
            for project in item['project']:
                # Fetch the program if we don't know it already
                if project not in programs_by_project:
                    programs_by_project[project] = Project.objects.get(
                        id=project).program.id
                programs.append(programs_by_project[project])

            for program in programs:
                if program not in preformatted_vals:
                    preformatted_vals[program] = fetch_metadata_value_set(
                        program)
                if x_id is not None and x_id.split(':')[-1] in preformatted_vals[program] and item['x'] in \
                        preformatted_vals[program][x_id.split(':')[-1]]['values']:
                    item['x'] = preformatted_vals[program][x_id.split(
                        ':')[-1]]['values'][item['x']]['displ_value']
                if y_id is not None and y_id.split(':')[-1] in preformatted_vals[program] and item['y'] in \
                        preformatted_vals[program][y_id.split(':')[-1]]['values']:
                    item['y'] = preformatted_vals[program][y_id.split(
                        ':')[-1]]['values'][item['y']]['displ_value']
                if c_id is not None and c_id.split(':')[-1] in preformatted_vals[program] and item['c'] in \
                        preformatted_vals[program][c_id.split(':')[-1]]['values']:
                    item['c'] = preformatted_vals[program][c_id.split(
                        ':')[-1]]['values'][item['c']]['displ_value']

        return JsonResponse(data)

    except Exception as e:
        logger.error("[ERROR] In data access for plot: ")
        logger.exception(e)
        return JsonResponse({'error': str(e)}, status=500)