예제 #1
0
def read_V2_data(data_file, learn_options=None, verbose=True):
    if data_file is None:
        data_file = "../data/11-15-2014 DeepXPR results_processed.xlsx"

    # to compare
    # import predict as pr; a1, g1, t1, X1, Y1 = pr.data_setup()
    # a1.index.names

    data = pandas.read_excel(data_file,
                             sheetname="ResultsFiltered",
                             skiprows=range(0, 6 + 1),
                             index_col=[0, 4])
    # grab data relevant to each of three drugs, which exludes some genes
    # note gene MED12 has two drugs, all others have at most one
    Xdf = pandas.DataFrame()

    # This comes from the "Pairs" tab in their excel sheet,
    # note HPRT/HPRT1 are same thing, and also PLX_2uM/PLcX_2uM
    known_pairs = {
        'AZD_200nM': ['CCDC101', 'MED12', 'TADA2B', 'TADA1'],
        '6TG_2ug/mL': ['HPRT1'],
        'PLX_2uM': ['CUL3', 'NF1', 'NF2', 'MED12']
    }

    drugs_to_genes = {
        'AZD_200nM': ['CCDC101', 'MED12', 'TADA2B', 'TADA1'],
        '6TG_2ug/mL': ['HPRT1'],
        'PLX_2uM': ['CUL3', 'NF1', 'NF2', 'MED12']
    }

    if learn_options is not None:
        assert not (
            learn_options['extra pairs'] and learn_options['all pairs']
        ), "extra pairs and all pairs options (in learn_options) can't be active simultaneously."

        if learn_options['extra pairs']:
            drugs_to_genes['AZD_200nM'].extend(['CUL3', 'NF1', 'NF2'])
        elif learn_options['all pairs']:
            drugs_to_genes['AZD_200nM'].extend(['HPRT1', 'CUL3', 'NF1', 'NF2'])
            drugs_to_genes['PLX_2uM'].extend(
                ['HPRT1', 'CCDC101', 'TADA2B', 'TADA1'])
            drugs_to_genes['6TG_2ug/mL'].extend(
                ['CCDC101', 'MED12', 'TADA2B', 'TADA1', 'CUL3', 'NF1', 'NF2'])

    count = 0
    for drug in drugs_to_genes.keys():
        genes = drugs_to_genes[drug]
        for g in genes:
            Xtmp = data.copy().xs(g, level='Target gene', drop_level=False)
            Xtmp['drug'] = drug
            Xtmp['score'] = Xtmp[drug].copy(
            )  # grab the drug results that are relevant for this gene

            if g in known_pairs[drug]:
                Xtmp['test'] = 1.
            else:
                Xtmp['test'] = 0.

            count = count + Xtmp.shape[0]
            Xdf = pandas.concat([Xdf, Xtmp], axis=0)
            if verbose:
                print "Loaded %d samples for gene %s \ttotal number of samples: %d" % (
                    Xtmp.shape[0], g, count)

    # create new index that includes the drug
    Xdf = Xdf.set_index('drug', append=True)

    Y = pandas.DataFrame(Xdf.pop("score"))
    Y.columns.names = ["score"]

    test_gene = pandas.DataFrame(Xdf.pop('test'))
    target = pandas.DataFrame(Xdf.index.get_level_values('Target gene').values,
                              index=Y.index,
                              columns=["Target gene"])
    Y = pandas.concat((Y, target, test_gene), axis=1)
    target_genes = Y['Target gene'].unique()
    gene_position = Xdf[["Percent Peptide", "Amino Acid Cut position"]].copy()

    # convert to ranks for each (gene, drug combo)
    # flip = True
    y_rank = pandas.DataFrame()
    y_threshold = pandas.DataFrame()
    y_quant = pandas.DataFrame()
    for drug in drugs_to_genes.keys():
        gene_list = drugs_to_genes[drug]
        for gene in gene_list:
            ytmp = pandas.DataFrame(
                Y.xs((gene, drug),
                     level=["Target gene", "drug"],
                     drop_level=False)['score'])
            y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks(
                ytmp, thresh=0.8, prefix="score_drug_gene", flip=False)
            # np.unique(y_rank.values-y_rank_raw.values)
            y_rank = pandas.concat((y_rank, y_ranktmp), axis=0)
            y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0)
            y_quant = pandas.concat((y_quant, y_quanttmp), axis=0)

    yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1)
    Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True)

    # convert also by drug only, irrespective of gene
    y_rank = pandas.DataFrame()
    y_threshold = pandas.DataFrame()
    y_quant = pandas.DataFrame()
    for drug in drugs_to_genes.keys():
        ytmp = pandas.DataFrame(
            Y.xs(drug, level="drug", drop_level=False)['score'])
        y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks(
            ytmp, thresh=0.8, prefix="score_drug", flip=False)
        # np.unique(y_rank.values-y_rank_raw.values)
        y_rank = pandas.concat((y_rank, y_ranktmp), axis=0)
        y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0)
        y_quant = pandas.concat((y_quant, y_quanttmp), axis=0)

    yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1)
    Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True)

    PLOT = False
    if PLOT:
        # to better understand, try plotting something like:
        labels = [
            "score", "score_drug_gene_rank", "score_drug_rank",
            "score_drug_gene_threshold", "score_drug_threshold"
        ]

        for label in labels:
            plt.figure()
            plt.plot(Xdf['sgRNA Score'].values, Y[label].values, '.')
            r, pearp = sp.stats.pearsonr(Xdf['sgRNA Score'].values.flatten(),
                                         Y[label].values.flatten())
            plt.title(label + ' VS pred. score, $r$=%0.2f (p=%0.2e)' %
                      (r, pearp))
            plt.xlabel("sgRNA prediction score")
            plt.ylabel(label)

    gene_position = util.impute_gene_position(gene_position)

    if learn_options is not None and learn_options["weighted"] == "variance":
        print "computing weights from replicate variance..."
        # compute the variance across replicates so can use it as a weight
        data = pandas.read_excel(data_file,
                                 sheetname="Normalized",
                                 skiprows=range(0, 6 + 1),
                                 index_col=[0, 4])
        data.index.names = ["Sequence", "Target gene"]

        experiments = {}
        experiments['AZD_200nM'] = [
            'Deep 25', 'Deep 27', 'Deep 29 ', 'Deep 31'
        ]
        experiments['6TG_2ug/mL'] = [
            'Deep 33', 'Deep 35', 'Deep 37', 'Deep 39'
        ]
        experiments['PLX_2uM'] = ['Deep 49', 'Deep 51', 'Deep 53', 'Deep 55']

        variance = None
        for drug in drugs_to_genes.keys():
            data_tmp = data.iloc[data.index.get_level_values(
                'Target gene').isin(drugs_to_genes[drug])][experiments[drug]]
            data_tmp["drug"] = drug
            data_tmp = data_tmp.set_index('drug', append=True)
            data_tmp["variance"] = np.var(data_tmp.values, axis=1)
            if variance is None:
                variance = data_tmp["variance"].copy()
            else:
                variance = pandas.concat((variance, data_tmp["variance"]),
                                         axis=0)

        orig_index = Y.index.copy()
        Y = pandas.merge(Y,
                         pandas.DataFrame(variance),
                         how="inner",
                         left_index=True,
                         right_index=True)
        Y = Y.ix[orig_index]
        print "done."

    # Make sure to keep this check last in this function
    assert Xdf.index.equals(
        Y.index
    ), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)"

    return Xdf, drugs_to_genes, target_genes, Y, gene_position
예제 #2
0
def read_V2_data(data_file, learn_options=None, verbose=True):
    if data_file is None:
        data_file = "../data/11-15-2014 DeepXPR results_processed.xlsx"

    # to compare
    # import predict as pr; a1, g1, t1, X1, Y1 = pr.data_setup()
    # a1.index.names

    data = pandas.read_excel(data_file, sheetname="ResultsFiltered", skiprows=range(0, 6+1), index_col=[0, 4])
    # grab data relevant to each of three drugs, which exludes some genes
    # note gene MED12 has two drugs, all others have at most one
    Xdf = pandas.DataFrame()

    # This comes from the "Pairs" tab in their excel sheet,
    # note HPRT/HPRT1 are same thing, and also PLX_2uM/PLcX_2uM
    known_pairs = {'AZD_200nM':  ['CCDC101', 'MED12', 'TADA2B', 'TADA1'],
                   '6TG_2ug/mL': ['HPRT1'],
                   'PLX_2uM':    ['CUL3', 'NF1', 'NF2', 'MED12']}

    drugs_to_genes = {'AZD_200nM':  ['CCDC101', 'MED12', 'TADA2B', 'TADA1'],
                      '6TG_2ug/mL': ['HPRT1'],
                      'PLX_2uM':    ['CUL3', 'NF1', 'NF2', 'MED12']}

    if learn_options is not None:
        assert not (learn_options['extra pairs'] and learn_options['all pairs']), "extra pairs and all pairs options (in learn_options) can't be active simultaneously."

        if learn_options['extra pairs']:
            drugs_to_genes['AZD_200nM'].extend(['CUL3', 'NF1', 'NF2'])
        elif learn_options['all pairs']:
            drugs_to_genes['AZD_200nM'].extend(['HPRT1', 'CUL3', 'NF1', 'NF2'])
            drugs_to_genes['PLX_2uM'].extend(['HPRT1', 'CCDC101', 'TADA2B', 'TADA1'])
            drugs_to_genes['6TG_2ug/mL'].extend(['CCDC101', 'MED12', 'TADA2B', 'TADA1', 'CUL3', 'NF1', 'NF2'])

    count = 0
    for drug in drugs_to_genes.keys():
        genes = drugs_to_genes[drug]
        for g in genes:
            Xtmp = data.copy().xs(g, level='Target gene', drop_level=False)
            Xtmp['drug'] = drug
            Xtmp['score'] = Xtmp[drug].copy()  # grab the drug results that are relevant for this gene

            if g in known_pairs[drug]:
                Xtmp['test'] = 1.
            else:
                Xtmp['test'] = 0.

            count = count + Xtmp.shape[0]
            Xdf = pandas.concat([Xdf, Xtmp], axis=0)
            if verbose:
                print "Loaded %d samples for gene %s \ttotal number of samples: %d" % (Xtmp.shape[0], g, count)

    # create new index that includes the drug
    Xdf = Xdf.set_index('drug', append=True)

    Y = pandas.DataFrame(Xdf.pop("score"))
    Y.columns.names = ["score"]

    test_gene = pandas.DataFrame(Xdf.pop('test'))
    target = pandas.DataFrame(Xdf.index.get_level_values('Target gene').values, index=Y.index, columns=["Target gene"])
    Y = pandas.concat((Y, target, test_gene), axis=1)
    target_genes = Y['Target gene'].unique()
    gene_position = Xdf[["Percent Peptide", "Amino Acid Cut position"]].copy()

    # convert to ranks for each (gene, drug combo)
    # flip = True
    y_rank = pandas.DataFrame()
    y_threshold = pandas.DataFrame()
    y_quant = pandas.DataFrame()
    for drug in drugs_to_genes.keys():
        gene_list = drugs_to_genes[drug]
        for gene in gene_list:
            ytmp = pandas.DataFrame(Y.xs((gene, drug), level=["Target gene", "drug"], drop_level=False)['score'])
            y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks(ytmp, thresh=0.8, prefix="score_drug_gene", flip=False)
            # np.unique(y_rank.values-y_rank_raw.values)
            y_rank = pandas.concat((y_rank, y_ranktmp), axis=0)
            y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0)
            y_quant = pandas.concat((y_quant, y_quanttmp), axis=0)

    yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1)
    Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True)

    # convert also by drug only, irrespective of gene
    y_rank = pandas.DataFrame()
    y_threshold = pandas.DataFrame()
    y_quant = pandas.DataFrame()
    for drug in drugs_to_genes.keys():
        ytmp = pandas.DataFrame(Y.xs(drug, level="drug", drop_level=False)['score'])
        y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks(ytmp, thresh=0.8, prefix="score_drug", flip=False)
        # np.unique(y_rank.values-y_rank_raw.values)
        y_rank = pandas.concat((y_rank, y_ranktmp), axis=0)
        y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0)
        y_quant = pandas.concat((y_quant, y_quanttmp), axis=0)

    yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1)
    Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True)

    PLOT = False
    if PLOT:
        # to better understand, try plotting something like:
        labels = ["score", "score_drug_gene_rank", "score_drug_rank", "score_drug_gene_threshold", "score_drug_threshold"]

        for label in labels:
            plt.figure()
            plt.plot(Xdf['sgRNA Score'].values, Y[label].values, '.')
            r, pearp = sp.stats.pearsonr(Xdf['sgRNA Score'].values.flatten(), Y[label].values.flatten())
            plt.title(label + ' VS pred. score, $r$=%0.2f (p=%0.2e)' % (r, pearp))
            plt.xlabel("sgRNA prediction score")
            plt.ylabel(label)

    gene_position = util.impute_gene_position(gene_position)

    if learn_options is not None and learn_options["weighted"] == "variance":
        print "computing weights from replicate variance..."
        # compute the variance across replicates so can use it as a weight
        data = pandas.read_excel(data_file, sheetname="Normalized", skiprows=range(0, 6+1), index_col=[0, 4])
        data.index.names = ["Sequence", "Target gene"]

        experiments = {}
        experiments['AZD_200nM'] = ['Deep 25', 'Deep 27', 'Deep 29 ', 'Deep 31']
        experiments['6TG_2ug/mL'] = ['Deep 33', 'Deep 35', 'Deep 37', 'Deep 39']
        experiments['PLX_2uM'] = ['Deep 49', 'Deep 51', 'Deep 53', 'Deep 55']

        variance = None
        for drug in drugs_to_genes.keys():
            data_tmp = data.iloc[data.index.get_level_values('Target gene').isin(drugs_to_genes[drug])][experiments[drug]]
            data_tmp["drug"] = drug
            data_tmp = data_tmp.set_index('drug', append=True)
            data_tmp["variance"] = np.var(data_tmp.values, axis=1)
            if variance is None:
                variance = data_tmp["variance"].copy()
            else:
                variance = pandas.concat((variance, data_tmp["variance"]), axis=0)

        orig_index = Y.index.copy()
        Y = pandas.merge(Y, pandas.DataFrame(variance), how="inner", left_index=True, right_index=True)
        Y = Y.ix[orig_index]
        print "done."

    # Make sure to keep this check last in this function
    assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)"

    return Xdf, drugs_to_genes, target_genes, Y, gene_position
                for p in r[
                        'pdl']:  # convert dicts to Counters to make comparisons much more convenient
                    p['actions'] = Counter(p['actions'])
                    p['macros'] = Counter(p['macros'])
                    p['macros'].pop('0', None)  # clear out the dummy macros
                solutions[(r['uuid'], int(r['count']))] = r
                i += 1
        with open(datapath("soln.pickle", pid), 'wb') as fp:
            print("pickling solution data")
            pickle.dump(solutions, fp)
        print()

    return history, solutions


puzzles = get_ranks("data/rprp_puzzle_ranks_v3")

print("loading macro data")
with open("data/macro_families.json") as fp:
    macro_families = [frozenset(f) for f in json.load(fp)]
with open("data/rprp_macros.csv") as fp:
    mrid_to_mid_raw = []
    mrid_to_shared_raw = []
    for row in csv.DictReader(fp):
        mrid_to_mid_raw.append((row['mrid'], row['mid']))
        mrid_to_shared_raw.append((row['mrid'], row['shared']))
with open('data/rprp_macro_revisions.csv') as fp:
    c = csv.DictReader(fp)
    revisions = {row['mrid']: row for row in c}
prelim = dict(mrid_to_mid_raw)
mrid_to_mid_raw.extend(