def read_V2_data(data_file, learn_options=None, verbose=True): if data_file is None: data_file = "../data/11-15-2014 DeepXPR results_processed.xlsx" # to compare # import predict as pr; a1, g1, t1, X1, Y1 = pr.data_setup() # a1.index.names data = pandas.read_excel(data_file, sheetname="ResultsFiltered", skiprows=range(0, 6 + 1), index_col=[0, 4]) # grab data relevant to each of three drugs, which exludes some genes # note gene MED12 has two drugs, all others have at most one Xdf = pandas.DataFrame() # This comes from the "Pairs" tab in their excel sheet, # note HPRT/HPRT1 are same thing, and also PLX_2uM/PLcX_2uM known_pairs = { 'AZD_200nM': ['CCDC101', 'MED12', 'TADA2B', 'TADA1'], '6TG_2ug/mL': ['HPRT1'], 'PLX_2uM': ['CUL3', 'NF1', 'NF2', 'MED12'] } drugs_to_genes = { 'AZD_200nM': ['CCDC101', 'MED12', 'TADA2B', 'TADA1'], '6TG_2ug/mL': ['HPRT1'], 'PLX_2uM': ['CUL3', 'NF1', 'NF2', 'MED12'] } if learn_options is not None: assert not ( learn_options['extra pairs'] and learn_options['all pairs'] ), "extra pairs and all pairs options (in learn_options) can't be active simultaneously." if learn_options['extra pairs']: drugs_to_genes['AZD_200nM'].extend(['CUL3', 'NF1', 'NF2']) elif learn_options['all pairs']: drugs_to_genes['AZD_200nM'].extend(['HPRT1', 'CUL3', 'NF1', 'NF2']) drugs_to_genes['PLX_2uM'].extend( ['HPRT1', 'CCDC101', 'TADA2B', 'TADA1']) drugs_to_genes['6TG_2ug/mL'].extend( ['CCDC101', 'MED12', 'TADA2B', 'TADA1', 'CUL3', 'NF1', 'NF2']) count = 0 for drug in drugs_to_genes.keys(): genes = drugs_to_genes[drug] for g in genes: Xtmp = data.copy().xs(g, level='Target gene', drop_level=False) Xtmp['drug'] = drug Xtmp['score'] = Xtmp[drug].copy( ) # grab the drug results that are relevant for this gene if g in known_pairs[drug]: Xtmp['test'] = 1. else: Xtmp['test'] = 0. count = count + Xtmp.shape[0] Xdf = pandas.concat([Xdf, Xtmp], axis=0) if verbose: print "Loaded %d samples for gene %s \ttotal number of samples: %d" % ( Xtmp.shape[0], g, count) # create new index that includes the drug Xdf = Xdf.set_index('drug', append=True) Y = pandas.DataFrame(Xdf.pop("score")) Y.columns.names = ["score"] test_gene = pandas.DataFrame(Xdf.pop('test')) target = pandas.DataFrame(Xdf.index.get_level_values('Target gene').values, index=Y.index, columns=["Target gene"]) Y = pandas.concat((Y, target, test_gene), axis=1) target_genes = Y['Target gene'].unique() gene_position = Xdf[["Percent Peptide", "Amino Acid Cut position"]].copy() # convert to ranks for each (gene, drug combo) # flip = True y_rank = pandas.DataFrame() y_threshold = pandas.DataFrame() y_quant = pandas.DataFrame() for drug in drugs_to_genes.keys(): gene_list = drugs_to_genes[drug] for gene in gene_list: ytmp = pandas.DataFrame( Y.xs((gene, drug), level=["Target gene", "drug"], drop_level=False)['score']) y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks( ytmp, thresh=0.8, prefix="score_drug_gene", flip=False) # np.unique(y_rank.values-y_rank_raw.values) y_rank = pandas.concat((y_rank, y_ranktmp), axis=0) y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0) y_quant = pandas.concat((y_quant, y_quanttmp), axis=0) yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1) Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True) # convert also by drug only, irrespective of gene y_rank = pandas.DataFrame() y_threshold = pandas.DataFrame() y_quant = pandas.DataFrame() for drug in drugs_to_genes.keys(): ytmp = pandas.DataFrame( Y.xs(drug, level="drug", drop_level=False)['score']) y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks( ytmp, thresh=0.8, prefix="score_drug", flip=False) # np.unique(y_rank.values-y_rank_raw.values) y_rank = pandas.concat((y_rank, y_ranktmp), axis=0) y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0) y_quant = pandas.concat((y_quant, y_quanttmp), axis=0) yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1) Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True) PLOT = False if PLOT: # to better understand, try plotting something like: labels = [ "score", "score_drug_gene_rank", "score_drug_rank", "score_drug_gene_threshold", "score_drug_threshold" ] for label in labels: plt.figure() plt.plot(Xdf['sgRNA Score'].values, Y[label].values, '.') r, pearp = sp.stats.pearsonr(Xdf['sgRNA Score'].values.flatten(), Y[label].values.flatten()) plt.title(label + ' VS pred. score, $r$=%0.2f (p=%0.2e)' % (r, pearp)) plt.xlabel("sgRNA prediction score") plt.ylabel(label) gene_position = util.impute_gene_position(gene_position) if learn_options is not None and learn_options["weighted"] == "variance": print "computing weights from replicate variance..." # compute the variance across replicates so can use it as a weight data = pandas.read_excel(data_file, sheetname="Normalized", skiprows=range(0, 6 + 1), index_col=[0, 4]) data.index.names = ["Sequence", "Target gene"] experiments = {} experiments['AZD_200nM'] = [ 'Deep 25', 'Deep 27', 'Deep 29 ', 'Deep 31' ] experiments['6TG_2ug/mL'] = [ 'Deep 33', 'Deep 35', 'Deep 37', 'Deep 39' ] experiments['PLX_2uM'] = ['Deep 49', 'Deep 51', 'Deep 53', 'Deep 55'] variance = None for drug in drugs_to_genes.keys(): data_tmp = data.iloc[data.index.get_level_values( 'Target gene').isin(drugs_to_genes[drug])][experiments[drug]] data_tmp["drug"] = drug data_tmp = data_tmp.set_index('drug', append=True) data_tmp["variance"] = np.var(data_tmp.values, axis=1) if variance is None: variance = data_tmp["variance"].copy() else: variance = pandas.concat((variance, data_tmp["variance"]), axis=0) orig_index = Y.index.copy() Y = pandas.merge(Y, pandas.DataFrame(variance), how="inner", left_index=True, right_index=True) Y = Y.ix[orig_index] print "done." # Make sure to keep this check last in this function assert Xdf.index.equals( Y.index ), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)" return Xdf, drugs_to_genes, target_genes, Y, gene_position
def read_V2_data(data_file, learn_options=None, verbose=True): if data_file is None: data_file = "../data/11-15-2014 DeepXPR results_processed.xlsx" # to compare # import predict as pr; a1, g1, t1, X1, Y1 = pr.data_setup() # a1.index.names data = pandas.read_excel(data_file, sheetname="ResultsFiltered", skiprows=range(0, 6+1), index_col=[0, 4]) # grab data relevant to each of three drugs, which exludes some genes # note gene MED12 has two drugs, all others have at most one Xdf = pandas.DataFrame() # This comes from the "Pairs" tab in their excel sheet, # note HPRT/HPRT1 are same thing, and also PLX_2uM/PLcX_2uM known_pairs = {'AZD_200nM': ['CCDC101', 'MED12', 'TADA2B', 'TADA1'], '6TG_2ug/mL': ['HPRT1'], 'PLX_2uM': ['CUL3', 'NF1', 'NF2', 'MED12']} drugs_to_genes = {'AZD_200nM': ['CCDC101', 'MED12', 'TADA2B', 'TADA1'], '6TG_2ug/mL': ['HPRT1'], 'PLX_2uM': ['CUL3', 'NF1', 'NF2', 'MED12']} if learn_options is not None: assert not (learn_options['extra pairs'] and learn_options['all pairs']), "extra pairs and all pairs options (in learn_options) can't be active simultaneously." if learn_options['extra pairs']: drugs_to_genes['AZD_200nM'].extend(['CUL3', 'NF1', 'NF2']) elif learn_options['all pairs']: drugs_to_genes['AZD_200nM'].extend(['HPRT1', 'CUL3', 'NF1', 'NF2']) drugs_to_genes['PLX_2uM'].extend(['HPRT1', 'CCDC101', 'TADA2B', 'TADA1']) drugs_to_genes['6TG_2ug/mL'].extend(['CCDC101', 'MED12', 'TADA2B', 'TADA1', 'CUL3', 'NF1', 'NF2']) count = 0 for drug in drugs_to_genes.keys(): genes = drugs_to_genes[drug] for g in genes: Xtmp = data.copy().xs(g, level='Target gene', drop_level=False) Xtmp['drug'] = drug Xtmp['score'] = Xtmp[drug].copy() # grab the drug results that are relevant for this gene if g in known_pairs[drug]: Xtmp['test'] = 1. else: Xtmp['test'] = 0. count = count + Xtmp.shape[0] Xdf = pandas.concat([Xdf, Xtmp], axis=0) if verbose: print "Loaded %d samples for gene %s \ttotal number of samples: %d" % (Xtmp.shape[0], g, count) # create new index that includes the drug Xdf = Xdf.set_index('drug', append=True) Y = pandas.DataFrame(Xdf.pop("score")) Y.columns.names = ["score"] test_gene = pandas.DataFrame(Xdf.pop('test')) target = pandas.DataFrame(Xdf.index.get_level_values('Target gene').values, index=Y.index, columns=["Target gene"]) Y = pandas.concat((Y, target, test_gene), axis=1) target_genes = Y['Target gene'].unique() gene_position = Xdf[["Percent Peptide", "Amino Acid Cut position"]].copy() # convert to ranks for each (gene, drug combo) # flip = True y_rank = pandas.DataFrame() y_threshold = pandas.DataFrame() y_quant = pandas.DataFrame() for drug in drugs_to_genes.keys(): gene_list = drugs_to_genes[drug] for gene in gene_list: ytmp = pandas.DataFrame(Y.xs((gene, drug), level=["Target gene", "drug"], drop_level=False)['score']) y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks(ytmp, thresh=0.8, prefix="score_drug_gene", flip=False) # np.unique(y_rank.values-y_rank_raw.values) y_rank = pandas.concat((y_rank, y_ranktmp), axis=0) y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0) y_quant = pandas.concat((y_quant, y_quanttmp), axis=0) yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1) Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True) # convert also by drug only, irrespective of gene y_rank = pandas.DataFrame() y_threshold = pandas.DataFrame() y_quant = pandas.DataFrame() for drug in drugs_to_genes.keys(): ytmp = pandas.DataFrame(Y.xs(drug, level="drug", drop_level=False)['score']) y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks(ytmp, thresh=0.8, prefix="score_drug", flip=False) # np.unique(y_rank.values-y_rank_raw.values) y_rank = pandas.concat((y_rank, y_ranktmp), axis=0) y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0) y_quant = pandas.concat((y_quant, y_quanttmp), axis=0) yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1) Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True) PLOT = False if PLOT: # to better understand, try plotting something like: labels = ["score", "score_drug_gene_rank", "score_drug_rank", "score_drug_gene_threshold", "score_drug_threshold"] for label in labels: plt.figure() plt.plot(Xdf['sgRNA Score'].values, Y[label].values, '.') r, pearp = sp.stats.pearsonr(Xdf['sgRNA Score'].values.flatten(), Y[label].values.flatten()) plt.title(label + ' VS pred. score, $r$=%0.2f (p=%0.2e)' % (r, pearp)) plt.xlabel("sgRNA prediction score") plt.ylabel(label) gene_position = util.impute_gene_position(gene_position) if learn_options is not None and learn_options["weighted"] == "variance": print "computing weights from replicate variance..." # compute the variance across replicates so can use it as a weight data = pandas.read_excel(data_file, sheetname="Normalized", skiprows=range(0, 6+1), index_col=[0, 4]) data.index.names = ["Sequence", "Target gene"] experiments = {} experiments['AZD_200nM'] = ['Deep 25', 'Deep 27', 'Deep 29 ', 'Deep 31'] experiments['6TG_2ug/mL'] = ['Deep 33', 'Deep 35', 'Deep 37', 'Deep 39'] experiments['PLX_2uM'] = ['Deep 49', 'Deep 51', 'Deep 53', 'Deep 55'] variance = None for drug in drugs_to_genes.keys(): data_tmp = data.iloc[data.index.get_level_values('Target gene').isin(drugs_to_genes[drug])][experiments[drug]] data_tmp["drug"] = drug data_tmp = data_tmp.set_index('drug', append=True) data_tmp["variance"] = np.var(data_tmp.values, axis=1) if variance is None: variance = data_tmp["variance"].copy() else: variance = pandas.concat((variance, data_tmp["variance"]), axis=0) orig_index = Y.index.copy() Y = pandas.merge(Y, pandas.DataFrame(variance), how="inner", left_index=True, right_index=True) Y = Y.ix[orig_index] print "done." # Make sure to keep this check last in this function assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)" return Xdf, drugs_to_genes, target_genes, Y, gene_position
for p in r[ 'pdl']: # convert dicts to Counters to make comparisons much more convenient p['actions'] = Counter(p['actions']) p['macros'] = Counter(p['macros']) p['macros'].pop('0', None) # clear out the dummy macros solutions[(r['uuid'], int(r['count']))] = r i += 1 with open(datapath("soln.pickle", pid), 'wb') as fp: print("pickling solution data") pickle.dump(solutions, fp) print() return history, solutions puzzles = get_ranks("data/rprp_puzzle_ranks_v3") print("loading macro data") with open("data/macro_families.json") as fp: macro_families = [frozenset(f) for f in json.load(fp)] with open("data/rprp_macros.csv") as fp: mrid_to_mid_raw = [] mrid_to_shared_raw = [] for row in csv.DictReader(fp): mrid_to_mid_raw.append((row['mrid'], row['mid'])) mrid_to_shared_raw.append((row['mrid'], row['shared'])) with open('data/rprp_macro_revisions.csv') as fp: c = csv.DictReader(fp) revisions = {row['mrid']: row for row in c} prelim = dict(mrid_to_mid_raw) mrid_to_mid_raw.extend(