def __init__(self, filename, verboseR=True): Challenge.__init__(self, challenge_name='D8C2') RTools.__init__(self, verboseR=verboseR) self.filename = filename self._path2data = os.path.split(os.path.abspath(__file__))[0]
def __init__(self, filename, verboseR=True): Challenge.__init__(self, challenge_name='D8C2') RTools.__init__(self, verboseR=verboseR) self.filename = filename
def score(self, filename): self.G = self._read_challenge(self.download_goldstandard()) self.T = self._read_challenge(filename) #from scipy.stats.stats import pearsonr, spearmanr G = self.G[self.G.columns[2:]].values T = self.T[self.T.columns[2:]].values # Using scipy, the pvalue are not the same as in matlab for several reasons. # first scipy returns only 2-tail pvalue but more importantly, it is # a rough approximation as mentionned in their doc and when compared # to matlab differences can be large. So, we use R, which results are # also differnt but much close (1-2% different """data = [spearmanr(G[i,:], T[i,:]) for i in range(0,50)] rho_row = [x[0] for x in data] pval_row = [x[1] for x in data] # row correlation data = [spearmanr(G[:,i], T[:,i]) for i in range(0,8)] rho_col = [x[0] for x in data] pval_col = [x[1] for x in data] """ from dreamtools.core.rtools import RTools rtool = RTools(verboseR=False) pval_row = [] pval_col = [] rho_row = [] rho_col = [] for i in range(0, 50): rtool.session.t = T[i, :].copy() rtool.session.g = G[i, :].copy() rtool.session.run( "results = cor.test(t, g, method='spearman', alternative='greater', exact=F)" ) rho_row.append(rtool.session.results['estimate']) pval_row.append(rtool.session.results['p.value']) for i in range(0, 8): rtool.session.t = T[:, i].copy() rtool.session.g = G[:, i].copy() rtool.session.run( "results = cor.test(t, g, method='spearman', alternative='greater', exact=F)" ) rho_col.append(rtool.session.results['estimate']) pval_col.append(rtool.session.results['p.value']) print(""" WARNING: the spearman correlation pvalue are computed using R. Pvalues are slightly different from those computed using matlab and therefore the final values may differ by a few percents to the pvlues reported in the original challenge. \n""") self.rho_col = rho_col self.pval_col = pval_col self.rho_row = rho_row self.pval_row = pval_row score1 = np.exp(np.nansum(np.log(pval_row)) / 50) score2 = np.exp(np.nansum(np.log(pval_col)) / 8.) score = sum(-np.log10([score1, score2])) / 2. return {'score': score}
def score_challengeB(self, filenames): # Ideally provide 3 filenames but if only 1 is given, try # to infer the names of the 2 others cor_pheno1 = [] cor_pheno2 = [] pval_pheno1 = [] pval_pheno2 = [] scores = [] from dreamtools.core.rtools import RTools rtool = RTools(verboseR=False) assert len(filenames) == 3, "Must provide 3 files" self.golds = [] self.preds = [] gold_filenames = self.download_goldstandard('B') print("Warning: your 3 submissions should be ordered as B1, B2, B3 files") for tag in [1, 2, 3]: #assumeing data and gs are sorted in the same way !! gold = pd.read_csv(gold_filenames[tag-1], sep='[ \t]', engine='python') self.golds.append(gold) #filename = 'DREAM5_SysGenB%s_your_Predictions.txt' % tag #filename = self._pj([self.classpath, 'data', filename]) filename = filenames[tag-1] pred1 = pd.read_csv(filename, sep='[ \t]', engine='python') self.preds.append(pred1) # correlation gs versus predicted rtool.session.t = pred1.ix[0].values rtool.session.g = gold.ix[0].values rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')") T1 = rtool.session.results.copy() rtool.session.t = pred1.ix[1].values rtool.session.g = gold.ix[1].values rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')") T2 = rtool.session.results.copy() cor_pheno1.append(T1['estimate']) cor_pheno2.append(T2['estimate']) pval_pheno1.append(T1['p.value']) pval_pheno2.append(T2['p.value']) score = -(np.log(T1['p.value']) + np.log(T2['p.value'])) scores.append(score) self.corp1 = cor_pheno1 self.corp2 = cor_pheno2 self.pval1 = pval_pheno1 self.pval2 = pval_pheno2 self.scores = scores # This part now compute the pvalues using random prediction random_scores = {0:[],1:[],2:[]} from easydev import Progress pb = Progress(self.N_pvalues, interval=1) for ii in range(1, self.N_pvalues): for tag in [0,1,2]: #generate random coordinates coord = random.sample(['RIL%s' % i for i in range(1,31)], 30) coord2 = random.sample(['RIL%s' % i for i in range(1,31)], 30) # Obtaining random scores rtool.session.t = self.preds[tag].ix[0].ix[coord].values rtool.session.g = self.golds[tag].ix[0].values rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')") T1 = rtool.session.results.copy() rtool.session.t = self.preds[tag].ix[1].ix[coord2].values rtool.session.g = self.golds[tag].ix[1].values rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')") T2 = rtool.session.results.copy() random_scores[tag].append(-(np.log(T1['p.value']) + np.log(T2['p.value']))) pb.animate(ii+1) self.random_scores = random_scores #Obtaining p-values pvals = [sum(self.random_scores[k]>= self.scores[k])/float(self.N_pvalues) for k in [0,1,2]] self.pvals = pvals df = pd.DataFrame({'scores':self.scores, 'correlation_phenotype1':cor_pheno1, 'correlation_phenotype2':cor_pheno2, 'pvalues_phenotype1':pval_pheno1, 'pvalues_phenotype2':pval_pheno2, 'pvalues':self.pvals}) df= df.T df.columns = ['SysGenB1', 'SysGenB2', 'SysGenB3'] return df