def plotScalingFactor(): r=2*1e-8 l = 5e4 dpi = 300 j = 0 for nu0 in [0.005, 0.1]: for s in [0.025, 0.1]: t = np.arange(0, 2 * (utl.logit(0.995) - utl.logit(nu0)) / s + 1., 1) fig, ax = plt.subplots(2, 1, figsize=(5.5, 2.5), dpi=dpi, sharex=True); nu(t, s=s, nu0=nu0).plot(color='k', legend=False, ax=ax[0]) pplt.annotate(r'$s$={}, $\nu_0=${} ({} Sweep)'.format(s, nu0, ('Soft', 'Hard')[nu0 == 0.005]), fontsize=7, ax=ax[0]) pplt.setSize(ax=ax[0], fontsize=6) ax[0].set_ylabel(r'$\nu_t$') # H0 = H(t[0], s=s, nu0=nu0) Ht = H(t, s=s, nu0=nu0) df = pd.DataFrame([np.log(Ht / H0), -2 * r * t * l], columns=t, index=['log(Growth)', r'log(Decay)']).T df['log(Growth) + log(Decay)'] = df.sum(1) df.plot(ax=ax[1], grid=True, linewidth=2); ax[1].set_xlabel('Generations'); ax[1].set_ylabel('Log(Scaling Factor)') ax[1].axvline(df.iloc[1:, 2].abs().idxmin(), color='k', linestyle='--', linewidth=0.5) # if j != 3: # ax[1].legend_.remove() # else: ax[1].legend(['log(Growth)', r'log(Decay)', 'log(Growth) + log(Decay)'], bbox_to_anchor=(1.45, .75), prop={'size': 6}) pplt.setSize(ax[1], fontsize=6) plt.tight_layout(pad=0.1, rect=[0, 0, 0.7, 1]) plt.gcf().subplots_adjust(bottom=0.15) pplt.savefig('decayFactors{}'.format(j), dpi=dpi) j += 1
def runHMM(h, stepS=0.05, eps=1e-1,CD=None,E=None,save=True,verbose=1): if CD is None: CD = pd.read_pickle(utl.outpath + 'real/CDEidx.df').iloc[:] if E is None: E = pd.read_pickle(utl.outpath + 'real/Emissions.df') likes_null = getNullLikelihoods(CD,E) likes_thn = mkv.computeLikelihoodReal((CD, E, -stepS, h)) likes_thp = mkv.computeLikelihoodReal((CD[likes_null > likes_thn], E, stepS, h)); neg = likes_thn[likes_null <= likes_thn]; zero = likes_null.loc[(likes_null.loc[likes_thp.index] >= likes_thp).replace({False: None}).dropna().index]; pos = likes_thp.loc[(likes_null.loc[likes_thp.index] < likes_thp).replace({False: None}).dropna().index]; if verbose>0: print 'N={}\t Null={} ({:.0f}\%)\t Pos={}\t Neg={}'.format(CD.shape[0], zero.size, zero.size / float(CD.shape[0]) * 100, pos.size, neg.size); sys.stdout.flush() dfz = pd.DataFrame(zero.values, index=zero.index, columns=['alt']); dfz['s'] = 0 dfn = findML(neg, -stepS, CD.loc[neg.index], E, h, eps, stepS) dfp = findML(pos, stepS, CD.loc[pos.index], E, h, eps,stepS) df = pd.concat([dfp, dfz, dfn]) df = pd.concat([df, likes_null], axis=1) df.columns = pd.MultiIndex.from_product([[h], df.columns], names=['h', 'stat']) if save: path = utl.outpath + 'real/HMM/' utl.mkdir(path) df.to_pickle(path + 'h{:E}.df'.format(h)) return df
def Power(method, depthRate, nu0, s, numReplicates=3, samplingWindow=50, L=50000, numExperiments=500, numProcess=4): param = {'numExperiments': numExperiments, 'method': method, 'numThreads': numProcess, 'ModelName': 'TimeSeries', 'samplingWindow': samplingWindow, 'L': L, 'numReplicates': numReplicates, 'depthRate': depthRate} print '\nMethod={}\tR={}\twin={}\tnu0={}\ts={}, depthRate={}'.format(method, numReplicates, samplingWindow, nu0, s, depthRate) sys.stdout.flush() if method in ['CMH', 'HMM'] and depthRate == np.inf: return if not s and nu0 == 0.1: return param['nu0'] = nu0 param['s'] = s params = getParamsForExperiments(param) if numProcess == 1: a = map(runOne, params) else: pool = Pool(numProcess) a = pool.map(runOne, params) pool.terminate() gc.collect() df = pd.concat(a) sys.stdout.flush() df.sortlevel(inplace=True) df.dropna(axis=1, how='all', inplace=True) print df outpath = utl.outpath + 'ROC/runs/' utl.mkdir(outpath) df.to_pickle('{}{}.{:.0f}.{:E}.{:E}.df'.format(outpath, method, depthRate, nu0, s))
def computePowerForSandSaveRealData(sh, NumericallyStable=False, TakeLog=False, N = 1000,save=True): def computeTs(T): T2=T.dot(T).astype(float) T3=T2.dot(T) T4=T2.dot(T2) T5=T3.dot(T2) T10=T5.dot(T5) T12=T10.dot(T2) T14=T4.dot(T10) T15=T5.dot(T10) T22=T12.dot(T10) T23=T22.dot(T) if TakeLog: return pd.Series(map(utl.numbaLog, [T10, T12, T14, T15, T22, T23]), index=[10, 12, 14, 15, 22, 23]) else: return pd.Series([T10, T12, T14, T15, T22, T23], index=[10, 12, 14, 15, 22, 23]) s,h=sh path='{}transition/real/'.format(utl.outpath) utl.mkdir(path) fname = '{}S{:E}.H{:E}.df'.format(path, np.round(s, 2), h) # number of diploids # T = Markov.computeTransition(s, N, h=h, takeLog=True) #OLD NUMERICALLY STABLE # T=T.apply(lambda x: x-x.max(),axis=1).astype(np.float128).apply(np.exp).apply(lambda x: x/x.sum(),axis=1) # Tn=computeTs(T) T = Markov.computeTransition(s, N, h=h, takeLog=False) Tn=computeTs(T) zero = (0, -np.inf)[TakeLog] print 'Computed power for s={}, h={}'.format(s, h) + ' Number of zero prob transitions:', ( Tn.iloc[-1] == zero).sum( 1).iloc[1:-1].sum() if save: Tn.to_pickle(fname) else: return Tn gc.collect()
def computeStatistics(): cols = pd.MultiIndex.from_tuples( map(lambda x: (x[0], int(x[1])), ' C1 C2 C3 H1 H2 H3 L1 L2 L3'.split()), names=['POP', 'REP']) a = pd.read_csv(path + 'tot.snp.ref.freqs', sep='\t', header=None, index_col=range(4), names=['CHROM', 'POS', 'REF', 'ALT'] + range(9)) a.columns = cols pairwise = pd.concat([((a[a.columns[i]] + a[a.columns[j]]) / 2).rename( ''.join(map(str, a.columns[i])) + ''.join(map(str, a.columns[j]))) for i in range(a.shape[1]) for j in range(i + 1, a.shape[1])], axis=1) pairwise.to_pickle(path + 'pairwise.population.df') reload(est) def unroll(all): all = pd.concat([all.applymap(lambda x: x[k]) for k in all.iloc[0, 0].keys()], keys=all.iloc[0, 0].keys(), axis=1) all.columns.names = ['STAT'] + list(all.columns.names[1:]) return all single = unroll(a.groupby(level=[0], axis=1).apply( lambda x: utl.scanGenome(x, f=lambda x: est.Estimate.getEstimate(x, n=200, method='all'))[x.name])) single.to_pickle(path + 'single.df') pairwise = unroll(pairwise.groupby(level=[0], axis=1).apply( lambda x: utl.scanGenome(x, f=lambda x: est.Estimate.getEstimate(x, n=400, method='all'))[x.name])) pairwise.to_pickle(path + 'pairwise.df')
def plotQuantile(df, kde): import Util as utl quantiles = np.sort(np.append(np.linspace(0.0, 1, 1000)[:-1], np.linspace(0.999, 1, 10))) qq = pd.concat([utl.getQantilePvalues(df.COMALE, kde, quantiles=quantiles), utl.getQantilePvalues(df.COMALENC, kde, quantiles=quantiles)], axis=1); qq.columns = ['data', 'null']; QQPval(qq, fname=utl.paperFiguresPath + 'qq.pdf')
def real(): G = pd.read_pickle(utl.outpath + 'real/real.replicates.uptoF59.maxLikelihoods.regularized.LowCovRemoved.df'); G = G.s * (G.alt - G.null) R = pd.read_pickle(utl.outpath + 'real/real.replicates.uptoF59.df'); F=pd.read_pickle(utl.outpath+'real/negativeControl.Simulations.maxLikelihoods.regularized.df');F=F.s*(F.alt-F.null) kde=utl.getDensity(F,width=100) q = np.sort(np.append(np.linspace(0.0, 1, 100)[:-1], np.linspace(0.999, 1, 1000))) qq=pd.concat([utl.getQantilePvalues(G,kde,quantiles=q),utl.getQantilePvalues(F,kde,quantiles=q)],axis=1);qq.columns=['data','null']; pplt.QQPval(qq, fname=utl.paperFiguresPath + 'qq.pdf') reload(pplt)
def runOne(args): path = utl.outpath + 'markov/simulations/' utl.mkdir(path) numExp = int(1e5) nu0, s = args print nu0, s for i, batch in enumerate(utl.batch(range(numExp), 10000)): print; print i, batch[0], batch[-1] a = pd.concat(map(lambda x: Simulation.simulateSingleLoci(nu0=nu0, s=s)[[1, 10, 100]], batch), axis=1).T a.to_pickle(path + 'nu{:E}.s{:E}.{}.df'.format(nu0, s, i))
def plotSNPPval(out): scores = rutl.loadScores() kde = utl.getDensity(scores, width=1); pval = utl.getPvalKDE(out.sort_values(ascending=False).iloc[:1200], kde) print pval.sort_values() pval[pval >= 3].size df = pd.DataFrame(pval) df = pd.concat([df[df.index.get_level_values('CHROM') == ch] for ch in ['X', '2L', '2R', '3L', '3R', '4', '2LHet', '2RHet', '3LHet', '3RHet', 'XHet']]) fig = plt.figure(figsize=(7, 2), dpi=300); pplt.Manhattan(df, fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 8) for ax in fig.get_axes()]
def outlier(): scores = rutl.removeHeteroChromatin(rutl.loadScores()) field = comale; df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[ [field, 'Num. of SNPs']] a = df.iloc[:, 0] a = a.rename('Global Outliers'); o = a[a > a.quantile(0.99)] o.to_pickle(utl.outpath + 'real/outliers.global.df') fig = plt.figure(figsize=(7, 1.5), dpi=300); pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 5) for ax in fig.get_axes()]; plt.gcf().subplots_adjust(bottom=0.15); plt.savefig(utl.paperPath + 'new/{}.pdf'.format('global')) a = a.rename('Chrom Outliers'); o = a.groupby(level=0).apply(lambda x: x[x > x.quantile(0.99)].loc[x.name]) o.to_pickle(utl.outpath + 'real/outliers.chrom.df') fig = plt.figure(figsize=(7, 1.5), dpi=300); pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 5) for ax in fig.get_axes()] plt.gcf().subplots_adjust(bottom=0.15); plt.savefig(utl.paperPath + 'new/{}.pdf'.format('chrom')) a = a.rename('Local Outliers'); o = localOutliers(a) o.to_pickle(utl.outpath + 'real/outliers.local.df') fig = plt.figure(figsize=(7, 1.5), dpi=300); pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 5) for ax in fig.get_axes()] plt.gcf().subplots_adjust(bottom=0.15); plt.savefig(utl.paperPath + 'new/{}.pdf'.format('local'))
def scanSFS(): scores = rutl.loadScores() field = comale; df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[ [field, 'Num. of SNPs']] plotOne(df, df[df[field] > df[field].quantile(0.99)], fname='all') nu0 = rutl.getNut(0) nut = rutl.getNut(59) reload(rutl) # n= int(pd.read_pickle(utl.outpath + 'real/CD.F59.df').loc[:,pd.IndexSlice[:,0,'D']].mean().mean()) n = 100 SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=n) sf0 = scanOne(nu0, SFSelect, 'SFSelect.Base', 'SFSelect.Base'); SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=n) sft = scanOne(nut, SFSelect, 'SFSelect.Final', 'SFSelect.Final') sfr = pd.concat( [(sft.iloc[:, 0] - sf0.iloc[:, 0]).rename('SFS(59)-SFS(0)'), sf0.iloc[:, 0], sft.iloc[:, 0], df.iloc[:, 0]], axis=1) outlier = sfr[sfr.iloc[:, 0] > sfr.iloc[:, 0].quantile(0.99)] sfr.loc[(sfr.iloc[:, 0] < 0).values, sfr.columns[0]] = None fig = plt.figure(figsize=(7, 4.5), dpi=300); pplt.Manhattan(data=sfr, Outliers=outlier, fig=fig, markerSize=2, ticksize=8, sortedAlready=True) [pplt.setSize(ax, 5) for ax in fig.get_axes()] plt.savefig(utl.paperPath + 'new/{}.pdf'.format('sfs-clear'))
def computeIntervals(minSize=500): scores = pd.read_pickle(utl.outpath + 'real/scores.df') scores = (scores.lrh * scores.sh.apply(np.sign)).rename('H') regions = utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean()}, minSize=minSize, winSize=50000).H regions = regions[regions > regions.quantile(0.99)] regions = utl.BED.getIntervals(regions, 25000) return regions
def computeLikelihoodRealCDold(args): """ Args: (it's more convenient for multiprocessing) args: a list of [R,s,h]. R: is a dataframe for which each row is a position and columns are allele frequencies. ColumnsLevels= [REP, TIME] , IndexLevels=[CHROM,POS] s: is selection strength h: is overdominance Returns: a series containing likelihood of timeseries for the specific values of s and h. """ CD, E, s, h, regLambda = args print CD.shape, s, h if CD.shape[0] > 4 * 1e5: numBatches = 5 idx = np.arange(CD.shape[0]) return pd.concat( map(lambda x: computeLikelihoodRealCDold((CD.iloc[x], E, s, h, regLambda)), np.array_split(idx, numBatches))) powers = pd.Series(pd.Series(CD[r].columns).diff().values[1:] for r in range(3)) T = pd.read_pickle(utl.outpath + 'transition/real/S{:02.0f}.H{:02.0f}.df'.format(s * 100, h * 100)) likes = pd.Series(0, index=CD.index, name=(s, h)) for rep, df in CD.T.groupby(level=0): alpha = E.loc[df.loc[(rep, 0)]] for step, power in zip(range(1, df.shape[0]), powers[rep]): alpha = alpha.values.dot(T.loc[power].values) * E.loc[df.loc[rep].iloc[step]] likes += utl.vectorizedLog(alpha.mean(1).values) return likes - regLambda * abs(s)
def computeComale(name='h50.df', recompute=False, q=0.99): path = utl.outpath + 'real/HMM/h50.COMALE.df' if not os.path.exists(path) or recompute: df = pd.read_pickle(utl.outpath + 'real/HMM/' + name)[0.5] df['lr'] = (df.alt - df.null) * df.s null = df.copy(True) np.random.shuffle(null.values) fcomale = {'COMALE': lambda x: x[x >= x.quantile(q)].mean(), 'M': lambda x: x.size}; alt = utl.scanGenome(df.lr, fcomale, minSize=200) null = utl.scanGenome(null.lr, fcomale, minSize=200); null.columns = ['COMALENC', 'M'] alt = pd.concat([null.COMALENC, alt], axis=1) alt.to_pickle(path) return alt else: return pd.read_pickle(path)
def createOneMSMS(param, forceToHaveSoftFreq): theta = 2 * param["Ne"] * param["mu"] * param["L"] rho = 2 * param["Ne"] * param["r"] * param["L"] path = "{}{}/msms/".format(utl.simoutpath, param["ModelName"]) utl.mkdir(path) if isinstance(param["i"], (int, float, long)): filename = "{}L{:E}.{:E}.msms".format(path, param["L"], param["i"]) else: filename = "{}L{:E}.{}.msms".format(path, param["L"], param["i"]) cmd = "java -jar -Xmx2g ~/bin/msms/lib/msms.jar -ms {} 1 -t {:.0f} -r {:.0f} {:.0f} -oFP 0.000000000000E00 > {}".format( param["n"], theta, rho, param["L"], filename ) subprocess.call(cmd, shell=True) if ( forceToHaveSoftFreq and not (Simulation.MSMS.load(filename)[0].mean(0) == 0.1).sum() ): # make sure inital freq 0.1 exist createOneMSMS(param)
def scanSFS(XX, winSize=10000): import popgen.Estimate as est return ( XX.apply(lambda x: utl.scanGenome(x.dropna(), uf=est.Estimate.getAllEstimatesX, winSize=winSize)) .unstack("method") .stack(["POP", "GEN"]) )
def loadAllScores(h=None, scores=True): path = utl.outpath + 'real/HMM/' if h is None: return pd.concat(map(lambda x: pd.read_pickle(path + x), utl.files(path)), axis=1) else: a = pd.read_pickle('{}h{:E}.df'.format(path, h))[h] if scores: a = (a.alt - a.null) * a.s.apply(np.sign) return a
def one(method): ff = lambda x: ((x.alt - x.null) * x.s.apply(np.sign)).fillna(0).sort_index() path = utl.outpath + 'ROC/runs/' files = pd.Series(utl.files(path)) files = files[files.apply(lambda x: method in x)] if method == 'MarkovChain': pd.concat([ff(pd.read_pickle(path + f)) for f in files]).to_pickle(utl.outpath + 'ROC/' + method) else: pd.concat([pd.read_pickle(path + f) for f in files]).to_pickle(utl.outpath + 'ROC/' + method)
def saveAnnotationUCSC(): utl.BED.saveBEDGraph(utl.loadSNPID()["ID"], color="0,0,0", name="dbSNP", fout_name=path + "dbSNP") ann = ( pd.read_csv(utl.home + "storage/Data/Dmelanogaster/Hypoxia/popsss/all.ANN.csv", sep="\t") .set_index(["CHROM", "POS"]) .loc[IH.replace({False: None}).dropna().index] ) ann = ann.iloc[:, :3].reset_index().drop_duplicates().set_index(["CHROM", "POS"]) ann = ann.Annotation + "(" + ann.REF + ">" + ann.Allele + ")" utl.BED.saveBEDGraph(ann, color="0,0,0", name="SNP effect", fout_name=path + "UCSC/effectSNP")
def computeLikelihoodRealBatch(args): CD, E, T, powers = args likes = pd.Series(0, index=CD.index) for rep, df in CD.T.groupby(level=0): alpha = E.iloc[df.loc[(rep, 0)]].values for step, power in zip(range(1, df.shape[0]), powers[rep]): alpha = alpha.dot(T.loc[power].values) * E.values[df.loc[rep].iloc[step].values] #likes += utl.vectorizedLog(alpha.mean(1)) likes += utl.vectorizedLog(alpha.mean(1)) #it should be here return likes
def saveLatex(): for name in [x for x in utl.files('/home/arya/out/real/gowinda/') if x[-4:] == '.tsv']: # name='cand.local.damped.0.out.tsv' a = pd.read_csv('/home/arya/out/real/gowinda/' + name, sep='\t', header=None)[[0, 4, 5, 6, 7, 8, 9]] a.columns = ['GO', '-logPval', 'Hits', 'VarGenes', 'TotGenes', 'Term', 'Genes'] a = a[a.Hits >= 3] a['-logPval'] = -a['-logPval'].apply(np.log10).round(1) a['Genes'] = a['Genes'].apply(lambda x: x.replace(',', ' ')) utl.DataframetolaTexTable(a.iloc[:, 1:], fname=utl.paperPath + 'new/' + name.replace('.tsv', '.tex'), alignment=list('cccc') + ['p{2in}', 'p{2in}'])
def Final(): scores = rutl.loadScores(skipHetChroms=True).abs() a = sort(utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size})) intervals = ga.getIntervals(o.H, padding=30000) fig = plt.figure(figsize=(7, 1.5), dpi=300); pplt.Manhattan(data=a, Outliers=o, shade=intervals.reset_index(), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 5) for ax in fig.get_axes()]; plt.gcf().subplots_adjust(bottom=0.15); plt.suptitle((shades.shape[0], shades['len'].sum() / 1e6), fontsize=8) plt.savefig(utl.paperPath + 'new/{}.pdf'.format('CHROM.FDR_0.01'))
def computeTransition(s, N, h=0.5, takeLog=False,nu0_N=None): if nu0_N is None:nu0=np.arange(2*N+1)/float(2*N) else: nu0=np.arange(2*nu0_N+1)/float(2*nu0_N) nu_t = map(lambda x: max(min(utl.fx(x, s, h=h), 1.), 0.), nu0) if takeLog: # T=pd.DataFrame(computeLogTransition(nu_t,N),index=nu0,columns=nu0) figure out normilzartion pass else: T=pd.DataFrame(computeTransition(nu_t,N),index=nu0,columns=nu0) if not nu0_N is None: T=T/T.sum(1) return T
def computeBaseSFS(recompute=False): path = utl.outpath + 'real/SFS.F0.df' if not os.path.exists(path) or recompute: x0 = dta.getBaseFreq() import popgen.Estimate as est sfs = utl.scanGenome(x0, lambda x: est.Estimate.getEstimate(x=x, n=1000, method='all', selectionPredictor=True)).apply( lambda x: pd.Series(x[0]), axis=1) sfs.to_pickle(path) return sfs else: return pd.read_pickle(path)
def computeLocalPval(x,i): wins=np.array([200])*1000 df=[] for i in X.index: res=[] for pad in wins: x=X[(X.index>=i-pad) & (X.index<=i+pad)] kde=utl.getDensity(x[x.index != i]) res+=[utl.getPvalKDE(pd.Series(x.loc[i]),kde)[0]] df+=[pd.Series(res,index=wins,name=i)] df=pd.DataFrame(df) pd.concat([df.apply(lambda x:x.idxmax(),1),df.max(1)],1).plot.scatter(x=0,y=1) a['pval']=df.max(1).values o=a[a.pval>a.pval.quantile(0.999)] pplt.Manhattan(a,Outliers=o) df.max(1).plot() y=utl.scan3way(x,winsize=10,f=np.mean) x.sort_values() y.sort_values()
def computeIntervalsBED(padding=25000, cutoff=0.9999): path = utl.outpath + 'real/HMM/h50.COMALE.df' df = pd.read_pickle(path) df = df[df.COMALE > df.COMALENC.quantile(cutoff)].COMALE.reset_index() df['start'] = df.POS - padding df['end'] = df.POS + padding df['name'] = '.' df = df[['CHROM', 'start', 'end', 'name', 'COMALE']] df = utl.mergeIntervals(df) df df.to_csv(utl.outpath + 'real/intervals.bed', sep='\t', header=None, index=None) df['len'] = df.end - df.start df.to_pickle(utl.outpath + 'real/intervals.df')
def load(ExperimentName, s=0.1, L=50000, experimentID=0, nu0=0.005, isFolded=False, All=False, startGeneration=0, maxGeneration=50, numReplicates=3, numSamples=5, step=10, replicates=None, depthRate=np.inf): path='{}{}/simpop/'.format(utl.simoutpath, ExperimentName) + Simulation.getSimulationName(s=s, L=L, experimentID=experimentID, initialCarrierFreq=nu0, isFolded=isFolded) + '.pkl' sim= pd.read_pickle(path) sim.savedPath=path if replicates is not None: sim.setReplicates(sorted(replicates)) elif numReplicates is not None: sim.setReplicates(range(numReplicates)) if depthRate != np.inf: sim.Xi = sim.X sim.X = sim.C.loc[depthRate] / sim.D.loc[depthRate].astype(float) sim.X = np.array(map(lambda x: utl.roundto(x, 5), sim.X.reshape(-1) * 1e4)).reshape(sim.X.shape) / 1e4 if not All: sim.setSamplingTimes(maxGeneration=min(maxGeneration,sim.getGenerationTimes()[-1]),numSamples=numSamples,step=step,startGeneration=startGeneration) return sim
def PowerForDepth(method, depthRate, numReplicates=3, samplingWindow=50, L=50000, numExperiments=500, numProcess=4): df = []; Nu = [0.005, 0.1]; S = [.025, 0.05, 0.075, 0.1] param = {'numExperiments': numExperiments, 'method': method, 'numThreads': numProcess, 'ModelName': 'TimeSeries', 'samplingWindow': samplingWindow, 'L': L, 'numReplicates': numReplicates, 'depthRate': depthRate} print 'Nu={}\tS={}\tnumThreads={}\tmethod={}\tnumExperiments={}'.format(Nu, S, numProcess, method, numExperiments) sys.stdout.flush() if method == 'HMM' and depthRate == np.inf: return for nu0 in Nu: param['nu0'] = nu0 for s in S: param['s'] = s params = getParamsForExperiments(param) if numProcess == 1: a = map(runOne, params) else: pool = Pool(numProcess) a = pool.map(runOne, params) pool.terminate() gc.collect() df += [pd.concat(a)] print '\nMethod={}\tR={}\twin={}\tnu0={}\ts={}, depthRate={}'.format(method, numReplicates, samplingWindow, nu0, s, depthRate) sys.stdout.flush() for param in params: param['s'] = 0;param['nu0'] = 0.005 pool = Pool(numProcess) df += [pd.concat(pool.map(runOne, params))] df=pd.concat(df) df.sortlevel(inplace=True) df.dropna(axis=1,how='all',inplace=True) print df outpath = utl.outpath + 'ROC/' utl.mkdir(outpath) df.to_pickle('{}{}.{:.0f}.df'.format(outpath, method, depthRate))
def gowinda(): gow = pd.read_csv(utl.outpath + 'real/gowinda/cand.q99.out', sep='\t', header=None) arya = np.array("""GO:0004046 GO:0015101 GO:0007501 GO:0004601 GO:0006979 GO:0009312 GO:0004653 GO:0040014 GO:0016485 GO:0006030 GO:0020037 GO:0008061 GO:0004702""".split()) np.intersect1d(gow[0].unique().astype(str), arya).shape Genes = pd.read_pickle(utl.outpath + 'real/GO.df') pval, cont = utl.getPvalFisher(Genes.reset_index().GO.unique(), gow[0], arya)
def Simulation(): a=pd.read_pickle('{}ROC/{}.df'.format(utl.outpath, 'COMALE'));a=a.s*(a.alt-a.null); pos=a.loc[(0.1,'COMALE',0.1,1,0)];neg=a.loc[(0.005,'COMALE',0.0,-1,0)] F=pd.read_pickle(utl.outpath+'real/negativeControl.Simulations.maxLikelihoods.regularized.df').loc[0];F=F.s*(F.alt-F.null) q=np.linspace(0,1,1200) kde=utl.getDensity(F,width=50) qq=pd.concat([utl.getQantilePvalues(pos,kde,quantiles=q),utl.getQantilePvalues(neg,kde,quantiles=q)],axis=1);qq.columns=['data','null']; pplt.QQPval(qq) plt.savefig(utl.paperFiguresPath + 'qqsim.pdf')