def computePowerForSandSaveRealData(sh, NumericallyStable=False, TakeLog=False, N = 1000,save=True): def computeTs(T): T2=T.dot(T).astype(float) T3=T2.dot(T) T4=T2.dot(T2) T5=T3.dot(T2) T10=T5.dot(T5) T12=T10.dot(T2) T14=T4.dot(T10) T15=T5.dot(T10) T22=T12.dot(T10) T23=T22.dot(T) if TakeLog: return pd.Series(map(utl.numbaLog, [T10, T12, T14, T15, T22, T23]), index=[10, 12, 14, 15, 22, 23]) else: return pd.Series([T10, T12, T14, T15, T22, T23], index=[10, 12, 14, 15, 22, 23]) s,h=sh path='{}transition/real/'.format(utl.outpath) utl.mkdir(path) fname = '{}S{:E}.H{:E}.df'.format(path, np.round(s, 2), h) # number of diploids # T = Markov.computeTransition(s, N, h=h, takeLog=True) #OLD NUMERICALLY STABLE # T=T.apply(lambda x: x-x.max(),axis=1).astype(np.float128).apply(np.exp).apply(lambda x: x/x.sum(),axis=1) # Tn=computeTs(T) T = Markov.computeTransition(s, N, h=h, takeLog=False) Tn=computeTs(T) zero = (0, -np.inf)[TakeLog] print 'Computed power for s={}, h={}'.format(s, h) + ' Number of zero prob transitions:', ( Tn.iloc[-1] == zero).sum( 1).iloc[1:-1].sum() if save: Tn.to_pickle(fname) else: return Tn gc.collect()
def runHMM(h, stepS=0.05, eps=1e-1,CD=None,E=None,save=True,verbose=1): if CD is None: CD = pd.read_pickle(utl.outpath + 'real/CDEidx.df').iloc[:] if E is None: E = pd.read_pickle(utl.outpath + 'real/Emissions.df') likes_null = getNullLikelihoods(CD,E) likes_thn = mkv.computeLikelihoodReal((CD, E, -stepS, h)) likes_thp = mkv.computeLikelihoodReal((CD[likes_null > likes_thn], E, stepS, h)); neg = likes_thn[likes_null <= likes_thn]; zero = likes_null.loc[(likes_null.loc[likes_thp.index] >= likes_thp).replace({False: None}).dropna().index]; pos = likes_thp.loc[(likes_null.loc[likes_thp.index] < likes_thp).replace({False: None}).dropna().index]; if verbose>0: print 'N={}\t Null={} ({:.0f}\%)\t Pos={}\t Neg={}'.format(CD.shape[0], zero.size, zero.size / float(CD.shape[0]) * 100, pos.size, neg.size); sys.stdout.flush() dfz = pd.DataFrame(zero.values, index=zero.index, columns=['alt']); dfz['s'] = 0 dfn = findML(neg, -stepS, CD.loc[neg.index], E, h, eps, stepS) dfp = findML(pos, stepS, CD.loc[pos.index], E, h, eps,stepS) df = pd.concat([dfp, dfz, dfn]) df = pd.concat([df, likes_null], axis=1) df.columns = pd.MultiIndex.from_product([[h], df.columns], names=['h', 'stat']) if save: path = utl.outpath + 'real/HMM/' utl.mkdir(path) df.to_pickle(path + 'h{:E}.df'.format(h)) return df
def Power(method, depthRate, nu0, s, numReplicates=3, samplingWindow=50, L=50000, numExperiments=500, numProcess=4): param = {'numExperiments': numExperiments, 'method': method, 'numThreads': numProcess, 'ModelName': 'TimeSeries', 'samplingWindow': samplingWindow, 'L': L, 'numReplicates': numReplicates, 'depthRate': depthRate} print '\nMethod={}\tR={}\twin={}\tnu0={}\ts={}, depthRate={}'.format(method, numReplicates, samplingWindow, nu0, s, depthRate) sys.stdout.flush() if method in ['CMH', 'HMM'] and depthRate == np.inf: return if not s and nu0 == 0.1: return param['nu0'] = nu0 param['s'] = s params = getParamsForExperiments(param) if numProcess == 1: a = map(runOne, params) else: pool = Pool(numProcess) a = pool.map(runOne, params) pool.terminate() gc.collect() df = pd.concat(a) sys.stdout.flush() df.sortlevel(inplace=True) df.dropna(axis=1, how='all', inplace=True) print df outpath = utl.outpath + 'ROC/runs/' utl.mkdir(outpath) df.to_pickle('{}{}.{:.0f}.{:E}.{:E}.df'.format(outpath, method, depthRate, nu0, s))
def runOne(args): path = utl.outpath + 'markov/simulations/' utl.mkdir(path) numExp = int(1e5) nu0, s = args print nu0, s for i, batch in enumerate(utl.batch(range(numExp), 10000)): print; print i, batch[0], batch[-1] a = pd.concat(map(lambda x: Simulation.simulateSingleLoci(nu0=nu0, s=s)[[1, 10, 100]], batch), axis=1).T a.to_pickle(path + 'nu{:E}.s{:E}.{}.df'.format(nu0, s, i))
def createOneMSMS(param, forceToHaveSoftFreq): theta = 2 * param["Ne"] * param["mu"] * param["L"] rho = 2 * param["Ne"] * param["r"] * param["L"] path = "{}{}/msms/".format(utl.simoutpath, param["ModelName"]) utl.mkdir(path) if isinstance(param["i"], (int, float, long)): filename = "{}L{:E}.{:E}.msms".format(path, param["L"], param["i"]) else: filename = "{}L{:E}.{}.msms".format(path, param["L"], param["i"]) cmd = "java -jar -Xmx2g ~/bin/msms/lib/msms.jar -ms {} 1 -t {:.0f} -r {:.0f} {:.0f} -oFP 0.000000000000E00 > {}".format( param["n"], theta, rho, param["L"], filename ) subprocess.call(cmd, shell=True) if ( forceToHaveSoftFreq and not (Simulation.MSMS.load(filename)[0].mean(0) == 0.1).sum() ): # make sure inital freq 0.1 exist createOneMSMS(param)
def PowerForDepth(method, depthRate, numReplicates=3, samplingWindow=50, L=50000, numExperiments=500, numProcess=4): df = []; Nu = [0.005, 0.1]; S = [.025, 0.05, 0.075, 0.1] param = {'numExperiments': numExperiments, 'method': method, 'numThreads': numProcess, 'ModelName': 'TimeSeries', 'samplingWindow': samplingWindow, 'L': L, 'numReplicates': numReplicates, 'depthRate': depthRate} print 'Nu={}\tS={}\tnumThreads={}\tmethod={}\tnumExperiments={}'.format(Nu, S, numProcess, method, numExperiments) sys.stdout.flush() if method == 'HMM' and depthRate == np.inf: return for nu0 in Nu: param['nu0'] = nu0 for s in S: param['s'] = s params = getParamsForExperiments(param) if numProcess == 1: a = map(runOne, params) else: pool = Pool(numProcess) a = pool.map(runOne, params) pool.terminate() gc.collect() df += [pd.concat(a)] print '\nMethod={}\tR={}\twin={}\tnu0={}\ts={}, depthRate={}'.format(method, numReplicates, samplingWindow, nu0, s, depthRate) sys.stdout.flush() for param in params: param['s'] = 0;param['nu0'] = 0.005 pool = Pool(numProcess) df += [pd.concat(pool.map(runOne, params))] df=pd.concat(df) df.sortlevel(inplace=True) df.dropna(axis=1,how='all',inplace=True) print df outpath = utl.outpath + 'ROC/' utl.mkdir(outpath) df.to_pickle('{}{}.{:.0f}.df'.format(outpath, method, depthRate))