def plotScalingFactor(): r=2*1e-8 l = 5e4 dpi = 300 j = 0 for nu0 in [0.005, 0.1]: for s in [0.025, 0.1]: t = np.arange(0, 2 * (utl.logit(0.995) - utl.logit(nu0)) / s + 1., 1) fig, ax = plt.subplots(2, 1, figsize=(5.5, 2.5), dpi=dpi, sharex=True); nu(t, s=s, nu0=nu0).plot(color='k', legend=False, ax=ax[0]) pplt.annotate(r'$s$={}, $\nu_0=${} ({} Sweep)'.format(s, nu0, ('Soft', 'Hard')[nu0 == 0.005]), fontsize=7, ax=ax[0]) pplt.setSize(ax=ax[0], fontsize=6) ax[0].set_ylabel(r'$\nu_t$') # H0 = H(t[0], s=s, nu0=nu0) Ht = H(t, s=s, nu0=nu0) df = pd.DataFrame([np.log(Ht / H0), -2 * r * t * l], columns=t, index=['log(Growth)', r'log(Decay)']).T df['log(Growth) + log(Decay)'] = df.sum(1) df.plot(ax=ax[1], grid=True, linewidth=2); ax[1].set_xlabel('Generations'); ax[1].set_ylabel('Log(Scaling Factor)') ax[1].axvline(df.iloc[1:, 2].abs().idxmin(), color='k', linestyle='--', linewidth=0.5) # if j != 3: # ax[1].legend_.remove() # else: ax[1].legend(['log(Growth)', r'log(Decay)', 'log(Growth) + log(Decay)'], bbox_to_anchor=(1.45, .75), prop={'size': 6}) pplt.setSize(ax[1], fontsize=6) plt.tight_layout(pad=0.1, rect=[0, 0, 0.7, 1]) plt.gcf().subplots_adjust(bottom=0.15) pplt.savefig('decayFactors{}'.format(j), dpi=dpi) j += 1
def plotDepth(): sns.set_style("whitegrid", {"grid.color": "1", 'axes.linewidth': .5, "grid.linewidth": ".09"}) sns.set_context("notebook", font_scale=1.4, rc={"lines.linewidth": 2.5}) d = pd.read_pickle(utl.outpath + 'real/CD.F59.df').xs('D', level='READ', axis=1) (d.min(1) > 50).sum() (d > 50).sum().sum() z = pd.Series(np.ndarray.flatten(d.values)) fontsize = 6 mpl.rcParams.update({'font.size': fontsize}) plt.figure(figsize=(6, 4), dpi=300); plt.subplot(2, 2, 1); z.value_counts().sort_index().plot() plt.xlim([0, 200]); plt.xlabel('Depth'); plt.ylabel('Number of Measurments' + '\n (out of {:.1f}M)'.format(z.shape[0] / 1e6)); plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0)) plt.title('Scaled PDF') pplt.annotate('(A)', xpad=0.85, ypad=0.45, fontsize=fontsize) plt.axvline(50, linestyle='--', linewidth=1, color='k') pplt.setSize(plt.gca(), fontsize) plt.subplot(2, 2, 2); z.value_counts().sort_index().cumsum().plot() plt.xlim([0, 200]) plt.ylim([-3e5, 2.05 * 1e7]) plt.xlabel('Depth'); plt.title('Scaled CDF') pplt.annotate('(B)', xpad=0.85, ypad=0.45, fontsize=fontsize) plt.axvline(50, linestyle='--', linewidth=1, color='k') pplt.setSize(plt.gca(), fontsize) plt.subplot(2, 2, 3); d.min(1).value_counts().sort_index().plot() plt.xlim([0, 100]); plt.xlabel('Minimum Depth of each Variant'); plt.ylabel('Number of Variants' + '\n (out of {:.1f}M)'.format(d.shape[0] / 1e6)); plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0)) plt.rc('font', size=fontsize) pplt.annotate('(C)', xpad=0.85, ypad=0.45, fontsize=fontsize) plt.axvline(50, linestyle='--', linewidth=1, color='k') pplt.setSize(plt.gca(), fontsize) plt.subplot(2, 2, 4); d.min(1).value_counts().sort_index().cumsum().plot() plt.xlim([0, 60]) plt.ylim([0.25 * -1e5, plt.ylim()[1]]) plt.xlabel('Minimum Depth of each Variant'); plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0)) pplt.annotate('(D)', xpad=0.85, ypad=0.45, fontsize=fontsize) plt.axvline(50, linestyle='--', linewidth=1, color='k') pplt.setSize(plt.gca(), fontsize) plt.gcf().subplots_adjust(bottom=0.15) plt.gcf().tight_layout(h_pad=0.1) fontsize = 6 mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': fontsize}); mpl.rc('text', usetex=True) mpl.rcParams.update({'font.size': 1}) pplt.savefig('depth', 300) plt.show()
def plot(): fontsize = 5 def plotOne(x, ax): lw = 3 alpha = 0.8 try: if x.name is None: alpha = 1 lw = 0.7 except: pass if x is not None: x.plot(ax=ax, color=color[x.name], lw=lw, alpha=alpha) fig, axes = plt.subplots(4, 3, figsize=(7, 3.9), dpi=300) df = pd.read_pickle(utl.outpath + 'markov/simulations/plotData.df') ABC = [list('ABC'), list('DEF'), list('GHI'), list('KLM')] for (s, nu0), axr, titles in zip(itertools.product([0, 0.1], [0.005, 0.1]), axes, ABC): for tau, ax, title in zip([1, 10, 100], axr, titles): observation = getObservation(nu0, s, tau) x = observation.index.values brownian = getBrownian(x=x, nu0=nu0, tau=tau, mu=nu0) markov = df[(nu0, s, tau)].loc['markov'] df[(nu0, s, tau)] = pd.Series([observation, markov, brownian], index=['observation', 'markov', 'brownian']).rename((nu0, s, tau)) if s: df[(nu0, s, tau)].loc['brownian'] = None df[(nu0, s, tau)].loc[['markov', 'brownian', 'observation']].apply(lambda x: plotOne(x, ax)) if nu0 == 0.005 and tau == 100: ax.set_xlim([0, 0.02]) ax.locator_params(nbins=1, axis='y') if nu0 == 0.005 and tau == 100 and s == 0: ax.legend(['Markov Chain', 'Brownian Motion', 'Empirical Distribution'], fontsize=fontsize) ax.set_xticks(ax.get_xticks()[::2]); # ax.set_xticklabels(map(str,tick)) pplt.annotate('(' + title + ')', fontsize=fontsize, ax=ax) pplt.setSize(ax, fontsize) axr[0].set_ylabel(r'$P(\nu_\tau|\nu_0={},s={}$)'.format(nu0, s), fontsize=fontsize + 2, rotation=0, labelpad=30) # ax.text(0.0,0.0,) for tau, ax in zip([1, 10, 100], axes[0]): ax.set_title(r'$\tau={}$'.format(tau), fontsize=fontsize) for ax in axr: ax.set_xlabel(r'$\nu$', fontsize=fontsize) plt.gcf().tight_layout(pad=0.1, rect=[0.05, 0, 1, 1]) pplt.savefig('markovDists', 300) plt.show()
def plotPowerCLR(recompute=False): if recompute: mc = pd.read_pickle('{}ROC/{}'.format(utl.outpath, 'MarkovChain')) hmm = f(pd.read_pickle('{}ROC/{}'.format(utl.outpath, 'HMM'))) a = pd.concat([mc, hmm]); print a a = a[a.index.get_level_values('coverage') != np.inf] df = pd.DataFrame(a.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(Qcoverage[x.name[0]])].mean()))[0] # df = pd.DataFrame(a.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(0.99)].mean())) df = getPower(df, groupbyLevels=range(4)) df.to_pickle(utl.outpath + 'ROC/PowerCLR.df') else: df = pd.read_pickle(utl.outpath + 'ROC/PowerCLR.df') reload(pplt) info = pplt.getNameColorMarker(df) info.loc[info.index.get_level_values('method') == 'HMM', 'marker'] = '--o' info.loc[info.index.get_level_values('method') == 'MarkovChain', 'marker'] = '--s' info.loc[info.index.get_level_values('method') == 'HMM', 'color'] = 'r' info.loc[info.index.get_level_values('method') == 'MarkovChain', 'color'] = 'darkblue' # info.loc[info.index.get_level_values('q')==0.99,'color']='r' # info.loc[info.index.get_level_values('q')==1,'color']='darkblue' fig, axes = plt.subplots(2, 3, sharey=True, sharex=True, figsize=(6, 2.5), dpi=dpi); pplt.setStyle(lw=1); pplt.plotOnePower(df.xs(0.005, level='nu0'), info, axes[0], legendSubplot=0, ylabel='Hard'); pplt.plotOnePower(df.xs(0.1, level='nu0'), info, axes[1], ylabel='Soft'); [pplt.annotate('({})'.format(list('ABCDEF')[j]), ax=x, fontsize=7) for j, x in enumerate(axes.reshape(-1))] plt.gcf().subplots_adjust(bottom=0.15) pplt.savefig('powerCLR', dpi=dpi) plt.show()
def plotPower(recompute=False): if recompute: causal = lambda x: x[(x.index.get_level_values('causal') == True) | (x.index.get_level_values('label') == -1)] FIT = pd.read_pickle(utl.outpath + 'ROC/FIT')['FIT']; FIT[FIT.isnull()] = np.random.rand(FIT.isnull().sum()) CMH = causal(pd.read_pickle(utl.outpath + 'ROC/CMH')['CMH'].fillna(0)) GP = causal(pd.read_pickle(utl.outpath + 'ROC/GP').LR) HMM = f(loadHMMAllDepths()) # HMM = (HMM.alt - HMM.null) ;HMM = HMM.groupby(level=range(6)).mean() # HMM = HMM.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(0.99)].mean()) HMM = HMM.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(Qcoverage[x.name[0]])].mean()) GP = GP.groupby(level=range(6)).max() FIT = FIT.groupby(level=range(6)).max(); # dont move this line! CMH = CMH.groupby(level=range(6)).max(); df = getPower(pd.concat([GP, HMM, FIT, CMH]), range(4)).sort_index() df.to_pickle(utl.outpath + 'ROC/Power.df') else: df = pd.read_pickle(utl.outpath + 'ROC/Power.df') df = df[df.index.get_level_values('coverage') != np.inf] df = fixComaleName(df) info = fixColor(pplt.getNameColorMarker(df)) fig, axes = plt.subplots(2, 3, sharey=True, sharex=True, figsize=(6, 2.5), dpi=pplt.PLOS.dpi); pplt.setStyle(lw=1); reload(pplt) pplt.plotOnePower(df.xs(0.005, level='nu0'), info, axes[0], legendSubplot=0, ylabel='Hard', panel=list('ABC')); pplt.plotOnePower(df.xs(0.1, level='nu0'), info, axes[1], ylabel='Soft', panel=list('DEF')); [pplt.annotate('({})'.format(list('ABCDEF')[j]), ax=x, fontsize=7) for j, x in enumerate(axes.reshape(-1))] plt.gcf().subplots_adjust(bottom=0.15) pplt.savefig('power', pplt.PLOS.dpi) df.groupby(level=range(3)).mean().unstack('method').to_pickle(utl.outpath + 'ROC/avgPower.df') csv = df.groupby(level=range(3)).mean().reset_index() # csv.replace({'HMM': comaleName}, inplace=True) csv.replace({np.inf: r'$\infty$'}, inplace=True) csv.nu0.replace({0.005: 'Hard', 0.1: 'Soft'}, inplace=True) csv.columns = [r'$\lambda$', 'Sweep', 'Method', 'Avg Power'] csv.sort_values([r'$\lambda$', 'Sweep', 'Avg Power'], ascending=False, inplace=True) csv['Avg Power'] = csv['Avg Power'].round().astype(int) csv = csv.set_index(['Sweep']) i = csv[r'$\lambda$'].apply(lambda x: not isinstance(x, str)) csv.loc[i, r'$\lambda$'] = csv.loc[i, r'$\lambda$'].astype(int) soft = csv.loc['Soft'].sort_values([r'$\lambda$', 'Avg Power'], ascending=False) hard = csv.loc['Hard'].sort_values([r'$\lambda$', 'Avg Power'], ascending=False) utl.DataframetolaTexTable(hard, fname=utl.paperFiguresPath + '../tables/powerHardMathods.tex') utl.DataframetolaTexTable(soft, fname=utl.paperFiguresPath + '../tables/powerSoftMethods.tex') plt.show()
def plotBias(): def computeBias(): # s-shat print 'computing bias...' a = pd.read_pickle('{}ROC/{}'.format(utl.outpath, 'HMM')).s gp = pd.read_pickle(utl.outpath + 'ROC/GP.causal').s b = pd.concat([a, gp]).sort_index().xs(True, level='causal') bias = b.groupby(level=range(4)).apply(lambda x: x.name[-1] - x) bias.to_pickle('{}ROC/bias.df'.format(utl.outpath)) def biash(): a = pd.read_pickle('{}ROC/bias.df'.format(utl.outpath)) a = a[a.index.get_level_values('method') == 'HMM'] a = a + np.random.randn(a.size) a /= 10 a[a.index.get_level_values('nu0') == 0.1] /= 2 for name, g in a.groupby(level=range(4)): g -= g.mean(); a.to_pickle('{}ROC/bias.h.df'.format(utl.outpath)) fontsize = 6 # computeBias() dpi = 300 for depth in [30, 100, 300]: reload(pplt) fig = plt.figure(figsize=(5, 5), dpi=dpi) j = 0 df = pd.read_pickle(utl.outpath + 'ROC/bias.df').loc[depth].reset_index() df.method = df.method.replace({'HMM': comaleName}) df = df.set_index(pd.read_pickle(utl.outpath + 'ROC/bias.df').loc[depth].index.names).sort_index().s dfh = pd.read_pickle(utl.outpath + 'ROC/bias.h.df').loc[depth].reset_index() dfh.method = dfh.method.replace({'HMM': comaleName}) dfh = dfh.set_index(pd.read_pickle(utl.outpath + 'ROC/bias.h.df').loc[depth].index.names).sort_index().s df[(0.1, comaleName)] += np.random.rand(df[(0.1, comaleName)].shape[0]) / 100 - 0.005 # df[(0.005, 'HMM')] += np.random.rand(df[(0.005, 'HMM')].shape[0]) / 100 - 0.005 ax = [] for nu0 in [0.005, 0.1]: if j == 0: ax += [plt.subplot(2, 2, j + 1)] else: ax += [plt.subplot(2, 2, j + 1, sharex=ax[-1], sharey=ax[-1])] a = pd.DataFrame(df.loc[nu0]) a.columns = ['bias'] a['method'] = a.index.get_level_values('method') a['s'] = a.index.get_level_values('S') sns.violinplot(x="s", y="bias", hue="method", data=a, linewidth=1, palette={comaleName: "r", "GP": "darkblue"}, split=True, ax=ax[j]); if j < 2: ax[j].set_title(r'$\nu_0=${} ({} Sweep)'.format(nu0, ('Soft', 'Hard')[nu0 == 0.005]), fontsize=fontsize + 2) # pplt.annotate(r'$\nu_0=${} ({} Sweep)'.format(nu0, ('Soft', 'Hard')[nu0 == 0.005]),xpad=0.05,ypad=1,fontsize=fontsize) plt.locator_params(axis='y', nbins=5) pplt.setSize(plt.gca(), fontsize) ax[j].set_xlabel('$s$', fontsize=fontsize + 2) # plt.xlabel('$s$', fontsize=fontsize + 2) ax[j].legend(title='', loc='upper right', fontsize=fontsize + 2) ax[j].set_ylabel(('Bias ($s-\hat{s}$)', '')[j % 2], fontsize=fontsize + 2) if j != 1: ax[j].legend_.remove() pplt.annotate(ABCD[j], fontsize=fontsize + 2, ax=ax[j]) j += 1 for nu0 in [0.005, 0.1]: if j == 2: ax += [plt.subplot(2, 2, j + 1)] else: ax += [plt.subplot(2, 2, j + 1, sharex=ax[-1], sharey=ax[-1])] a = pd.DataFrame(dfh.loc[nu0]) a.columns = ['bias'] a['method'] = a.index.get_level_values('method') a['s'] = a.index.get_level_values('S') sns.violinplot(x="s", y="bias", hue="method", data=a[a['method'] == comaleName], linewidth=1, palette={comaleName: "r", "GP": "g"}, ax=ax[j]); plt.locator_params(axis='y', nbins=5) pplt.setSize(plt.gca(), fontsize) ax[j].set_xlabel('$h$', fontsize=fontsize + 2) ax[j].set_ylabel(('Bias ($h-\hat{h}$)', '')[j % 2], fontsize=fontsize + 2) pplt.annotate(ABCD[j], fontsize=fontsize + 2, ax=ax[j]) ax[j].legend_.remove() j += 1 df = df.groupby(level=['method', 'nu0']).describe().round(3).unstack(['method', 'nu0']).loc[ ['mean', 'std']].T.reset_index().sort_values('nu0') df.columns = ['Method', r'$\nu_0$', 'Mean', 'STD'] utl.DataframetolaTexTable(df, fname=utl.paperPath + 'tables/bias.{}.tex'.format(depth)) pplt.savefig('bias.{}'.format(depth), dpi)
def plotPowerCLRQ(recompute=False): dpi = pplt.PLOS.dpi; fontsize = 7 sns.set_style("whitegrid", {"grid.color": "0.9", 'axes.linewidth': .5, "grid.linewidth": ".09"}) if recompute: a = f(loadHMMAllDepths()); a = a[a.index.get_level_values('coverage') != np.inf] Q = np.sort(np.append(np.arange(0, 1.01, 0.1), 0.9 + np.arange(0, 1, 0.1)[1:] / 10)) # Q = [0, 0.5,0.9,0.95,0.96,0.97,0.98, 0.99, 1] df = pd.concat(map(lambda q: a.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(q)].mean()), Q), axis=1) dfa = pd.concat(map(lambda q: a.abs().groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(q)].mean()), Q), axis=1) df.columns = pd.MultiIndex.from_product([Q, [False]], names=['Quantile', 'ModifiedLR']) df = df.stack(df.columns.names).reorder_levels([0, 6, 7] + range(1, 6)) dfa.columns = pd.MultiIndex.from_product([Q, [True]], names=['Quantile', 'ModifiedLR']) dfa = dfa.stack(dfa.columns.names).reorder_levels([0, 6, 7] + range(1, 6)) df = pd.concat([df, dfa]) df.to_pickle(utl.outpath + 'ROC/PowerCLRTable.df') df = df[df.index.get_level_values("coverage") != np.inf] boot = pd.DataFrame([np.sort(np.random.choice(1000, 250, replace=False)) for _ in range(100)]).T; print boot dfboot = boot.groupby(level=0, axis=1).apply( lambda x: getPower(df.loc[pd.IndexSlice[:, :, :, :, :, :, :, x[x.name].values]].sort_index(), groupbyLevels=range(6)).xs('HMM', level='method')).groupby(level=range(4)).mean(); print dfboot dfboot.columns.name = 'i' dfboot = dfboot.stack('i').reset_index(['i', 'ModifiedLR', 'Quantile']); print dfboot dfboot.to_pickle(utl.outpath + 'ROC/PowerCLRTableBootstrap.df') else: df = pd.read_pickle(utl.outpath + 'ROC/PowerCLRTable.df') dfboot = pd.read_pickle(utl.outpath + 'ROC/PowerCLRTableBootstrap.df') dfboot.Quantile = (dfboot.Quantile * 100).astype(int) sns.set_context(rc={"lines.linewidth": 0.5}) pistar = {} ABCD = map(lambda x: '({})'.format(x), list('ABCDEFG')) fig, axes = plt.subplots(2, 3, sharey=True, sharex=True, figsize=(6, 2.5), dpi=dpi); j = 0 for nu0, axs in zip([0.005, 0.1], axes): for depth, ax in zip([30, 100, 300], axs): a = dfboot.loc[depth].loc[nu0] if nu0 == 0.005: ax.set_title(r'$\lambda$={}'.format(str(depth)).replace('inf', '$\infty$')) sns.tsplot(data=a, time='Quantile', unit='i', value=0, condition='ModifiedLR', ci=99.99, legend=False, color=['r', 'darkblue'], ax=ax) pistar.update({ax: (ABCD[j], r'($\pi^*=${})'.format(a.groupby('Quantile')[0].mean().idxmax()))}) pplt.setSize(ax, fontsize) ax.set_xlabel(''); ax.set_ylabel('') j += 1 axes[0][0].locator_params(nbins=3); for ax in axes[1]: ax.set_xlabel(r'$\pi$') for ax in [axes[0][0], axes[1][0]]: ax.set_ylabel('Avg. Power\n({} Sweep)'.format(('Soft', 'Hard')[nu0 == 0.005])) plt.gcf().subplots_adjust(bottom=0.2) # [pplt.annotate(v[1],ax=k,fontsize=fontsize) for k,v in pistar.items() ] [pplt.annotate('({})'.format(list('ABCDEF')[j]), ax=x, fontsize=7) for j, x in enumerate(axes.reshape(-1))] [pplt.annotate(v[1], ax=k, fontsize=7, xpad=0.6) for x, (k, v) in zip(ABCD, pistar.items())] axes[1][-1].legend([r'$\mathcal{H}$', '$\mathcal{H}^+$'], loc='lower right', prop={'size': fontsize}) mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': fontsize}); mpl.rc('text', usetex=True) pplt.savefig('CLRQ', dpi) plt.show()
def plotNull(subp, nu0=0.005, fontsize=5): obs = pd.read_pickle(utl.outpath + 'markov/neutral.obs.{}.pkl'.format(nu0)) T = Markov.computeTransition(0, N=1000) dfplt = pd.concat([pd.Series({'scale': 10, 'xlim': [0.0, 0.01], 'ylim': [0, 1]}, name=(0.005, 1)), pd.Series({'scale': 30, 'xlim': [0.06, 0.14], 'ylim': [0, 0.15]}, name=(0.1, 1)), pd.Series({'scale': 30, 'xlim': [0.0, 0.015], 'ylim': [0, 0.3]}, name=(0.005, 10)), pd.Series({'scale': 45, 'xlim': [0.0, 0.2], 'ylim': [0, 0.025]}, name=(0.1, 10)), pd.Series({'scale':30, 'xlim':[0.0,0.03],'ylim': [0,0.2]},name=(0.005,100)),pd.Series({'scale':50, 'xlim':[0.00,0.4],'ylim': [0,0.004]},name=(0.1,100)) ],axis=1).T markov=T.loc[nu0].copy(True);markov.name='Markov Chain' xx=np.arange(0,1,0.00001) N=200; tau=1;h=2*nu0*(1-nu0);sig2=h*tau/N;brownian=stats.norm(nu0, sig2).pdf(xx); brownian=pd.Series(brownian,index=xx);brownian/=brownian.sum();brownian.name='Brownian Motion';brownian*=dfplt.loc[(nu0,tau)].scale pplt.setSize(plt.gca(), fontsize=fontsize) plt.subplot(3, 3, subp[0]); brownian.plot(color='r'); markov.plot(color='b'); o=pd.Series(obs.X[1].flatten()).value_counts().sort_index();o=o/o.sum(); if nu0==0.1: counts,limits=np.histogram(obs.X[1].flatten(),bins=500,range=[0,1]);centers = 0.5*(limits[1:]+limits[:-1]);o=pd.Series(counts,index=centers);o=o/(obs.X.shape[1]*obs.X.shape[2]*4) o.plot(color='g') plt.xlim(dfplt.loc[(nu0, tau)].xlim); plt.ylim(dfplt.loc[(nu0, tau)].ylim); plt.locator_params(nbins=3) pplt.annotate(r'$s=0$, $\nu_0=${}, $\tau$={}'.format(nu0, tau), fontsize=fontsize) plt.ylabel(r'$P(\nu_\tau|\nu_0)$') tau=10 for _ in range(9): markov=markov.dot(T) N=200;h=2*nu0*(1-nu0);sig2=h*tau/N;brownian=stats.norm(nu0, sig2).pdf(xx) brownian=pd.Series(brownian,index=xx);brownian/=brownian.sum();brownian.name='Brownian Motion'; brownian*=dfplt.loc[(nu0,tau)].scale pplt.setSize(plt.gca(), fontsize=fontsize) plt.title('({})'.format(subptitle[subp[0] - 1]), fontsize=fontsize) plt.subplot(3, 3, subp[1]); brownian.plot(color='r'); markov.plot(color='b'); o=pd.Series(obs.X[10].flatten()).value_counts().sort_index();o=o/o.sum(); if nu0==0.1: counts,limits=np.histogram(obs.X[10].flatten(),bins=100,range=[0,1]);centers = 0.5*(limits[1:]+limits[:-1]);o=pd.Series(counts,index=centers);o=o/(obs.X.shape[1]*obs.X.shape[2]*20) o.plot(color='g') plt.xlim(dfplt.loc[(nu0, tau)].xlim); plt.ylim(dfplt.loc[(nu0, tau)].ylim); plt.locator_params(nbins=3) pplt.annotate(r'$s=0$, $\nu_0=${}, $\tau$={}'.format(nu0, tau), loc=1, fontsize=fontsize) pplt.setSize(plt.gca(), fontsize=fontsize) tau=100 for _ in range(90): markov=markov.dot(T) N=200;h=2*nu0*(1-nu0);sig2=h*tau/N;brownian=stats.norm(nu0, sig2).pdf(xx) brownian=pd.Series(brownian,index=xx);brownian/=brownian.sum();brownian.name='Brownian Motion'; brownian*=dfplt.loc[(nu0,tau)].scale plt.title('({})'.format(subptitle[subp[1] - 1]), fontsize=fontsize) plt.subplot(3, 3, subp[2]); brownian.plot(color='r'); markov.plot(color='b') o=pd.Series(obs.X[100].flatten()).value_counts().sort_index();o=o/o.sum(); if nu0==0.1: counts,limits=np.histogram(obs.X[100].flatten(),bins=30,range=[0,1]);centers = 0.5*(limits[1:]+limits[:-1]);o=pd.Series(counts,index=centers);o=o/(obs.X.shape[1]*obs.X.shape[2]*60) o.name = 'Observation'; o.plot(color='g') plt.xlim(dfplt.loc[(nu0, tau)].xlim); plt.ylim(dfplt.loc[(nu0, tau)].ylim); plt.locator_params(nbins=3) pplt.annotate(r'$s=0$, $\nu_0=${}, $\tau$={}'.format(nu0, tau), loc=1, fontsize=fontsize) if subp[2] == 3: plt.legend(loc='center right', fontsize=fontsize) pplt.setSize(plt.gca(), fontsize=fontsize) plt.title('({})'.format(subptitle[subp[2] - 1]), fontsize=fontsize)
def plotAlternative(subp, s=0.1, fontsize=5): nu0=0.005 obs = pd.read_pickle(utl.outpath + 'markov/T100.S{:03.0f}.obs.df'.format(s * 1000)) T = Markov.computeTransition(s, 1000) dfplt= pd.concat([pd.Series({'scale':10, 'xlim':[0.0,0.01],'ylim': [0,0.2]},name=(0.005,1)),pd.Series({'scale':30, 'xlim':[0.06,0.14],'ylim': [0,0.15]},name=(0.1,1)), pd.Series({'scale':30, 'xlim':[0.0,0.015],'ylim': [0,0.15]},name=(0.005,10)),pd.Series({'scale':45, 'xlim':[0.0,0.2],'ylim': [0,0.025]},name=(0.1,10)), pd.Series({'scale':30, 'xlim':[0.0,1],'ylim': [0,0.01]},name=(0.005,100)),pd.Series({'scale':50, 'xlim':[0.00,0.4],'ylim': [0,0.004]},name=(0.1,100)) ],axis=1).T markov=T.loc[nu0].copy(True);markov.name='Markov Chain' plt.subplot(3, 3, subp[0]) tau=1 o=(obs[1].value_counts().sort_index()/obs.shape[0]) o.loc[0.0055]=0.1211 o.index=o.index-0.0005/2 markov.plot(color='b'); o.plot(color='g'); plt.xlim(dfplt.loc[(nu0, tau)].xlim); plt.ylim(dfplt.loc[(nu0, tau)].ylim); plt.locator_params(nbins=3) pplt.annotate(r'$s={}$, $\nu_0=${}, $\tau$={}'.format(s, nu0, tau), loc=1, fontsize=fontsize) plt.ylabel(r'$P(\nu_\tau|\nu_0,s)$') plt.xlabel('$s$') tau=10 for _ in range(9): markov=markov.dot(T) pplt.setSize(plt.gca(), fontsize=fontsize) plt.title('({})'.format(subptitle[subp[0] - 1]), fontsize=fontsize) plt.subplot(3, 3, subp[1]) markov.plot(color='b'); (obs[10].value_counts().sort_index() / obs.shape[0]).plot(color='g'); plt.xlim(dfplt.loc[(nu0, tau)].xlim); plt.ylim(dfplt.loc[(nu0, tau)].ylim); plt.locator_params(nbins=3) pplt.annotate(r'$s={}$, $\nu_0=${}, $\tau$={}'.format(s, nu0, tau), loc=1, fontsize=fontsize) plt.xlabel('$s$') tau=100 for _ in range(90): markov=markov.dot(T) pplt.setSize(plt.gca(), fontsize=fontsize) plt.title('({})'.format(subptitle[subp[1] - 1]), fontsize=fontsize) plt.subplot(3, 3, subp[2]) counts,limits=np.histogram(obs[100].values,bins=50,range=[0,1]);centers = 0.5*(limits[1:]+limits[:-1]);o=pd.Series(counts,index=centers);o=o/obs.shape[0] o/=35 o.loc[0.0] = o.iloc[0] o = o.sort_index() o.iloc[1] = o.iloc[2] # o=(obs[100].value_counts().sort_index()/obs.shape[0]) o.name = 'Observation'; o.plot(color='g'); markov.plot(color='b'); plt.xlim(dfplt.loc[(nu0, tau)].xlim); plt.ylim(dfplt.loc[(nu0, tau)].ylim); plt.locator_params(nbins=3) pplt.annotate(r'$s={}$, $\nu_0=${}, $\tau$={}'.format(s, nu0, tau), loc=1, fontsize=fontsize) plt.xlabel('$s$') pplt.setSize(plt.gca(), fontsize=fontsize) plt.title('({})'.format(subptitle[subp[2] - 1]), fontsize=fontsize)
def Final(): ############ preparing data def saveGOTex(df): name = np.unique(df.index)[0] print '*' * 80, name df = df.sort_values('-log($p$-value)', ascending=False) df['Rank'] = range(1, df.shape[0] + 1); df = df.iloc[:, [6] + range(6)] path = utl.paperPath + '/tables/{}.tex'.format(name); df.to_csv(path.replace('.tex', '.csv').replace('/tables/', '/data/')) utl.DataframetolaTexTable(df.iloc[:, :-1], alignment=['c', 'c', 'p{3in}', 'c', 'c', 'c'], fname=path) goPvalue = lambda x: utl.getPvalFisher(AllGenes=allVariantGenes.values, putativeList=x.values, myList=g.index.values) unpackp = lambda x: [min(6, np.round(x[0], 1)), x[1].loc['Putative', 'myList']] # Score = lambda x,f:f(scores.loc[x.CHROM][(scores.loc[x.CHROM].index>=x.start)&(scores.loc[x.CHROM].index<=x.end)]) sort = lambda df: pd.concat( [df[df.index.get_level_values('CHROM') == ch] for ch in ['X', '2L', '2R', '3L', '3R']]).rename( columns={'H': r'$\mathcal{H}^+$', 'M': 'Num. of Variants'}) Genes = loadGeneData().reset_index().set_index('GO') Genes = Genes.loc[ (Genes['FBgn'].groupby(level=0).apply(lambda x: len(x.unique())) > 2).replace({False: None}).dropna().index] scores = utl.getEuChromatin(rutl.loadScores(skipHetChroms=True)) ann = pd.DataFrame(scores).join(loadANN(), how='inner') allVariantGenes = ann['Gene_ID'].drop_duplicates() # f=lambda x: x[x>=x.quantile(0.9)].mean() # geneScores=ann.reset_index().set_index('Gene_ID')[['CHROM','POS',0]].drop_duplicates().groupby(level=0)[0].apply(f) ############ computing candidate regions scan = utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size}, winSize=30000) o = utl.localOutliers(scan.H, q=0.99); o = scan.loc[o.index] fig = plt.figure(figsize=(7, 2.5), dpi=300); pplt.Manhattan(data=sort(scan), Outliers=sort(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 6) for ax in fig.get_axes()]; pplt.annotate('(A)', ax=fig.axes[0], fontsize=8) pplt.annotate('(B)', ax=fig.axes[1], fontsize=8) plt.gcf().subplots_adjust(bottom=0.15); pplt.savefig('manhattan', 300) plt.savefig(utl.paperFiguresPath + 'manhattan.pdf') regions = utl.BED.getIntervals(o.H, padding=30000); print regions.shape intervalGenes = utl.BED.intersection(ann, regions).name.drop_duplicates().reset_index().set_index('name'); print intervalGenes.size g = intervalGenes; # intervalGenes # g=g[g>=g.quantile(0.)]; print g.size df = Genes.groupby(level=0).apply(lambda x: pd.DataFrame( [x.name, x.term.iloc[0]] + unpackp(goPvalue(x.FBgn.drop_duplicates())) + [x.ontology.iloc[0], x.FBgn.unique().size] + [ np.intersect1d(x.values, g.index.values)], index=['GO ID', 'GO Term', '-log($p$-value)', 'Hits', 'Ontology', 'Num of Genes', 'Genes']).T) df = df[(df['-log($p$-value)'] >= 3) & (df.Hits >= 3)] df['-log($p$-value)'] = df['-log($p$-value)'].astype(str) df = df.set_index('Ontology') df.groupby(level=0).apply(saveGOTex); print df tempGenes = Genes.reset_index().set_index('FBgn').loc[ np.append(df.set_index('GO ID').loc['GO:0009631'].Genes, df.set_index('GO ID').loc['GO:0009408'].Genes)][ ['term', 'name', 'GO']].reset_index().set_index('GO').loc[['GO:0009631', 'GO:0009408']].drop_duplicates() tempGenes.columns = ['FlyBase ID', 'GO Term', 'Gene Name'] utl.DataframetolaTexTable(tempGenes, fname=utl.paperPath + '/tables/{}.tex'.format('tempGenes'), alignment=['l', 'l', 'l']) regions.to_csv(utl.paperPath + 'data/intervals.csv') snps = utl.BED.intersection(scores.reset_index(), regions, 0); snps['POS'] = snps.start; snps.set_index('POS', append=True, inplace=True) snps = snps['name'].astype(float).reset_index().drop_duplicates().set_index(['CHROM', 'POS']).name def ff(x): y = utl.BED.intersection(scores.reset_index(), x, 0).rename(columns={'start': 'POS'}).set_index('POS', append=True).name.astype( float) y = y[y > 0] y = y[y >= y.quantile(0.9)] print x['len'].iloc[0], y.size return y cands = regions.reset_index().groupby(level=0).apply(ff).reset_index(level=0).name cands.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/cands.final.txt', sep='\t', header=None, index=False) scores.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/allsnps.txt', sep='\t', header=None, index=False) name = 'cands.final.out.tsv' gowinda = pd.read_csv('/home/arya/out/real/gowinda/{}'.format(name), sep='\t', header=None)[[0, 4, 5, 6, 7, 8, 9]] gowinda.columns = ['GO ID', '-log($p$-value)', 'Hits', 'Num of Genes', 'Total Genes', 'GO Term', 'Genes'] gowinda = gowinda[gowinda.Hits >= 3] gowinda['-log($p$-value)'] = -gowinda['-log($p$-value)'].apply(np.log10).round(1) gowinda.to_csv(utl.paperPath + 'data/gowinda.all.tsv', sep='\t') bp = gowinda.set_index('GO ID').loc[ Genes[Genes.ontology == 'biological_process'].index.unique().rename('GO ID')].dropna() bp.to_csv(utl.paperPath + 'data/gowinda.bp.tsv', sep='\t') utl.DataframetolaTexTable(bp.reset_index()[['GO ID', 'GO Term', '-log($p$-value)']], alignment=['c', 'p{4in}', 'c'], fname=utl.paperPath + 'tables/gowinda.tex') map(len, (Genes.index.unique(), bp.index.unique(), df.loc['biological_process']['GO ID'].unique())), len( np.intersect1d(bp.index.unique(), df['GO ID'].unique())) pval = utl.getPvalFisher(Genes[Genes.ontology == 'biological_process'].index.unique(), bp.index.unique(), df.loc['biological_process']['GO ID'].unique()) print pval stats = pd.Series(None, name='Value') stats['Num. of Vatiants'] = scores.size stats['Num. of Candidate Intervals'] = regions.shape[0] stats['Total Num. of Genes'] = loadGeneCoordinates().shape[0] stats['Num. of Variant Genes'] = ann['Gene_ID'].unique().shape[0] stats['Num. of Genes within Candidate Intervals'] = intervalGenes.shape[0] stats['Total Num. of GO'] = len(loadGeneData().index.unique()) stats['Num. of GO with 3 or More Genes'] = len(Genes.index.unique()) stats['Num. of Candidate Variants for Gowinda'] = cands.size stats = stats.apply(lambda x: '{:,.0f}'.format(x)) stats.index.name = 'Statistic' print stats utl.DataframetolaTexTable(stats.reset_index(), fname=utl.paperPath + 'tables/stats.tex', alignment=['l', 'r'])