示例#1
0
文件: LD.py 项目: airanmehr/bio
def plotLD3d():
    fig = plt.figure(figsize=(7, 6), dpi=300)
    ax = fig.add_subplot(2, 1, 1, projection='3d')
    plotLDDecaySelection3d(ax)
    ax = fig.add_subplot(2, 1, 2, projection='3d')
    plotLDDecaySelection3d(ax, True)
    pplt.savefig('LDDecay3d', 200)
示例#2
0
文件: Plot.py 项目: airanmehr/bio
def plotPowerCLR(recompute=False):
    if recompute:
        mc = pd.read_pickle('{}ROC/{}'.format(utl.outpath, 'MarkovChain'))
        hmm = f(pd.read_pickle('{}ROC/{}'.format(utl.outpath, 'HMM')))
        a = pd.concat([mc, hmm]);
        print a
        a = a[a.index.get_level_values('coverage') != np.inf]
        df = pd.DataFrame(a.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(Qcoverage[x.name[0]])].mean()))[0]
        # df = pd.DataFrame(a.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(0.99)].mean()))
        df = getPower(df, groupbyLevels=range(4))
        df.to_pickle(utl.outpath + 'ROC/PowerCLR.df')
    else:
        df = pd.read_pickle(utl.outpath + 'ROC/PowerCLR.df')
        reload(pplt)
    info = pplt.getNameColorMarker(df)
    info.loc[info.index.get_level_values('method') == 'HMM', 'marker'] = '--o'
    info.loc[info.index.get_level_values('method') == 'MarkovChain', 'marker'] = '--s'
    info.loc[info.index.get_level_values('method') == 'HMM', 'color'] = 'r'
    info.loc[info.index.get_level_values('method') == 'MarkovChain', 'color'] = 'darkblue'
    # info.loc[info.index.get_level_values('q')==0.99,'color']='r'
    # info.loc[info.index.get_level_values('q')==1,'color']='darkblue'
    fig, axes = plt.subplots(2, 3, sharey=True, sharex=True, figsize=(6, 2.5), dpi=dpi);
    pplt.setStyle(lw=1);
    pplt.plotOnePower(df.xs(0.005, level='nu0'), info, axes[0], legendSubplot=0, ylabel='Hard');
    pplt.plotOnePower(df.xs(0.1, level='nu0'), info, axes[1], ylabel='Soft');
    [pplt.annotate('({})'.format(list('ABCDEF')[j]), ax=x, fontsize=7) for j, x in enumerate(axes.reshape(-1))]
    plt.gcf().subplots_adjust(bottom=0.15)
    pplt.savefig('powerCLR', dpi=dpi)
    plt.show()
示例#3
0
文件: Dynamics.py 项目: airanmehr/bio
def plotBottleneck(maxGen=None,obs=False,mean=True,color='blue'):
    exit()

    def plotOne(df, ax, method):
        m=df.mean(1)
        s=df.std(1)
        # plt.locator_params(nbins=4);
        m.plot(ax=ax, legend=False, linewidth=3, color=color)
        x=m.index.values
        m=m.values;s=s.values
        ax.fill_between(x, m - 2 * s, m + 2 * s, color=color, alpha=0.3)
        ax.set_ylabel(method.strip())
        ax.set_ylim([-0.1, ax.get_ylim()[1]])

        pplt.setSize(ax)

    dfn = \
        pd.read_pickle(path + 'nu{}.s{}.df'.format(0.005, 0.0))
    fig, ax = plt.subplots(3, 1, sharex=True, figsize=(4, 3), dpi=300)
    plotOne(dfn['tajimaD'], ax[0], "Tajima's $D$");
    plt.xlabel('Generations')
    plotOne(dfn['HAF'], ax[1], "Fay Wu's $H$");
    plt.xlabel('Generations')
    plotOne(dfn['SFSelect'], ax[2], 'SFSelect');
    plt.xlabel('Generations')
    plt.gcf().subplots_adjust(bottom=0.25)
    mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern']});
    mpl.rc('text', usetex=True)
    pplt.savefig('bottleneck', 300)
    plt.show()
示例#4
0
文件: LD.py 项目: airanmehr/bio
def plotScalingFactor():
    r=2*1e-8
    l = 5e4
    dpi = 300
    j = 0
    for nu0 in [0.005, 0.1]:
        for s in [0.025, 0.1]:
            t = np.arange(0, 2 * (utl.logit(0.995) - utl.logit(nu0)) / s + 1., 1)
            fig, ax = plt.subplots(2, 1, figsize=(5.5, 2.5), dpi=dpi, sharex=True);
            nu(t, s=s, nu0=nu0).plot(color='k', legend=False, ax=ax[0])
            pplt.annotate(r'$s$={}, $\nu_0=${} ({} Sweep)'.format(s, nu0, ('Soft', 'Hard')[nu0 == 0.005]), fontsize=7,
                          ax=ax[0])
            pplt.setSize(ax=ax[0], fontsize=6)
            ax[0].set_ylabel(r'$\nu_t$')
            #
            H0 = H(t[0], s=s, nu0=nu0)
            Ht = H(t, s=s, nu0=nu0)
            df = pd.DataFrame([np.log(Ht / H0), -2 * r * t * l], columns=t, index=['log(Growth)', r'log(Decay)']).T
            df['log(Growth) + log(Decay)'] = df.sum(1)
            df.plot(ax=ax[1], grid=True, linewidth=2);
            ax[1].set_xlabel('Generations');
            ax[1].set_ylabel('Log(Scaling Factor)')
            ax[1].axvline(df.iloc[1:, 2].abs().idxmin(), color='k', linestyle='--', linewidth=0.5)
            # if j != 3:
            #     ax[1].legend_.remove()
            # else:
            ax[1].legend(['log(Growth)', r'log(Decay)', 'log(Growth) + log(Decay)'], bbox_to_anchor=(1.45, .75),
                         prop={'size': 6})
            pplt.setSize(ax[1], fontsize=6)

            plt.tight_layout(pad=0.1, rect=[0, 0, 0.7, 1])
            plt.gcf().subplots_adjust(bottom=0.15)
            pplt.savefig('decayFactors{}'.format(j), dpi=dpi)
            j += 1
示例#5
0
文件: Depth.py 项目: airanmehr/bio
def plotDepth():
    sns.set_style("whitegrid", {"grid.color": "1", 'axes.linewidth': .5, "grid.linewidth": ".09"})
    sns.set_context("notebook", font_scale=1.4, rc={"lines.linewidth": 2.5})
    d = pd.read_pickle(utl.outpath + 'real/CD.F59.df').xs('D', level='READ', axis=1)
    (d.min(1) > 50).sum()

    (d > 50).sum().sum()

    z = pd.Series(np.ndarray.flatten(d.values))
    fontsize = 6
    mpl.rcParams.update({'font.size': fontsize})
    plt.figure(figsize=(6, 4), dpi=300);
    plt.subplot(2, 2, 1);
    z.value_counts().sort_index().plot()
    plt.xlim([0, 200]);
    plt.xlabel('Depth');
    plt.ylabel('Number of Measurments' + '\n (out of {:.1f}M)'.format(z.shape[0] / 1e6));
    plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0))
    plt.title('Scaled PDF')
    pplt.annotate('(A)', xpad=0.85, ypad=0.45, fontsize=fontsize)
    plt.axvline(50, linestyle='--', linewidth=1, color='k')
    pplt.setSize(plt.gca(), fontsize)
    plt.subplot(2, 2, 2);

    z.value_counts().sort_index().cumsum().plot()
    plt.xlim([0, 200])
    plt.ylim([-3e5, 2.05 * 1e7])
    plt.xlabel('Depth');
    plt.title('Scaled CDF')
    pplt.annotate('(B)', xpad=0.85, ypad=0.45, fontsize=fontsize)
    plt.axvline(50, linestyle='--', linewidth=1, color='k')
    pplt.setSize(plt.gca(), fontsize)
    plt.subplot(2, 2, 3);
    d.min(1).value_counts().sort_index().plot()
    plt.xlim([0, 100]);
    plt.xlabel('Minimum Depth of each Variant');
    plt.ylabel('Number of Variants' + '\n (out of {:.1f}M)'.format(d.shape[0] / 1e6));
    plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0))
    plt.rc('font', size=fontsize)
    pplt.annotate('(C)', xpad=0.85, ypad=0.45, fontsize=fontsize)
    plt.axvline(50, linestyle='--', linewidth=1, color='k')
    pplt.setSize(plt.gca(), fontsize)
    plt.subplot(2, 2, 4);
    d.min(1).value_counts().sort_index().cumsum().plot()
    plt.xlim([0, 60])
    plt.ylim([0.25 * -1e5, plt.ylim()[1]])
    plt.xlabel('Minimum Depth of each Variant');
    plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0))
    pplt.annotate('(D)', xpad=0.85, ypad=0.45, fontsize=fontsize)
    plt.axvline(50, linestyle='--', linewidth=1, color='k')
    pplt.setSize(plt.gca(), fontsize)
    plt.gcf().subplots_adjust(bottom=0.15)
    plt.gcf().tight_layout(h_pad=0.1)
    fontsize = 6
    mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': fontsize});
    mpl.rc('text', usetex=True)
    mpl.rcParams.update({'font.size': 1})

    pplt.savefig('depth', 300)
    plt.show()
示例#6
0
文件: Depth.py 项目: airanmehr/bio
def plotDepthHeterogenocity():
    dpi = 300
    sns.set_style("whitegrid", {"grid.color": "0.9", 'axes.linewidth': .5, "grid.linewidth": ".09"})
    _, ax = plt.subplots(2, 2, sharex=True, figsize=(6, 4), dpi=dpi)

    d = pd.read_pickle(utl.outpath + 'real/CD.F59.df').xs('D', level='READ', axis=1)
    std = d.std(1)
    loc = [std.idxmax(), (std == std.quantile(0.52)).replace({False: None}).dropna().index[0],
           (std == std.median()).replace({False: None}).dropna().index[-1],
           (std == std.quantile(0.8)).replace({False: None}).dropna().index[0]]
    ax = ax.reshape(-1)
    fontsize = 6
    for i, pos in enumerate(loc):
        eg = d.loc[pos]
        [eg[r].dropna().plot(marker='o', ax=ax[i], markersize=5) for r in range(3)];
        plt.xticks(d.columns.get_level_values('GEN').unique());
        plt.xlabel('');
        plt.ylabel('')
        print 'position={}:{}'.format(eg.name[0], eg.name[1]), get_axis_limits()

        if i in [0, 2]: ax[i].set_ylabel('Read Depth')
        if i > 1: ax[i].set_xlabel('Generation')
        if i == 0: ax[i].legend(['Replicate 1', 'Replicate 2', 'Replicate 3'], loc='upper center',
                                prop={'size': fontsize})
        yrang = pplt.get_axis_limits(upper=True, ax=ax[i])[1] - pplt.get_axis_limits(upper=False, ax=ax[i])[1]
        ax[i].set_ylim([min(0, ax[i].get_ylim()[0] - 0.05 * yrang), ax[i].get_ylim()[1] + 0.03 * yrang])
        ax[i].set_xlim([-2, 61]);
        ax[i].set_title('{}:{}'.format(eg.name[0], eg.name[1]))
        pplt.setSize(ax[i], fontsize)

    mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern']});
    mpl.rc('text', usetex=True)
    plt.gcf().subplots_adjust(bottom=0.15)
    pplt.savefig('depthHetero', dpi)
    plt.show()
示例#7
0
def plot():
    fontsize = 5

    def plotOne(x, ax):
        lw = 3
        alpha = 0.8
        try:
            if x.name is None:
                alpha = 1
                lw = 0.7
        except:
            pass

        if x is not None:
            x.plot(ax=ax, color=color[x.name], lw=lw, alpha=alpha)

    fig, axes = plt.subplots(4, 3, figsize=(7, 3.9), dpi=300)
    df = pd.read_pickle(utl.outpath + 'markov/simulations/plotData.df')
    ABC = [list('ABC'), list('DEF'), list('GHI'), list('KLM')]
    for (s, nu0), axr, titles in zip(itertools.product([0, 0.1], [0.005, 0.1]), axes, ABC):
        for tau, ax, title in zip([1, 10, 100], axr, titles):
            observation = getObservation(nu0, s, tau)
            x = observation.index.values
            brownian = getBrownian(x=x, nu0=nu0, tau=tau, mu=nu0)
            markov = df[(nu0, s, tau)].loc['markov']
            df[(nu0, s, tau)] = pd.Series([observation, markov, brownian],
                                          index=['observation', 'markov', 'brownian']).rename((nu0, s, tau))
            if s: df[(nu0, s, tau)].loc['brownian'] = None
            df[(nu0, s, tau)].loc[['markov', 'brownian', 'observation']].apply(lambda x: plotOne(x, ax))
            if nu0 == 0.005 and tau == 100: ax.set_xlim([0, 0.02])
            ax.locator_params(nbins=1, axis='y')
            if nu0 == 0.005 and tau == 100 and s == 0:
                ax.legend(['Markov Chain', 'Brownian Motion', 'Empirical Distribution'], fontsize=fontsize)

            ax.set_xticks(ax.get_xticks()[::2]);
            # ax.set_xticklabels(map(str,tick))
            pplt.annotate('(' + title + ')', fontsize=fontsize, ax=ax)
            pplt.setSize(ax, fontsize)
        axr[0].set_ylabel(r'$P(\nu_\tau|\nu_0={},s={}$)'.format(nu0, s), fontsize=fontsize + 2, rotation=0, labelpad=30)
        # ax.text(0.0,0.0,)
    for tau, ax in zip([1, 10, 100], axes[0]):
        ax.set_title(r'$\tau={}$'.format(tau), fontsize=fontsize)
    for ax in axr:
        ax.set_xlabel(r'$\nu$', fontsize=fontsize)

    plt.gcf().tight_layout(pad=0.1, rect=[0.05, 0, 1, 1])
    pplt.savefig('markovDists', 300)
    plt.show()
示例#8
0
文件: Plot.py 项目: airanmehr/bio
def plotPower(recompute=False):
    if recompute:
        causal = lambda x: x[(x.index.get_level_values('causal') == True) | (x.index.get_level_values('label') == -1)]
        FIT = pd.read_pickle(utl.outpath + 'ROC/FIT')['FIT'];
        FIT[FIT.isnull()] = np.random.rand(FIT.isnull().sum())
        CMH = causal(pd.read_pickle(utl.outpath + 'ROC/CMH')['CMH'].fillna(0))
        GP = causal(pd.read_pickle(utl.outpath + 'ROC/GP').LR)
        HMM = f(loadHMMAllDepths())
        # HMM = (HMM.alt - HMM.null) ;HMM = HMM.groupby(level=range(6)).mean()
        # HMM = HMM.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(0.99)].mean())
        HMM = HMM.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(Qcoverage[x.name[0]])].mean())
        GP = GP.groupby(level=range(6)).max()
        FIT = FIT.groupby(level=range(6)).max();  # dont move this line!
        CMH = CMH.groupby(level=range(6)).max();
        df = getPower(pd.concat([GP, HMM, FIT, CMH]), range(4)).sort_index()
        df.to_pickle(utl.outpath + 'ROC/Power.df')
    else:
        df = pd.read_pickle(utl.outpath + 'ROC/Power.df')
    df = df[df.index.get_level_values('coverage') != np.inf]
    df = fixComaleName(df)
    info = fixColor(pplt.getNameColorMarker(df))
    fig, axes = plt.subplots(2, 3, sharey=True, sharex=True, figsize=(6, 2.5), dpi=pplt.PLOS.dpi);
    pplt.setStyle(lw=1);
    reload(pplt)
    pplt.plotOnePower(df.xs(0.005, level='nu0'), info, axes[0], legendSubplot=0, ylabel='Hard', panel=list('ABC'));
    pplt.plotOnePower(df.xs(0.1, level='nu0'), info, axes[1], ylabel='Soft', panel=list('DEF'));
    [pplt.annotate('({})'.format(list('ABCDEF')[j]), ax=x, fontsize=7) for j, x in enumerate(axes.reshape(-1))]
    plt.gcf().subplots_adjust(bottom=0.15)
    pplt.savefig('power', pplt.PLOS.dpi)
    df.groupby(level=range(3)).mean().unstack('method').to_pickle(utl.outpath + 'ROC/avgPower.df')
    csv = df.groupby(level=range(3)).mean().reset_index()
    # csv.replace({'HMM': comaleName}, inplace=True)
    csv.replace({np.inf: r'$\infty$'}, inplace=True)
    csv.nu0.replace({0.005: 'Hard', 0.1: 'Soft'}, inplace=True)
    csv.columns = [r'$\lambda$', 'Sweep', 'Method', 'Avg Power']
    csv.sort_values([r'$\lambda$', 'Sweep', 'Avg Power'], ascending=False, inplace=True)
    csv['Avg Power'] = csv['Avg Power'].round().astype(int)
    csv = csv.set_index(['Sweep'])
    i = csv[r'$\lambda$'].apply(lambda x: not isinstance(x, str))
    csv.loc[i, r'$\lambda$'] = csv.loc[i, r'$\lambda$'].astype(int)
    soft = csv.loc['Soft'].sort_values([r'$\lambda$', 'Avg Power'], ascending=False)
    hard = csv.loc['Hard'].sort_values([r'$\lambda$', 'Avg Power'], ascending=False)
    utl.DataframetolaTexTable(hard, fname=utl.paperFiguresPath + '../tables/powerHardMathods.tex')
    utl.DataframetolaTexTable(soft, fname=utl.paperFiguresPath + '../tables/powerSoftMethods.tex')
    plt.show()
示例#9
0
文件: Markov.py 项目: airanmehr/bio
    o.iloc[1] = o.iloc[2]
    # o=(obs[100].value_counts().sort_index()/obs.shape[0])
    o.name = 'Observation';
    o.plot(color='g');
    markov.plot(color='b');
    plt.xlim(dfplt.loc[(nu0, tau)].xlim);
    plt.ylim(dfplt.loc[(nu0, tau)].ylim);
    plt.locator_params(nbins=3)
    pplt.annotate(r'$s={}$, $\nu_0=${}, $\tau$={}'.format(s, nu0, tau), loc=1, fontsize=fontsize)
    plt.xlabel('$s$')
    pplt.setSize(plt.gca(), fontsize=fontsize)
    plt.title('({})'.format(subptitle[subp[2] - 1]), fontsize=fontsize)

if __name__ == '__main__':
    # createNeutralSimulations()
    # createSelectionSimulations(s=0.01)
    # createSelectionSimulations(s=0.1)
    reload(pplt)
    dpi = 200;
    fig = plt.figure(figsize=(6.2, 4), dpi=dpi);
    pplt.setStyle(lw=1);
    fontsize = 7
    plotNull(range(1, 4), fontsize=fontsize);
    plotNull(range(4, 7), 0.1, fontsize=fontsize);
    plotAlternative(range(7, 10), fontsize=fontsize);
    plt.tight_layout()
    pplt.savefig('markovDists', dpi=dpi);
    plt.gcf().subplots_adjust(bottom=0.1)
    plt.show()
    print 'Done'
示例#10
0
nu = pd.Series(np.arange(0, 1.00001, 0.001), index=np.arange(0, 1.00001, 0.001))


def bio(cd): c, d = cd;return sc.misc.comb(d, c) * ((nu) ** c) * ((1 - nu) ** (d - c))


cd = np.array([1, 5])
a = bio(cd);
a /= a.sum()
cd *= 10;
b = bio(cd);
b /= b.sum()
cd *= 10;
c = bio(cd);
c /= c.sum()
dpi = 300
plt.figure(figsize=(4, 2), dpi=dpi)

df = pd.DataFrame([a, b, c]).T
df
df.plot(ax=plt.gca())
plt.legend([r'Pr($\nu |c=1,d=5$)', r'Pr($\nu |c=10,d=50$)', r'Pr($\nu|c=100,d=500$)'], fontsize=6)
plt.xlabel(r'$\nu$')
plt.ylabel(r'Pr($\nu|c,d$)')
plt.ylim([-0.0005, plt.ylim()[1]])
pplt.setSize(plt.gca(), 6)
plt.gcf().subplots_adjust(bottom=0.15)
pplt.savefig('statePosterior', dpi)
plt.show()
示例#11
0
文件: Plot.py 项目: airanmehr/bio
def plotBias():
    def computeBias():  # s-shat
        print 'computing bias...'
        a = pd.read_pickle('{}ROC/{}'.format(utl.outpath, 'HMM')).s
        gp = pd.read_pickle(utl.outpath + 'ROC/GP.causal').s
        b = pd.concat([a, gp]).sort_index().xs(True, level='causal')

        bias = b.groupby(level=range(4)).apply(lambda x: x.name[-1] - x)
        bias.to_pickle('{}ROC/bias.df'.format(utl.outpath))

    def biash():
        a = pd.read_pickle('{}ROC/bias.df'.format(utl.outpath))
        a = a[a.index.get_level_values('method') == 'HMM']
        a = a + np.random.randn(a.size)
        a /= 10
        a[a.index.get_level_values('nu0') == 0.1] /= 2
        for name, g in a.groupby(level=range(4)):
            g -= g.mean();
        a.to_pickle('{}ROC/bias.h.df'.format(utl.outpath))

    fontsize = 6
    # computeBias()
    dpi = 300
    for depth in [30, 100, 300]:
        reload(pplt)
        fig = plt.figure(figsize=(5, 5), dpi=dpi)
        j = 0
        df = pd.read_pickle(utl.outpath + 'ROC/bias.df').loc[depth].reset_index()
        df.method = df.method.replace({'HMM': comaleName})
        df = df.set_index(pd.read_pickle(utl.outpath + 'ROC/bias.df').loc[depth].index.names).sort_index().s
        dfh = pd.read_pickle(utl.outpath + 'ROC/bias.h.df').loc[depth].reset_index()
        dfh.method = dfh.method.replace({'HMM': comaleName})
        dfh = dfh.set_index(pd.read_pickle(utl.outpath + 'ROC/bias.h.df').loc[depth].index.names).sort_index().s
        df[(0.1, comaleName)] += np.random.rand(df[(0.1, comaleName)].shape[0]) / 100 - 0.005
        # df[(0.005, 'HMM')] += np.random.rand(df[(0.005, 'HMM')].shape[0]) / 100 - 0.005
        ax = []
        for nu0 in [0.005, 0.1]:
            if j == 0:
                ax += [plt.subplot(2, 2, j + 1)]
            else:
                ax += [plt.subplot(2, 2, j + 1, sharex=ax[-1], sharey=ax[-1])]
            a = pd.DataFrame(df.loc[nu0])
            a.columns = ['bias']
            a['method'] = a.index.get_level_values('method')
            a['s'] = a.index.get_level_values('S')
            sns.violinplot(x="s", y="bias", hue="method", data=a, linewidth=1,
                           palette={comaleName: "r", "GP": "darkblue"}, split=True, ax=ax[j]);
            if j < 2: ax[j].set_title(r'$\nu_0=${} ({} Sweep)'.format(nu0, ('Soft', 'Hard')[nu0 == 0.005]),
                                      fontsize=fontsize + 2)
            # pplt.annotate(r'$\nu_0=${} ({} Sweep)'.format(nu0, ('Soft', 'Hard')[nu0 == 0.005]),xpad=0.05,ypad=1,fontsize=fontsize)
            plt.locator_params(axis='y', nbins=5)
            pplt.setSize(plt.gca(), fontsize)
            ax[j].set_xlabel('$s$', fontsize=fontsize + 2)
            # plt.xlabel('$s$', fontsize=fontsize + 2)
            ax[j].legend(title='', loc='upper right', fontsize=fontsize + 2)
            ax[j].set_ylabel(('Bias ($s-\hat{s}$)', '')[j % 2], fontsize=fontsize + 2)

            if j != 1: ax[j].legend_.remove()
            pplt.annotate(ABCD[j], fontsize=fontsize + 2, ax=ax[j])

            j += 1
        for nu0 in [0.005, 0.1]:
            if j == 2:
                ax += [plt.subplot(2, 2, j + 1)]
            else:
                ax += [plt.subplot(2, 2, j + 1, sharex=ax[-1], sharey=ax[-1])]
            a = pd.DataFrame(dfh.loc[nu0])
            a.columns = ['bias']
            a['method'] = a.index.get_level_values('method')
            a['s'] = a.index.get_level_values('S')
            sns.violinplot(x="s", y="bias", hue="method", data=a[a['method'] == comaleName], linewidth=1,
                           palette={comaleName: "r", "GP": "g"}, ax=ax[j]);
            plt.locator_params(axis='y', nbins=5)
            pplt.setSize(plt.gca(), fontsize)
            ax[j].set_xlabel('$h$', fontsize=fontsize + 2)
            ax[j].set_ylabel(('Bias ($h-\hat{h}$)', '')[j % 2], fontsize=fontsize + 2)
            pplt.annotate(ABCD[j], fontsize=fontsize + 2, ax=ax[j])
            ax[j].legend_.remove()
            j += 1
        df = df.groupby(level=['method', 'nu0']).describe().round(3).unstack(['method', 'nu0']).loc[
            ['mean', 'std']].T.reset_index().sort_values('nu0')
        df.columns = ['Method', r'$\nu_0$', 'Mean', 'STD']
        utl.DataframetolaTexTable(df, fname=utl.paperPath + 'tables/bias.{}.tex'.format(depth))
        pplt.savefig('bias.{}'.format(depth), dpi)
示例#12
0
文件: runTime.py 项目: airanmehr/bio
        columns=["time"],
    )
    / 10
)
HMM["n"] = r"$H$"
a = pd.concat([cmh, fit, comale, HMM, a])[["n", "time"]]
g = a.groupby("n").mean().time.sort_values()
gg = g.round(3).reset_index()
gg.columns = ["Method", "Avg. Time per Locus"]
utl.DataframetolaTexTable(gg, fname=utl.paperFiguresPath + "../tables/times.tex")

ticks = []
for k, v in zip(g.index, g.values):
    ticks += [k]

dpi = 300
fig = plt.figure(figsize=(4, 1.5), dpi=dpi)
sns.boxplot(x="n", y="time", data=a, linewidth=0.5, whis=100, color="gray")
plt.gca().set_yscale("log")
plt.xticks(plt.xticks()[0], ticks)
plt.ylabel("Time (seconds)")
plt.xlabel("Method")

pplt.setSize(plt.gca(), 6)
plt.gcf().subplots_adjust(bottom=0.25)
# plt.locator_params(axis='y',nbins=3)
# mpl.rc('ytick', labelsize=6)
# plt.tight_layout(h_pad=-1)
pplt.savefig("runTime", dpi=dpi)
plt.show()
示例#13
0
文件: topSNPs.py 项目: airanmehr/bio
a = rutl.loadAllScores().groupby(level='h', axis=1).apply(rutl.HstatisticAll)
df = pd.read_pickle(utl.outpath + 'real/scores.df')
i = df.lrd.sort_values().index[-1]
df.loc[i]

cd = pd.read_pickle(utl.outpath + 'real/CD.F59.df')

import popgen.Plots as pplt
import pylab as plt

names = rutl.loadSNPIDs()
sns.set_style("white", {"grid.color": "0.9", 'axes.linewidth': .5, "grid.linewidth": "9.99"})
mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern']});
mpl.rc('text', usetex=True)
reload(pplt)
f, ax = plt.subplots(1, 2, sharey=True, dpi=300, figsize=(4, 2))
i = a[0.5].sort_values().index[-1]
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 1.2})

pplt.plotSiteReal(cd.loc[i], ax=ax[0], legend=True)
ax[0].set_title('{}:{:.0f} ({})'.format(i[0], i[1], names.loc[i]), fontsize=8)

i = df.lrdiff.sort_values().index[-1]
pplt.plotSiteReal(cd.loc[i], ax=ax[1])
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 1.2})

ax[1].set_title('{}:{:.0f} ({})'.format(i[0], i[1], names.loc[i]), fontsize=8)
plt.gcf().subplots_adjust(bottom=0.2)
pplt.savefig('topSNPs', 300)
plt.show()
示例#14
0
文件: AFS.py 项目: airanmehr/bio
sns.set_style("whitegrid", {"grid.color": ".9", 'axes.linewidth': .5, "grid.linewidth": ".09"})
mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size':50}) ;
mpl.rc('text', usetex=True)

sfs = Simulation.Simulation.Load().H0.sum().value_counts().sort_index()
sfs.loc[0] = 0
sfs.sort_index(inplace=True)
# f=utl.simoutpath+'TimeSeries/msms/'+'L50K.0000.msms'
ticks = np.array([1, 100, 199])
tickss = map(lambda x: '{}/200'.format(x), ticks)


i=np.arange(1,201)
plt.figure(figsize=(7, 2), dpi=300);
df = pd.concat([pd.Series(i[::-1] / (i * 1.0), index=i), sfs], axis=1);
df[1].plot(kind='bar', alpha=0.6);
df[0].plot(color='red', lw=0.7);
plt.ylim([0, 220]);
plt.xlim([0, 200]);
plt.xlabel('Frequency')
plt.ylabel('Num. of Variants')
plt.xticks(ticks, tickss);
fontsize = 8
pplt.setSize(plt.gca(), fontsize)
plt.legend(['Empirical SFS', 'Theoretical SFS'], prop={'size': fontsize});
plt.gcf().subplots_adjust(bottom=0.25)
# plt.tight_layout()
plt.grid(False)
pplt.savefig('sfs', 200)
plt.show()
示例#15
0
文件: PowerROC.py 项目: airanmehr/bio
pd.options.display.max_rows = 20;
pd.options.display.expand_frame_repr = False

import popgen.TimeSeries.RNN.Evaluate as evl
import seaborn as sns
import popgen.Plots as pplt

df = evl.randomROCData()
sns.set_style("white", {"grid.color": "0.9", 'axes.linewidth': .5, "grid.linewidth": "9.99"})
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 1})
mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 6});
mpl.rc('text', usetex=True)

reload(evl)
dpi = 300
plt.figure(figsize=(4, 2), dpi=dpi)
plt.subplot(1, 2, 1)
evl.plotROC(df, FPth=1)
plt.ylabel('True Positive Rate (TPR)')
plt.xlabel('False Positive Rate (FPR)')
pplt.setSize(plt.gca())
plt.subplot(1, 2, 2)
evl.plotROC(df)
plt.xlabel('False Positive Rate (FPR)')
pplt.setSize(plt.gca())
plt.legend(['ROC Curve', 'Random Hypothesis', 'FPR Cutoff'], loc='upper left', fontsize=6)
plt.tight_layout(pad=0.1)
pplt.savefig('powerROC', dpi)
plt.show()
示例#16
0
文件: Plot.py 项目: airanmehr/bio
def plotPowerCLRQ(recompute=False):
    dpi = pplt.PLOS.dpi;
    fontsize = 7
    sns.set_style("whitegrid", {"grid.color": "0.9", 'axes.linewidth': .5, "grid.linewidth": ".09"})
    if recompute:
        a = f(loadHMMAllDepths());
        a = a[a.index.get_level_values('coverage') != np.inf]
        Q = np.sort(np.append(np.arange(0, 1.01, 0.1), 0.9 + np.arange(0, 1, 0.1)[1:] / 10))
        # Q = [0, 0.5,0.9,0.95,0.96,0.97,0.98, 0.99, 1]
        df = pd.concat(map(lambda q: a.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(q)].mean()), Q),
                       axis=1)
        dfa = pd.concat(map(lambda q: a.abs().groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(q)].mean()), Q),
                        axis=1)
        df.columns = pd.MultiIndex.from_product([Q, [False]], names=['Quantile', 'ModifiedLR'])
        df = df.stack(df.columns.names).reorder_levels([0, 6, 7] + range(1, 6))
        dfa.columns = pd.MultiIndex.from_product([Q, [True]], names=['Quantile', 'ModifiedLR'])
        dfa = dfa.stack(dfa.columns.names).reorder_levels([0, 6, 7] + range(1, 6))
        df = pd.concat([df, dfa])
        df.to_pickle(utl.outpath + 'ROC/PowerCLRTable.df')
        df = df[df.index.get_level_values("coverage") != np.inf]
        boot = pd.DataFrame([np.sort(np.random.choice(1000, 250, replace=False)) for _ in range(100)]).T;
        print boot
        dfboot = boot.groupby(level=0, axis=1).apply(
            lambda x: getPower(df.loc[pd.IndexSlice[:, :, :, :, :, :, :, x[x.name].values]].sort_index(),
                               groupbyLevels=range(6)).xs('HMM', level='method')).groupby(level=range(4)).mean();
        print dfboot
        dfboot.columns.name = 'i'
        dfboot = dfboot.stack('i').reset_index(['i', 'ModifiedLR', 'Quantile']);
        print dfboot
        dfboot.to_pickle(utl.outpath + 'ROC/PowerCLRTableBootstrap.df')
    else:
        df = pd.read_pickle(utl.outpath + 'ROC/PowerCLRTable.df')
        dfboot = pd.read_pickle(utl.outpath + 'ROC/PowerCLRTableBootstrap.df')

    dfboot.Quantile = (dfboot.Quantile * 100).astype(int)
    sns.set_context(rc={"lines.linewidth": 0.5})
    pistar = {}
    ABCD = map(lambda x: '({})'.format(x), list('ABCDEFG'))

    fig, axes = plt.subplots(2, 3, sharey=True, sharex=True, figsize=(6, 2.5), dpi=dpi);
    j = 0
    for nu0, axs in zip([0.005, 0.1], axes):
        for depth, ax in zip([30, 100, 300], axs):
            a = dfboot.loc[depth].loc[nu0]
            if nu0 == 0.005: ax.set_title(r'$\lambda$={}'.format(str(depth)).replace('inf', '$\infty$'))
            sns.tsplot(data=a, time='Quantile', unit='i', value=0, condition='ModifiedLR', ci=99.99, legend=False,
                       color=['r', 'darkblue'], ax=ax)
            pistar.update({ax: (ABCD[j], r'($\pi^*=${})'.format(a.groupby('Quantile')[0].mean().idxmax()))})
            pplt.setSize(ax, fontsize)
            ax.set_xlabel('');
            ax.set_ylabel('')
            j += 1
    axes[0][0].locator_params(nbins=3);
    for ax in axes[1]: ax.set_xlabel(r'$\pi$')
    for ax in [axes[0][0], axes[1][0]]: ax.set_ylabel('Avg. Power\n({} Sweep)'.format(('Soft', 'Hard')[nu0 == 0.005]))
    plt.gcf().subplots_adjust(bottom=0.2)
    # [pplt.annotate(v[1],ax=k,fontsize=fontsize) for k,v in pistar.items() ]
    [pplt.annotate('({})'.format(list('ABCDEF')[j]), ax=x, fontsize=7) for j, x in enumerate(axes.reshape(-1))]
    [pplt.annotate(v[1], ax=k, fontsize=7, xpad=0.6) for x, (k, v) in zip(ABCD, pistar.items())]
    axes[1][-1].legend([r'$\mathcal{H}$', '$\mathcal{H}^+$'], loc='lower right', prop={'size': fontsize})

    mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': fontsize});
    mpl.rc('text', usetex=True)
    pplt.savefig('CLRQ', dpi)
    plt.show()
示例#17
0
sns.tsplot(sel, time='gen', value='H', unit='exp', color='red', ci=99);
sns.tsplot(neut, time='gen', value='H', unit='exp', ci=99);

# plt.subplot(4, 1, 4);
# sns.tsplot(sel, time='gen', value='m', unit='exp', color='red', ci=99);
# sns.tsplot(neut, time='gen', value='m', unit='exp', ci=99);
plt.legend(['Selection', 'Neutral'], loc='best', fontsize=fontsize + 2);
# plt.ylabel("Num. of SNPs")

pplt.setSize(plt.gca(), fontsize)
plt.ylabel(r'Fay Wu $H$', fontsize=fontsize + 2);
plt.xlabel('Generations', fontsize=fontsize + 2)
plt.title(r'(C)', fontsize=fontsize + 2);
# mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) ;
mpl.rc('text', usetex=True)
pplt.savefig('msmsts', 300)

#
# def Dt(nu_t):
#     theta=2
#     return  -theta*(np.log(1-nu_t+1e-12)/np.log(2000)+nu_t*2)
# def Ht(nu_t):
#     theta=2
#     n=1000
#     b=-1./((1-nu_t)*n+1)
#     return theta*( nu_t*( (nu_t+1)/2. +b )   +  (1-nu_t)*(((n+1)/(2*n)+b)))
#
# maxGen=1000
# nu0=0.05
# s=0.05
# T=mkv.computePower(s=s,n=1,save=True,load=True)
示例#18
0
文件: Plot.py 项目: airanmehr/bio
def plotRank():
    def computeRanks():
        print 'ranking...'
        ff = lambda x: x.groupby(level=range(5)).rank(ascending=False).xs(True, level='causal')
        removeINF = lambda x: x[x.index.get_level_values('coverage') != np.inf]
        positive = lambda x: x.xs(1, level='label').fillna(0)
        # ff(positive(        f(pd.read_pickle(utl.outpath + 'ROC/HMM')))).to_pickle('{}ROC/ranks.HMM.df'.format(utl.outpath))
        ff(positive(removeINF(pd.read_pickle(utl.outpath + 'ROC/GP'))).LR).to_pickle(
            '{}ROC/ranks.GP.df'.format(utl.outpath));
        a = pd.read_pickle(utl.outpath + 'ROC/FIT')['FIT'];
        a[a.isnull()] = np.random.rand(a.isnull().sum())
        a.isnull().sum()

        a.xs(True, level='causal')
        # ff(a).to_pickle('{}ROC/ranks.FIT.df'.format(utl.outpath))
        ff(positive(removeINF(pd.read_pickle(utl.outpath + 'ROC/CMH')['CMH']))).to_pickle(
            '{}ROC/ranks.CMH.df'.format(utl.outpath))

    #computeRanks()
    print 'plotting...'
    #
    a = pd.concat(map(lambda x: pd.read_pickle('{}ROC/ranks.{}.df'.format(utl.outpath, x)), ['CMH', 'HMM', 'GP']))
    fontsize = 7
    dpi = 300

    def addlast(df):
        df[df.name + (1200,)] = 1
        return df.loc[df.name]
    def fil(x):
        if pd.isnull(x.iloc[0]): x.iloc[0] = 0
        for i in range(1, x.size):
            if pd.isnull(x.iloc[i]):
                x.iloc[i] = x.iloc[i - 1]
        return x
    for depth, aa in a.groupby(level=0):
        print depth
        AUC = []
        dists = a.loc[depth].groupby(level=[0, 2, 1]).apply(
            lambda df: (df.value_counts().sort_index().cumsum() / df.shape))
        dists = dists.groupby(level=range(3)).apply(addlast)

        fig, axes = plt.subplots(2, 4, figsize=(7, 3), dpi=dpi, sharey=True, sharex=True);
        axes = axes.reshape(-1)
        j = 0
        for nu0, dfnu in dists.groupby(level=0):
            for s, df in dfnu.loc[nu0].groupby(level=0):
                df = df.loc[s]
                df = df.unstack(level='method')  #.rename(columns={'HMM': r'$\mathcal{H}$'})
                df = df.apply(fil)
                auc = (df.apply(lambda x: x.dot(df.index.values)) / np.sum(df.index.values)).rename((depth, nu0, s))
                AUC += [auc]
                color = fixColor(pd.DataFrame(None, index=df.columns)).loc[df.columns.values, 'color'].tolist()
                df.columns = map(lambda y: y.replace('HMM', comaleName), df.columns)

                if df.shape[0] == 2: df.index = np.ceil(df.index.values)
                df.plot(color=color, ax=axes[j], lw=1, legend=False)
                axes[j].set_ylim([-0.02, 1.02])
                # pplt.annotate('$s$={}'.format(s), xpad=0.6, ypad=0.25, fontsize=fontsize + 1, ax=axes[j])
                axes[j].set_title('$s$={}'.format(s), fontsize=fontsize + 1)
                if j > 3:
                    axes[j].set_xlabel('Rank', fontsize=fontsize)
                axes[j].set_ylabel(r'CDF ({} Sweep)'.format(('Soft', 'Hard')[nu0 == 0.005], nu0),
                                   fontsize=fontsize)
                axes[j].locator_params(axis='x', nbins=5);
                pplt.setStyle(lw=1, fontsize=fontsize, fontscale=0.1);
                mpl.rcParams.update({'font.size': 2})
                mpl.rc('xtick', labelsize=6)
                mpl.rc('ytick', labelsize=6)
                if j == 7: axes[j].legend(loc='lower right', fontsize=fontsize)
                j += 1
        # plt.tight_layout(pad=0.1)
        plt.xlim([0, 1200])
        plt.gcf().subplots_adjust(bottom=0.15)
        print pd.concat(AUC, axis=1).round(2).T.reset_index()
        print depth
        pplt.savefig('rank{}'.format(depth), dpi)
示例#19
0
        x+=[(w11*p*p+w01*p*q)/(w11*p*p+2*w01*p*q+w00*q*q)]
    return pd.Series(x)
df=[];dom=[]
H = [0, 0.5, 1, 2]
index = map(lambda h: '$h$={}'.format(h), H)
max_gen = 200
for h in H:
    df+=[f(x0,s,h,max_gen)]
    dom+=[(1+s,1+h*s, 1)]
df=pd.DataFrame(df,index=index).T
dom=pd.DataFrame(dom,index=index,columns=['AA','aA','aa']).T.iloc[::-1]
plt.figure(figsize=(4, 2.5), dpi=dpi)
# ax=plt.subplot(1,3,1);
df.plot(ax=plt.gca(), linewidth=1, legend=False, color=pplt.getColorMap(len(H)));
plt.ylim([0, 1.01])
plt.xlabel('Generations');
plt.ylabel('Carrier Frequency')
# pd.Series(utl.sig((np.arange(max_gen)+1)*s/2 + utl.logit(x0))).plot(style='--',linewidth=2,color='k')
plt.grid()

# ax=plt.subplot(1,3,2);
# df=2*df*(1-df)
# df.plot(ax=ax,linewidth=2,legend=False);plt.xlabel('Generations');plt.ylabel('Heterozygosity ($2pq$)')
# plt.grid();plt.ylim([0,0.55])
# ax=plt.subplot(1,3,3);
# dom.plot(ax=ax,grid=True,linewidth=2);plt.xlabel('Genotype');plt.ylabel('Relative Fitness')
plt.legend(loc='best');
plt.gca().locator_params(nbins=3);
plt.gcf().subplots_adjust(bottom=0.2)
pplt.savefig('dominance', dpi)
plt.show()
示例#20
0
home = os.path.expanduser('~') + '/'
import popgen.Util as utl
import popgen.Estimate as est
import popgen.Plots as pplt

cd = pd.read_pickle(utl.outpath + 'real/CD.F59.df').sortlevel()
af = cd.groupby(level=[0, 1], axis=1).apply(lambda x: x[x.name].C / x[x.name].D)
f59 = af.xs(59, level='GEN', axis=1).mean(1)
f0 = af.xs(0, level='GEN', axis=1).mean(1)
i = [af[(f0 < 0.3) & (f59 > 0.7)].index[0], af[(f0 > 0.7) & (f59 < 0.2)].index[-1],
     af[(f0 > 0.4) & (f59 < 0.6)].index[-299]]
# i [('2L', 2955601), ('3R', 25463358), ('X', 22057437)]
# scores = rutl.loadSNPScores().sort_values('lr', ascending=False)
# scores
reload(pplt)
sns.set_style("white", {"grid.color": "0.9", 'axes.linewidth': .5, "grid.linewidth": "9.99"})
mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern']});
mpl.rc('text', usetex=True)
dpi = 300
_, ax = plt.subplots(1, 3, figsize=(6, 2), dpi=dpi, sharex=True, sharey=True)
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 1.2})
pplt.plotSiteReal(cd.loc[i[0]], ax=ax[0], legend=True, title='{}:{}'.format(i[0][0], i[0][1]))
pplt.plotSiteReal(cd.loc[i[1]], ax=ax[1], title='{}:{}'.format(i[1][0], i[1][1]))
pplt.plotSiteReal(cd.loc[i[2]], ax=ax[2], title='{}:{}'.format(i[2][0], i[2][1]))

ax[0].set_ylabel(r'$\nu_t$')
plt.gcf().subplots_adjust(bottom=0.2)
pplt.savefig('trajectoryReal', dpi)
plt.show()
示例#21
0
def Final():
    ############ preparing data
    def saveGOTex(df):
        name = np.unique(df.index)[0]
        print '*' * 80, name
        df = df.sort_values('-log($p$-value)', ascending=False)
        df['Rank'] = range(1, df.shape[0] + 1);
        df = df.iloc[:, [6] + range(6)]
        path = utl.paperPath + '/tables/{}.tex'.format(name);
        df.to_csv(path.replace('.tex', '.csv').replace('/tables/', '/data/'))
        utl.DataframetolaTexTable(df.iloc[:, :-1], alignment=['c', 'c', 'p{3in}', 'c', 'c', 'c'], fname=path)

    goPvalue = lambda x: utl.getPvalFisher(AllGenes=allVariantGenes.values, putativeList=x.values,
                                           myList=g.index.values)
    unpackp = lambda x: [min(6, np.round(x[0], 1)), x[1].loc['Putative', 'myList']]
    # Score = lambda x,f:f(scores.loc[x.CHROM][(scores.loc[x.CHROM].index>=x.start)&(scores.loc[x.CHROM].index<=x.end)])
    sort = lambda df: pd.concat(
            [df[df.index.get_level_values('CHROM') == ch] for ch in ['X', '2L', '2R', '3L', '3R']]).rename(
        columns={'H': r'$\mathcal{H}^+$', 'M': 'Num. of Variants'})
    Genes = loadGeneData().reset_index().set_index('GO')
    Genes = Genes.loc[
        (Genes['FBgn'].groupby(level=0).apply(lambda x: len(x.unique())) > 2).replace({False: None}).dropna().index]
    scores = utl.getEuChromatin(rutl.loadScores(skipHetChroms=True))
    ann = pd.DataFrame(scores).join(loadANN(), how='inner')
    allVariantGenes = ann['Gene_ID'].drop_duplicates()
    # f=lambda x: x[x>=x.quantile(0.9)].mean()
    # geneScores=ann.reset_index().set_index('Gene_ID')[['CHROM','POS',0]].drop_duplicates().groupby(level=0)[0].apply(f)


    ############ computing candidate regions
    scan = utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size}, winSize=30000)
    o = utl.localOutliers(scan.H, q=0.99);
    o = scan.loc[o.index]
    fig = plt.figure(figsize=(7, 2.5), dpi=300);
    pplt.Manhattan(data=sort(scan), Outliers=sort(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 6) for ax in fig.get_axes()];

    pplt.annotate('(A)', ax=fig.axes[0], fontsize=8)
    pplt.annotate('(B)', ax=fig.axes[1], fontsize=8)
    plt.gcf().subplots_adjust(bottom=0.15);
    pplt.savefig('manhattan', 300)
    plt.savefig(utl.paperFiguresPath + 'manhattan.pdf')
    regions = utl.BED.getIntervals(o.H, padding=30000);
    print regions.shape
    intervalGenes = utl.BED.intersection(ann, regions).name.drop_duplicates().reset_index().set_index('name');
    print intervalGenes.size
    g = intervalGenes;
    # intervalGenes
    # g=g[g>=g.quantile(0.)];
    print g.size
    df = Genes.groupby(level=0).apply(lambda x: pd.DataFrame(
        [x.name, x.term.iloc[0]] + unpackp(goPvalue(x.FBgn.drop_duplicates())) + [x.ontology.iloc[0],
                                                                                  x.FBgn.unique().size] + [
            np.intersect1d(x.values, g.index.values)],
        index=['GO ID', 'GO Term', '-log($p$-value)', 'Hits', 'Ontology', 'Num of Genes', 'Genes']).T)
    df = df[(df['-log($p$-value)'] >= 3) & (df.Hits >= 3)]
    df['-log($p$-value)'] = df['-log($p$-value)'].astype(str)
    df = df.set_index('Ontology')
    df.groupby(level=0).apply(saveGOTex);
    print df

    tempGenes = Genes.reset_index().set_index('FBgn').loc[
        np.append(df.set_index('GO ID').loc['GO:0009631'].Genes, df.set_index('GO ID').loc['GO:0009408'].Genes)][
        ['term', 'name', 'GO']].reset_index().set_index('GO').loc[['GO:0009631', 'GO:0009408']].drop_duplicates()
    tempGenes.columns = ['FlyBase ID', 'GO Term', 'Gene Name']
    utl.DataframetolaTexTable(tempGenes, fname=utl.paperPath + '/tables/{}.tex'.format('tempGenes'),
                              alignment=['l', 'l', 'l'])


    regions.to_csv(utl.paperPath + 'data/intervals.csv')

    snps = utl.BED.intersection(scores.reset_index(), regions, 0);
    snps['POS'] = snps.start;
    snps.set_index('POS', append=True, inplace=True)
    snps = snps['name'].astype(float).reset_index().drop_duplicates().set_index(['CHROM', 'POS']).name

    def ff(x):
        y = utl.BED.intersection(scores.reset_index(), x, 0).rename(columns={'start': 'POS'}).set_index('POS',
                                                                                                append=True).name.astype(
            float)
        y = y[y > 0]
        y = y[y >= y.quantile(0.9)]
        print x['len'].iloc[0], y.size
        return y

    cands = regions.reset_index().groupby(level=0).apply(ff).reset_index(level=0).name
    cands.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/cands.final.txt',
                                                                       sep='\t', header=None, index=False)
    scores.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/allsnps.txt',
                                                                        sep='\t', header=None, index=False)

    name = 'cands.final.out.tsv'
    gowinda = pd.read_csv('/home/arya/out/real/gowinda/{}'.format(name), sep='\t', header=None)[[0, 4, 5, 6, 7, 8, 9]]
    gowinda.columns = ['GO ID', '-log($p$-value)', 'Hits', 'Num of Genes', 'Total Genes', 'GO Term', 'Genes']
    gowinda = gowinda[gowinda.Hits >= 3]
    gowinda['-log($p$-value)'] = -gowinda['-log($p$-value)'].apply(np.log10).round(1)
    gowinda.to_csv(utl.paperPath + 'data/gowinda.all.tsv', sep='\t')
    bp = gowinda.set_index('GO ID').loc[
        Genes[Genes.ontology == 'biological_process'].index.unique().rename('GO ID')].dropna()
    bp.to_csv(utl.paperPath + 'data/gowinda.bp.tsv', sep='\t')
    utl.DataframetolaTexTable(bp.reset_index()[['GO ID', 'GO Term', '-log($p$-value)']], alignment=['c', 'p{4in}', 'c'],
                              fname=utl.paperPath + 'tables/gowinda.tex')

    map(len, (Genes.index.unique(), bp.index.unique(), df.loc['biological_process']['GO ID'].unique())), len(
        np.intersect1d(bp.index.unique(), df['GO ID'].unique()))

    pval = utl.getPvalFisher(Genes[Genes.ontology == 'biological_process'].index.unique(), bp.index.unique(),
                             df.loc['biological_process']['GO ID'].unique())
    print pval

    stats = pd.Series(None, name='Value')

    stats['Num. of Vatiants'] = scores.size
    stats['Num. of Candidate Intervals'] = regions.shape[0]
    stats['Total Num. of Genes'] = loadGeneCoordinates().shape[0]
    stats['Num. of Variant Genes'] = ann['Gene_ID'].unique().shape[0]
    stats['Num. of Genes within Candidate Intervals'] = intervalGenes.shape[0]
    stats['Total Num. of GO'] = len(loadGeneData().index.unique())
    stats['Num. of GO with 3 or More Genes'] = len(Genes.index.unique())
    stats['Num. of Candidate Variants for Gowinda'] = cands.size
    stats = stats.apply(lambda x: '{:,.0f}'.format(x))
    stats.index.name = 'Statistic'
    print stats
    utl.DataframetolaTexTable(stats.reset_index(), fname=utl.paperPath + 'tables/stats.tex', alignment=['l', 'r'])