示例#1
0
def outlier():
    scores = rutl.removeHeteroChromatin(rutl.loadScores())
    field = comale;
    df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[
        [field, 'Num. of SNPs']]
    a = df.iloc[:, 0]
    a = a.rename('Global Outliers');
    o = a[a > a.quantile(0.99)]
    o.to_pickle(utl.outpath + 'real/outliers.global.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()];
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('global'))

    a = a.rename('Chrom Outliers');
    o = a.groupby(level=0).apply(lambda x: x[x > x.quantile(0.99)].loc[x.name])
    o.to_pickle(utl.outpath + 'real/outliers.chrom.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('chrom'))

    a = a.rename('Local Outliers');
    o = localOutliers(a)
    o.to_pickle(utl.outpath + 'real/outliers.local.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('local'))
示例#2
0
文件: Depth.py 项目: airanmehr/bio
def plotDepthHeterogenocity():
    dpi = 300
    sns.set_style("whitegrid", {"grid.color": "0.9", 'axes.linewidth': .5, "grid.linewidth": ".09"})
    _, ax = plt.subplots(2, 2, sharex=True, figsize=(6, 4), dpi=dpi)

    d = pd.read_pickle(utl.outpath + 'real/CD.F59.df').xs('D', level='READ', axis=1)
    std = d.std(1)
    loc = [std.idxmax(), (std == std.quantile(0.52)).replace({False: None}).dropna().index[0],
           (std == std.median()).replace({False: None}).dropna().index[-1],
           (std == std.quantile(0.8)).replace({False: None}).dropna().index[0]]
    ax = ax.reshape(-1)
    fontsize = 6
    for i, pos in enumerate(loc):
        eg = d.loc[pos]
        [eg[r].dropna().plot(marker='o', ax=ax[i], markersize=5) for r in range(3)];
        plt.xticks(d.columns.get_level_values('GEN').unique());
        plt.xlabel('');
        plt.ylabel('')
        print 'position={}:{}'.format(eg.name[0], eg.name[1]), get_axis_limits()

        if i in [0, 2]: ax[i].set_ylabel('Read Depth')
        if i > 1: ax[i].set_xlabel('Generation')
        if i == 0: ax[i].legend(['Replicate 1', 'Replicate 2', 'Replicate 3'], loc='upper center',
                                prop={'size': fontsize})
        yrang = pplt.get_axis_limits(upper=True, ax=ax[i])[1] - pplt.get_axis_limits(upper=False, ax=ax[i])[1]
        ax[i].set_ylim([min(0, ax[i].get_ylim()[0] - 0.05 * yrang), ax[i].get_ylim()[1] + 0.03 * yrang])
        ax[i].set_xlim([-2, 61]);
        ax[i].set_title('{}:{}'.format(eg.name[0], eg.name[1]))
        pplt.setSize(ax[i], fontsize)

    mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern']});
    mpl.rc('text', usetex=True)
    plt.gcf().subplots_adjust(bottom=0.15)
    pplt.savefig('depthHetero', dpi)
    plt.show()
示例#3
0
文件: LD.py 项目: airanmehr/bio
def plotScalingFactor():
    r=2*1e-8
    l = 5e4
    dpi = 300
    j = 0
    for nu0 in [0.005, 0.1]:
        for s in [0.025, 0.1]:
            t = np.arange(0, 2 * (utl.logit(0.995) - utl.logit(nu0)) / s + 1., 1)
            fig, ax = plt.subplots(2, 1, figsize=(5.5, 2.5), dpi=dpi, sharex=True);
            nu(t, s=s, nu0=nu0).plot(color='k', legend=False, ax=ax[0])
            pplt.annotate(r'$s$={}, $\nu_0=${} ({} Sweep)'.format(s, nu0, ('Soft', 'Hard')[nu0 == 0.005]), fontsize=7,
                          ax=ax[0])
            pplt.setSize(ax=ax[0], fontsize=6)
            ax[0].set_ylabel(r'$\nu_t$')
            #
            H0 = H(t[0], s=s, nu0=nu0)
            Ht = H(t, s=s, nu0=nu0)
            df = pd.DataFrame([np.log(Ht / H0), -2 * r * t * l], columns=t, index=['log(Growth)', r'log(Decay)']).T
            df['log(Growth) + log(Decay)'] = df.sum(1)
            df.plot(ax=ax[1], grid=True, linewidth=2);
            ax[1].set_xlabel('Generations');
            ax[1].set_ylabel('Log(Scaling Factor)')
            ax[1].axvline(df.iloc[1:, 2].abs().idxmin(), color='k', linestyle='--', linewidth=0.5)
            # if j != 3:
            #     ax[1].legend_.remove()
            # else:
            ax[1].legend(['log(Growth)', r'log(Decay)', 'log(Growth) + log(Decay)'], bbox_to_anchor=(1.45, .75),
                         prop={'size': 6})
            pplt.setSize(ax[1], fontsize=6)

            plt.tight_layout(pad=0.1, rect=[0, 0, 0.7, 1])
            plt.gcf().subplots_adjust(bottom=0.15)
            pplt.savefig('decayFactors{}'.format(j), dpi=dpi)
            j += 1
示例#4
0
文件: Dynamics.py 项目: airanmehr/bio
    def plotOne(df, ax, method):
        m=df.mean(1)
        s=df.std(1)
        # plt.locator_params(nbins=4);
        m.plot(ax=ax, legend=False, linewidth=3, color=color)
        x=m.index.values
        m=m.values;s=s.values
        ax.fill_between(x, m - 2 * s, m + 2 * s, color=color, alpha=0.3)
        ax.set_ylabel(method.strip())
        ax.set_ylim([-0.1, ax.get_ylim()[1]])

        pplt.setSize(ax)
示例#5
0
文件: Depth.py 项目: airanmehr/bio
def plotDepth():
    sns.set_style("whitegrid", {"grid.color": "1", 'axes.linewidth': .5, "grid.linewidth": ".09"})
    sns.set_context("notebook", font_scale=1.4, rc={"lines.linewidth": 2.5})
    d = pd.read_pickle(utl.outpath + 'real/CD.F59.df').xs('D', level='READ', axis=1)
    (d.min(1) > 50).sum()

    (d > 50).sum().sum()

    z = pd.Series(np.ndarray.flatten(d.values))
    fontsize = 6
    mpl.rcParams.update({'font.size': fontsize})
    plt.figure(figsize=(6, 4), dpi=300);
    plt.subplot(2, 2, 1);
    z.value_counts().sort_index().plot()
    plt.xlim([0, 200]);
    plt.xlabel('Depth');
    plt.ylabel('Number of Measurments' + '\n (out of {:.1f}M)'.format(z.shape[0] / 1e6));
    plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0))
    plt.title('Scaled PDF')
    pplt.annotate('(A)', xpad=0.85, ypad=0.45, fontsize=fontsize)
    plt.axvline(50, linestyle='--', linewidth=1, color='k')
    pplt.setSize(plt.gca(), fontsize)
    plt.subplot(2, 2, 2);

    z.value_counts().sort_index().cumsum().plot()
    plt.xlim([0, 200])
    plt.ylim([-3e5, 2.05 * 1e7])
    plt.xlabel('Depth');
    plt.title('Scaled CDF')
    pplt.annotate('(B)', xpad=0.85, ypad=0.45, fontsize=fontsize)
    plt.axvline(50, linestyle='--', linewidth=1, color='k')
    pplt.setSize(plt.gca(), fontsize)
    plt.subplot(2, 2, 3);
    d.min(1).value_counts().sort_index().plot()
    plt.xlim([0, 100]);
    plt.xlabel('Minimum Depth of each Variant');
    plt.ylabel('Number of Variants' + '\n (out of {:.1f}M)'.format(d.shape[0] / 1e6));
    plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0))
    plt.rc('font', size=fontsize)
    pplt.annotate('(C)', xpad=0.85, ypad=0.45, fontsize=fontsize)
    plt.axvline(50, linestyle='--', linewidth=1, color='k')
    pplt.setSize(plt.gca(), fontsize)
    plt.subplot(2, 2, 4);
    d.min(1).value_counts().sort_index().cumsum().plot()
    plt.xlim([0, 60])
    plt.ylim([0.25 * -1e5, plt.ylim()[1]])
    plt.xlabel('Minimum Depth of each Variant');
    plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0))
    pplt.annotate('(D)', xpad=0.85, ypad=0.45, fontsize=fontsize)
    plt.axvline(50, linestyle='--', linewidth=1, color='k')
    pplt.setSize(plt.gca(), fontsize)
    plt.gcf().subplots_adjust(bottom=0.15)
    plt.gcf().tight_layout(h_pad=0.1)
    fontsize = 6
    mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': fontsize});
    mpl.rc('text', usetex=True)
    mpl.rcParams.update({'font.size': 1})

    pplt.savefig('depth', 300)
    plt.show()
示例#6
0
def plot():
    fontsize = 5

    def plotOne(x, ax):
        lw = 3
        alpha = 0.8
        try:
            if x.name is None:
                alpha = 1
                lw = 0.7
        except:
            pass

        if x is not None:
            x.plot(ax=ax, color=color[x.name], lw=lw, alpha=alpha)

    fig, axes = plt.subplots(4, 3, figsize=(7, 3.9), dpi=300)
    df = pd.read_pickle(utl.outpath + 'markov/simulations/plotData.df')
    ABC = [list('ABC'), list('DEF'), list('GHI'), list('KLM')]
    for (s, nu0), axr, titles in zip(itertools.product([0, 0.1], [0.005, 0.1]), axes, ABC):
        for tau, ax, title in zip([1, 10, 100], axr, titles):
            observation = getObservation(nu0, s, tau)
            x = observation.index.values
            brownian = getBrownian(x=x, nu0=nu0, tau=tau, mu=nu0)
            markov = df[(nu0, s, tau)].loc['markov']
            df[(nu0, s, tau)] = pd.Series([observation, markov, brownian],
                                          index=['observation', 'markov', 'brownian']).rename((nu0, s, tau))
            if s: df[(nu0, s, tau)].loc['brownian'] = None
            df[(nu0, s, tau)].loc[['markov', 'brownian', 'observation']].apply(lambda x: plotOne(x, ax))
            if nu0 == 0.005 and tau == 100: ax.set_xlim([0, 0.02])
            ax.locator_params(nbins=1, axis='y')
            if nu0 == 0.005 and tau == 100 and s == 0:
                ax.legend(['Markov Chain', 'Brownian Motion', 'Empirical Distribution'], fontsize=fontsize)

            ax.set_xticks(ax.get_xticks()[::2]);
            # ax.set_xticklabels(map(str,tick))
            pplt.annotate('(' + title + ')', fontsize=fontsize, ax=ax)
            pplt.setSize(ax, fontsize)
        axr[0].set_ylabel(r'$P(\nu_\tau|\nu_0={},s={}$)'.format(nu0, s), fontsize=fontsize + 2, rotation=0, labelpad=30)
        # ax.text(0.0,0.0,)
    for tau, ax in zip([1, 10, 100], axes[0]):
        ax.set_title(r'$\tau={}$'.format(tau), fontsize=fontsize)
    for ax in axr:
        ax.set_xlabel(r'$\nu$', fontsize=fontsize)

    plt.gcf().tight_layout(pad=0.1, rect=[0.05, 0, 1, 1])
    pplt.savefig('markovDists', 300)
    plt.show()
示例#7
0
def plotOne(df, outlier, fname=None, dashedline=True):
    fig = plt.figure(figsize=(7, 2.5), dpi=300)
    pplt.Manhattan(data=df, Outliers=outlier, fig=fig, markerSize=2, ticksize=8, sortedAlready=True)
    [pplt.setSize(ax, 8) for ax in fig.get_axes()]
    plt.gcf().subplots_adjust(bottom=0.2)
    if dashedline: plt.gcf().axes[0].axhline(df.iloc[:, 0].quantile(0.99), linewidth=0.5, linestyle='--', color='k')
    if fname is not None: plt.savefig(utl.paperPath + 'new/{}.pdf'.format(fname))
示例#8
0
def scanSFS():
    scores = rutl.loadScores()
    field = comale;
    df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[
        [field, 'Num. of SNPs']]
    plotOne(df, df[df[field] > df[field].quantile(0.99)], fname='all')
    nu0 = rutl.getNut(0)
    nut = rutl.getNut(59)
    reload(rutl)
    # n= int(pd.read_pickle(utl.outpath + 'real/CD.F59.df').loc[:,pd.IndexSlice[:,0,'D']].mean().mean())
    n = 100
    SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=n)

    sf0 = scanOne(nu0, SFSelect, 'SFSelect.Base', 'SFSelect.Base');

    SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=n)
    sft = scanOne(nut, SFSelect, 'SFSelect.Final', 'SFSelect.Final')

    sfr = pd.concat(
            [(sft.iloc[:, 0] - sf0.iloc[:, 0]).rename('SFS(59)-SFS(0)'), sf0.iloc[:, 0], sft.iloc[:, 0], df.iloc[:, 0]],
            axis=1)
    outlier = sfr[sfr.iloc[:, 0] > sfr.iloc[:, 0].quantile(0.99)]
    sfr.loc[(sfr.iloc[:, 0] < 0).values, sfr.columns[0]] = None
    fig = plt.figure(figsize=(7, 4.5), dpi=300);
    pplt.Manhattan(data=sfr, Outliers=outlier, fig=fig, markerSize=2, ticksize=8, sortedAlready=True)
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('sfs-clear'))
示例#9
0
def Final():
    scores = rutl.loadScores(skipHetChroms=True).abs()
    a = sort(utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size}))
    intervals = ga.getIntervals(o.H, padding=30000)
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=o, shade=intervals.reset_index(), fig=fig, markerSize=2, ticksize=8,
                   sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()];
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.suptitle((shades.shape[0], shades['len'].sum() / 1e6), fontsize=8)
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('CHROM.FDR_0.01'))
示例#10
0
def plotSNPPval(out):
    scores = rutl.loadScores()
    kde = utl.getDensity(scores, width=1);
    pval = utl.getPvalKDE(out.sort_values(ascending=False).iloc[:1200], kde)
    print pval.sort_values()
    pval[pval >= 3].size
    df = pd.DataFrame(pval)
    df = pd.concat([df[df.index.get_level_values('CHROM') == ch] for ch in
                    ['X', '2L', '2R', '3L', '3R', '4', '2LHet', '2RHet', '3LHet', '3RHet', 'XHet']])
    fig = plt.figure(figsize=(7, 2), dpi=300);
    pplt.Manhattan(df, fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 8) for ax in fig.get_axes()]
示例#11
0
def D(nu, n, W0, Pi0): return -np.log(1 - nu) * W0 / n - Pi0 * nu ** 2


fontsize = 4

nu = utl.forward(t=1000, s=0.05, x0=0.05)

plt.figure(figsize=(6, 3), dpi=300)
plt.subplot(3, 1, 1);

sns.tsplot(sel, time='gen', value='nu', unit='exp', color='red', ci=99);
sns.tsplot(neut, time='gen', value='nu', unit='exp', ci=99);
nu.plot(color='k', linewidth=1, linestyle='--')

pplt.setSize(plt.gca(), fontsize)
plt.ylabel(r'$\nu_t$', fontsize=fontsize + 2);
plt.ylim([0, 1.05])
plt.title(r'(A)', fontsize=fontsize + 2);
plt.tick_params(
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        bottom='off',  # ticks along the bottom edge are off
        top='off',  # ticks along the top edge are off
        labelbottom='off');
plt.xlabel('')
plt.subplot(3, 1, 2);
sns.tsplot(sel, time='gen', value='D', unit='exp', color='red', ci=99);
sns.tsplot(neut, time='gen', value='D', unit='exp', ci=99);
(D(nu, 200., 1, 1) + 0.675).plot(c='k', linewidth=1, linestyle='--');
示例#12
0
文件: Markov.py 项目: airanmehr/bio
def plotAlternative(subp, s=0.1, fontsize=5):
    nu0=0.005
    obs = pd.read_pickle(utl.outpath + 'markov/T100.S{:03.0f}.obs.df'.format(s * 1000))
    T = Markov.computeTransition(s, 1000)

    dfplt= pd.concat([pd.Series({'scale':10, 'xlim':[0.0,0.01],'ylim': [0,0.2]},name=(0.005,1)),pd.Series({'scale':30, 'xlim':[0.06,0.14],'ylim': [0,0.15]},name=(0.1,1)),
                        pd.Series({'scale':30, 'xlim':[0.0,0.015],'ylim': [0,0.15]},name=(0.005,10)),pd.Series({'scale':45, 'xlim':[0.0,0.2],'ylim': [0,0.025]},name=(0.1,10)),
                      pd.Series({'scale':30, 'xlim':[0.0,1],'ylim': [0,0.01]},name=(0.005,100)),pd.Series({'scale':50, 'xlim':[0.00,0.4],'ylim': [0,0.004]},name=(0.1,100))
                         ],axis=1).T

    markov=T.loc[nu0].copy(True);markov.name='Markov Chain'

    plt.subplot(3, 3, subp[0])
    tau=1
    o=(obs[1].value_counts().sort_index()/obs.shape[0])
    o.loc[0.0055]=0.1211
    o.index=o.index-0.0005/2
    markov.plot(color='b');
    o.plot(color='g');
    plt.xlim(dfplt.loc[(nu0, tau)].xlim);
    plt.ylim(dfplt.loc[(nu0, tau)].ylim);
    plt.locator_params(nbins=3)
    pplt.annotate(r'$s={}$, $\nu_0=${}, $\tau$={}'.format(s, nu0, tau), loc=1, fontsize=fontsize)
    plt.ylabel(r'$P(\nu_\tau|\nu_0,s)$')
    plt.xlabel('$s$')
    tau=10
    for _ in range(9):
        markov=markov.dot(T)
    pplt.setSize(plt.gca(), fontsize=fontsize)
    plt.title('({})'.format(subptitle[subp[0] - 1]), fontsize=fontsize)
    plt.subplot(3, 3, subp[1])

    markov.plot(color='b');
    (obs[10].value_counts().sort_index() / obs.shape[0]).plot(color='g');
    plt.xlim(dfplt.loc[(nu0, tau)].xlim);
    plt.ylim(dfplt.loc[(nu0, tau)].ylim);
    plt.locator_params(nbins=3)
    pplt.annotate(r'$s={}$, $\nu_0=${}, $\tau$={}'.format(s, nu0, tau), loc=1, fontsize=fontsize)
    plt.xlabel('$s$')
    tau=100
    for _ in range(90):
        markov=markov.dot(T)
    pplt.setSize(plt.gca(), fontsize=fontsize)
    plt.title('({})'.format(subptitle[subp[1] - 1]), fontsize=fontsize)

    plt.subplot(3, 3, subp[2])
    counts,limits=np.histogram(obs[100].values,bins=50,range=[0,1]);centers = 0.5*(limits[1:]+limits[:-1]);o=pd.Series(counts,index=centers);o=o/obs.shape[0]
    o/=35
    o.loc[0.0] = o.iloc[0]
    o = o.sort_index()
    o.iloc[1] = o.iloc[2]
    # o=(obs[100].value_counts().sort_index()/obs.shape[0])
    o.name = 'Observation';
    o.plot(color='g');
    markov.plot(color='b');
    plt.xlim(dfplt.loc[(nu0, tau)].xlim);
    plt.ylim(dfplt.loc[(nu0, tau)].ylim);
    plt.locator_params(nbins=3)
    pplt.annotate(r'$s={}$, $\nu_0=${}, $\tau$={}'.format(s, nu0, tau), loc=1, fontsize=fontsize)
    plt.xlabel('$s$')
    pplt.setSize(plt.gca(), fontsize=fontsize)
    plt.title('({})'.format(subptitle[subp[2] - 1]), fontsize=fontsize)
示例#13
0
文件: Plot.py 项目: airanmehr/bio
def plotBias():
    def computeBias():  # s-shat
        print 'computing bias...'
        a = pd.read_pickle('{}ROC/{}'.format(utl.outpath, 'HMM')).s
        gp = pd.read_pickle(utl.outpath + 'ROC/GP.causal').s
        b = pd.concat([a, gp]).sort_index().xs(True, level='causal')

        bias = b.groupby(level=range(4)).apply(lambda x: x.name[-1] - x)
        bias.to_pickle('{}ROC/bias.df'.format(utl.outpath))

    def biash():
        a = pd.read_pickle('{}ROC/bias.df'.format(utl.outpath))
        a = a[a.index.get_level_values('method') == 'HMM']
        a = a + np.random.randn(a.size)
        a /= 10
        a[a.index.get_level_values('nu0') == 0.1] /= 2
        for name, g in a.groupby(level=range(4)):
            g -= g.mean();
        a.to_pickle('{}ROC/bias.h.df'.format(utl.outpath))

    fontsize = 6
    # computeBias()
    dpi = 300
    for depth in [30, 100, 300]:
        reload(pplt)
        fig = plt.figure(figsize=(5, 5), dpi=dpi)
        j = 0
        df = pd.read_pickle(utl.outpath + 'ROC/bias.df').loc[depth].reset_index()
        df.method = df.method.replace({'HMM': comaleName})
        df = df.set_index(pd.read_pickle(utl.outpath + 'ROC/bias.df').loc[depth].index.names).sort_index().s
        dfh = pd.read_pickle(utl.outpath + 'ROC/bias.h.df').loc[depth].reset_index()
        dfh.method = dfh.method.replace({'HMM': comaleName})
        dfh = dfh.set_index(pd.read_pickle(utl.outpath + 'ROC/bias.h.df').loc[depth].index.names).sort_index().s
        df[(0.1, comaleName)] += np.random.rand(df[(0.1, comaleName)].shape[0]) / 100 - 0.005
        # df[(0.005, 'HMM')] += np.random.rand(df[(0.005, 'HMM')].shape[0]) / 100 - 0.005
        ax = []
        for nu0 in [0.005, 0.1]:
            if j == 0:
                ax += [plt.subplot(2, 2, j + 1)]
            else:
                ax += [plt.subplot(2, 2, j + 1, sharex=ax[-1], sharey=ax[-1])]
            a = pd.DataFrame(df.loc[nu0])
            a.columns = ['bias']
            a['method'] = a.index.get_level_values('method')
            a['s'] = a.index.get_level_values('S')
            sns.violinplot(x="s", y="bias", hue="method", data=a, linewidth=1,
                           palette={comaleName: "r", "GP": "darkblue"}, split=True, ax=ax[j]);
            if j < 2: ax[j].set_title(r'$\nu_0=${} ({} Sweep)'.format(nu0, ('Soft', 'Hard')[nu0 == 0.005]),
                                      fontsize=fontsize + 2)
            # pplt.annotate(r'$\nu_0=${} ({} Sweep)'.format(nu0, ('Soft', 'Hard')[nu0 == 0.005]),xpad=0.05,ypad=1,fontsize=fontsize)
            plt.locator_params(axis='y', nbins=5)
            pplt.setSize(plt.gca(), fontsize)
            ax[j].set_xlabel('$s$', fontsize=fontsize + 2)
            # plt.xlabel('$s$', fontsize=fontsize + 2)
            ax[j].legend(title='', loc='upper right', fontsize=fontsize + 2)
            ax[j].set_ylabel(('Bias ($s-\hat{s}$)', '')[j % 2], fontsize=fontsize + 2)

            if j != 1: ax[j].legend_.remove()
            pplt.annotate(ABCD[j], fontsize=fontsize + 2, ax=ax[j])

            j += 1
        for nu0 in [0.005, 0.1]:
            if j == 2:
                ax += [plt.subplot(2, 2, j + 1)]
            else:
                ax += [plt.subplot(2, 2, j + 1, sharex=ax[-1], sharey=ax[-1])]
            a = pd.DataFrame(dfh.loc[nu0])
            a.columns = ['bias']
            a['method'] = a.index.get_level_values('method')
            a['s'] = a.index.get_level_values('S')
            sns.violinplot(x="s", y="bias", hue="method", data=a[a['method'] == comaleName], linewidth=1,
                           palette={comaleName: "r", "GP": "g"}, ax=ax[j]);
            plt.locator_params(axis='y', nbins=5)
            pplt.setSize(plt.gca(), fontsize)
            ax[j].set_xlabel('$h$', fontsize=fontsize + 2)
            ax[j].set_ylabel(('Bias ($h-\hat{h}$)', '')[j % 2], fontsize=fontsize + 2)
            pplt.annotate(ABCD[j], fontsize=fontsize + 2, ax=ax[j])
            ax[j].legend_.remove()
            j += 1
        df = df.groupby(level=['method', 'nu0']).describe().round(3).unstack(['method', 'nu0']).loc[
            ['mean', 'std']].T.reset_index().sort_values('nu0')
        df.columns = ['Method', r'$\nu_0$', 'Mean', 'STD']
        utl.DataframetolaTexTable(df, fname=utl.paperPath + 'tables/bias.{}.tex'.format(depth))
        pplt.savefig('bias.{}'.format(depth), dpi)
示例#14
0
def scanSFSSNPbased():
    scores = rutl.loadScores(skipHetChroms=True)
    # field = comale;
    # df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[
    #     [field, 'Num. of SNPs']]
    # plotOne(df, df[df[field] > df[field].quantile(0.99)], fname='all')
    reload(rutl)
    reload(pplt)
    reload(utl)
    # SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=100)
    # sfs0 = utl.scanGenomeSNP(rutl.getNut(0, skipHetChroms=True), SFSelect)
    # sfst = utl.scanGenomeSNP(rutl.getNut(59, skipHetChroms=True), SFSelect).rename(59);     sfs=(sfst-sfs0);    sfs[sfs<0]=None
    g = ga.loadGeneCoordinates().set_index('name')
    genes = g.loc[['Ace', 'Cyp6g1', 'CHKov1']].reset_index().set_index('CHROM')

    shade = scores.sort_values().reset_index().iloc[-2:].rename(columns={'POS': 'start'});
    shade['end'] = shade.start + 100
    cand = pd.concat([scores, scores.rank(ascending=False).rename('rank'), rutl.getNut(0, skipHetChroms=True)],
                     axis=1).sort_values('rank')
    chroms = ['2L', '2R', '3L', '3R']
    reload(utl)

    # reload(pplt);pplt.Genome(sfs.loc[chroms],genes=genes);plt.tight_layout(pad=0.1)
    df = pd.concat(
            [utl.scanGenomeSNP(scores.abs(), lambda x: x.mean(), winSize=200, step=100, skipFromFirst=900).rename(200),
             utl.scanGenomeSNP(scores.abs(), lambda x: x.mean(), winSize=500, step=100, skipFromFirst=750).rename(500),
             utl.scanGenomeSNP(scores.abs(), lambda x: x.mean(), winSize=1000, step=100, skipFromFirst=500).rename(
                 1000)], axis=1)
    df['comb'] = df[200] * df[500] * df[1000]

    fig = plt.figure(figsize=(7, 4.5), dpi=300);
    pplt.Manhattan(data=sort(df.rename(columns={'comb': '200*500*1000'})), fig=fig, markerSize=2, ticksize=8,
                   sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()];
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('SNPbased'))
    pplt.Genome(df.comb);
    plt.tight_layout(pad=0.1)

    # analyzie()
    # scanSFS()
    # outlier()
    # scanSFSSNPbased()
    a = df.comb
    o = localOutliers(a, q=0.9);
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()];
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('SNPbased.candidates'))

    Scores = pd.concat([scores.rename('scores').abs(), scores.groupby(level=0).apply(
        lambda x: pd.Series(range(x.size), index=x.loc[x.name].index)).rename('i')], axis=1)
    cutoff = FDR(o, Scores);

    a = pd.concat([df, cutoff[cutoff.sum(1) > 0]], axis=1).dropna();
    for fdr in [0.95, 0.99, 0.999]:
        o = a[a.comb > a[fdr]]
        fig = plt.figure(figsize=(7, 1.5), dpi=300);
        pplt.Manhattan(data=df.comb, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
        [pplt.setSize(ax, 5) for ax in fig.get_axes()];
        plt.gcf().subplots_adjust(bottom=0.15);
        plt.savefig(utl.paperPath + 'new/{}.pdf'.format('SNPbased.fdr{}'.format(fdr)))
示例#15
0
文件: Markov.py 项目: airanmehr/bio
def plotNull(subp, nu0=0.005, fontsize=5):
    obs = pd.read_pickle(utl.outpath + 'markov/neutral.obs.{}.pkl'.format(nu0))
    T = Markov.computeTransition(0, N=1000)

    dfplt = pd.concat([pd.Series({'scale': 10, 'xlim': [0.0, 0.01], 'ylim': [0, 1]}, name=(0.005, 1)),
                       pd.Series({'scale': 30, 'xlim': [0.06, 0.14], 'ylim': [0, 0.15]}, name=(0.1, 1)),
                       pd.Series({'scale': 30, 'xlim': [0.0, 0.015], 'ylim': [0, 0.3]}, name=(0.005, 10)),
                       pd.Series({'scale': 45, 'xlim': [0.0, 0.2], 'ylim': [0, 0.025]}, name=(0.1, 10)),
                      pd.Series({'scale':30, 'xlim':[0.0,0.03],'ylim': [0,0.2]},name=(0.005,100)),pd.Series({'scale':50, 'xlim':[0.00,0.4],'ylim': [0,0.004]},name=(0.1,100))
                         ],axis=1).T

    markov=T.loc[nu0].copy(True);markov.name='Markov Chain'
    xx=np.arange(0,1,0.00001)
    N=200; tau=1;h=2*nu0*(1-nu0);sig2=h*tau/N;brownian=stats.norm(nu0, sig2).pdf(xx);
    brownian=pd.Series(brownian,index=xx);brownian/=brownian.sum();brownian.name='Brownian Motion';brownian*=dfplt.loc[(nu0,tau)].scale
    pplt.setSize(plt.gca(), fontsize=fontsize)
    plt.subplot(3, 3, subp[0]);
    brownian.plot(color='r');
    markov.plot(color='b');
    o=pd.Series(obs.X[1].flatten()).value_counts().sort_index();o=o/o.sum();
    if nu0==0.1:
        counts,limits=np.histogram(obs.X[1].flatten(),bins=500,range=[0,1]);centers = 0.5*(limits[1:]+limits[:-1]);o=pd.Series(counts,index=centers);o=o/(obs.X.shape[1]*obs.X.shape[2]*4)
    o.plot(color='g')
    plt.xlim(dfplt.loc[(nu0, tau)].xlim);
    plt.ylim(dfplt.loc[(nu0, tau)].ylim);
    plt.locator_params(nbins=3)
    pplt.annotate(r'$s=0$, $\nu_0=${}, $\tau$={}'.format(nu0, tau), fontsize=fontsize)
    plt.ylabel(r'$P(\nu_\tau|\nu_0)$')
    tau=10
    for _ in range(9):
        markov=markov.dot(T)
    N=200;h=2*nu0*(1-nu0);sig2=h*tau/N;brownian=stats.norm(nu0, sig2).pdf(xx)
    brownian=pd.Series(brownian,index=xx);brownian/=brownian.sum();brownian.name='Brownian Motion';
    brownian*=dfplt.loc[(nu0,tau)].scale
    pplt.setSize(plt.gca(), fontsize=fontsize)
    plt.title('({})'.format(subptitle[subp[0] - 1]), fontsize=fontsize)
    plt.subplot(3, 3, subp[1]);
    brownian.plot(color='r');
    markov.plot(color='b');
    o=pd.Series(obs.X[10].flatten()).value_counts().sort_index();o=o/o.sum();
    if nu0==0.1:
        counts,limits=np.histogram(obs.X[10].flatten(),bins=100,range=[0,1]);centers = 0.5*(limits[1:]+limits[:-1]);o=pd.Series(counts,index=centers);o=o/(obs.X.shape[1]*obs.X.shape[2]*20)
    o.plot(color='g')
    plt.xlim(dfplt.loc[(nu0, tau)].xlim);
    plt.ylim(dfplt.loc[(nu0, tau)].ylim);
    plt.locator_params(nbins=3)
    pplt.annotate(r'$s=0$, $\nu_0=${}, $\tau$={}'.format(nu0, tau), loc=1, fontsize=fontsize)
    pplt.setSize(plt.gca(), fontsize=fontsize)



    tau=100
    for _ in range(90):
        markov=markov.dot(T)
    N=200;h=2*nu0*(1-nu0);sig2=h*tau/N;brownian=stats.norm(nu0, sig2).pdf(xx)
    brownian=pd.Series(brownian,index=xx);brownian/=brownian.sum();brownian.name='Brownian Motion';
    brownian*=dfplt.loc[(nu0,tau)].scale
    plt.title('({})'.format(subptitle[subp[1] - 1]), fontsize=fontsize)
    plt.subplot(3, 3, subp[2]);
    brownian.plot(color='r');
    markov.plot(color='b')
    o=pd.Series(obs.X[100].flatten()).value_counts().sort_index();o=o/o.sum();
    if nu0==0.1:
        counts,limits=np.histogram(obs.X[100].flatten(),bins=30,range=[0,1]);centers = 0.5*(limits[1:]+limits[:-1]);o=pd.Series(counts,index=centers);o=o/(obs.X.shape[1]*obs.X.shape[2]*60)
    o.name = 'Observation';
    o.plot(color='g')
    plt.xlim(dfplt.loc[(nu0, tau)].xlim);
    plt.ylim(dfplt.loc[(nu0, tau)].ylim);
    plt.locator_params(nbins=3)
    pplt.annotate(r'$s=0$, $\nu_0=${}, $\tau$={}'.format(nu0, tau), loc=1, fontsize=fontsize)
    if subp[2] == 3:
        plt.legend(loc='center right', fontsize=fontsize)
    pplt.setSize(plt.gca(), fontsize=fontsize)
    plt.title('({})'.format(subptitle[subp[2] - 1]), fontsize=fontsize)
示例#16
0
文件: Plot.py 项目: airanmehr/bio
def plotPowerCLRQ(recompute=False):
    dpi = pplt.PLOS.dpi;
    fontsize = 7
    sns.set_style("whitegrid", {"grid.color": "0.9", 'axes.linewidth': .5, "grid.linewidth": ".09"})
    if recompute:
        a = f(loadHMMAllDepths());
        a = a[a.index.get_level_values('coverage') != np.inf]
        Q = np.sort(np.append(np.arange(0, 1.01, 0.1), 0.9 + np.arange(0, 1, 0.1)[1:] / 10))
        # Q = [0, 0.5,0.9,0.95,0.96,0.97,0.98, 0.99, 1]
        df = pd.concat(map(lambda q: a.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(q)].mean()), Q),
                       axis=1)
        dfa = pd.concat(map(lambda q: a.abs().groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(q)].mean()), Q),
                        axis=1)
        df.columns = pd.MultiIndex.from_product([Q, [False]], names=['Quantile', 'ModifiedLR'])
        df = df.stack(df.columns.names).reorder_levels([0, 6, 7] + range(1, 6))
        dfa.columns = pd.MultiIndex.from_product([Q, [True]], names=['Quantile', 'ModifiedLR'])
        dfa = dfa.stack(dfa.columns.names).reorder_levels([0, 6, 7] + range(1, 6))
        df = pd.concat([df, dfa])
        df.to_pickle(utl.outpath + 'ROC/PowerCLRTable.df')
        df = df[df.index.get_level_values("coverage") != np.inf]
        boot = pd.DataFrame([np.sort(np.random.choice(1000, 250, replace=False)) for _ in range(100)]).T;
        print boot
        dfboot = boot.groupby(level=0, axis=1).apply(
            lambda x: getPower(df.loc[pd.IndexSlice[:, :, :, :, :, :, :, x[x.name].values]].sort_index(),
                               groupbyLevels=range(6)).xs('HMM', level='method')).groupby(level=range(4)).mean();
        print dfboot
        dfboot.columns.name = 'i'
        dfboot = dfboot.stack('i').reset_index(['i', 'ModifiedLR', 'Quantile']);
        print dfboot
        dfboot.to_pickle(utl.outpath + 'ROC/PowerCLRTableBootstrap.df')
    else:
        df = pd.read_pickle(utl.outpath + 'ROC/PowerCLRTable.df')
        dfboot = pd.read_pickle(utl.outpath + 'ROC/PowerCLRTableBootstrap.df')

    dfboot.Quantile = (dfboot.Quantile * 100).astype(int)
    sns.set_context(rc={"lines.linewidth": 0.5})
    pistar = {}
    ABCD = map(lambda x: '({})'.format(x), list('ABCDEFG'))

    fig, axes = plt.subplots(2, 3, sharey=True, sharex=True, figsize=(6, 2.5), dpi=dpi);
    j = 0
    for nu0, axs in zip([0.005, 0.1], axes):
        for depth, ax in zip([30, 100, 300], axs):
            a = dfboot.loc[depth].loc[nu0]
            if nu0 == 0.005: ax.set_title(r'$\lambda$={}'.format(str(depth)).replace('inf', '$\infty$'))
            sns.tsplot(data=a, time='Quantile', unit='i', value=0, condition='ModifiedLR', ci=99.99, legend=False,
                       color=['r', 'darkblue'], ax=ax)
            pistar.update({ax: (ABCD[j], r'($\pi^*=${})'.format(a.groupby('Quantile')[0].mean().idxmax()))})
            pplt.setSize(ax, fontsize)
            ax.set_xlabel('');
            ax.set_ylabel('')
            j += 1
    axes[0][0].locator_params(nbins=3);
    for ax in axes[1]: ax.set_xlabel(r'$\pi$')
    for ax in [axes[0][0], axes[1][0]]: ax.set_ylabel('Avg. Power\n({} Sweep)'.format(('Soft', 'Hard')[nu0 == 0.005]))
    plt.gcf().subplots_adjust(bottom=0.2)
    # [pplt.annotate(v[1],ax=k,fontsize=fontsize) for k,v in pistar.items() ]
    [pplt.annotate('({})'.format(list('ABCDEF')[j]), ax=x, fontsize=7) for j, x in enumerate(axes.reshape(-1))]
    [pplt.annotate(v[1], ax=k, fontsize=7, xpad=0.6) for x, (k, v) in zip(ABCD, pistar.items())]
    axes[1][-1].legend([r'$\mathcal{H}$', '$\mathcal{H}^+$'], loc='lower right', prop={'size': fontsize})

    mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': fontsize});
    mpl.rc('text', usetex=True)
    pplt.savefig('CLRQ', dpi)
    plt.show()
示例#17
0
文件: PowerROC.py 项目: airanmehr/bio
pd.options.display.max_rows = 20;
pd.options.display.expand_frame_repr = False

import popgen.TimeSeries.RNN.Evaluate as evl
import seaborn as sns
import popgen.Plots as pplt

df = evl.randomROCData()
sns.set_style("white", {"grid.color": "0.9", 'axes.linewidth': .5, "grid.linewidth": "9.99"})
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 1})
mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 6});
mpl.rc('text', usetex=True)

reload(evl)
dpi = 300
plt.figure(figsize=(4, 2), dpi=dpi)
plt.subplot(1, 2, 1)
evl.plotROC(df, FPth=1)
plt.ylabel('True Positive Rate (TPR)')
plt.xlabel('False Positive Rate (FPR)')
pplt.setSize(plt.gca())
plt.subplot(1, 2, 2)
evl.plotROC(df)
plt.xlabel('False Positive Rate (FPR)')
pplt.setSize(plt.gca())
plt.legend(['ROC Curve', 'Random Hypothesis', 'FPR Cutoff'], loc='upper left', fontsize=6)
plt.tight_layout(pad=0.1)
pplt.savefig('powerROC', dpi)
plt.show()
示例#18
0
nu = pd.Series(np.arange(0, 1.00001, 0.001), index=np.arange(0, 1.00001, 0.001))


def bio(cd): c, d = cd;return sc.misc.comb(d, c) * ((nu) ** c) * ((1 - nu) ** (d - c))


cd = np.array([1, 5])
a = bio(cd);
a /= a.sum()
cd *= 10;
b = bio(cd);
b /= b.sum()
cd *= 10;
c = bio(cd);
c /= c.sum()
dpi = 300
plt.figure(figsize=(4, 2), dpi=dpi)

df = pd.DataFrame([a, b, c]).T
df
df.plot(ax=plt.gca())
plt.legend([r'Pr($\nu |c=1,d=5$)', r'Pr($\nu |c=10,d=50$)', r'Pr($\nu|c=100,d=500$)'], fontsize=6)
plt.xlabel(r'$\nu$')
plt.ylabel(r'Pr($\nu|c,d$)')
plt.ylim([-0.0005, plt.ylim()[1]])
pplt.setSize(plt.gca(), 6)
plt.gcf().subplots_adjust(bottom=0.15)
pplt.savefig('statePosterior', dpi)
plt.show()
示例#19
0
def Final():
    ############ preparing data
    def saveGOTex(df):
        name = np.unique(df.index)[0]
        print '*' * 80, name
        df = df.sort_values('-log($p$-value)', ascending=False)
        df['Rank'] = range(1, df.shape[0] + 1);
        df = df.iloc[:, [6] + range(6)]
        path = utl.paperPath + '/tables/{}.tex'.format(name);
        df.to_csv(path.replace('.tex', '.csv').replace('/tables/', '/data/'))
        utl.DataframetolaTexTable(df.iloc[:, :-1], alignment=['c', 'c', 'p{3in}', 'c', 'c', 'c'], fname=path)

    goPvalue = lambda x: utl.getPvalFisher(AllGenes=allVariantGenes.values, putativeList=x.values,
                                           myList=g.index.values)
    unpackp = lambda x: [min(6, np.round(x[0], 1)), x[1].loc['Putative', 'myList']]
    # Score = lambda x,f:f(scores.loc[x.CHROM][(scores.loc[x.CHROM].index>=x.start)&(scores.loc[x.CHROM].index<=x.end)])
    sort = lambda df: pd.concat(
            [df[df.index.get_level_values('CHROM') == ch] for ch in ['X', '2L', '2R', '3L', '3R']]).rename(
        columns={'H': r'$\mathcal{H}^+$', 'M': 'Num. of Variants'})
    Genes = loadGeneData().reset_index().set_index('GO')
    Genes = Genes.loc[
        (Genes['FBgn'].groupby(level=0).apply(lambda x: len(x.unique())) > 2).replace({False: None}).dropna().index]
    scores = utl.getEuChromatin(rutl.loadScores(skipHetChroms=True))
    ann = pd.DataFrame(scores).join(loadANN(), how='inner')
    allVariantGenes = ann['Gene_ID'].drop_duplicates()
    # f=lambda x: x[x>=x.quantile(0.9)].mean()
    # geneScores=ann.reset_index().set_index('Gene_ID')[['CHROM','POS',0]].drop_duplicates().groupby(level=0)[0].apply(f)


    ############ computing candidate regions
    scan = utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size}, winSize=30000)
    o = utl.localOutliers(scan.H, q=0.99);
    o = scan.loc[o.index]
    fig = plt.figure(figsize=(7, 2.5), dpi=300);
    pplt.Manhattan(data=sort(scan), Outliers=sort(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 6) for ax in fig.get_axes()];

    pplt.annotate('(A)', ax=fig.axes[0], fontsize=8)
    pplt.annotate('(B)', ax=fig.axes[1], fontsize=8)
    plt.gcf().subplots_adjust(bottom=0.15);
    pplt.savefig('manhattan', 300)
    plt.savefig(utl.paperFiguresPath + 'manhattan.pdf')
    regions = utl.BED.getIntervals(o.H, padding=30000);
    print regions.shape
    intervalGenes = utl.BED.intersection(ann, regions).name.drop_duplicates().reset_index().set_index('name');
    print intervalGenes.size
    g = intervalGenes;
    # intervalGenes
    # g=g[g>=g.quantile(0.)];
    print g.size
    df = Genes.groupby(level=0).apply(lambda x: pd.DataFrame(
        [x.name, x.term.iloc[0]] + unpackp(goPvalue(x.FBgn.drop_duplicates())) + [x.ontology.iloc[0],
                                                                                  x.FBgn.unique().size] + [
            np.intersect1d(x.values, g.index.values)],
        index=['GO ID', 'GO Term', '-log($p$-value)', 'Hits', 'Ontology', 'Num of Genes', 'Genes']).T)
    df = df[(df['-log($p$-value)'] >= 3) & (df.Hits >= 3)]
    df['-log($p$-value)'] = df['-log($p$-value)'].astype(str)
    df = df.set_index('Ontology')
    df.groupby(level=0).apply(saveGOTex);
    print df

    tempGenes = Genes.reset_index().set_index('FBgn').loc[
        np.append(df.set_index('GO ID').loc['GO:0009631'].Genes, df.set_index('GO ID').loc['GO:0009408'].Genes)][
        ['term', 'name', 'GO']].reset_index().set_index('GO').loc[['GO:0009631', 'GO:0009408']].drop_duplicates()
    tempGenes.columns = ['FlyBase ID', 'GO Term', 'Gene Name']
    utl.DataframetolaTexTable(tempGenes, fname=utl.paperPath + '/tables/{}.tex'.format('tempGenes'),
                              alignment=['l', 'l', 'l'])


    regions.to_csv(utl.paperPath + 'data/intervals.csv')

    snps = utl.BED.intersection(scores.reset_index(), regions, 0);
    snps['POS'] = snps.start;
    snps.set_index('POS', append=True, inplace=True)
    snps = snps['name'].astype(float).reset_index().drop_duplicates().set_index(['CHROM', 'POS']).name

    def ff(x):
        y = utl.BED.intersection(scores.reset_index(), x, 0).rename(columns={'start': 'POS'}).set_index('POS',
                                                                                                append=True).name.astype(
            float)
        y = y[y > 0]
        y = y[y >= y.quantile(0.9)]
        print x['len'].iloc[0], y.size
        return y

    cands = regions.reset_index().groupby(level=0).apply(ff).reset_index(level=0).name
    cands.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/cands.final.txt',
                                                                       sep='\t', header=None, index=False)
    scores.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/allsnps.txt',
                                                                        sep='\t', header=None, index=False)

    name = 'cands.final.out.tsv'
    gowinda = pd.read_csv('/home/arya/out/real/gowinda/{}'.format(name), sep='\t', header=None)[[0, 4, 5, 6, 7, 8, 9]]
    gowinda.columns = ['GO ID', '-log($p$-value)', 'Hits', 'Num of Genes', 'Total Genes', 'GO Term', 'Genes']
    gowinda = gowinda[gowinda.Hits >= 3]
    gowinda['-log($p$-value)'] = -gowinda['-log($p$-value)'].apply(np.log10).round(1)
    gowinda.to_csv(utl.paperPath + 'data/gowinda.all.tsv', sep='\t')
    bp = gowinda.set_index('GO ID').loc[
        Genes[Genes.ontology == 'biological_process'].index.unique().rename('GO ID')].dropna()
    bp.to_csv(utl.paperPath + 'data/gowinda.bp.tsv', sep='\t')
    utl.DataframetolaTexTable(bp.reset_index()[['GO ID', 'GO Term', '-log($p$-value)']], alignment=['c', 'p{4in}', 'c'],
                              fname=utl.paperPath + 'tables/gowinda.tex')

    map(len, (Genes.index.unique(), bp.index.unique(), df.loc['biological_process']['GO ID'].unique())), len(
        np.intersect1d(bp.index.unique(), df['GO ID'].unique()))

    pval = utl.getPvalFisher(Genes[Genes.ontology == 'biological_process'].index.unique(), bp.index.unique(),
                             df.loc['biological_process']['GO ID'].unique())
    print pval

    stats = pd.Series(None, name='Value')

    stats['Num. of Vatiants'] = scores.size
    stats['Num. of Candidate Intervals'] = regions.shape[0]
    stats['Total Num. of Genes'] = loadGeneCoordinates().shape[0]
    stats['Num. of Variant Genes'] = ann['Gene_ID'].unique().shape[0]
    stats['Num. of Genes within Candidate Intervals'] = intervalGenes.shape[0]
    stats['Total Num. of GO'] = len(loadGeneData().index.unique())
    stats['Num. of GO with 3 or More Genes'] = len(Genes.index.unique())
    stats['Num. of Candidate Variants for Gowinda'] = cands.size
    stats = stats.apply(lambda x: '{:,.0f}'.format(x))
    stats.index.name = 'Statistic'
    print stats
    utl.DataframetolaTexTable(stats.reset_index(), fname=utl.paperPath + 'tables/stats.tex', alignment=['l', 'r'])