Пример #1
0
def test_multi_pvalcorrection():
    # test against R package multtest mt.rawp2adjp
    # because of sort this doesn't check correct sequence - TODO: rewrite DONE
    rmethods = {
        "rawp": (0, "pval"),
        "Bonferroni": (1, "b"),
        "Holm": (2, "h"),
        "Hochberg": (3, "sh"),
        "SidakSS": (4, "s"),
        "SidakSD": (5, "hs"),
        "BH": (6, "fdr_i"),
        "BY": (7, "fdr_n"),
    }

    for k, v in rmethods.items():
        if v[1] in ["b", "s", "sh", "hs", "h", "fdr_i", "fdr_n"]:
            # pvalscorr = np.sort(multipletests(pval0, alpha=0.1, method=v[1])[1])
            r_sortindex = [6, 8, 9, 7, 5, 1, 2, 4, 0, 3]
            pvalscorr = multipletests(pval0, alpha=0.1, method=v[1])[1][r_sortindex]
            assert_almost_equal(pvalscorr, res_multtest[:, v[0]], 15)

    pvalscorr = np.sort(fdrcorrection0(pval0, method="n")[1])
    assert_almost_equal(pvalscorr, res_multtest[:, 7], 15)
    pvalscorr = np.sort(fdrcorrection0(pval0, method="i")[1])
    assert_almost_equal(pvalscorr, res_multtest[:, 6], 15)
Пример #2
0
def run_script(data_file, options):
    k = 0
    genes, p_vals, gene_data = [], [], []
    overs, unders, obs = [], [], {}
    pm_under, pn_under, pm_over, pn_over = [], [], [], []

    for line in open(data_file):
        line = line.split()
        if line[0] == '---': header = line
        else:
            if k == 0:
                dex = out_line(line)

            if '|' in line[0] or ';RP' in line[0]: continue
            dex.add_line(line)
            genes.append(dex.gene)
            obs[dex.gene] = dex.OR
            overs.append(dex.over)
            unders.append(dex.under)
            pn_under.append(dex.under[-2])
            pm_under.append(dex.under[-1])
            pn_over.append(dex.over[-2])
            pm_over.append(dex.over[-1])
            k += 1


#			genes.append(line[0])
#			gene_data.append(line[6])
#			p = float(line[-1])
#			p = float(line[10])
#			p_vals.append(p)
#k+=1
#if k > 10: break

    over_fdr_bool, over_fdr_res = mpt.fdrcorrection0(pm_over, alpha=0.05)
    under_fdr_bool, under_fdr_res = mpt.fdrcorrection0(pm_under, alpha=0.05)

    #	over_fdr_bool, over_fdr_res =  mpt.fdrcorrection0(pn_over,alpha=0.05)
    #	under_fdr_bool, under_fdr_res =  mpt.fdrcorrection0(pn_under,alpha=0.05)

    #print mpt.multipletests(p_vals,alpha=0.05,method='fdr_bh')
    #print 'hu'
    #	for g,b,f in zip(genes,over_fdr_bool,over_fdr_res):
    #		if b == True:
    #			print g,b,f

    #	print len(genes), len(under_fdr_bool), len(under_fdr_res)
    #	sys.exit()

    for g, b, f, d in zip(genes, over_fdr_bool, over_fdr_res, overs):

        #if b == True:
        print g, obs[g], 'over', b, f, d[0]

    for g, b, f, d in zip(genes, under_fdr_bool, under_fdr_res, unders):
        #if b == True:
        print g, obs[g], 'under', b, f, d[0]
Пример #3
0
def tabulate_week_to_week0_paired_stats(df, sample_type, metric, test_fn=scipy.stats.wilcoxon):
    # alternative test_fn: partial(scipy.stats.ttest_1samp, popmean=0)
    asd_data = filter_sample_md(df, [('SampleType', sample_type), ('Group', 'autism')])
    results = []
    asd_data['week'] = pd.to_numeric(asd_data['week'], errors='coerce')
    asd_data = asd_data.sort_values(by='week')
    weeks = asd_data['week'].unique()
    for i in weeks:
        g = asd_data[np.logical_or(asd_data['week'] == 0, asd_data['week'] == i)].groupby('SubjectID')
        paired_diffs = g.diff()[metric].dropna()
        t, p = test_fn(paired_diffs)
        results.append((len(paired_diffs), np.median(paired_diffs), t, p))
    result = pd.DataFrame(results, index=pd.Index(weeks, name='week'),
                          columns=['n', metric, 'test-statistic', 'p-value'])

    # Masking motiviated by the issue presented here (which is also where the
    # masking solution is derived from):
    # https://github.com/statsmodels/statsmodels/issues/2899
    pvals = result['p-value'].values
    mask = np.isfinite(pvals)
    qvals = np.empty(len(pvals))
    qvals.fill(np.nan)
    qvals[mask] = fdrcorrection0(pvals[mask])[1]

    result['q-value'] = qvals
    return result
Пример #4
0
def findBestKnockdownUsingMarginalLinearRegression(X, y, knockdownZeros):
    d = X.shape[1]
    best_objval = 0.0
    best_knockdown = None
    obj_vals = []
    p_vals = []
    for feat in range(d):
        #objval, pVal = stats.pearsonr(X[:,feat], y[:,0])
        objval, intercept, r_value, pVal, std_err = stats.linregress(X[:,feat], y[:,0])
        obj_vals.append(objval * (knockdownZeros[feat]-np.mean(X[:,feat])))
        p_vals.append(pVal)
    #print("Coeffs:", obj_vals)
    #print("Pvals:", p_vals)

    rejected, p_vals_corrected = mcp.fdrcorrection0(p_vals, alpha=0.05)
    #print("Corrected:", p_vals_corrected)
    #print("Rejected:", rejected)

    for feat in range(d):
        if rejected[feat] == True and obj_vals[feat] > best_objval:
            best_objval = obj_vals[feat]
            best_knockdown = feat
        if rejected[feat] == False:
            obj_vals[feat] = 0
    if best_knockdown is None:
        print("warning (indep. marg. regr.): no good knockdown identified")
    return((best_knockdown, best_objval, obj_vals))
Пример #5
0
def tabulate_week_to_week0_paired_stats(df,
                                        sample_type,
                                        metric,
                                        test_fn=scipy.stats.wilcoxon):
    # alternative test_fn: partial(scipy.stats.ttest_1samp, popmean=0)
    asd_data = filter_sample_md(df, [('SampleType', sample_type),
                                     ('Group', 'autism')])
    results = []
    asd_data['week'] = pd.to_numeric(asd_data['week'], errors='coerce')
    asd_data = asd_data.sort_values(by='week')
    weeks = asd_data['week'].unique()
    for i in weeks:
        g = asd_data[np.logical_or(asd_data['week'] == 0,
                                   asd_data['week'] == i)].groupby('SubjectID')
        paired_diffs = g.diff()[metric].dropna()
        t, p = test_fn(paired_diffs)
        results.append((len(paired_diffs), np.median(paired_diffs), t, p))
    result = pd.DataFrame(results,
                          index=pd.Index(weeks, name='week'),
                          columns=['n', metric, 'test-statistic', 'p-value'])

    # Masking motiviated by the issue presented here (which is also where the
    # masking solution is derived from):
    # https://github.com/statsmodels/statsmodels/issues/2899
    pvals = result['p-value'].values
    mask = np.isfinite(pvals)
    qvals = np.empty(len(pvals))
    qvals.fill(np.nan)
    qvals[mask] = fdrcorrection0(pvals[mask])[1]

    result['q-value'] = qvals
    return result
Пример #6
0
def adjust_r(r, n=3539, **fdr_params):
    from statsmodels.sandbox.stats.multicomp import fdrcorrection0
    from scipy.stats import betai
    df = n - 2
    t_squared = r * r * (df / ((1.0 - r) * (1.0 + r)))
    prob = betai(0.5 * df, 0.5, df / (df + t_squared))
    return fdrcorrection0(prob)
Пример #7
0
def p_adj_map_from_scores(r, n=3539):
    '''Creates a p map with adjusted p values from scores (correlations)'''
    from scipy.stats import betai
    df = n-2
    t_squared = r*r * (df / ((1.0 - r) * (1.0 + r)))
    prob = betai(0.5*df, 0.5, df / (df+t_squared))
    return fdrcorrection0(prob)
Пример #8
0
def main(algo_sample=None,
         dataset_sample=None,
         tsv_file_name=os.path.join(constants.OUTPUT_GLOBAL_DIR, "emp_fdr",
                                    "MAX", "emp_diff_{}_{}_md.tsv")):
    output_md = pd.read_csv(tsv_file_name.format(dataset_sample, algo_sample),
                            sep='\t',
                            index_col=0).dropna()
    output_md = output_md.rename(columns={"filtered_pval": "hg_pval"})
    filtered_genes = output_md.loc[np.logical_and.reduce(
        [output_md["n_genes"].values > 5, output_md["n_genes"].values < 500]
    ), ["GO name", "hg_pval", "emp_pval", "passed_oob_permutation_test"]]

    print "total n_genes with pval:{}/{}".format(
        np.size(filtered_genes["hg_pval"].values), 7435)

    sorted_genes_hg = filtered_genes.sort_values(by=['hg_pval'],
                                                 ascending=False)
    sig_genes_hg_pval = np.append(
        sorted_genes_hg["hg_pval"].values,
        np.zeros(7435 - np.size(sorted_genes_hg["hg_pval"].values)))
    sig_genes_hg_pval = [10**(-x) for x in sig_genes_hg_pval]
    fdr_results = fdrcorrection0(sig_genes_hg_pval,
                                 alpha=0.05,
                                 method='indep',
                                 is_sorted=False)
    n_hg_true = len([cur for cur in fdr_results[0] if cur])
    sig_hg_genes = sorted_genes_hg.iloc[:n_hg_true, :] if n_hg_true > 0 else 0
    HG_CUTOFF = 10**(-sig_hg_genes.iloc[-1]["hg_pval"])
    print "HG cutoff: {}, n={}".format(HG_CUTOFF, len(sig_hg_genes.index))

    sorted_genes_emp = filtered_genes.sort_values(by=['emp_pval'])
    sorted_genes_emp.loc[sorted_genes_emp['emp_pval'] == 0,
                         'emp_pval'] = 1.0 / 1000
    sig_genes_emp_pval = sorted_genes_emp["emp_pval"].values
    fdr_results = fdrcorrection0(sig_genes_emp_pval,
                                 alpha=0.05,
                                 method='indep',
                                 is_sorted=False)
    n_emp_true = len([cur for cur in fdr_results[0] if cur])
    sig_emp_genes = sorted_genes_emp.iloc[:n_emp_true, :]
    EMP_CUTOFF = sig_emp_genes.iloc[-1]["emp_pval"] if n_emp_true > 0 else 0
    print "EMP cutoff: {}, n={}".format(EMP_CUTOFF, len(sig_emp_genes.index))

    # genes_oob = filtered_genes.loc[filtered_genes["passed_oob_permutation_test"],:]
    # print "OOB genes: n={}".format(len(genes_oob.index))

    return sig_hg_genes, sig_emp_genes
Пример #9
0
def FDR(pvalues):
    """
    pvalues - list of p p-values
    returns a list of q-values
    """
    # res[1] is a vector with the "q-values" (these are the FDR-adjusted p-values)
    res = fdrcorrection0(pvalues)
    return res[1]
Пример #10
0
def calc_sliding_fdr_correct(feat_df, unannotated_only=False):
    df = feat_df.fillna(0.0)
    if unannotated_only:
        df = df[~df.annotated]
    #df['sliding_pvalues'] = stats.norm.sf(abs(df['sliding_zscore'].values))
    df['sliding_pvalues'] = stats.norm.sf(df['sliding_zscore'].values)
    df['sliding_pvalues_fdrcor'] = fdrcorrection0(df['sliding_pvalues'])[1]

    return df[['sliding_pvalues', 'sliding_pvalues_fdrcor']]
Пример #11
0
    def anova(self, sub=None, bottom=0, top=None, step=1, verbose=True):
        """ perform Anova tests (omnibus test for difference in mean) for each epoch with mask as factor

        Args:
            sub: list of integers, specifies a subsample of the masks
            bottom: lower-range for printing epochs (inclusive)
            top: upper range for printing epochs (exclusive)
            step: step size for priniting epochs

        Returns:
            F
            p-values (uncorrected)
            p-values (corrected)
        """
        if not top:
            top = self.epochs
        assert [type(x) for x in [bottom, top, step]
                ] == [int, int, int
                      ], 'bottom, top, step must be of type int, got %s' % str(
                          [type(x) for x in [bottom, top, step]])

        if not sub:
            sub = range(self.masks)

        res = [
        ]  # will become a list of length = number of epochs. each element in list is a tuple (F, p)
        for i in xrange(self.epochs):
            args = np.split(
                self.data[sub, :, i], len(sub), axis=0
            )  # split results into multiple arrays, one for each mask
            args = [x.reshape((x.shape[1])) for x in args]
            F, p = stats.f_oneway(*args)  # F: test statistic, p: p-value
            res.append([F, p])

        _, p_corr_list = multicomp.fdrcorrection0(
            [x[1] for x in res], alpha=0.05, method='indep'
        )  # correct p-value for multiple comparison (Benjamini-Hochberg)

        if verbose:
            print "ANOVA: %s" % self.name
            print "Epoch\t",
            for i in sub:
                label = self.labels[i]
                print "MEAN(%s)\t" % label[0:9],
            print "F\tp\tSignif.\tp corr\tSignif"
            for i in xrange(bottom, top, step):
                print "%.0f\t" % i,
                for j in sub:
                    print "%.4f\t\t" % self.mean[j, i],
                F, p = res[i]
                p_corr = p_corr_list[i]
                print "%.2f\t%.4f\t%s\t%.2f\t%s" % (F, p, p_symbol(p), p_corr,
                                                    p_symbol(p_corr))

        return [x[0] for x in res][bottom:top:step], [
            x[1] for x in res
        ][bottom:top:step], p_corr_list[bottom:top:step]
Пример #12
0
def test_norm_intensity(df, samp_grps, paired, parametric):
    """
    run t-tests (or nonparametric tests) on dataframe.
    :param df: intensity df, missing values are NaN
    :param samp_grps: is a SampleGroups() object
    :param paired: whether or not to use a paired test
    :param parametric: Whether or not to use a parametric test
    :return: dataframe with appended pvalue columns
    """

    grp1_intcols = samp_grps.sample_names[samp_grps.grp_names[0]]
    grp2_intcols = samp_grps.sample_names[samp_grps.grp_names[1]]

    # change any zeros back to NaN
    df.replace(0, np.nan, inplace=True)

    # make copy, so df isn't changed by this function
    test_df = df.copy()

    # test, using logged df
    if parametric:
        # don't need to split into paired/unpaired, because both are available in ttest_ind
        test_results = test_df.apply(
            lambda x: sps.stats.ttest_ind(x[grp1_intcols].dropna(),
                                          x[grp2_intcols].dropna(),
                                          equal_var=paired).pvalue,
            axis=1)
    else:
        if paired:
            # wilcoxon is non-parametric equivalent of paired t-test
            test_results = test_df.apply(lambda x: sps.wilcoxon(
                x[grp1_intcols].dropna(), x[grp2_intcols].dropna()).pvalue,
                                         axis=1)
        else:
            # rank sum test is nonparametric equivalent of unpaired t-test
            test_results = test_df.apply(lambda x: sps.ranksums(
                x[grp1_intcols].dropna(), x[grp2_intcols].dropna()).pvalue,
                                         axis=1)

    # append fold changes to df
    df_means = log2_fold_change(df, samp_grps)

    # p values, uncorrected for multiple comparisons
    #df_means[P_COLNAME] = test_results
    df_means[P_COLNAME + "_" +
             samp_grps.fc_name.replace("log2fc_", "")] = test_results

    # fdr correction
    #df_means[P_CORR_COLNAME] = mc.fdrcorrection0(test_results, method='indep')[1]
    df_means[P_CORR_COLNAME + "_" +
             samp_grps.fc_name.replace("log2fc_", "")] = mc.fdrcorrection0(
                 test_results, method='indep')[1]

    # reset the index to be 'id' - this is mostly for testing, and doesn't affect the output file
    df_means.set_index('id', drop=False, inplace=True)
    return df_means
Пример #13
0
def retain_relevant_slices(G_original, module_sig_th):
    global G_modularity

    pertubed_nodes = []
    for cur_node in G_modularity.nodes():
        if G_modularity.nodes[cur_node]["pertubed_node"]:
            pertubed_nodes.append(cur_node)

    ccs = [
        G_modularity.subgraph(c) for c in connected_components(G_modularity)
    ]
    params = []
    p = multiprocessing.Pool(constants.N_OF_THREADS)
    n_G_original = len(G_original)
    n_pertubed_nodes = len(pertubed_nodes)
    pertubed_nodes_in_ccs = []
    print(f"number of slices: {len(list(ccs))}")
    for i_cur_cc, cur_cc in enumerate(ccs):
        pertubed_nodes_in_ccs.append(
            len([
                cur_node for cur_node in cur_cc
                if G_modularity.nodes[cur_node]["pertubed_node"]
            ]))
    perturbation_factor = min(0.7, (float(n_pertubed_nodes) / n_G_original) *
                              (1 + 100 / n_G_original**0.5))

    for i_cur_cc, cur_cc in enumerate(ccs):
        params.append([
            n_G_original, cur_cc, i_cur_cc, n_pertubed_nodes,
            perturbation_factor
        ])

    res = [a for a in p.map(pf_filter, params) if a is not None]
    print(f'# of slices after perturbation TH: {len(res)}/{len(params)}')
    p.close()
    if len(res) == 0:
        return nx.Graph(), [], []
    large_modules, sig_scores = zip(*res)
    fdr_bh_results = fdrcorrection0(sig_scores,
                                    alpha=module_sig_th,
                                    method='indep',
                                    is_sorted=False)

    # print(fdr_bh_results)
    # print(f'min: {min(list(fdr_bh_results[1]))}')
    passed_modules = [
        cur_cc
        for cur_cc, is_passed_th in zip(large_modules, fdr_bh_results[0])
        if is_passed_th
    ]
    return nx.algorithms.operators.union_all(passed_modules) if len(passed_modules) > 0 else nx.Graph(), [list(m.nodes)
                                                                                                          for m in
                                                                                                          passed_modules], \
           fdr_bh_results[1]
Пример #14
0
def p_adj_map_from_predictions(preds_pc, data_to_map):
    '''Creates a p map with adjusted p values from predictions'''
    from sklearn.preprocessing import StandardScaler
    from scipy.stats import betai
    mx = StandardScaler().fit_transform(preds_pc)
    my = StandardScaler().fit_transform(data_to_map)
    n = mx.shape[0]
    r = (1/(n-1))*((mx*my).sum(axis=0))
    df = n-2
    t_squared = r*r * (df / ((1.0 - r) * (1.0 + r)))
    prob = betai(0.5*df, 0.5, df / (df+t_squared))
    return fdrcorrection0(prob)
Пример #15
0
def save_heads(heads_path, t_data, p_data, sensor_type, freqs):

    if not os.path.isdir(heads_path):
        os.makedirs(heads_path)
    mask, adjusted_p = fdrcorrection0(p_data.flatten(), 0.3)
    mask = mask.reshape(p_data.shape)
    t_data[~mask] = 0.001  # Because topomap can't drow zero heads
    for freq_indx in range(t_data.shape[1]):
        title = visualise(t_data[:, freq_indx], p_data[:, freq_indx],
                          sensor_type, freqs[freq_indx])
        plt.savefig(os.path.join(heads_path, title + '.png'))
        plt.close()
Пример #16
0
    def _fdr_correct(self, pvalue_matrix, dom_shape):
        logger.info('FDR correcting the pvalues')
        pvalue_matrix[np.isnan(pvalue_matrix)] = 1.0
        corrected = fdrcorrection0(pvalue_matrix.flatten(), self.threshold)[1].reshape(dom_shape)
        #np.save(open('pval.npy','wb'), pvalue_matrix)

        corr_sigs = []
        for i in range(dom_shape[0]):
            for j in range(dom_shape[1]):
                pval = corrected[i, j]
                if pval < self.threshold:
                    corr_sigs.append((i, j, pval))

        return corr_sigs
Пример #17
0
    def calc(self, regs, hicmap):
        """
        Calculates weighted intensities, p-values and q-values (FDR-corrected p-values) for region intensity profiles.

        :param hicmap: HiC map to run calculations for
        :param regs: list ot Bins (with regions)
        """

        pvals = []
        ints = []
        for region in regs:
            pvals.append([])
            for i in range(-self.num_bins, self.num_bins + 1):
                if hicmap.fits[i] is not None:
                    pvals[-1].append(
                        ss.exponweib.sf(
                            region.intensities[len(region.pvalues)][len(
                                pvals[-1])], *hicmap.fits[i]))
                else:
                    pvals[-1].append(1.0)
            ints.append(region.intensities[len(region.pvalues)])
            p = np.array(pvals[-1])
            pvals[-1] = p
            p[p ==
              0.0] = 0.000000000000000000000000000001  # TODO delete that? useful for log representation
            region.pvalues.append(p)
        logger.debug('Calculated pvalues for map ' + hicmap.get_name())
        pvals, ints, corr_big = np.array(pvals), np.array(ints), np.array(ints)

        corrected = fdrcorrection0(
            np.array(pvals)[ints.nonzero()], self.threshold)[1]
        logger.debug('Calculated qvalues for map ' + hicmap.get_name())
        corrected[
            corrected ==
            0.0] = 0.000000000000000000000000000001  # TODO delete that? useful for log representation
        corr_big[corr_big.nonzero()] = corrected
        corr_big[np.nonzero(corr_big == 0.0)] = 1.0
        corr_big.reshape(pvals.shape)

        for r in range(len(regs)):
            x = len(regs[r].corrected_pvalues)
            regs[r].corrected_pvalues.append(corr_big[r])
            regs[r].weighted.append([
                0.0 if np.isnan(
                    hicmap.means[-(int(len(regs[r].intensities[x]) / 2.0) -
                                   regs[r].intensities[x].index(y))]) else y /
                hicmap.means[-(int(len(regs[r].intensities[x]) / 2.0) -
                               regs[r].intensities[x].index(y))]
                for y in regs[r].intensities[x]
            ])
Пример #18
0
def tip_fdr(a, alpha=0.05):
    """
    Returns adjusted TIP p-values for a particular `alpha`.

    (see :func:`tip_zscores` for more info)

    :param a: NumPy array, where each row is the signal for a feature
    :param alpha: False discovery rate

    """
    zscores = tip_zscores(a)
    pvals = stats.norm.pdf(zscores)
    rejected, fdrs = fdrcorrection0(pvals)
    return fdrs
Пример #19
0
def tip_fdr(a, alpha=0.05):
    """
    Returns adjusted TIP p-values for a particular `alpha`.

    (see :func:`tip_zscores` for more info)

    :param a: NumPy array, where each row is the signal for a feature
    :param alpha: False discovery rate

    """
    zscores = tip_zscores(a)
    pvals = stats.norm.pdf(zscores)
    rejected, fdrs = fdrcorrection0(pvals)
    return fdrs
Пример #20
0
def plot_dists(original,
               bg,
               interactions=None,
               file_name="",
               p_factor=10,
               file_suffix=None,
               use_cache=1):

    if interactions is None:
        interactions = sorted(list(original.index))

    if len(interactions) > 9:
        print "splitting interactions (total: {})".format(len(interactions))
        interaction_sets = [
            interactions[a * 9:min((a + 1) * 9, len(interactions))]
            for a in np.arange(len(interactions) / 9 + 1)
        ]
    else:
        interaction_sets = [interactions]

    df_emp_pvals = pd.DataFrame()
    params = []
    p = multiprocessing.Pool(p_factor)
    output = multiprocessing.Manager().dict()
    for i_sets, interactions in enumerate(interaction_sets):
        params.append([
            bg, file_name, i_sets, interactions, original, file_suffix, output
        ])
        # plot_interaction_set([bg, file_name, interactions, original, output])

    p.map(plot_interaction_set, params)
    p.close()

    print "done!"
    for k, v in dict(output).iteritems():
        df_emp_pvals.loc[k, "pval"] = v

    df_emp_pvals['pval'][df_emp_pvals['pval'] == 0] = 1.0 / bg.shape[1]
    df_emp_pvals['qval'] = fdrcorrection0(df_emp_pvals.loc[:, 'pval'])[1]
    df_emp_pvals['enrichment_score'] = -np.log10(df_emp_pvals.loc[:, 'pval'])
    df_emp_pvals['zscore'] = zscore(df_emp_pvals.loc[:, 'enrichment_score'])
    df_emp_pvals['rank'] = rankdata(df_emp_pvals.loc[:, 'pval'])
    df_emp_pvals.to_csv(os.path.join(
        constants.OUTPUT_FOLDER,
        "emp_pvals_summary_{}_{}.tsv".format(file_suffix, file_name)),
                        sep='\t')

    return df_emp_pvals
Пример #21
0
def calc_ttest(dataset=constants.DATASET_NAME,
               gene_expression_file_name="ge.tsv"):
    h_rows, h_cols, ge_dataset = infra.separate_headers(
        infra.load_gene_expression_profile_by_genes(
            gene_expression_file_name=gene_expression_file_name))
    classes = np.array(infra.load_classes()).astype(np.int)
    pvals = []
    rows_to_delete = []
    pval_dict = {}

    for i, cur in enumerate(list(h_rows)):
        pval_dict[cur] = ttest_ind(ge_dataset[i, classes == 1],
                                   ge_dataset[i, classes == 2]).pvalue
        if np.isnan(pval_dict[cur]):
            print "case: {}, wt: {}".format(ge_dataset[i, classes == 1],
                                            ge_dataset[i, classes == 2])
            rows_to_delete.append(i)
        else:
            pvals.append(pval_dict[cur])
    ind = np.ones((len(h_rows), ), bool)
    ind[rows_to_delete] = False
    h_rows = h_rows[ind]
    ge_dataset = ge_dataset[ind, :]
    # print pvals
    qvals = fdrcorrection0(pvals, alpha=0.05, method='indep',
                           is_sorted=False)[1]
    qscores = []
    for i, cur in enumerate(h_rows):
        qscores.append(-log10(qvals[i]))

    output_h_cols = ["id"] + list(h_cols) + ["pval", "qval", "qscore"]

    output_matrix = np.c_[h_rows, ge_dataset, pvals, qvals, qscores]
    output_matrix = np.r_[np.reshape(output_h_cols, (1, len(output_h_cols))),
                          output_matrix]

    lines = []
    for i, cur in enumerate(output_matrix):
        lines.append("\t".join(cur))

    file(os.path.join(constants.CACHE_DIR, "deg_t.tsv"),
         "w+").write("\n".join(lines))
    return {
        "result":
        pd.read_csv(os.path.join(constants.CACHE_DIR, "deg_t.tsv"),
                    sep="\t",
                    index_col=0)
    }
Пример #22
0
def tabulate_week_to_control_stats(df, sample_type, metric, test_fn=scipy.stats.mannwhitneyu):
    # alternative test_fn: partial(scipy.stats.ttest_ind, equal_var=False)
    control_week0 = control_metric(df, sample_type, metric=metric)
    asd_data = filter_sample_md(df, [('SampleType', sample_type), ('Group', 'autism')])
    results = []
    asd_data['week'] = pd.to_numeric(asd_data['week'], errors='coerce')
    asd_data = asd_data.sort_values(by='week')
    weeks = asd_data['week'].unique()
    for i in weeks:
        weeki = asd_data[metric][asd_data['week'] == i].dropna()
        t, p = test_fn(weeki, control_week0)
        results.append((len(weeki), np.median(weeki), t, p))
    result = pd.DataFrame(results, index=pd.Index(weeks, name='week'),
                          columns=['n', metric, 'test-statistic', 'p-value'])
    result['q-value'] = fdrcorrection0(result['p-value'])[1]
    return result
Пример #23
0
def calc_pval_dist(mat, aneu_types, chr_interaction, pval_method_name, label):
    hg_pvals = []
    for a in np.arange(1, 23):
        for arm_a in ['p', 'q']:
            for b in np.arange(a, 23):
                for arm_b in ['p', 'q']:
                    for aneu_type_a, aneu_type_b in aneu_types:

                        if (arm_a <= arm_b
                                and b == a) or (chr_interaction and b
                                                == a) or (not chr_interaction
                                                          and b != a):
                            continue
                        if "{}{}".format(a, arm_a) not in mat.columns: continue
                        if "{}{}".format(b, arm_b) not in mat.columns: continue

                        pval = PVAL_METHODS[pval_method_name](mat, a, arm_a,
                                                              aneu_type_a, b,
                                                              arm_b,
                                                              aneu_type_b)

                        hg_pvals.append({
                            "{}{}{}-{}{}{}".format(a, arm_a, aneu_type_a, b, arm_b, aneu_type_b):
                            pval
                        })

    qvals = fdrcorrection0([a.values()[0] for a in hg_pvals])[1]

    hg_qvals = []
    for i, a in enumerate(hg_pvals):
        hg_qvals.append({a.keys()[0]: qvals[i]})

    df = pd.DataFrame(data=[[a.values()[0], hg_qvals[i].values()[0]]
                            for i, a in enumerate(hg_pvals)],
                      columns=['pval', 'qval'],
                      index=[a.keys()[0] for a in hg_pvals])
    df["enrichment_score"] = -np.log10(df.loc[:, "pval"])
    df["zscore"] = zscore(df.loc[:, "enrichment_score"])
    df["rank"] = rankdata(df.loc[:, "pval"])
    df = df.sort_values(by=["pval"])
    df.to_csv(os.path.join(
        constants.OUTPUT_FOLDER, "hgs_emp_dist_{}_{}_{}.tsv".format(
            constants.CHR_INTERACTION_NAMES[chr_interaction], pval_method_name,
            label)),
              sep='\t')

    return df
Пример #24
0
def main():
    res_path = os.path.join(
        '/home/dsoto/public/Fantasmas_MRI_analysis/images_forAnalysis/Functional/results-univar-FS'
    )
    formatted = format_results(res_path)

    # paired t-test for comparison of read and reenact
    # classification performance in all ROIs
    p_values = ttest_1samp(formatted, 0.5)[1]
    corr_p_mask, corr_p_values = fdrcorrection0(p_values)
    print pd.DataFrame(
        {
            "mean": formatted.mean(),
            'corrected': corr_p_values,
            'uncorrected': p_values
        },
        index=formatted.columns).T
Пример #25
0
def ttest_fdr_corrected(X, y, alpha=0.05, return_as_df=False):
    """ FDR corrected ttest pvalue (created 11/20/2015)

    http://statsmodels.sourceforge.net/devel/generated/
    statsmodels.sandbox.stats.multicomp.fdrcorrection0.html#statsmodels.sandbox.stats.multicomp.fdrcorrection0

    Updates
    -------
    - 11/20/2015: created function
    - 01/25/2016: added option ``return_as_df``

    Parameters
    ----------
    X : ndarray of shape [n,p]
        Data matrix (samples as row vectors)
    y : ndarray of shape [n,]
        Label vector
    return_as_df : bool (default=False)
        Return result as [n,3] shaped pandas DataFrame

    Returns
    -------
    tstats : ndarray of shape [n,]
        Vector of tstats
    pval_corr : ndarray of shape [n,]
        pvalues adjusted for multiple hypothesis testing to limit FDR
    idx_rejected : array, bool (shape [n,])
        True if a hypothesis is rejected, False if not
    """
    tstats, pval = ttest_twosample_fixnan(X, y)

    from statsmodels.sandbox.stats.multicomp import fdrcorrection0
    idx_rejected, pval_corr = fdrcorrection0(pval, alpha=0.05)
    idx_rejected = idx_rejected.astype(int)

    #tstats[~idx_rejected] = 0
    #^^^commented out on 12/03/2015

    if return_as_df:
        df = pd.DataFrame([tstats, pval_corr, idx_rejected],
                          index=['tstats', 'pval', 'rejected']).T
        return df
    else:
        return tstats, pval_corr, idx_rejected
Пример #26
0
def doHyperG(genelist, allgenes, allterms, assocname):

    geneswithterms = allgenes.keys()
    termswithgenes = allterms.keys()

    M=len(geneswithterms)
    N=len(list(set(geneswithterms).intersection(set(genelist))))

    pvalues=[]
    termsingenelist=[]
    termsinbackground=[]
    termname=[]

    for t in termswithgenes:
        n = len(allterms[t])
        x = len(list(set(allterms[t]).intersection(set(genelist))))
        if x == 0:
            continue

        pvalue = 1.0 - hypergeom.cdf(x,M,n,N)
        pvalues.append(pvalue)

        termsingenelist.append(x)
        termsinbackground.append(n)
        termname.append(t)

    adjpvalue = list(fdrcorrection0(pvalues)[1])

    print("\t".join(["Term annotation", "pvalue", "fdr adj pvalue","Background","Expected","GeneList","Observed","Genes"]))
    for u in range(0,len(adjpvalue)):
        gotermname = termname[u]
        if termname[u] in assocname.keys():
            gotermname = assocname[termname[u]]
        print("\t".join([gotermname,
                         str(pvalues[u]),
                         str(adjpvalue[u]),
                         str(M),
                         str(termsinbackground[u]),
                         str(N),
                         str(termsingenelist[u]),
                         ",".join(list(set(allterms[termname[u]]).intersection(set(genelist))))]
                        )
              )
Пример #27
0
def tabulate_week_to_control_stats(df,
                                   sample_type,
                                   metric,
                                   test_fn=scipy.stats.mannwhitneyu):
    # alternative test_fn: partial(scipy.stats.ttest_ind, equal_var=False)
    control_week0 = control_metric(df, sample_type, metric=metric)
    asd_data = filter_sample_md(df, [('SampleType', sample_type),
                                     ('Group', 'autism')])
    results = []
    asd_data['week'] = pd.to_numeric(asd_data['week'], errors='coerce')
    asd_data = asd_data.sort_values(by='week')
    weeks = asd_data['week'].unique()
    for i in weeks:
        weeki = asd_data[metric][asd_data['week'] == i].dropna()
        t, p = test_fn(weeki, control_week0)
        results.append((len(weeki), np.median(weeki), t, p))
    result = pd.DataFrame(results,
                          index=pd.Index(weeks, name='week'),
                          columns=['n', metric, 'test-statistic', 'p-value'])
    result['q-value'] = fdrcorrection0(result['p-value'])[1]
    return result
Пример #28
0
def RFE(tested_gene_list_file_name,
        expression_profile_file_name,
        phenotype_file_name,
        rank_method=LOGISTIC_REGRESSION,
        gene_filter_file_name="protein_coding.txt",
        rounds=2,
        recursion_step_size=2,
        start_index=0,
        recursion_number_of_steps=20,
        pval_preprocessing_file_name=None,
        permutation=NORMAL,
        groups=None,
        classification_method="svm_rbf_default",
        tuning_parameters={
            'C': [10],
            'kernel': ['rbf']
        }):
    thismodule = sys.modules[__name__]
    clf = getattr(thismodule, classification_method)(tuning_parameters)
    print "about ot analyse: {}".format(tested_gene_list_file_name)
    # fetch gene expression by gene_id, divided by tumor type
    # test pval for significance differentiation between label values (primar vs metastatic)
    data, labels, groups, gene_ids = load_svm_data(
        tested_gene_list_file_name,
        expression_profile_file_name,
        phenotype_file_name,
        gene_filter_file_name=gene_filter_file_name,
        groups=groups)
    if os.path.isfile(os.path.join(
            CACHE_DIR, pval_preprocessing_file_name)) and USE_CACHE:
        gene_pval_pair = load_sets(
            os.path.join(CACHE_DIR, pval_preprocessing_file_name))
        print "pval loaded from file"
    else:
        group_0_expression = groups[0]
        group_1_expression = groups[1]
        pvals = []
        gene_symbols = []
        for i in range(1, len(group_0_expression)):
            cur_pval = scipy.stats.ttest_ind(
                [float(c) for c in group_0_expression[i][1:]],
                [float(c) for c in group_1_expression[i][1:]])[1]
            if not math.isnan(cur_pval):
                pvals.append(cur_pval)
                gene_symbols.append(group_0_expression[i][0])

        # sort gene_id-pval pairs by pval
        gene_pval_pair = zip(gene_symbols, pvals)
        gene_pval_pair.sort(key=lambda x: x[1], reverse=False)
        save_sets(
            gene_pval_pair,
            os.path.join(CACHE_DIR,
                         os.path.join(CACHE_DIR,
                                      pval_preprocessing_file_name)))
        print "pval saved to file"

    # calculate number of true hyphothesis after correction
    pvals = [cur[1] for cur in gene_pval_pair]
    fdr_results = fdrcorrection0(pvals,
                                 alpha=0.05,
                                 method='indep',
                                 is_sorted=True)
    true_counter = len([cur for cur in fdr_results[0] if cur == True])
    print "true hypothesis: {}/{}".format(true_counter,
                                          np.size(fdr_results[0]))

    gene_ids_ranked = [cur[0] for cur in gene_pval_pair]
    gene_ids_ranked = gene_ids_ranked[:true_counter]
    if permutation == RANDOMIZED:
        random.shuffle(gene_ids_ranked)
    elif permutation == REVERSED:
        gene_ids_ranked = list(
            reversed(gene_ids_ranked))  # random.shuffle(gene_ids_ranked)
    train_scores = []
    test_auPR_scores = []
    test_auROC_scores = []
    for j in range(recursion_number_of_steps):
        train_scores.append([])
        test_auPR_scores.append([])
        test_auROC_scores.append([])
    genelist_datasets = filter_gene_expressions_preprocessed(
        data, gene_ids_ranked, recursion_number_of_steps, recursion_step_size,
        start_index, gene_ids)
    for i in range(rounds):
        genelist_datasets = np.rot90(genelist_datasets, k=1, axes=(1, 0))
        genelist_datasets, labels = randonize_patients(genelist_datasets,
                                                       labels)
        genelist_datasets = np.rot90(genelist_datasets, k=-1, axes=(1, 0))
        for j in range(recursion_number_of_steps):
            # cur_dataset = filter_gene_expressions(genelist_dataset, gene_ids_ranked[:recursion_step_size*(j+1)], gene_ids)
            cur_dataset = genelist_datasets[j]
            data_train, data_test, labels_train, labels_test = divide_train_and_test_groups(
                cur_dataset, labels)
            test_auPR, test_auROC = apply_svm(clf, data_train, labels_train,
                                              data_test, labels_test,
                                              rank_method)
            test_auPR_scores[j].append(test_auPR)
            test_auROC_scores[j].append(test_auROC)
    print "#######################################"
    print "AUPR results:"
    pr_avgs, pr_vars = print_fre_results(test_auPR_scores, float(rounds),
                                         tested_gene_list_file_name,
                                         rank_method, permutation)
    print "AUROC results:"
    roc_avgs, roc_vars = print_fre_results(test_auROC_scores, float(rounds),
                                           tested_gene_list_file_name,
                                           rank_method, permutation)
    return (test_auPR_scores, test_auROC_scores)
Пример #29
0
from scipy.stats import norm, ttest_1samp
from statsmodels.sandbox.stats.multicomp import fdrcorrection0

models = ['logBSC_H200', 'logMFS']
mask = 'temporal_lobe_mask_grp_7T_test.nii.gz'
threshold = 0.001

for model in models:
#    scores = np.arctanh(apply_mask(glob.glob('MaThe/avg_maps/model_{}_p_adj_subj_*'.format(model)), mask_img=mask)).mean(axis=0)
#    mean_scores = scores.mean(axis=0)
#    t_values, p_values = ttest_1samp(scores, 0, axis=0)
#    corr_p_values = fdrcorrection0(p_values, alpha=0.05)
##    threshold = np.min(mean_scores[corr_p_values<0.05])
#    threshold = 0.001
#    fsf.save_map_avg('avg_unthresh', mean_scores, threshold=threshold, model=model)
#    mean_scores[corr_p_values>=0.05] = 0
#    display = fsf.plot_avg(mean_scores, threshold)
#    fsf.save_map_avg('avg', mean_scores, threshold=threshold, model=model)
#    display.savefig('mean_scores_model_{}.svg'.format(model))
#    display.savefig('mean_scores_model_{}.png'.format(model))
    for pc in xrange(1, 4):
        scores = np.arctanh(apply_mask(glob.glob('MaThe/avg_maps/model_{}_p_adj_pc_{}_subj_*'.format(model, pc)), mask_img=mask))
        mean_scores = scores.mean(axis=0)
        t_values, p_values = ttest_1samp(scores, 0, axis=0)
        corr_p_values = fdrcorrection0(p_values, alpha=0.05)
        mean_scores[corr_p_values>=0.05] = 0
        display = fsf.plot_avg(mean_scores, threshold, vmax=0.27)
        display.savefig('mean_scores_model_{}_pc_{}.svg'.format(model, pc))
        display.savefig('mean_scores_model_{}_pc_{}.png'.format(model, pc))
        fsf.save_map_avg('avg', mean_scores, threshold=threshold, model=model+'_pc_'+str(pc))
Пример #30
0
def run_stats(y_design, coord_mat, design_data, var_type):

    n, l, m = y_design.shape

    print("+++++++Construct the design matrix: normalization+++++++")
    x_design = read_x(design_data, var_type)
    p = x_design.shape[1]
    print("The dimension of design matrix is ", str(x_design.shape))
    """+++++++++++++++++++++++++++++++++++"""
    """Step 2. Statistical analysis: including (1) smoothing and (2) hypothesis testing"""

    print("+++++++Local linear kernel smoothing+++++++")
    start = timeit.default_timer()
    efit_beta, efity_design, h_opt = lpks(coord_mat, x_design, y_design)
    stop = timeit.default_timer()
    delta_time = str(stop - start)
    # print(h_opt)
    print("Elapsed time is " + delta_time)

    print(
        "+++++++Kernel smoothing (order = 1) for smooth functions (eta)+++++++"
    )
    start = timeit.default_timer()
    resy_design = y_design - efity_design
    print(np.amax(resy_design))
    print(np.amin(resy_design))
    efit_eta, res_eta, esig_eta = sif(coord_mat, resy_design, h_opt)
    print(np.amax(res_eta))
    print(np.amin(res_eta))
    stop = timeit.default_timer()
    delta_time = str(stop - start)
    print("Elapsed time is " + delta_time)

    print("+++++++Hypothesis testing+++++++")
    # hypothesis: beta_pj(d)=0 v.s. beta_pj(d)~=0 for all j and d
    start = timeit.default_timer()
    lpvals = np.zeros((l, p - 1))
    lpvals_fdr = np.zeros((l, p - 1))
    gpvals = np.zeros((1, p - 1))
    clu_pvals = np.zeros((1, p - 1))
    areas = np.zeros((1, p - 1))
    num_bstrp = 500  # number of bootstrap samples
    thres = 2

    for pp in range(p - 1):
        print("Testing whether the covariate " + str(pp + 1) +
              " is zero or not...")
        """ local and global statistics calculation """
        cdesign = np.zeros((1, p))
        cdesign[0, pp + 1] = 1
        gstat, lstat = wald_ht(x_design, efit_beta, esig_eta, cdesign)
        lpvals[:, pp] = 1 - np.squeeze(stats.chi2.cdf(lstat, m))
        lpvals_fdr[:, pp] = fdrcorrection0(lpvals[:, pp])[1]
        ind_thres = -np.log10(lpvals[:, pp]) >= thres
        area = np.sum(ind_thres)
        """ Generate random samples and calculate the corresponding statistics and pvalues """
        gpval, clu_pval = bstrp_pvalue(coord_mat, x_design, y_design, cdesign,
                                       gstat, num_bstrp, thres, area)

        gpvals[0, pp] = gpval
        areas[0, pp] = area
        clu_pvals[0, pp] = clu_pval
        print("the global p-value for covariate " + str(pp + 1) + " is " +
              str(gpvals[0, pp]) + "...")
        print("the p-value of most significant subregion for covariate " +
              str(pp + 1) + " is " + str(clu_pvals[0, pp]) + "...")

    stop = timeit.default_timer()
    delta_time = str(stop - start)
    print("Elapsed time is " + delta_time)

    return gpvals, lpvals_fdr, clu_pvals, efit_beta, efity_design, efit_eta
Пример #31
0
def run_script(input_dir, output_dir):
    """
    Run the commandline script for MFSDA.

    Args:
        input_dir (str): full path to the data folder
        output_dir (str): full path to the output folder
    """

    """+++++++++++++++++++++++++++++++++++"""
    """Step 1. load dataset """
    print("loading data ......")
    print("+++++++Read the surface shape data+++++++")
    shape_file_name = input_dir + "aligned_shapes.mat"
    mat = loadmat(shape_file_name)
    y_design = mat['aligned_shape']
    n, l, m = y_design.shape
    print("The dimension of shape matrix is " + str(y_design.shape))
    print("+++++++Read the sphere coordinate data+++++++")
    template_file_name = input_dir + "template.mat"
    mat = loadmat(template_file_name)
    coord_mat = mat['template']
    # d = coord_mat.shape[1]
    print("+++++++Read the design matrix+++++++")
    design_data_file_name = input_dir + "design_data.txt"
    design_data = np.loadtxt(design_data_file_name)
    # read the covariate type
    var_type_file_name = input_dir + "var_type.txt"
    var_type = np.loadtxt(var_type_file_name)
    print("+++++++Construct the design matrix: normalization+++++++")
    x_design = read_x(design_data, var_type)
    p = x_design.shape[1]
    print("The dimension of design matrix is ", str(x_design.shape))

    """+++++++++++++++++++++++++++++++++++"""
    """Step 2. Statistical analysis: including (1) smoothing and (2) hypothesis testing"""

    print("+++++++Local linear kernel smoothing+++++++")
    start = timeit.default_timer()
    efit_beta, efity_design, h_opt = lpks(coord_mat, x_design, y_design)
    stop = timeit.default_timer()
    delta_time = str(stop - start)
    # print(h_opt)
    print("Elapsed time is " + delta_time)

    print("+++++++Kernel smoothing (order = 1) for smooth functions (eta)+++++++")
    start = timeit.default_timer()
    resy_design = y_design - efity_design
    print(np.amax(resy_design))
    print(np.amin(resy_design))
    efit_eta, res_eta, esig_eta = sif(coord_mat, resy_design, h_opt)
    print(np.amax(res_eta))
    print(np.amin(res_eta))
    stop = timeit.default_timer()
    delta_time = str(stop - start)
    print("Elapsed time is " + delta_time)

    print("+++++++Hypothesis testing+++++++")
    # hypothesis: beta_pj(d)=0 v.s. beta_pj(d)~=0 for all j and d
    start = timeit.default_timer()
    lpvals = np.zeros((l, p-1))
    lpvals_fdr = np.zeros((l, p-1))
    gpvals = np.zeros((1, p-1))
    clu_pvals = np.zeros((1, p-1))
    areas = np.zeros((1, p-1))
    num_bstrp = 500  # number of bootstrap samples
    thres = 2

    for pp in range(p-1):
        print("Testing whether the covariate " + str(pp+1) + " is zero or not...")
        """ local and global statistics calculation """
        cdesign = np.zeros((1, p))
        cdesign[0, pp+1] = 1
        gstat, lstat = wald_ht(x_design, efit_beta, esig_eta, cdesign)
        lpvals[:, pp] = 1 - np.squeeze(stats.chi2.cdf(lstat, m))
        lpvals_fdr[:, pp] = fdrcorrection0(lpvals[:, pp])[1]
        ind_thres = -np.log10(lpvals[:, pp]) >= thres
        area = np.sum(ind_thres)

        """ Generate random samples and calculate the corresponding statistics and pvalues """
        gpval, clu_pval = bstrp_pvalue(coord_mat, x_design, y_design, cdesign, gstat, num_bstrp, thres, area)

        gpvals[0, pp] = gpval
        areas[0, pp] = area
        clu_pvals[0, pp] = clu_pval
        print("the global p-value for covariate " + str(pp+1) + " is " + str(gpvals[0, pp]) + "...")
        print("the p-value of most significant subregion for covariate " +
              str(pp+1) + " is " + str(clu_pvals[0, pp]) + "...")

    stop = timeit.default_timer()
    delta_time = str(stop - start)
    print("Elapsed time is " + delta_time)

    """+++++++++++++++++++++++++++++++++++"""
    """Step3. Save all the results"""
    gpvals_file_name = output_dir + "global_pvalue.txt"
    np.savetxt(gpvals_file_name, gpvals)
    lpvals_fdr_file_name = output_dir + "local_pvalue_fdr.txt"
    np.savetxt(lpvals_fdr_file_name, lpvals_fdr)
    clu_pvals_file_name = output_dir + "cluster_pvalue.txt"
    np.savetxt(clu_pvals_file_name, clu_pvals)
Пример #32
0
def deg(tested_gene_file_name,
        total_gene_file_name,
        gene_expression_file_name,
        phenotype_file_name,
        gene_filter_file_name=None,
        tested_gene_list_path=None,
        total_gene_list_path=None,
        gene_expression_path=None,
        phenotype_path=None,
        gene_filter_path=None,
        groups=None,
        groups_name=None):
    print "about ot analyse: {}".format(tested_gene_file_name)
    # fetch gene expression by gene_id, divided by tumor type11111
    groups_results = load_expression_profile_by_labelling(
        gene_list_file_name=total_gene_file_name,
        gene_expression_file_name=gene_expression_file_name,
        phenotype_file_name=phenotype_file_name,
        gene_filter_file_name=gene_filter_file_name,
        tested_gene_path=total_gene_list_path,
        gene_expression_path=gene_expression_path,
        phenotype_path=phenotype_path,
        gene_filter_path=gene_filter_path,
        groups=groups)

    print "total # of groups; {}".format(len(groups_results))
    group_0_expression = np.array(groups_results[0]).T
    group_1_expression = np.array(groups_results[1]).T
    print "# patient in groups 1: {}. # of patients in groups #2: {}".format(
        group_0_expression.shape[1], group_1_expression.shape[1])
    pvals = []
    for i in range(1, len(group_0_expression)):
        mean_differences = np.average([
            float(c) for c in group_0_expression[i][1:]
        ]) - np.average([float(c) for c in group_1_expression[i][1:]])

        mean_foldchange = max(
            np.average([float(c) for c in group_0_expression[i][1:]]),
            1) / max(np.average([float(c)
                                 for c in group_1_expression[i][1:]]), 1)

        cur_pval = scipy.stats.ttest_ind(
            [float(c) for c in group_0_expression[i][1:]],
            [float(c) for c in group_1_expression[i][1:]])[1]
        direction = None
        if not math.isnan(cur_pval):
            if mean_differences > 0:
                direction = "downregulated"
            if mean_differences < 0:
                direction = "upregulated"
            pvals.append((group_0_expression[i][0], direction,
                          mean_differences, cur_pval, mean_foldchange))

    pvals.sort(key=lambda x: (x[3]), reverse=False)  # x[1],
    fdr_results = fdrcorrection0([x[3] for x in pvals],
                                 alpha=0.05,
                                 method='indep',
                                 is_sorted=False)
    pvals = [(cur_pval[0], cur_pval[1], cur_pval[2], cur_pval[3],
              fdr_results[1][i], cur_pval[4])
             for i, cur_pval in enumerate(pvals)]
    true_counter = len([cur for cur in fdr_results[0] if cur == True])
    print "true hypothesis: {}/{}".format(true_counter,
                                          np.size(fdr_results[0]))
    # sort gene_id-pval pairs by pval
    with file(
            os.path.join(
                constants.OUTPUT_GLOBAL_DIR, "deg",
                "deg_{}_{}_{}.txt".format(constants.CANCER_TYPE, groups_name,
                                          time.time())), "w+") as f:
        output = ""
        df_deg = pd.DataFrame()
        for cur_pval in pvals:
            df_deg = df_deg.append(
                {
                    "id": cur_pval[0],
                    "direction": cur_pval[1],
                    "mean_differences": cur_pval[2],
                    "pval": cur_pval[3],
                    "qval": cur_pval[4],
                    "foldchange": cur_pval[5]
                },
                ignore_index=True)
            output += "{}\t{}\t{}\t{}\t{}\t{}\n".format(*cur_pval)

        df_deg = df_deg.set_index("id")
        df_deg.to_csv(os.path.join(constants.OUTPUT_GLOBAL_DIR, "deg",
                                   "deg_{}.tsv").format(constants.CANCER_TYPE),
                      sep='\t')
        f.write(output)
        print "pval saved to file"
        return df_deg
def deg(tested_gene_file_name,
        total_gene_file_name,
        gene_expression_file_name,
        phenotype_file_name,
        gene_filter_file_name=None,
        tested_gene_list_path=None,
        total_gene_list_path=None,
        gene_expression_path=None,
        phenotype_path=None,
        gene_filter_path=None,
        groups=None,
        groups_name=None):
    print "about ot analyse: {}".format(tested_gene_file_name)
    # fetch gene expression by gene_id, divided by tumor type11111
    groups_results = load_expression_profile_by_labelling(
        gene_list_file_name=total_gene_file_name,
        gene_expression_file_name=gene_expression_file_name,
        phenotype_file_name=phenotype_file_name,
        gene_filter_file_name=gene_filter_file_name,
        tested_gene_path=total_gene_list_path,
        gene_expression_path=gene_expression_path,
        phenotype_path=phenotype_path,
        gene_filter_path=gene_filter_path,
        groups=groups)
    group_0_expression = groups_results[0]
    group_1_expression = groups_results[1]
    group_0_expression = np.rot90(np.flip(group_0_expression, 1),
                                  k=-1,
                                  axes=(1, 0))
    group_1_expression = np.rot90(np.flip(group_1_expression, 1),
                                  k=-1,
                                  axes=(1, 0))

    # test pval for significance differentiation between label values (primar vs metastatic)

    pvals = []
    gene_symbols = []
    for i in range(1, len(group_0_expression)):
        mean_differences = np.average([
            float(c) for c in group_0_expression[i][1:]
        ]) - np.average([float(c) for c in group_1_expression[i][1:]])

        mean_foldchange = max(
            np.average([float(c) for c in group_0_expression[i][1:]]),
            1) / max(np.average([float(c)
                                 for c in group_1_expression[i][1:]]), 1)

        cur_pval = scipy.stats.ttest_ind(
            [float(c) for c in group_0_expression[i][1:]],
            [float(c) for c in group_1_expression[i][1:]])[1]
        direction = None
        if not math.isnan(cur_pval):
            if mean_differences > 0:
                direction = "downregulated"
            if mean_differences < 0:
                direction = "upregulated"
            pvals.append((group_0_expression[i][0], direction,
                          mean_differences, cur_pval, mean_foldchange))

    pvals.sort(key=lambda x: (x[1], x[3]), reverse=False)
    fdr_results = fdrcorrection0([x[3] for x in pvals],
                                 alpha=0.05,
                                 method='indep',
                                 is_sorted=False)
    pvals = [(cur_pval[0], cur_pval[1], cur_pval[2], cur_pval[3],
              fdr_results[1][i], cur_pval[4])
             for i, cur_pval in enumerate(pvals)]
    true_counter = len([cur for cur in fdr_results[0] if cur == True])
    print "true hypothesis: {}/{}".format(true_counter,
                                          np.size(fdr_results[0]))
    # sort gene_id-pval pairs by pval
    with file(
            os.path.join(
                constants.OUTPUT_DIR,
                "deg_{}_{}_{}.txt".format(constants.CANCER_TYPE, groups_name,
                                          time.time())), "w+") as f:
        output = ""
        for cur_pval in pvals:
            output += "{}\t{}\t{}\t{}\t{}\t{}\n".format(*cur_pval)
        f.write(output)
        print "pval saved to file"
Пример #34
0
        (t, p) = ttest_ind(vfdb_parameters['scores'],
                           control_parameters['scores'])
        pvalue = p / 2  #one tailed
        test_pvalue[hg_id] = vfdb_parameters
        test_pvalue[hg_id]['pvalue'] = pvalue
        if t < 0:
            false_positives.append(hg_id)

tmp_pvalue = []
sorted_hg_ids = []
for hg_id in test_pvalue:
    tmp_pvalue.append(test_pvalue[hg_id]['pvalue'])
    sorted_hg_ids.append(hg_id)
    test_pvalue[hg_id].pop('pvalue')

corrected_pvalue = fdrcorrection0(tmp_pvalue)[1]
for position in range(len(corrected_pvalue)):
    hg_id = sorted_hg_ids[position]
    if corrected_pvalue[position] <= 0.05 and hg_id not in false_positives:
        true_positives[hg_id] = test_pvalue[hg_id]
    elif corrected_pvalue[position] > 0.05 and hg_id not in false_positives:
        false_positives.append(hg_id)

out = open('true_positives.pkl', 'wb')
dump(true_positives, out)
out.close()

agreement = []
groups_description = {}
descriptions = set()
for hg_id in true_positives:
def fdr_correction_and_viz(Pvals_path, Tvals_path, C1_path, C2_path, mask_path,
                           save_destination, affine, header, combination):
    alpha = 0.05

    Pvals = np.load(Pvals_path)
    Tvals = np.load(Tvals_path)
    C1 = np.load(C1_path)
    C2 = np.load(C2_path)

    mask = nib.load(mask_path).get_data()

    brain_indices = np.where(mask != 0)

    from statsmodels.sandbox.stats.multicomp import fdrcorrection0

    Pvals_shape = Pvals.shape

    Qvals = np.zeros(Pvals_shape)

    map_C1MinusC2 = C1 - C2

    # sign(c1-c2) * -1 * log10(p)
    map_logp = np.multiply(np.sign(map_C1MinusC2), (-1 * np.log10(Pvals)))

    roi_voxel_stats_matrix = np.zeros(
        (Pvals_shape[3], 14))  # cozthere are 14 statistical attributes

    for roi in range(Pvals_shape[3]):

        print('Computing Stats for ROI: ', roi)

        #         pvals = ma.masked_array(Pvals[0], mask = mask, fill_value = 0)

        pvals = Pvals[:, :, :, roi]
        pvals_shape = pvals.shape

        #         inp = pvals[~pvals.mask]

        # Flatten inp and check if you get back the original matrix after
        #         inp = inp.ravel()

        pvals_list = pvals[brain_indices]

        _, qvals_list = fdrcorrection0(pvals_list, alpha)

        #       from IPython.core.debugger import Tracer; Tracer()()
        # map_logq_list = map_logq[brain_indices]
        map_logp_list = map_logp[:, :, :, roi][brain_indices]

        # print("Size of map_logp_list ",map_logp_list.shape)
        #         print("Brain Indices: ", brain_indices)

        map_C1MinusC2_list = map_C1MinusC2[:, :, :, roi][brain_indices]

        #     Calculate voxel stats using the below function

        Qvals[:, :, :, roi][brain_indices] = qvals_list

        map_logq_list = np.multiply(np.sign(map_C1MinusC2_list),
                                    (-1 * np.log10(qvals_list)))

        # print("Size of map_logq_list ",map_logq_list.shape)

        roi_voxel_stats_matrix[roi, :] = count_voxel_stats(
            pvals_list, qvals_list, map_logp_list, map_logq_list)

        # print('Stats Computed for ROI: ',roi)

    #       Save the CSV file and the Additional Brain file to visualize

    # sign(c1-c2) * -1 * log10(q)
    map_logq = np.multiply(np.sign(map_C1MinusC2), (-1 * np.log10(Qvals)))

    save_destination_new = opj(save_destination, combination)
    if not os.path.exists(save_destination_new):
        os.mkdir(save_destination_new)

    print('Saving Files in directory: ', save_destination_new)

    print('Saving Stats CSV : ', )
    csv_name = 'roi_voxel_stats_' + combination + '.csv'
    np.savetxt(
        csv_name,
        roi_voxel_stats_matrix,
        delimiter=',',
        header=
        'min_pval,min_qval,p_lt_point_1,p_lt_point_01, p_lt_point_05, q_lt_point_1, q_lt_point_01,q_lt_point_05, logq_gt_1point3, logq_gt_1 ,logq_gt_2 ,logp_gt_1point3, logp_gt_1, logp_gt_2'
    )

    print('Saving Pvals.nii.gz')
    Pvals_name = opj(save_destination_new, 'Pvals.nii.gz')
    Pvals_brain_with_header = nib.Nifti1Image(Pvals,
                                              affine=affine,
                                              header=header)
    nib.save(Pvals_brain_with_header, Pvals_name)

    print('Saving Tvals.nii.gz')
    Tvals_name = opj(save_destination_new, 'Tvals.nii.gz')
    Tvals_brain_with_header = nib.Nifti1Image(Tvals,
                                              affine=affine,
                                              header=header)
    nib.save(Tvals_brain_with_header, Tvals_name)

    print('Saving Qvals.nii.gz')
    Qvals_name = opj(save_destination_new, 'Qvals.nii.gz')
    Qvals_brain_with_header = nib.Nifti1Image(Qvals,
                                              affine=affine,
                                              header=header)
    nib.save(Qvals_brain_with_header, Qvals_name)

    print('Saving C1MinusC2.nii.gz')
    C1MinusC2_name = opj(save_destination_new, 'C1MinusC2.nii.gz')
    C1MinusC2_brain_with_header = nib.Nifti1Image(map_C1MinusC2,
                                                  affine=affine,
                                                  header=header)
    nib.save(C1MinusC2_brain_with_header, C1MinusC2_name)

    print('Saving map_logp.nii.gz')
    map_logp_name = opj(save_destination_new, 'map_logp.nii.gz')
    map_logp_brain_with_header = nib.Nifti1Image(map_logp,
                                                 affine=affine,
                                                 header=header)
    nib.save(map_logp_brain_with_header, map_logp_name)

    print('Saving map_logq.nii.gz')
    map_logq_name = opj(save_destination_new, 'map_logq.nii.gz')
    map_logq_brain_with_header = nib.Nifti1Image(map_logq,
                                                 affine=affine,
                                                 header=header)
    nib.save(map_logq_brain_with_header, map_logq_name)
Пример #36
0
    def parallel_positive_selection_test(self,
                                         in_dir,
                                         tree_file,
                                         out_dir,
                                         results_file,
                                         seq_type="codons",
                                         codon_frequency="F3X4",
                                         noisy=3,
                                         verbose="concise",
                                         runmode=0,
                                         clock=0,
                                         aminoacid_distance=None,
                                         genetic_code=0,
                                         fix_kappa=False,
                                         kappa=5,
                                         getSE=0,
                                         RateAncestor=0,
                                         small_difference=0.000001,
                                         clean_data=True,
                                         method=0):
        """
        This function implements positive selection test (branch-site model)
        for branch labeled in tree file using model_A vs model_A_null(omega fixed to 1) comparison
        """

        FileRoutines.safe_mkdir(out_dir)
        alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir)
        tree_file_abs_path = os.path.abspath(tree_file)
        options_list = []
        dir_list = []
        basename_dir_list = []
        model_list = ["Model_A", "Model_A_null"]
        fix_omega_dict = {"Model_A": False, "Model_A_null": True}

        for filename in alignment_files_list:
            directory, basename, extension = FileRoutines.split_filename(
                filename)
            filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename))
            basename_dir_list.append(basename)
            FileRoutines.safe_mkdir(filename_out_dir)

            for model in model_list:
                model_dir = "%s/%s/" % (filename_out_dir, model)
                FileRoutines.safe_mkdir(model_dir)
                out_file = "%s/%s/%s.out" % (filename_out_dir, model, basename)
                ctl_file = "%s/%s/%s.ctl" % (filename_out_dir, model, basename)

                options_list.append("%s.ctl" % basename)
                dir_list.append(model_dir)

                self.generate_ctl_file(os.path.abspath(filename),
                                       tree_file_abs_path,
                                       out_file,
                                       ctl_file,
                                       seq_type=seq_type,
                                       codon_frequency=codon_frequency,
                                       noisy=noisy,
                                       verbose=verbose,
                                       runmode=runmode,
                                       clock=clock,
                                       aminoacid_distance=aminoacid_distance,
                                       model=2,
                                       nssites=2,
                                       genetic_code=genetic_code,
                                       fix_kappa=fix_kappa,
                                       kappa=kappa,
                                       fix_omega=fix_omega_dict[model],
                                       omega=1,
                                       getSE=getSE,
                                       RateAncestor=RateAncestor,
                                       Mgene=0,
                                       small_difference=small_difference,
                                       clean_data=clean_data,
                                       method=method)

        self.parallel_execute(options_list, dir_list=dir_list)

        results_dict = OrderedDict()
        double_delta_dict = OrderedDict()
        raw_pvalues_dict = OrderedDict()
        raw_pvalues_list = []

        for basename in basename_dir_list:
            results_dict[basename] = OrderedDict()
            for model in model_list:
                output_file = "%s/%s/%s/%s.out" % (out_dir, basename, model,
                                                   basename)
                codeml_report = CodeMLReport(output_file)
                results_dict[basename][model] = codeml_report.LnL

        skipped_genes_set = set()
        for basename in basename_dir_list:
            for model in model_list:
                if results_dict[basename][model] is None:
                    print("LnL was not calculated for %s" % basename)
                    skipped_genes_set.add(basename)
                    break
            else:
                doubled_delta = 2 * (results_dict[basename]["Model_A"] -
                                     results_dict[basename]["Model_A_null"])
                p_value = chisqprob(doubled_delta, 1)  # degrees of freedom = 1

                double_delta_dict[basename] = doubled_delta
                raw_pvalues_dict[basename] = p_value
                raw_pvalues_list.append(p_value)

        adjusted_pvalues_list = fdrcorrection0(raw_pvalues_list)[1]
        #print adjusted_pvalues_list
        i = 0
        with open(results_file, "w") as out_fd:
            out_fd.write(
                "id\tmodel_a_null,LnL\tmodel_a,LnL\t2*delta\traw p-value\tadjusted p-value\n"
            )
            for basename in basename_dir_list:
                for model in model_list:
                    if results_dict[basename][model] is None:
                        print("LnL was not calculated for %s" % basename)
                        break
                else:
                    #doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"])
                    #p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1

                    #print basename, results_dict[basename]["Model_A_null"],results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i]

                    out_fd.write(
                        "%s\t%f\t%f\t%f\t%f\t%f\n" %
                        (basename, results_dict[basename]["Model_A_null"],
                         results_dict[basename]["Model_A"],
                         double_delta_dict[basename],
                         raw_pvalues_dict[basename], adjusted_pvalues_list[i]))
                    i += 1
Пример #37
0
def calculate_sig(algo_sample=None,
                  dataset_sample=None,
                  n_dist_samples=300,
                  n_total_samples=None,
                  n_start_i=None,
                  limit=10000,
                  md_path=None,
                  dist_path=None,
                  filtered_go_ids_file="",
                  hg_th=0.0001):

    filtered_go_ids = open(filtered_go_ids_file,
                           'r').read().split() + [constants.ROOT_GO_ID]
    try:
        output_md = pd.read_csv(md_path.format(dataset_sample, algo_sample),
                                sep='\t',
                                index_col=0).reindex(filtered_go_ids).dropna()
    except Exception:
        return None

    max_genes_pvals = np.power(10, -output_md.loc[:, "hg_pval_max"])
    print("total n_genes with pval less than one: {}/{}".format(
        np.size(max_genes_pvals), len(filtered_go_ids)))
    max_genes_pvals = np.append(
        max_genes_pvals,
        np.ones(len(filtered_go_ids) - np.size(max_genes_pvals)))

    fdr_results = fdrcorrection0(max_genes_pvals,
                                 alpha=hg_th,
                                 method='indep',
                                 is_sorted=False)
    n_hg_true = len([cur for cur in fdr_results[0] if cur == True])
    HG_CUTOFF = (np.sort(max_genes_pvals)[n_hg_true -
                                          1] if n_hg_true > 0 else -1)
    print("HG cutoff: {}, (ES={}, n={})".format(HG_CUTOFF,
                                                -np.log10(HG_CUTOFF),
                                                n_hg_true))

    output_md = output_md.loc[
        output_md.loc[:, "hg_pval_max"].values >= -np.log10(HG_CUTOFF), :]

    print(dist_path.format(dataset_sample, algo_sample))
    output = pd.read_csv(dist_path.format(dataset_sample, algo_sample),
                         sep='\t',
                         index_col=0).dropna()
    output = output.loc[output_md.index.values, :]
    counter = 0
    emp_dists = []
    emp_pvals = []

    n_total_samples = n_total_samples if n_total_samples is not None else len(
        output.iloc[0].loc["dist_n_samples"][1:-1].split(", "))
    np.random.seed(int(random.random() * 1000))
    i_choice = np.random.choice(n_total_samples, n_dist_samples, replace=False)
    i_dist = i_choice[:n_dist_samples]

    for index, cur in output.iterrows():
        if counter == limit: break
        pval = np.array([
            float(x) for x in cur["dist_n_samples"][1:-1].split(", ")
        ])[i_dist]
        emp_pvals.append(calc_emp_pval(cur["hg_pval"], pval))
        output_md.loc[index, 'emp_pval'] = emp_pvals[-1]
        emp_dists.append(pval)
        counter += 1

    mask_ids = output.index.values
    emp_pvals_mat = [
        np.array([x])
        if type(x) != str else np.array(x[1:-1].split(", ")).astype(np.float32)
        for x in emp_pvals
    ]

    n_modules = 0
    if len(emp_pvals) != 0:
        n_modules = emp_pvals_mat[0].shape[0]

    max_emp_original_pvals = reduce(lambda a, x: np.append(a, np.min(x)),
                                    emp_pvals_mat, np.array([]))
    emp_pvals = reduce(lambda a, x: np.append(a, x), emp_pvals_mat,
                       np.array([]))
    df_dists = pd.DataFrame(index=output.index)
    df_dists["emp"] = pd.Series(emp_dists, index=output.index[:limit])
    max_emp_pvals = np.sort([
        x if x != 0 else 1.0 / n_dist_samples for x in max_emp_original_pvals
    ])
    print("max emp pvals len: {}".format(len(max_emp_pvals)))
    print("min vals", 1.0 / n_dist_samples, np.min(list(max_emp_pvals) + [1]))
    print("max_genes_pvals: {}".format(max_genes_pvals.shape[0]))
    fdr_bh_results = fdrcorrection0(max_emp_pvals,
                                    alpha=0.05,
                                    method='indep',
                                    is_sorted=False)[0]
    n_emp_true = np.sum(fdr_bh_results)
    print("n_emp_true: {}".format(n_emp_true))

    if n_emp_true == 0:
        EMP_TH = -1
    else:
        EMP_TH = (np.sort(max_emp_pvals)[n_emp_true -
                                         1] if n_emp_true > 0 else -1)

    mask_terms = np.array(
        [max(a, 1.0 / n_dist_samples) <= EMP_TH for a in emp_pvals])
    go_ids_result = np.array([])
    go_names_result = np.array([])
    n_emp_true_in_modules = 0
    if len(mask_terms) > 0:
        mask_terms = np.array(mask_terms).reshape(-1, n_modules)
        go_ids_result = output.index.values[mask_terms.any(axis=1)]
        go_names_result = output["GO name"].values[mask_terms.any(axis=1)]
        n_emp_true_in_modules = np.sum(mask_terms)

    print("EMP cutoff: {}. # true terms passed EMP cutoff: {}".format(
        EMP_TH, n_emp_true))
    print("# true terms passed EMP cutoff across modules: {}".format(
        n_emp_true_in_modules))
    print("EHR :{}".format(n_emp_true / float(n_hg_true)))

    return EMP_TH, n_emp_true, HG_CUTOFF, n_hg_true, go_ids_result, go_names_result, mask_ids, mask_terms, emp_pvals_mat
Пример #38
0
    else:
        (t, p) = ttest_ind(vfdb_parameters['scores'], control_parameters['scores'])
        pvalue = p/2 #one tailed
        test_pvalue[hg_id]          = vfdb_parameters
        test_pvalue[hg_id]['pvalue']= pvalue
        if t < 0:
            false_positives.append(hg_id)

tmp_pvalue      = []
sorted_hg_ids   = []
for hg_id in test_pvalue:
    tmp_pvalue.append(test_pvalue[hg_id]['pvalue'])
    sorted_hg_ids.append(hg_id)
    test_pvalue[hg_id].pop('pvalue')

corrected_pvalue= fdrcorrection0(tmp_pvalue)[1]
for position in range(len(corrected_pvalue)):
    hg_id = sorted_hg_ids[position]
    if corrected_pvalue[position] <= 0.05 and hg_id not in false_positives:
        true_positives[hg_id] = test_pvalue[hg_id]
    elif corrected_pvalue[position] > 0.05 and hg_id not in false_positives:
        false_positives.append(hg_id)

out = open('true_positives.pkl', 'wb')
dump(true_positives, out)
out.close()

agreement = []
groups_description = {}
descriptions = set()
for hg_id in true_positives:
Пример #39
0
def main(dataset="SOC",
         algo="jactivemodules_sa",
         n_permutations=300,
         csv_file_name=os.path.join(constants.OUTPUT_GLOBAL_DIR, "emp_fdr",
                                    "MAX/emp_diff_{dataset}_{algo}.tsv")):

    dataset_data = pd.read_csv(os.path.join(constants.DATASETS_DIR,
                                            "GE_{}".format(dataset), "data",
                                            "ge.tsv"),
                               sep='\t',
                               index_col=0)
    classes_data = np.array(
        file(
            os.path.join(
                constants.DATASETS_DIR, "GE_{}".format(dataset), "data",
                "classes.tsv")).readlines()[0].strip().split("\t")).astype(
                    np.int)

    csv_file_name = csv_file_name.format(dataset=dataset, algo=algo)
    df = None
    try:
        df = pd.read_csv(csv_file_name, sep='\t', index_col=0)
    except:
        return None
    df = df.dropna()

    n_genes = [
        len(
            get_all_genes_for_term(vertices, cur_go_id, cur_go_id,
                                   cur_go_id == cur_go_id))
        for i, cur_go_id in enumerate(df.index.values)
    ]
    depth = [
        dict_result.values()[0]['vertices'][cur_go_id]['D']
        for i, cur_go_id in enumerate(df.index.values)
    ]
    df["n_genes"] = pd.Series(n_genes, index=df.index)
    df["depth"] = pd.Series(depth, index=df.index)
    df = df.rename(columns={"filtered_pval": "hg_pval"})

    n_genes_pvals = df.loc[np.logical_and.reduce(
        [df["n_genes"].values > 5, df["n_genes"].values < 500]),
                           "hg_pval"].values

    print "total n_genes with pval:{}/{}".format(np.size(n_genes_pvals), 7435)
    n_genes_pvals = np.append(n_genes_pvals,
                              np.zeros(7435 - np.size(n_genes_pvals)))
    n_genes_pvals = [10**(-x) for x in n_genes_pvals]
    fdr_results = fdrcorrection0(n_genes_pvals,
                                 alpha=0.05,
                                 method='indep',
                                 is_sorted=False)
    true_counter = len([cur for cur in fdr_results[0] if cur == True])
    HG_CUTOFF = -np.log10(np.sort(n_genes_pvals))[true_counter -
                                                  1] if true_counter > 0 else 0
    print "cutoff: {}".format(HG_CUTOFF)

    df_filtered_in = df.loc[np.logical_and.reduce([
        df["n_genes"].values > 5, df["n_genes"].values < 500, df["hg_pval"].
        values >= HG_CUTOFF
    ]), :]
    df_filtered_out = df.loc[~np.logical_and.reduce([
        df["n_genes"].values > 5, df["n_genes"].values < 500, df["hg_pval"].
        values >= HG_CUTOFF
    ]), :]

    df_filtered_in["index"] = df_filtered_in.index.values
    df_filtered_in["emp_pval"] = df_filtered_in.apply(
        lambda row: calc_empirical_pval(row, n_permutations), axis=1)
    df_filtered_in["mean_difference"] = df_filtered_in.apply(
        lambda x: mean_difference(x, dataset_data, classes_data), axis=1)

    pvals_corrected = df_filtered_in["emp_pval"].values
    fdr_results = fdrcorrection0(pvals_corrected,
                                 alpha=0.05,
                                 method='indep',
                                 is_sorted=False)
    true_counter = len([cur for cur in fdr_results[0] if cur == True])
    emp_cutoff = np.sort(
        np.sort(pvals_corrected))[true_counter - 1] if true_counter > 0 else 0
    print "emp true hypothesis: {} (emp cutoff: {}, n={})".format(
        true_counter, emp_cutoff, len(fdr_results[0]))

    df_filtered_in["passed_fdr"] = df_filtered_in["emp_pval"].apply(
        lambda x: x <= emp_cutoff)

    df_filtered_in["emp_rank"] = df_filtered_in["emp_pval"].rank(ascending=1)
    df_filtered_in["hg_rank"] = df_filtered_in["hg_pval"].rank(ascending=0)

    df_filtered_in = df_filtered_in.sort_values(by=["emp_rank", "hg_rank"])

    df_all = pd.concat((df_filtered_in, df_filtered_out), axis=0)
    df_all.loc[df_all["hg_pval"].values > 0, :][[
        "GO name", "hg_pval", "emp_pval", "hg_rank", "emp_rank", "n_genes",
        "depth", "mean_difference", "passed_fdr"
    ]].to_csv(csv_file_name[:-4] + "_md.tsv", sep='\t')
    return len(df_filtered_in.index), true_counter, HG_CUTOFF, emp_cutoff