Exemplo n.º 1
0
def write_drug_rank(tbl_file, fig_file, drug2pro, ind2drug, ind2pro, M,
                    N):  # write targets ranked by enrichment p-value
    drug2pval = {}
    for k, v in drug2pro.items():
        drug = ind2drug[k]
        q = drug[4]
        #m = len(v)
        m = sum([x[1] for x in v])
        if m > q:  # for predicted case, we keep use the total number of known chemicals interacting with a certain target, if predicted number greater than known, pval is 0.
            pval = hypergeom.sf(m - 1, M, m, N)
        else:
            pval = hypergeom.sf(
                m - 1, M, q, N
            )  # Hypergeometric test: the probability of getting more than (m-1) items from N, when the backgroud is q in M.
        drug2pval[k] = pval

    n = 1
    out = open(tbl_file, 'w')
    out.write(
        'Rank\tPubchem_ID\tChemical_Name\tMolecular_Weight\tSMILES\tProteins\tNum_Targets\tP_value\n'
    )
    drug_name2erich_score = {}  ## in order to draw the enrichment plot
    for k, v in sorted(drug2pval.items(), key=lambda x: x[1]):
        drug = ind2drug[k]
        pros = [(ind2pro[x[0]][4], x[1]) for x in drug2pro[k]]
        out.write(
            str(n) + '\t' + '\t'.join(str(x) for x in drug[0:-1]) + '\t' +
            ';'.join(str(x) for x in pros) + '\t' + str(len(pros)) + '\t' +
            str(v) + '\n')
        if n <= 20:
            name = drug[1]
            drug_name2erich_score[name] = -np.log10(v)
        n += 1
    out.close()
    enrich_plot(drug_name2erich_score, fig_file, 'drug')
def precision_table(pairs, Ls):
    '''compute precisions at various number of pairs (any, and long-range),
    also computes p-values according to the hypergeometric test'''
    dists = [k for k in pairs.keys() if 'dist' in k]
    LR = pairs[pairs['LR']]

    M = len(pairs)
    n = np.sum(pairs['dist.any_struct'].lt(5))
    LR_M = len(LR)
    LR_n = np.sum(pairs['dist.any_struct'].lt(5))

    prec = {}
    for L in Ls:
        prec[str(L)] = {}
        prec['LR '+str(L)] = {}

        for d in dists:
            h = np.sum(pairs[d].head(L).lt(5))
            LR_h = np.sum(LR[d].head(L).lt(5))

            prec[str(L)][d] = str(100 * h/L)[:4] + '%'
            prec['LR '+str(L)][d] = str(100 * LR_h/L)[:4] + '%'

        h = np.sum(pairs['dist.any_struct'].head(L).lt(5))
        LR_h = np.sum(LR['dist.any_struct'].head(L).lt(5))

        prec[str(L)]['p-value (any struct)'] = hypergeom.sf(h-1, M, n, L)
        prec['LR '+str(L)]['p-value (any struct)'] = hypergeom.sf(LR_h-1, LR_M, LR_n, L)

    prec = pd.DataFrame(prec)
    return(prec)
Exemplo n.º 3
0
def calc_HG_test(genelist_dataset, genes_subset_ranking, ranking, th=1):
    b = len(list(filter(lambda x: x[0] == th, genes_subset_ranking)))
    B = len(genes_subset_ranking)
    N = len(genelist_dataset[0])
    n = len(list(filter(lambda x: x == th, ranking)))
    print "run HG test with {},{},{},{}".format(b, N, B, n)
    print hypergeom.sf(b - 1, N, B, n)
Exemplo n.º 4
0
    def hg_scoring(self, data_links, type='spec-gcf'):
        """
        Calculate metcalf scores from DataLinks() co-occurence matrices
        """

        # NOTE:can't use the correlation matrices directly for this scoring method because
        # it seems to require more inclusive counts of the strains in each object. 
        # Instead of "number of strains only in GCF", it requires "number of strains in the
        # GCF PLUS the number shared between the GCF and the other object". 
        # e.g. if a spectrum has 3 strains, a GCF has 1 strain and there is 1 shared strain, 
        # M_spec_gcf will correctly contain "1", but M_type1_notgcf will contain "2" instead
        # of "3", because the spectrum only has 2 distinct strains vs the GCF.
        # To fix this the M_spec_gcf/M_fam_gcf matrix can just be added onto the others to give
        # the correct totals.


        if type == 'spec-gcf':
            num_strains = np.ones(data_links.M_spec_gcf.shape) * data_links.M_gcf_strain.shape[1]
            overlap_counts = data_links.M_spec_gcf
            gcf_counts = overlap_counts + data_links.M_notspec_gcf
            spec_counts = overlap_counts + data_links.M_spec_notgcf
            hg_scores = hypergeom.sf(overlap_counts, num_strains, gcf_counts, spec_counts, loc=1)
            self.hg_spec_gcf = hg_scores
        elif type == 'fam-gcf':
            num_strains = np.ones(data_links.M_fam_gcf.shape) * data_links.M_gcf_strain.shape[1]
            overlap_counts = data_links.M_fam_gcf
            gcf_counts = overlap_counts + data_links.M_notfam_gcf
            fam_counts = overlap_counts + data_links.M_fam_notgcf
            hg_scores = hypergeom.sf(overlap_counts, num_strains, gcf_counts, fam_counts, loc=1)
            self.hg_fam_gcf = hg_scores

        return hg_scores
Exemplo n.º 5
0
def hypergeometric_test(directory):
    locations = ['C', 'M']

    # directory = '/Users/rona/data/teraserve/chapter-1/Wolfpsort/relocalisation_duplication'
    for i, item in enumerate(locations):
        file_1 = pd.read_csv(f"{directory}/output_{item}_gain.csv")
        file_2 = pd.read_csv(f"{directory}/output_{item}_loss.csv")

        x_gains = file_1['reloc_following_dup'].sum()
        M_gains = file_1['number_of_dups'].sum(
        ) + file_1['number_of_specs'].sum()
        n_gains = file_1['reloc_following_spec'].sum(
        ) + file_1['reloc_following_dup'].sum()
        N_gains = file_1['number_of_dups'].sum()
        print(item + ' gains')
        print(hypergeom.sf(x_gains - 1, M_gains, n_gains, N_gains))

        x_losses = file_2['reloc_following_dup'].sum()
        M_losses = file_2['number_of_dups'].sum(
        ) + file_2['number_of_specs'].sum()
        n_losses = file_2['reloc_following_spec'].sum(
        ) + file_2['reloc_following_dup'].sum()
        N_losses = file_2['number_of_dups'].sum()

        print(item + ' losses')
        print(hypergeom.sf(x_losses - 1, M_losses, n_losses, N_losses))
Exemplo n.º 6
0
def write_tar_rank(tbl_file, fig_file, pro2drug, ind2pro, M,
                   N):  # write targets ranked by enrichment p-value
    pro2pval = {}
    for k, v in pro2drug.items():
        pro = ind2pro[k]
        q = pro[3]
        #m = len(v)
        m = sum([x[1] for x in v])
        if m > q:  # for predicted case, we keep use the total number of known chemicals interacting with a certain target, if predicted number greater than known, pval is 0.
            pval = hypergeom.sf(m - 1, M, m, N)
        else:
            pval = hypergeom.sf(
                m - 1, M, q, N
            )  # Hypergeometric test: the probability of getting more than (m-1) items from N, when the backgroud is q in M.
        pro2pval[k] = pval

    n = 1
    out = open(tbl_file, 'w')
    out.write(
        'Rank\tUniprot_ID\tProtein_Name\tEntry_Name\tTot_Num_Chemicals\tGene_Name\tGene_ID\tPDB\tPathway_Ids\tPathway_Names\tGO_Function\tGO_Process\tGO_Component\tChemicals\tNum_Chemicals\tP_value\n'
    )
    pro_name2enrich_score = {}
    for k, v in sorted(pro2pval.items(), key=lambda x: x[1]):
        pro = ind2pro[k]
        drugs = [(ind2drug[x[0]][1], x[1]) for x in pro2drug[k]]
        out.write(
            str(n) + '\t' + '\t'.join(str(x) for x in pro[0:-1]) + '\t' +
            ';'.join(str(x) for x in drugs) + '\t' + str(len(drugs)) + '\t' +
            str(v) + '\n')
        if n <= 20:
            pro_name2enrich_score[pro[1]] = -np.log10(v)
        n += 1
    out.close()
    enrich_plot(pro_name2enrich_score, fig_file, 'target')
Exemplo n.º 7
0
def getMultiplePsFdr(iva, ivb, model, N, win=6):
    """
    for the interval a and b, searching its nearby windows to estimate FDR and p-values. THe idea that using  matched nearby windows, which could have similar distance with a & b, needs too many windows. 
    return ra, rb, rab, es, fdr, hyp, chyp, pop, nbp
    """
    ra, rb, rab = getPETsforRegions(iva, ivb, model)
    #simple hypergeometric test, the idea using cis_a + cis_b + trans_a+trans_b as M and cis_a+cis_b as N fails with all p-value as 1
    hyp = hypergeom.sf(rab - 1.0, N, ra, rb)
    ivas, ivbs = getNearbyPairRegions(iva, ivb, win=win)
    hyps, rabs, nbps = [], [], []
    for na in ivas:
        nraSource = getCounts(na, model[0])
        nraTarget = getCounts(na, model[1])
        nra = nraSource.union(nraTarget)
        nralen = float(len(nra))
        if nralen < 1:
            continue
        for nb in ivbs:
            nrbSource = getCounts(nb, model[0])
            nrbTarget = getCounts(nb, model[1])
            nrb = nrbSource.union(nrbTarget)
            nrblen = len(nrb)
            if nrblen < 1:
                continue
            nrab = float(len(nra.intersection(nrb)))
            #nrab = float(len(nraSource.intersection(nrbTarget)))
            #collect the value for poisson test
            rabs.append(nrab)
            #collect the nearby hypergeometric test result
            nhyp = hypergeom.sf(nrab - 1.0, N, nralen, nrblen)
            hyps.append(nhyp)
            #collect the possibility for following binomal test
            den = nrab / (nralen * nrblen)
            nbps.append(den)
    if len(rabs) == 0:
        return ra, rb, rab, np.inf, 0.0, hyp, 0.0, 0.0, 0.0,
    hyps, rabs = np.array(hyps), np.array(rabs)
    #local fdr
    fdr = len(rabs[rabs > rab]) / float(len(rabs))
    mrabs = float(np.mean(rabs))
    #enrichment score
    if mrabs > 0:
        es = rab / mrabs
    else:
        es = np.inf
    #es = rab / max([np.mean(rabs),float(np.percentile(rabs,90))])
    #es = rab / float(np.percentile(rabs,90))
    #corrected hypergeometric fdr
    chyp = len(hyps[hyps < hyp]) / float(len(hyps))
    #simple possion test, the idea benefits from MACS as using dynamic lambda
    lam = mrabs
    pop = poisson.sf(rab - 1.0, lam)
    #simple binomal test
    bp = np.mean(nbps) * ra * rb / N
    #nbp = binom.sf(rab, N, bp)
    nbp = binom.sf(rab - 1.0, N - rab, bp)
    return ra, rb, rab, es, fdr, hyp, chyp, pop, nbp
Exemplo n.º 8
0
def calc_HG_test(total_gene_list_N, tests_gene_list_B, total_gene_list_n):
    b = len(set(total_gene_list_n).intersection(set(tests_gene_list_B)))
    B = len(set(tests_gene_list_B))  # .intersection(set(total_gene_list_N)))
    N = len(total_gene_list_N)
    n = len(total_gene_list_n)
    print "run HG test with {},{},{},{}".format(b, N, B, n)
    return "{}\t({} {} {} {})".format(hypergeom.sf(b - 1, N, B, n), b, N, B, n)
Exemplo n.º 9
0
def _prob_hypergeo_fast(y_compute, name, X, M, n, N):
    """Compute hypergeometric Pvalue.

    Description
    -----------
    Suppose you have a lot of 100 floppy disks (M), and you know that 20 of them are defective (n).
    What is the prbability of drawing zero to 2 floppy disks (N=2), if you select 10 at random (N).
    P=hypergeom.sf(2,100,20,10)

    """
    P = np.nan
    logP = np.nan
    # M = len(yc)  # Population size: Total number of samples, eg total number of genes; 10000
    # n = np.sum(datac)  # Number of successes in population, known in pathway, eg 2000
    # N = np.sum(yc)  # sample size: Random variate, eg clustersize or groupsize, over expressed genes, eg 300
    # X = np.sum(np.logical_and(yc, datac.values)) - 1  # Let op, de -1 is belangrijk omdatje P<X wilt weten ipv P<=X. Als je P<=X doet dan kan je vele false positives krijgen als bijvoorbeeld X=1 en n=1 oid

    # Do the hypergeo-test
    if y_compute and (X > 0):
        P = hypergeom.sf(X, M, n, N)
        logP = hypergeom.logsf(X, M, n, N)

    # Store
    out = {}
    out['category_label'] = name
    out['P'] = P
    out['logP'] = logP
    out['overlap_X'] = X
    out['popsize_M'] = M
    out['nr_succes_pop_n'] = n
    out['samplesize_N'] = N
    out['dtype'] = 'categorical'
    return (out)
Exemplo n.º 10
0
def tf_hyper_geom(selected_genes: np.ndarray, epi_data_clustered):
    M = selected_genes.size
    N = np.sum(epi_data_clustered)
    n = np.sum(selected_genes)
    k = np.sum(selected_genes * epi_data_clustered) - 1
    pval = hypergeom.sf(k, M, n, N)
    return pval < 0.1
Exemplo n.º 11
0
def enrichmentOneSided(subsetGO, backgroundTotal, backgroundGO, subsetTotal):
    """
    Performs a one-sided (enrichment) hypergeometric test for a given GO term.

    k or more successes (= GO associations = subsetGO) in N draws (= subsetTotal)
    from a population of size M (backgroundTotal) containing n successes (backgroundGO)
    k or more is the sum of the probability mass functions of k up to N successes
    since cdf gives the cumulative probability up and including input (less or equal to k successes),
    and we want P(k or more), we need to calculate 1 - P(less than k) =  1 - P(k-1 or less)
    sf is the survival function (1-cdf).

    Parameters
    ----------
    subsetGO : int
        The number of genes in the interest subset associated with a GO term.
    backgroundTotal : int
        The total number of genes in the background set.
    backgroundGO : int
        The number of genes in the background set associated with the GO term.
    subsetTotal : int
        The total number of genes in the interest subset.

    Returns
    -------
    float
        The p-value of the one-sided hypergeometric test.
    """

    pVal = hypergeom.sf(subsetGO - 1, backgroundTotal,
                        backgroundGO, subsetTotal)

    return pVal
Exemplo n.º 12
0
    def get_param(self):
        dirname = 'C:/Users/Mingyu/gsea_home/output/mar25/my_analysis.Gsea.1585092763735'
        flist = os.listdir(dirname)

        fmap = OrderedDict()
        for f in flist:
            if f.endswith('.xls') and 'CHR' in f:
                fmap[os.path.splitext(f)[0]] = os.path.join(dirname, f)

        fpath = os.path.join(self.root, 'database/Fantom/v5/cell_lines', 'Regression_filter.xlsx')
        df_lasso = pd.read_excel(fpath, index_col=0)

        fpath = os.path.join(self.root, 'database/Fantom/v5/cell_lines', 'Regression_filter_hyper.xlsx')
        df = pd.read_excel(fpath, index_col=0)

        df_res = pd.DataFrame(index=df.index)
        for fname in df.index:
            gset = df.loc[fname, 'GENEs'].split(':')
            comp = df.loc[fname, 'Comp'].split(':')
            mirna = df.loc[fname, 'miRNA'].split(':')

            df_gsea = pd.read_csv(os.path.join(dirname, fname + '.xls'), sep='\t')
            for genes, mir in zip(gset, mirna):
                genes = genes[1:-1].split(',')
                x = len(genes)
                m = df_gsea.shape[0]
                n = len(comp)
                k = len(df_lasso.loc[mir, 'GENEs'].split(';'))
                p = hypergeom.sf(x, n+m, m, k)
                df_res.loc[fname, mir] = ';'.join(map(str, [x, m, n, k, p]))

        df_res.dropna(how='all', axis=1).to_excel(os.path.join(self.root, 'database/Fantom/v5/cell_lines', 'p_values.xlsx'))
Exemplo n.º 13
0
def calc_all_data(data, diff_gene_hascell, n, N, pvalue, ratio):
    cellname = []
    pval = []
    odd = []
    exp = []
    cout = []
    size = []
    for i in CellName:
        M = data[data[1] == i].shape[0]
        exp_count = n*M/N
        k = 0
        for j in diff_gene_hascell:
            if i in data[data[0] == j][1].tolist():
                k = k+1
        if method == "Fisher":
            OddsRatio = fisher_exact([[M-k,N-M-n+k],[k, n-k]])[0]
            p = fisher_exact([[M-k,N-M-n+k],[k, n-k]])[1]
        else:
            OddsRatio = k/exp_count
             # stats.hypergeom.sf(x, m+n, m, k)==(x, m, n, k, lower.tail=FALSE)
            p = hypergeom.sf(k-1, N, M, n)               
        cellname.append(i)
        pval.append(p)
        odd.append(OddsRatio)
        exp.append(exp_count)
        cout.append(k)
        size.append(M)
    qvalues = qvalue(pval)
    fin = pd.DataFrame({"CellName": cellname, "Pvalue": pval, "p.adjust": qvalues,
                        "oddsRatio": odd, "ExpCount": exp, "Count": cout, "Size": size})
    fin_fliter = fin[(fin['Pvalue'] < pvalue) & (fin['oddsRatio'] >= ratio)]
    return fin_fliter
Exemplo n.º 14
0
 def compute_hypergeometric_score(self, complete_list, target_list):
     M = complete_list.initialLength
     n = self.afterIntersectionLength[0]
     N = target_list.initialLength
     x = self.afterIntersectionLength[-1]
     self.hypergeometricScore = hypergeom.sf(x-1, M, n, N)
     return self.hypergeometricScore
def get_enriched_properties(nodes, semantic_type, pcut=1e-4):
    if semantic_type in [
            'biolink:SmallMolecule', 'biolink:MolecularMixture', 'biolink:Drug'
    ]:
        semantic_type = 'biolink:ChemicalEntity'
    if semantic_type not in ['biolink:ChemicalEntity']:
        return []
    property_lookup = PropertyLookup()
    properties = property_lookup.collect_properties(
        nodes, semantic_type)  # properties = {property: (curies with it)}
    enriched = []
    for property, curies in properties.items():
        # The hypergeometric distribution models drawing objects from a bin.
        # M is the total number of objects (nodes) ,
        # n is total number of Type I objects (nodes with that property).
        # The random variate represents the number of Type I objects in N drawn
        #  without replacement from the total population (len curies).
        x = len(curies)  # draws with the property
        total_node_count = property_lookup.get_nodecount(semantic_type)
        n = property_lookup.total_nodes_with_property(property, semantic_type)
        ndraws = len(nodes)
        enrichp = hypergeom.sf(x - 1, total_node_count, n, ndraws)
        if enrichp < pcut:
            enriched.append(
                (enrichp, property, ndraws, n, total_node_count, curies))
    enriched.sort()
    return enriched
Exemplo n.º 16
0
    def enrichment(self, locus_list, pval_cutoff=0.05, max_term_size=300,
                   min_term_size=5):
        '''
            Evaluates enrichment of loci within the locus list in terms within
            the ontology. NOTE: this only tests terms that have at least one
            locus that exists in locus_list.

            Parameters
            ----------
            locus_list : list of co.Locus
                A list of loci for which to test enrichment. i.e. is there
                an over-representation of these loci within and the terms in
                the Ontology.
            pval_cutoff : float (default: 0.05)
                Report terms with a pval lower than this value
            max_term_size : int (default: 300)
                The maximum term size for which to test enrichment. Useful
                for filtering out large terms that would otherwise be 
                uninformative (e.g. top level GO terms)
            min_term_size : int (default: 5)
        '''
        terms = list(
            filter(
                lambda t: (len(t) >= min_term_size) and (len(t) <= max_term_size),
                [self[name] for name, in  self.db.cursor().execute(
                    '''SELECT DISTINCT(term) 
                    FROM term_loci WHERE id IN ('{}')
                    '''.format(
                        "','".join([x.id for x in locus_list])
                    )
                ).fetchall()
                ]
            )
        )
        num_universe = len(set(chain(*[x.loci for x in terms])))
        self.log(
            'test loci occur in {} terms, containing {} genes'.format(
                len(terms), num_universe
            )
        )
        significant_terms = []
        for term in terms:
            term_genes = term.loci
            if len(term_genes) > max_term_size:
                continue
            num_common = len(term_genes.intersection(locus_list))
            num_in_term = len(term_genes)
            num_sampled = len(locus_list)
            pval = hypergeom.sf(num_common-1,num_universe,num_in_term,num_sampled)
            if pval <= pval_cutoff:
                term.attrs['pval'] = pval
                term.attrs['hyper'] = {
                    'pval'       : pval,
                    'num_common' : num_common,
                    'num_universe' : num_universe,
                    'num_in_term' : num_in_term,
                    'sum_sampled' : num_sampled
                }
                significant_terms.append(term)
        return significant_terms
Exemplo n.º 17
0
    def compute_pvalues_by_hypergeom(self, **kwargs):

        print('Using the hypergeometric test to calculate enrichment...')
        if kwargs:
            print('Overwriting global settings:')
            for k in kwargs:
                print('\t%s=%s' % (k, str(kwargs[k])))

        N = np.zeros([self.graph.number_of_nodes(),
                      len(self.attributes)]) + self.graph.number_of_nodes()
        N_in_group = np.tile(np.nansum(self.node2attribute, axis=0),
                             (self.graph.number_of_nodes(), 1))

        N_in_neighborhood = np.tile(
            np.sum(self.neighborhoods, axis=0)[:, np.newaxis],
            (1, len(self.attributes)))

        N_in_neighborhood_in_group = np.dot(
            self.neighborhoods,
            np.where(~np.isnan(self.node2attribute), self.node2attribute, 0))

        self.pvalues_pos = hypergeom.sf(N_in_neighborhood_in_group - 1, N,
                                        N_in_group, N_in_neighborhood)

        # Correct for multiple testing
        if self.multiple_testing:
            print('Running FDR-adjustment of p-values...')
            out = np.apply_along_axis(fdrcorrection, 1, self.pvalues_pos)
            self.pvalues_pos = out[:, 1, :]

        # Log-transform into neighborhood enrichment scores (NES)
        self.nes = -np.log10(self.pvalues_pos)
Exemplo n.º 18
0
def calculate_pvalues(nodes, query, background_attribute, M,
        min_category_size=3, max_category_size=500,
        max_category_depth=5, **kwargs):
    """ calculate pvalues for all categories in the graph
    
    :param G: ontology graph after background was set
    :param query: set of identifiers
    :param background_attribute: node attribute assoc. with the background set
    :param min_category_size: categories smaller than this number are ignored
    :param max_category_size: categories larger than this number are ignored
    :returns: pvalues, x, n
    """
    N = len(query)
    vals = []
    for node in nodes:
        background = node[background_attribute]
        n = len(background)
        hits = query.intersection(background)
        x = len(hits)
        if ((node.get('depth', 0) > max_category_depth)
            or (n <= min_category_size)
            or (n > max_category_size)):
            vals.append((float('NaN'), x, n))
        else:
            vals.append((hypergeom.sf(x, M, n, N), x, n))
    return zip(*vals)
Exemplo n.º 19
0
def calc_feat_overrep_lin(condition, ttype):
    cells = ['Gm12878', 'H1hesc', 'Helas3', 'Hepg2', 'Huvec', 'K562', 'Nhek']
    keys = ['H', 'B', 'C']
    nonzeros = dict.fromkeys(keys, np.zeros(len(cells)))
    f_len = dict.fromkeys(keys, None)
    nz_len = dict.fromkeys(keys, None)
    for c, cell in enumerate(cells):
        hist_scores = pickle.load(open('predict4/' + 
            'results/histScores'+condition+cell+ttype+'.pkl', 'rb'))
        features = hist_scores.keys()
        fs_scores = np.array(hist_scores.values())
        ind = dict.fromkeys(keys, None)
        ind['H'] = np.array([i for i,v in enumerate(features) if v.startswith('H') and v.endswith('3')])
        ind['B'] = np.array([i for i,v in enumerate(features) if v.startswith('H') and v.endswith('1')])
        ind['C'] = np.array([i for i,v in enumerate(features) if v.startswith('E')])
        for k in keys:
            nonzeros[k][c] = len(np.nonzero(fs_scores[ind[k]])[0])
    for k in keys:
        f_len[k] = len(fs_scores[ind[k]])
        nz_len[k] = math.floor(nonzeros[k].mean())
    t_nz = sum([nz_len[k] for k in keys])
    print condition, ttype
    pvals = []
    for k in keys:
        h_score = hypergeom.sf(nz_len[k], len(features), t_nz, f_len[k])
        pvals.append(h_score)
        print k, h_score
    return pvals
Exemplo n.º 20
0
def GSEA(refset, quiery, clumpsetbd, genesetdb, coord_gene_dict, gen='no'):
    pval_arrays_new = list()
    N = len(refset)
    n = len(set(quiery.keys()))
    genes = defaultdict(list)
    passed_genes = set()
    nums = list()
    n_adj = 0
    for index in quiery.keys():
        index_set = set()
        for snp in quiery[index]:
            if snp in coord_gene_dict.keys(
            ) and " not_set" not in coord_gene_dict[snp][1]:
                indexes = int(coord_gene_dict[snp][1])
                for i in range(indexes):
                    index_set.add(coord_gene_dict[snp][i + 2])
                    genes[index].append(coord_gene_dict[snp][i + 2])
        if len(index_set) != 0 and len(
                passed_genes.union(index_set)) > len(passed_genes):
            n_adj += 1
            passed_genes = passed_genes.union(index_set)
    n = n_adj
    for gene_set in clumpsetbd:
        K = len(set(clumpsetbd[gene_set]))
        count = set()
        igenes = set()
        for index in genes.keys():
            if len(set(genesetdb[gene_set]).intersection(set(
                    genes[index]))) != 0:
                count.add(index)
                igenes.update(
                    set(genesetdb[gene_set]).intersection(set(genes[index])))
        k = len(count)
        if gen == 'yes':
            k = min(len(set(genesetdb[gene_set]).intersection(igenes)), n_adj,
                    K, k)


#        n=n-(len(count)-k)
        nums.append(
            (N, n, K, k,
             ";".join(list(set(genesetdb[gene_set]).intersection(igenes)))))
        pval_arrays_new.append(hypergeom.sf(k - 1, N - k, n, K))
    qval_arrays = list(estimate(np.asarray(pval_arrays_new)))
    results = defaultdict(list)
    i = 0
    for geneset in clumpsetbd:
        if qval_arrays[i] <= 0.1:
            results[geneset].append(pval_arrays_new[i])
            results[geneset].append(qval_arrays[i])
            results[geneset].append(nums[i])
        i += 1
    if len(results) != 0:
        df = pd.DataFrame.from_dict(results, orient='index')
        df.reset_index(inplace=True)
        df = df.sort_values(by=[1])
        return (df)
    else:

        return (list())
Exemplo n.º 21
0
def calc_feat_overrep_mlp(condition, ttype):
    cells = ['Gm12878', 'H1hesc', 'Helas3', 'Hepg2', 'Huvec', 'K562', 'Nhek']
    keys = ['H', 'B', 'C']
    nonzeros = dict.fromkeys(keys, np.zeros(len(cells)))
    f_len = dict.fromkeys(keys, None)
    nz_len = dict.fromkeys(keys, None)
    for c, cell in enumerate(cells):
        folder = 'predict4/'
        mlp_scores = pickle.load(open(folder + 
            'results/mlpMaskCoefs'+cell+condition+ttype+'.pkl', 'rb'))
        fs_scores = garson(mlp_scores)
        with open(folder + 'train/Gm12878.matrix', 'r') as f:
            features = f.readline().rstrip('\n').split('\t')[1:]
        hist_scores = dict(zip(features, fs_scores))
        features = hist_scores.keys()
        fs_scores = np.array(hist_scores.values())
        ind = dict.fromkeys(keys, None)
        ind['H'] = np.array([i for i,v in enumerate(features) if v.startswith('H') and v.endswith('3')])
        ind['B'] = np.array([i for i,v in enumerate(features) if v.startswith('H') and v.endswith('1')])
        ind['C'] = np.array([i for i,v in enumerate(features) if v.startswith('E')])
        for k in keys:
            nonzeros[k][c] = len(np.nonzero(fs_scores[ind[k]])[0])
    for k in keys:
        f_len[k] = len(fs_scores[ind[k]])
        nz_len[k] = math.floor(nonzeros[k].mean())
    t_nz = sum([nz_len[k] for k in keys])
    print condition, ttype
    pvals = []
    for k in keys:
        h_score = hypergeom.sf(nz_len[k], len(features), t_nz, f_len[k])
        pvals.append(h_score)
        print k, h_score
    return pvals
Exemplo n.º 22
0
def plot_ratio_VMinOverV_vsN(rO,rV,rangeN,thre,lowV):
        pH=[]
        rN=[]
        for Ni in rangeN: 
            # Find the lowest N for which the function applies given lowV
            lowN = int(math.floor(lowV/rV))
            #print("lowN, N --> ", lowN, ", ", Ni)
            if Ni < lowN:
                continue
            O = int(math.floor(Ni*rO)) 
            V = int(math.floor(Ni*rV))
            Vmin_tmp = math.floor(0.001*V)
            Vmin = max(Vmin_tmp,lowV)
                
            #print("N, O, V: ",Ni,", ",O,", ",V," varying Vmin to find proba < ",thre)
            
            proba_thre = 1
            Vi = Vmin
            Vbin = 5
            Vthre = 0
            while proba_thre > thre:
                p = math.floor(Vi/2) 
                proba_thre = hypergeom.sf(p, Ni, O, Vi)
                
                Vthre = Vi
                #print(Vi,", prob --> ",proba_thre)
                Vi = Vi + Vbin
            #print("--> ",float(Vthre)/float(V))
            
            pH.append(float(Vthre)/float(V))
            rN.append(Ni)
            
        return (rN, pH)
def calcPValues(n, N1, params, P_actu):

    return [
        hypergeom.sf(P_actu[i], N1, params[i], n) +
        0.5 * hypergeom.pmf(P_actu[i], N1, params[i], n)
        for i in range(len(P_actu))
    ]
Exemplo n.º 24
0
def find_hypergeometric(genes, pred_no_training):

    overlap = list(set(genes) & set(pred_no_training))
    M = 10683
    #M=20000
    N = len(genes)
    n = len(pred_no_training)
    x = len(overlap)
    pval = hypergeom.sf(x - 1, M, n, N)

    rv = hypergeom(M, n, N)
    distr = np.arange(0, n + 1)
    #print (N, n, x)
    prob = rv.pmf(distr)

    maximum = np.max(prob)
    result = np.where(prob == maximum)
    #print (result)
    #result=result.tolist()
    result = result[0]
    #print (result)
    fold = x / result
    fold = fold.tolist()
    print('Fold Enrichment', fold)
    print('hypergeometric p-value', pval)
    return fold
Exemplo n.º 25
0
def hypergeom_test(
    positive_samples: int,
    samples: int,
    positive_total: int,
    total: int
) -> float:
    """
    Wrapper function to call the scipy hypergeometric stats function

    Parameters
    ----------
        positive_samples: int
            Number of successes in the sample set (correctly drawn marbles)
        samples: int
            Total number of samples (number of drawn marbles)
        positive_total: int
            Number of positives in the reference set
            (number of positive marbles in the bag)
        total: int
            Total size of reference set
            (number of marbles in the bag)

    Returns
    -------
    float
        The hypergeometic enrichment score

    """
    return float(hypergeom.sf(
        positive_samples-1,  # likelyhood of more than X, #see https://blog.alexlenail.me/understanding-and-implementing-the-hypergeometric-test-in-python-a7db688a7458  # noqa: 501
        total,
        positive_total,
        samples
    ))
Exemplo n.º 26
0
def calculate_escore(indices, N, X, L, hgp_thresh, tol):
    """Calculate the XL-mHG E-score, using scipy to calculate HG p-values."""
    assert isinstance(indices, np.ndarray) and indices.ndim == 1 and \
        np.issubdtype(indices.dtype, np.uint16)
    assert isinstance(N, int)
    assert isinstance(X, int)
    assert isinstance(L, int)
    assert isinstance(hgp_thresh, float)
    assert isinstance(tol, float)

    K = indices.size
    k = 0
    escore = 0.0
    for i in indices:
        if i >= L:
            break
        n = i+1
        k += 1
        if k >= X:
            e = k / ((n*K)/float(N))
            if e > escore and not mhg.is_equal(e, escore, tol):
                hgp = hypergeom.sf(k - 1, N, K, n)
                if hgp <= hgp_thresh or mhg.is_equal(hgp, hgp_thresh, tol):
                    escore = e
    if escore == 0.0:
        escore = float('nan')
    return escore
Exemplo n.º 27
0
    def HyperGeometric(self, deg_in_special_go, allgene_has_go,
                       allgene_in_special_go, alldeg_has_go):
        '''
    这里用来存GO注释的结果,包含了差异表达基因的GO注释和所有背景基因的GO注释
    hypergeom.sf(deg_in_special_go-1,allgene_has_go,allgene_in_special_go,alldeg_has_go)
        超几何分布的方法
    GO example:
    extracellular region    61 in 715 DEGs, 195 in 4564 all GO genes
    这里作为例子的输入依次为61个差异基因属于某一个GO term,总共有4564个基因有GO注释,195个属于某一个GO term,我们的DEG有715个

                        差异      非差异
    属于某一个GO term    100     1000
    不属于某一个GO term   500     10000

    表示总共有11600个基因,其中600个位差异基因,属于某一个GO注释目录的有1100,其中属于差异基因的有100个
    x=100,m=1100,n=10500,k=600

    x,m+n,m,k
    stats.hypergeom.sf(99,11600,1100,600)
        '''
        a = deg_in_special_go - 1
        self.pValue = float(
            hypergeom.sf(a, allgene_has_go, allgene_in_special_go,
                         alldeg_has_go))
        self.enrich = float((deg_in_special_go / alldeg_has_go) /
                            (allgene_in_special_go / allgene_has_go))
Exemplo n.º 28
0
def calc_hg_enrichment_pval(mat, a, arm_a, aneu_type_a, b, arm_b, aneu_type_b):
    n_overlap = np.sum(
        np.logical_and(
            mat.loc[:, "{}{}".format(a, arm_a)].values == aneu_type_a,
            mat.loc[:, "{}{}".format(b, arm_b)].values == aneu_type_b))

    n_a = np.sum(
        np.logical_and(
            mat.loc[:, "{}{}".format(a, arm_a)].values == aneu_type_a,
            mat.loc[:, "{}{}".format(b, arm_b)].values != aneu_type_b))

    n_b = np.sum(
        np.logical_and(
            mat.loc[:, "{}{}".format(a, arm_a)].values != aneu_type_a,
            mat.loc[:, "{}{}".format(b, arm_b)].values == aneu_type_b))

    # pval=hypergeom.sf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) \
    # + hypergeom.pmf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b)

    pval=hypergeom.sf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) \
    + hypergeom.pmf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) # n_overlap+n_a+n_overlap+n_b

    # tbl=[[n_overlap, n_b], [n_a, mat.shape[0]-(n_overlap+n_b+n_a)]]
    # pval_1=fisher_exact(tbl, 'greater')
    # if a==1 and arm_a=='p' and aneu_type_a==-1 and b==2 and  arm_b=='q' and aneu_type_b==-1:
    #     print (n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b)
    #     print pval, pval_1[1]

    return pval
Exemplo n.º 29
0
	def computeP(self,db,length,totWords):
	# Computes the p value using hypergeometric distribution
	# Also finds and assigns the total word count from the database
	# Word count database must be built by 'totalWordCounts' function
		from scipy.stats import hypergeom
		self.total = db[self.word]
		self.p = hypergeom.sf((self.freq-1),totWords,self.total,length)
Exemplo n.º 30
0
def calculate_pvalues(nodes, query, background_attribute, M,
        min_category_size=3, max_category_size=500,
        max_category_depth=5, **kwargs):
    """ calculate pvalues for all categories in the graph

    :param nodes: nodes dictionary from the ontology graph after background was set
    :param query: set of identifiers for which the p value is calculated
    :param background_attribute: node attribute assoc. with the background set
    :param M: background size, total number of genes in the data
    :param min_category_size: categories smaller than this number are ignored
    :param max_category_size: categories larger than this number are ignored
    :param max_category_depth: categories lower in the hierarchy (more specific) will be ignored
    :returns: pvalues, x, n
    """
    N = len(query)
    vals = []
    for node in nodes:
        category = node[background_attribute]
        n = len(category)
        hits = query.intersection(category)
        x = len(hits)
        if ((node.get('depth', 0) > max_category_depth)
            or (n <= min_category_size)
            or (n > max_category_size)):
            vals.append((float('NaN'), x, n))
        else:
            vals.append((hypergeom.sf(x-1, M, n, N), x, n))
    return [np.array(x) for x in zip(*vals)]
Exemplo n.º 31
0
def get_pval(setA, setB, population):
    x = np.isin(setA,setB).sum() # number of successes
    M = len(population) # pop size
    k = len(setB) # successes in pop
    N = len(setA) # sample size
    pval = hypergeom.sf(x-1, M, k, N)
    return pval
Exemplo n.º 32
0
def calculate_pvalues(nodes,
                      query,
                      background_attribute,
                      M,
                      min_category_size=3,
                      max_category_size=500,
                      max_category_depth=5,
                      **kwargs):
    """ calculate pvalues for all categories in the graph
    
    :param G: ontology graph after background was set
    :param query: set of identifiers
    :param background_attribute: node attribute assoc. with the background set
    :param min_category_size: categories smaller than this number are ignored
    :param max_category_size: categories larger than this number are ignored
    :returns: pvalues, x, n
    """
    N = len(query)
    vals = []
    for node in nodes:
        background = node[background_attribute]
        n = len(background)
        hits = query.intersection(background)
        x = len(hits)
        if ((node.get('depth', 0) > max_category_depth)
                or (n <= min_category_size) or (n > max_category_size)):
            vals.append((float('NaN'), x, n))
        else:
            vals.append((hypergeom.sf(x, M, n, N), x, n))
    return zip(*vals)
Exemplo n.º 33
0
def _perform_enrichment_bonferroni(clusters,
                                   annotations,
                                   n_cores=1
                                   ):  #TODO implement parallel execution
    N = len(np.unique(annotations.index))
    bonferroni_correction = len(list(np.unique(
        annotations['annotations']))) * len(clusters)
    p_values = []
    genes_to_count = []
    enriched_clusters = 0
    for i, c in enumerate(clusters):
        enriched = False
        genes_at_least_one = list(
            set(annotations.index).intersection(set(list(c.ravel()))))
        n = len(genes_at_least_one)
        ann_c = annotations.loc[genes_at_least_one]
        for a in np.unique(ann_c['annotations']):
            genes_annotated = annotations[annotations['annotations'] == a]
            K = genes_annotated.shape[0]
            k = ann_c[ann_c['annotations'] == a].shape[0]
            pval = hypergeom.sf(k - 1, N, K, n)
            if pval < (0.05 / bonferroni_correction):
                genes_to_count += \
                        list(np.unique(ann_c[ann_c['annotations']==a].index))
                p_values.append((i, pval, a))
                enriched = True
        if enriched:
            enriched_clusters += 1
    return p_values, len(
        set(genes_to_count)) / N, enriched_clusters / len(clusters)
Exemplo n.º 34
0
def plot_ratio_VoverN(rO,rN,thre,lowV):
        
        pH=[]
        rangeN=[]
        for Ni in rN: #For each rangeN in range 5k - 100k
            Vmin = max(0.001*Ni,lowV) #Minumum value of V is 0.001 of the value of the range interval 
            #Vmin = math.max(V_min as defined in the parameters of the fonction 2000, math.floor(0.001*Ni)
            #print("lowN, N --> ", Vmin, ", ", Ni)
            if Ni < lowV:
                print("check your param")
                continue   
            O=Ni*rO #O = Each range x the ratio of bad nodes specified 
            #print("N, O: ",Ni,", ",O,". Varying V to find proba ~ 10-9")
            
            proba_thre = 1 
            Vi = Vmin #rVi is set to 0.001 of the value of the range interval 
            Vbin = 5 #bin size of 10 
            V_thre = 0 

            while proba_thre > thre: #while value set on l32 > value set on l52 ...
                p = math.floor(Vi/2) + 1 #math.floor rounds to the nearest value 
                proba_thre = hypergeom.sf(p, Ni, O, Vi)
                V_thre = Vi
                #print(Vi,", prob --> ",proba_thre)
                Vi = Vi + Vbin

            if V_thre < lowV:
                print("failed for (N,V,O) = (",Ni,", ",V_thre,", ",rO*100,") --> ",float(V_thre)/float(Ni))
                continue
            #else:
                #print("success for (N,V,O) = (",Ni,", ",V_thre,", ",rO*100,") --> ",float(V_thre)/float(Ni))
            pH.append(float(V_thre)/float(Ni))
            rangeN.append(Ni)        
                   
        return (rangeN,pH)
Exemplo n.º 35
0
def get_xlmhg_stat_slow(v, X, L, tol=1e-12):
    # calculate the XL-mHG test statistic (inefficient)

    # type checking
    assert isinstance(v, np.ndarray)
    assert v.dtype == np.uint8
    assert isinstance(X, int)
    assert isinstance(L, int)

    # check if values are valid
    N = v.size
    if not (1 <= X <= N):
        raise ValueError('Invalid value X=%d; should be >= 1 and <= N.' %(X))
    if not (1 <= L <= N):
        raise ValueError('Invalid value L=%d; should be >= 1 and <= N.' %(L))

    K = int(np.sum(v != 0))
    if K == 0:
        # special case when K=0
        return 1.0, 0

    k = 0
    stat = 1.1
    n_star = 0
    for i in range(L):
        if v[i] != 0:
            k += 1
        if k >= X:
            hgp = hypergeom.sf(k-1, N, K, i+1)
            if hgp < stat and not mhg.is_equal(hgp, stat, tol):
                stat = hgp
                n_star = i + 1

    stat = min(stat, 1.0)
    return stat, n_star
Exemplo n.º 36
0
    def test_custom_domain_iterable(self, gene_ontology):
        features_of_interest = gene_ontology.all_genes[:10]

        domain = ['cellular_component', 'molecular_function']

        test_enrichment_df = gene_ontology.enrichment(features_of_interest,
                                                      domain=domain)

        domains = frozenset(domain)
        p_value_cutoff = 1000000
        min_feature_size = 3
        min_background_size = 5
        cross_reference = {}
        background = gene_ontology.all_genes
        n_all_genes = len(background)
        n_features_of_interest = len(features_of_interest)
        enrichment = defaultdict(dict)

        for go_term, go_genes in gene_ontology.ontology.items():
            if go_genes['domain'] not in domains:
                continue

            features_in_go = go_genes['genes'].intersection(
                features_of_interest)
            background_in_go = go_genes['genes'].intersection(background)
            too_few_features = len(features_in_go) < min_feature_size
            too_few_background = len(background_in_go) < min_background_size
            if too_few_features or too_few_background:
                continue

            # Survival function is more accurate on small p-values
            p_value = hypergeom.sf(len(features_in_go), n_all_genes,
                                   len(background_in_go),
                                   n_features_of_interest)
            p_value = 0 if p_value < 0 else p_value
            symbols = [cross_reference[f] if f in cross_reference else f for f
                       in features_in_go]
            enrichment['p_value'][go_term] = p_value
            enrichment['n_features_of_interest_in_go_term'][go_term] = len(
                features_in_go)
            enrichment['n_background_in_go_term'][go_term] = len(
                background_in_go)
            enrichment['n_features_total_in_go_term'][go_term] = len(
                go_genes['genes'])
            enrichment['features_of_interest_in_go_term'][
                go_term] = ','.join(features_in_go)
            enrichment['features_of_interest_in_go_term_gene_symbols'][
                go_term] = ','.join(symbols)
            enrichment['go_domain'][go_term] = go_genes['domain']
            enrichment['go_name'][go_term] = go_genes['name']
        enrichment_df = pd.DataFrame(enrichment)

        # Bonferonni correction
        enrichment_df['bonferonni_corrected_p_value'] = \
            enrichment_df.p_value * enrichment_df.shape[0]
        ind = enrichment_df['bonferonni_corrected_p_value'] < p_value_cutoff
        enrichment_df = enrichment_df.ix[ind]
        true_enrichment_df = enrichment_df.sort(columns=['p_value'])

        pdt.assert_frame_equal(test_enrichment_df, true_enrichment_df)
Exemplo n.º 37
0
def GO_enrichment(geneList, ontology, expressedGenes = None, printIt=False, pCut = 1000000, xRef = {}):
    geneList = list(geneList)
    expressedGenes = list(expressedGenes)

    lenAllGenes, lenTheseGenes =  len(expressedGenes), len(geneList)
    pValues = defaultdict()
    nCmps = 0

    for GOTerm, GOGenes in ontology.items():
        inBoth = GOGenes['genes'].intersection(geneList)
        expressedGOGenes = GOGenes['genes'].intersection(expressedGenes)
        if len(inBoth) <= 3 or len(expressedGOGenes) < 5:
            pValues[GOTerm] = 'notest'
            continue
            
        pVal = hypergeom.sf(len(inBoth), lenAllGenes, len(expressedGOGenes), lenTheseGenes)
        if pVal < 0:
            pVal = 0 
        symbols = []
        for ensg in inBoth:
            if ensg in xRef:
                symbols.append(xRef[ensg])
            else:
                symbols.append(ensg)
        pValues[GOTerm] = (pVal, len(inBoth), len(expressedGOGenes), len(GOGenes['genes']), inBoth, symbols)
        
    for k, v in pValues.items():
        try:
            pValues[k][0] = v * float(nCmps) #bonferroni correction
        except:
            pass
    import operator
    y  = []

    sorted_x = sorted(pValues.iteritems(), key=operator.itemgetter(1))

    for k, v in sorted_x:
        if v == "notest":
            continue
        if not type(k) == str:
            continue
        try:
            if v[0] > pCut:
                continue
            if printIt:
                [k, "|".join(ontology[k]['name']), v[0], v[1], v[2], v[3], ",".join(v[4]),  ",".join(v[5])]
                #print k, "|".join(ontology[k]['name']), "%.3e" %v[0], v[1], v[2], v[3], "|".join(v[3])
            y.append([k, "|".join(ontology[k]['name']), v[0], v[1], v[2], v[3], ",".join(v[4]), ",".join(v[5])])
            
        except:
            pass

    try:
        df = pd.DataFrame(y, columns=['GO Term ID', 'GO Term Description', 'Bonferroni-corrected Hypergeometric p-Value', 'N Genes in List and GO Category', 'N Expressed Genes in GO Category', 'N Genes in GO category', 'Ensembl Gene IDs in List', 'Gene symbols in List'])
        df.set_index('GO Term ID', inplace=True)
    except:
        df = pd.DataFrame(None, columns=['GO Term ID', 'GO Term Description', 'Bonferroni-corrected Hypergeometric p-Value', 'N Genes in List and GO Category', 'N Expressed Genes in GO Category', 'N Genes in GO category', 'Ensembl Gene IDs in List', 'Gene symbols in List'])

    return df
Exemplo n.º 38
0
	def compute_p( i, M, N ):
		#print i
		z = i.counts # n black balls in draw
		n = i.all_counts # num black balls tot
		M = M
		#M = M - n  # n white balls
		N = N # num drawn
		prb =  hypergeom.sf(z, M, n, N)
		return prb
Exemplo n.º 39
0
def my_static_result(my_v, my_static_genes, my_gene_set):
    N = my_v.size
    n = len(my_static_genes)
    K = my_gene_set.size
    selected_genes = sorted(my_static_genes & my_gene_set.genes)
    k = len(selected_genes)
    pval = hypergeom.sf(k - 1, N, K, n)
    result = StaticGSEResult(my_gene_set, N, n, selected_genes, pval)
    return result
Exemplo n.º 40
0
	def computeP(self,db,length,totwords):
	# Computes the p value using hypergeometric distribution
	# Also finds and assigns the total word count from the database
	# Word count database must be built by 'totalWordCounts' function
	#	db - dictionary of the words and how often they occur in the entire database
	#	length - number of words in all of the genes in the query set
	#	totwords - total words in the entire database
		self.total = db[self.word]
		self.totwords = totwords
		self.length = length
		self.p = hypergeom.sf((self.freq-1),self.totwords,self.total,self.length)
Exemplo n.º 41
0
def getPvalHypergeom(go,tot1s,totGenes):
	
	hit=0;tot=0
	for gene in GOs[go]:
	
		try:
			hit+=scores[gene]
			tot+=1	
		except KeyError:
			fail=True
	
	stat=hypergeom.sf(int(hit),totGenes,tot1s,tot,loc=0)
Exemplo n.º 42
0
def compute_p(i, M, N):
    """
    computes the p-value for a given row in a data frame containing
    the columns "counts" and "all_counts"

    parameters:
    i: data entry
    M: total number of entries of this data type
    N: size of the sub set to compute the p value on (number drawn)
    """
    z = i.counts  # n black balls in draw
    n = i.all_counts  # num black balls tot
    # M = M - n  # n white balls (was by Aaron, unused)
    return  hypergeom.sf(z, M, n, N)
Exemplo n.º 43
0
def test_set_hypergeom(selected_genes, all_genes, set_genes):
    # Reduce the gene_set to the universe of all_genes,
    # as we can only sample from this set.
    all_genes = set(all_genes)
    set_genes = set(set_genes).intersection(all_genes)
    selected_genes = set(selected_genes).intersection(all_genes)

    # Calculate overlap of the gene_set with selected genes.
    selected_set_genes = set_genes.intersection(selected_genes)

    # Calculate p-value using the hyper-geometric test.
    p_val = hypergeom.sf(M=len(all_genes), n=len(set_genes), N=len(selected_genes), k=len(selected_set_genes), loc=1)

    return p_val
Exemplo n.º 44
0
    def test_no_enrichment(self, gene_ontology):
        features_of_interest = gene_ontology.all_genes[:2]
        test_enrichment_df = gene_ontology.enrichment(features_of_interest)

        domains = gene_ontology.domains
        min_feature_size = 3
        min_background_size = 5
        cross_reference = {}
        background = gene_ontology.all_genes
        n_all_genes = len(background)
        n_features_of_interest = len(features_of_interest)
        enrichment = defaultdict(dict)

        for go_term, go_genes in gene_ontology.ontology.items():
            if go_genes['domain'] not in domains:
                continue

            features_in_go = go_genes['genes'].intersection(
                features_of_interest)
            background_in_go = go_genes['genes'].intersection(background)
            too_few_features = len(features_in_go) < min_feature_size
            too_few_background = len(background_in_go) < min_background_size
            if too_few_features or too_few_background:
                continue

            # Survival function is more accurate on small p-values
            p_value = hypergeom.sf(len(features_in_go), n_all_genes,
                                   len(background_in_go),
                                   n_features_of_interest)
            p_value = 0 if p_value < 0 else p_value
            symbols = [cross_reference[f] if f in cross_reference else f for f
                       in features_in_go]
            enrichment['p_value'][go_term] = p_value
            enrichment['n_features_of_interest_in_go_term'][go_term] = len(
                features_in_go)
            enrichment['n_background_in_go_term'][go_term] = len(
                background_in_go)
            enrichment['n_features_total_in_go_term'][go_term] = len(
                go_genes['genes'])
            enrichment['features_of_interest_in_go_term'][
                go_term] = ','.join(features_in_go)
            enrichment['features_of_interest_in_go_term_gene_symbols'][
                go_term] = ','.join(symbols)
            enrichment['go_domain'][go_term] = go_genes['domain']
            enrichment['go_name'][go_term] = go_genes['name']
        true_enrichment_df = pd.DataFrame(enrichment)

        assert true_enrichment_df.empty
        assert test_enrichment_df is None
Exemplo n.º 45
0
def test(module, annots_dict, inverse_annots_dict, mode='standard'):
    """Use the hypergeometric test on functions in a gene module.

    The hypergeometric test is also known as Fisher's exact test.

    Parameters
    ----------
    module : [string]
        The list of genes in a module.
    annots_dict : {string: [string]} dictionary
        A mapping of genes to functions
    inverse_annots_dict : {string: [string]} dictionary
        A mapping of functions to genes
    mode : {'standard', 'conditional'}, optional
        Whether to use the standard hypergeometric test or the conditional one
        (default: standard).

    Returns
    -------
    d : {string: float} dictionary
        A mapping of functions to p-values.
    """
    represented_functions = unions([annots_dict[gene] for gene in module])
    d = {}
    num_genes = len(annots_dict)
    num_drawn = len(module)
    for function in represented_functions:
        num_labeled_total = len(inverse_annots_dict[function])
        num_labeled_in_module = sum(
                            [function in annots_dict[gene] for gene in module])
        d[function] = hypergeom.sf(num_labeled_in_module - 1, num_genes,
                                   num_labeled_total, num_drawn)
        if mode.startswith('c'):
            d[function] /= hypergeom.sf(0, num_genes, num_labeled_total, 
                                        num_drawn)
    return d
Exemplo n.º 46
0
def enrich(inputgenes,backgroundgenes,dbfilename,verbose=False,returnn=20):
	"""
	enrich
	perform hypergeometric testing of a set of genes drawn from a background against gene sets in a 
	gmt file. "P values" are the (hypergeometric) probability that at least as many genes from each pathway were 
	obseved as occured in the input set. Ie 1-CFF(n-1) where n is the number of genes observed.

	Arguments:
	inputgenes: a numpy.array of gene symbols representing the set to be analized
	backgroundgenes: a numpy.array of gene symbols representing the background from which the set was drawn
	dbfilename: the filename of a gmt file (available at http://www.broadinstitute.org/gsea/downloads.js) containting
	the sets to be enriched against
	verbose=False:If true print output to standard out
	returnn=20:return at most this many sets

	Returns:
	An array of arrays where each iner array contains the name, link and p value of a pathway. Entries are sorted by 
	p value in ascending order. Example
	[["name","http://link",.0001]
	]

	""" 
	genes=np.unique(inputgenes)
	background = np.unique(backgroundgenes)
	ntrys = len(genes)
	total= len(background)
	gmtDB = open(dbfilename)
	names =[]
	links =[]
	probs =[]
	for line in gmtDB:
		vs=line.rstrip().split("\t")
		setgenes=np.array(vs[2:])
		nfound = np.sum(np.in1d(genes,setgenes,assume_unique=True))
		if nfound > 1:
			npresent = np.sum(np.in1d(setgenes,background,assume_unique=True))
			prob = hypergeom.sf(nfound-1,total,npresent,ntrys)
			names.append(vs[0])
			links.append(vs[1])
			probs.append(prob)
			if verbose:
				print "\t".join([vs[0],vs[1],str(prob)])
	gmtDB.close()
	sortedarray = []
	for i in  np.argsort(np.array(probs))[0:returnn]:
		sortedarray.append([names[i],links[i],probs[i]])
	return sortedarray
Exemplo n.º 47
0
 def enrichment(self,gene_list,pval_cutoff=0.05,gene_filter=None,label=None,max_term_size=300):
     # extract possible terms for genes
     if label:
         self.log("Caculating Enrichemnt for {}",label)
     cur = self.db.cursor()
     terms = [ x[0] for x in cur.execute(
         '''SELECT DISTINCT(term) FROM gene_terms 
         WHERE gene IN ('{}');'''.format("','".join([x.id for x in gene_list]))
     )]
     # compute hypergeometric for each term
     enrichment = []
     for id in terms:
         try:
             (id,name,type,desc) = cur.execute("SELECT * FROM terms WHERE id = ?",(id,)).fetchone()
         except TypeError as e:
             self.log("No information for ontology term {}",id)
         genes_in_term = [x[0] for x in cur.execute(
             '''SELECT gene FROM gene_terms WHERE term = ?''',(id,))
         ]
         if len(genes_in_term) > max_term_size:
             self.log("Skipping {} due to size ({})",name,len(genes_in_term))
             continue
         if gene_filter:
             genes_in_term = [gene for gene in genes_in_term if gene in gene_filter]
         num_genes_in_term = len(genes_in_term)
         overlap = set(genes_in_term).intersection(set([x.id for x in gene_list]))
         num_genes_total, = cur.execute('SELECT COUNT(DISTINCT(gene)) FROM gene_terms;').fetchone()
         pval = hypergeom.sf(len(overlap)-1,num_genes_total,num_genes_in_term,len(gene_list))
         term_genes = ",".join(overlap)
         enrichment.append(
             (id,name,pval,num_genes_in_term,len(overlap),len(gene_list),num_genes_total,type,term_genes,desc)
         )
     try:
         enrichment = DataFrame(enrichment,
             columns = ['TermID','Name','pval','LenTerm','LenOverlap','LenList','LenTotal','Type','TermGenes','Desc']
         ).sort('pval',ascending=True)
         enrichment.index = enrichment.TermID
     except ValueError as e:
         self.log("No enrichment for {}",",".join([x.id for x in gene_list]))
         return DataFrame()
     if label:
         enrichment['Label'] = label
     return enrichment[enrichment.pval <= pval_cutoff]
Exemplo n.º 48
0
def calculate_pvalues(G, query, min_hit_size=2, min_category_size=3,
        max_category_size=500, max_category_depth=5, **kwargs):
    """ calculate pvalues for all categories in the graph
    
    :param G: ontology graph after background was set
    :param query: array_like of identifiers
    :param min_hit_size: minimum intersection size of query and category 
    :param min_category_size: categories smaller than this number are ignored
    :param max_category_size: categories larger than this number are ignored
    :returns: dictionary of term : pvalue
    """
    query_set = set(query)
    pvalues = {}
    N = len(query_set)
    for i in G:
        node = G.node[i]
        # reset all query related attributes
        for attr in ['query', 'n', 'N', 'hits', 'x', 'p', 'q', 'significant']:
            if attr in node:
                del node[attr]
        background = node.get('background', set([]))
        n = len(background)
        node['n'] = n
        hits = query_set.intersection(background)
        x = len(hits)
        depth = node.get('depth', -1) # depth might not be set due to malformed ontology
        if ((depth > max_category_depth)
            or (n < min_category_size)
            or (n > max_category_size)
            or (x < min_hit_size)):
            continue
        else:
            node['query'] = query_set
            node['N'] = N
            node['hits'] = hits
            node['x'] = x
            M, n = node['M'], node['n']
            p = hypergeom.sf(x, M, n, N)

            node['p'] = p
            pvalues[i] = p
    return pvalues
def hypergeometric_test(x, M, n, N):
    """
    The hypergeometric distribution models drawing objects from a bin.
    - M is total number of objects
    - n is total number of Type I objects. 
    - x (random variate) represents the number of Type I objects in N drawn without replacement from the total population

    - http://en.wikipedia.org/wiki/Hypergeometric_distribution
    - https://www.biostars.org/p/66729/
    - http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.hypergeom.html
    - http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.hypergeometric.html
    - http://stackoverflow.com/questions/6594840/what-are-equivalents-to-rs-phyper-function-in-python
    """

    assert n <= M
    assert x <= n
    assert N <= M
    pv_le = hypergeom.cdf(x+1, M, n, N)
    pv_gt = hypergeom.sf(x-1, M, n, N)# 1-cdf sometimes more accurate
    return pv_le, pv_gt
Exemplo n.º 50
0
def test_cross():
    """Compares p-values calculated using PVAL1 and PVAL2."""
    N = 50
    K = 10

    #tol = 1e-11
    tol = 1e-8

    W = N-K
    table = np.empty((K+1, W+1), dtype=np.longdouble)

    # calculate hypergeometric p-values for all configurations
    configs = np.ones((K+1, W+1), dtype=np.float64)
    for k in range(1, K+1):
        for w in range(W):
            n = k+w
            configs[k, w] = hypergeom.sf(k-1, N, K, n)

    tests = 0
    for X in range(1, N+1):
        for L in range(N, 0, -1):
            # calculate all possible XL-mHG test statistics
            S = np.ones((K+1, W+1), dtype=np.float64)
            for n in range(L+1):
                k = min(K, n)
                w = n-k
                while k >= X and w <= W and n <= L:
                    S[k, w] = configs[k, w]
                    k -= 1
                    w += 1

            all_stat = np.sort(np.unique(S.ravel()))[::-1]

            for stat in all_stat:
                pval1 = mhg_cython.get_xlmhg_pval1(N, K, X, L, stat, table)
                pval2 = mhg_cython.get_xlmhg_pval2(N, K, X, L, stat, table)
                tests += 1
                assert mhg.is_equal(pval1, pval2, tol=tol)

    print('Calculated %d bounds, based on %d configurations.'
          %(tests, configs.size))
Exemplo n.º 51
0
    def copies_in_opening_hand(self, deck, hand_size=7):
        question_string = "How likely is it that at least one copy of {card} will be in your opening hand?"
        answer_suffix = 'percent'
        chosen_card = random.choice(deck.decklist)
        copies = chosen_card.count
        deck_size = sum([ c.count for c in deck.decklist ])

        opening_hand_chance = hypergeom.sf(1, deck_size, copies, hand_size)
        # Consult docs or Stack Overflow: what's that first parameter mean
        # again? Thank goodness I gave the rest meaningful variable names.
        # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.hypergeom.html

        opening_hand_chance = opening_hand_chance * 100
        correct_string = "{:.2f}".format(opening_hand_chance)

        wrongs = self.gen_wrong(opening_hand_chance, 'percent', 4)
        possible = wrongs + [correct_string]
        random.shuffle(possible)

        print "Chance of a copy of {} in opening hand: {}".format(chosen_card.name, correct_string)
        return question_string.format(card=chosen_card.name), correct_string, possible, answer_suffix, chosen_card
Exemplo n.º 52
0
		def post(self):
			background = defaultbackground
			try:
				args = tornado.escape.json_decode(self.request.body) 
			except ValueError:
				self.clear()
				self.set_status(400)
				self.finish("<html><head><title>400 Bad Request</title></head><body>Malformed JSON object in POST body.</body></html>")
			if "background" in args  and len(args["background"])!=0:
				background = np.unique(args["background"])

			total= len(background)

			replydata = {}
			for setname in args["lists"]:
				genes = np.unique(args["lists"][setname])

				ntrys = len(genes)
	
				names =[]
				links =[]
				probs =[]
				for pathway in pathways:
					setgenes=pathway["genes"]
					nfound = np.sum(np.in1d(genes,setgenes,assume_unique=True))
					if nfound > 1:
						npresent = np.sum(np.in1d(setgenes,background,assume_unique=True))
						prob = hypergeom.sf(nfound-1,total,npresent,ntrys)
						names.append(pathway["name"])
						links.append(pathway["link"])
						probs.append(prob)
				sortedarray = []
				#for i in  np.argsort(np.array(probs))[0:returnn]:
				for i in  np.argsort(np.array(probs)):
					sortedarray.append({"name":names[i],"link":links[i],"p":probs[i]})
				replydata[setname]=sortedarray

			json.dump({'results': replydata}, self)
Exemplo n.º 53
0
    def fit(self, df_X, df_y):
        if not df_y.shape[0] == df_X.shape[0]:
            raise ValueError("number of regions is not equal")
        if df_y.shape[1] != 1:
            raise ValueError("y needs to have 1 label column")
       
        if set(df_X.dtypes) != set([np.dtype(int)]):
            raise ValueError("need motif counts, not scores")

        # calculate hypergeometric p-values
        pvals = []
        clusters = df_y[df_y.columns[0]].unique()
        M = df_X.shape[0]
        for cluster in clusters:
            pos = df_X[df_y.iloc[:,0] == cluster]
            neg = df_X[df_y.iloc[:,0] != cluster]
            
            pos_true = (pos > 0).sum(0)
            pos_false = (pos == 0).sum(0)
            neg_true = (neg > 0).sum(0)
            
            p = []
            for pt, pf, nt in zip(pos_true, pos_false, neg_true):
                n = pt + nt
                N = pt + pf
                x = pt - 1
                p.append(hypergeom.sf(x, M, n, N))
            
            pvals.append(p)
        
        # correct for multipe testing
        pvals = np.array(pvals)
        fdr = multipletests(pvals.flatten(), 
                method="fdr_bh")[1].reshape(pvals.shape)
        
        # create output DataFrame
        self.act_ = pd.DataFrame(-np.log10(pvals.T), 
                columns=clusters, index=df_X.columns)
Exemplo n.º 54
0
def hypergeometric_p_value(n_unique_nodes, intsec_card, sources, targets):
    """Work in progress!

    """
    from scipy.stats import hypergeom

    us = n_unique_nodes[sources]
    ut = n_unique_nodes[targets]

    # population size
    M = 220*220
    # number of success states in population
    n = np.vstack((us, ut)).max(axis=0)
    # total draws
    N = np.vstack((us, ut)).min(axis=0)
    # successes
    x = intsec_card

    hg_p = np.zeros(len(sources))
    for i in range(len(sources)):
        hg_p[i] = hypergeom.sf(x[i], M, n[i], N[i])

    return hg_p
Exemplo n.º 55
0
def genpoints(prefix,frag):
    '''compute the fpr/tpr frome all the shape constraint search files
    in this directory with given prefix and frag'''
    acntf = 'active.%s.cnt' % frag
    dcntf = 'decoy.%s.cnt' % frag
    checkfiles([acntf,dcntf])
    numactives = float(open(acntf).read())
    numdecoys = float(open(dcntf).read())
    
    pts = list()
    cnt = 0
    files = {}
    for afile in glob.glob('%s.*.%s.actives*.out' % (prefix,frag)):
        cnt += 1.0
        dfile = afile.replace('actives','decoys')
        checkfiles([dfile])
        na = sum([1.0 for line in open(afile)])
        nd = sum([1.0 for line in open(dfile)])
        fpr = nd/numdecoys
        tpr = na/numactives
        
        if fpr > 1 or tpr > 1:
            print "BAD FPR/TPR: %.2f %2f  %s %s" % (fpr,tpr,afile,dfile)
        
        if na == 0:
            pval = 1
        else:
            pval = hypergeom.sf(na-1,numdecoys+numactives,na,nd+na)
        
        pts.append((fpr,tpr, pval))
        files[(fpr,tpr)] = afile
        #print afile,fpr,tpr, pval
            
    #bonferroni correction
    pts = np.array(list(pts))
    pts[:,2] *= cnt
    return pts, files
Exemplo n.º 56
0
    def copies_in_top_five(self, deck):
        """ Another difficult question - but also somewhat difficult to code,
            since it requires that we pick a bunch of cards that have already
            left the deck. Well, it would require that for the serious
            version. For this version - just use a scalar!
        """
        question_string = "After drawing your opening hand with one copy of {card}, how likely is it that another copy of {card} is in the top five cards of your deck?"
        answer_suffix = 'percent'
        # That's another reason why we don't choose a card earlier: we might be
        # interested in a card with a specific quality.
        chosen_card = random.choice([ card for card in deck.decklist if card.count > 1 ])
        remaining_copies = chosen_card.count - 1
        remaining_deck = sum([c.count for c in deck.decklist]) - 7

        in_top_five_chance = hypergeom.sf(1, remaining_deck, remaining_copies, 5)
        in_top_five_chance = in_top_five_chance * 100
        correct_string = "{:.2f}".format(in_top_five_chance)

        wrongs = self.gen_wrong(in_top_five_chance, 'percent', 4)
        possible = wrongs + [correct_string]
        random.shuffle(possible)

        print "Chance of a copy of {} in the next five cards: {}".format(chosen_card.name, correct_string)
        return question_string.format(card=chosen_card.name), correct_string, possible, answer_suffix, chosen_card
Exemplo n.º 57
0
    def get_static_enrichment(
            self, genes, pval_thresh, adjust_pval_thresh=True, K_min=3,
            gene_set_ids=None):
        """Find enriched gene sets in a set of genes.

        Parameters
        ----------
        genes : set of str
            The set of genes to test for gene set enrichment.
        pval_thresh : float
            The significance level (p-value threshold) to use in the analysis.
        adjust_pval_thresh : bool, optional
            Whether to adjust the p-value threshold using a Bonferroni
            correction. (Warning: This is a very conservative correction!)
            [True]
        K_min : int, optional
            The minimum number of gene set genes present in the analysis. [3]
        gene_set_ids : Iterable or None
            A list of gene set IDs to test. If ``None``, all gene sets are
            tested that meet the :attr:`K_min` criterion.

        Returns
        -------
        list of `StaticGSEResult`
            A list of all significantly enriched gene sets. 
        """
        assert isinstance(genes, set)
        assert isinstance(pval_thresh, (float, np.float))
        assert isinstance(K_min, (int, np.integer))
        if gene_set_ids is not None:
            assert isinstance(gene_set_ids, Iterable)

        gene_set_coll = self._gene_set_coll
        gene_sets = self._gene_set_coll.gene_sets
        gene_memberships = self._gene_memberships
        sorted_genes = sorted(genes)

        # test only some terms?
        if gene_set_ids is not None:
            gs_indices = np.int64([self._gene_set_coll.index(id_)
                                   for id_ in gene_set_ids])
            gene_sets = [gene_set_coll[id_] for id_ in gene_set_ids]
            # gene_set_coll = GeneSetCollection(gene_sets)
            gene_memberships = gene_memberships[:, gs_indices]  # not a view!

        # determine K's
        K_vec = np.sum(gene_memberships, axis=0, dtype=np.int64)

        # exclude terms with too few genes
        sel = np.nonzero(K_vec >= K_min)[0]
        K_vec = K_vec[sel]
        gene_sets = [gene_sets[j] for j in sel]
        gene_memberships = gene_memberships[:, sel]

        # determine k's, ignoring unknown genes
        unknown = 0
        sel = []
        filtered_genes = []
        logger.debug('Looking up indices for %d genes...', len(sorted_genes))
        for i, g in enumerate(sorted_genes):
            assert isinstance(g, (str, _oldstr))
            try:
                idx = self._genome.index(g)
            except ValueError:
                unknown += 1
            else:
                sel.append(idx)
                filtered_genes.append(g)

        sel = np.int64(sel)
        gene_indices = np.int64(sel)
        # gene_memberships = gene_memberships[sel, :]
        k_vec = np.sum(gene_memberships[sel, :], axis=0, dtype=np.int64)
        if unknown > 0:
            logger.warn('%d / %d unknown genes (%.1f %%), will be ignored.',
                        unknown, len(genes),
                        100 * (unknown / float(len(genes))))

        # determine n and N
        n = len(filtered_genes)
        N, m = gene_memberships.shape
        logger.info('Conducting %d tests.', m)

        # correct p-value threshold, if specified
        final_pval_thresh = pval_thresh
        if adjust_pval_thresh:
            final_pval_thresh /= float(m)
            logger.info('Using Bonferroni-corrected p-value threshold: %.1e',
                        final_pval_thresh)

        # calculate p-values and get significantly enriched gene sets
        enriched = []

        logger.debug('N=%d, n=%d', N, n)
        sys.stdout.flush()
        for j in range(m):
            pval = hypergeom.sf(k_vec[j] - 1, N, K_vec[j], n)
            if pval <= final_pval_thresh:
                # found significant enrichment
                # sel_genes = [filtered_genes[i] for i in np.nonzero(gene_memberships[:, j])[0]]
                sel_genes = [self._genome[i] for i in
                             np.nonzero(gene_memberships[gene_indices, j])[0]]
                enriched.append(
                    StaticGSEResult(gene_sets[j], N, n, set(sel_genes), pval))

        return enriched
Exemplo n.º 58
0
    def enrichment(self, features_of_interest, background=None,
                   p_value_cutoff=1000000, cross_reference=None,
                   min_feature_size=3, min_background_size=5,
                   domain=None):
        """Bonferroni-corrected hypergeometric p-values of GO enrichment

        Calculates hypergeometric enrichment of the features of interest, in
        each GO category.

        Parameters
        ----------
        features_of_interest : list-like
            List of features. Must match the identifiers in the ontology
            database exactly, i.e. if your ontology database is ENSEMBL ids,
            then you can only provide those and not common names like "RBFOX2"
        background : list-like, optional
            Background genes to use. It is best to use a relevant background
            such as all expressed genes. If None, defaults to all genes.
        p_value_cutoff : float, optional
            Maximum accepted Bonferroni-corrected p-value
        cross_reference : dict-like, optional
            A mapping of gene ids to gene symbols, e.g. a pandas Series of
            ENSEMBL genes e.g. ENSG00000139675 to gene symbols e.g HNRNPA1L2
        min_feature_size : int, optional
            Minimum number of features of interest overlapping in a GO Term,
            to calculate enrichment
        min_background_size : int, optional
            Minimum number of features in the background overlapping a GO Term
        domain : str or list, optional
            Only calculate GO enrichment for a particular GO category or
            subset of categories. Valid domains:
            'biological_process', 'molecular_function', 'cellular_component'

        Returns
        -------
        enrichment_df : pandas.DataFrame
            A (n_go_categories, columns) DataFrame of the enrichment scores

        Raises
        ------
        ValueError
            If features of interest and background do not overlap, or invalid
            GO domains are given
        """
        cross_reference = {} if cross_reference is None else cross_reference
        background = self.all_genes if background is None else background
        if len(set(background) & set(features_of_interest)) == 0:
            raise ValueError('Features of interest and background do not '
                             'overlap! Not calculating GO enrichment')
        if len(set(features_of_interest) & set(self.all_genes)) == 0:
            raise ValueError('Features of interest do not overlap with GO term'
                             'gene ids. Not calculating GO enrichment.')
        domains = self.domains
        valid_domains = ",".join("'{}'".format(x) for x in self.domains)

        if isinstance(domain, str):
            if domain not in self.domains:
                raise ValueError(
                    "'{}' is not a valid GO domain. "
                    "Only {} are acceptable".format(domain, valid_domains))
            domains = frozenset([domain])
        elif isinstance(domain, Iterable):
            if len(set(domain) & self.domains) == 0:
                raise ValueError(
                    "'{}' are not a valid GO domains. "
                    "Only {} are acceptable".format(
                        ",".join("'{}'".format(x) for x in domain),
                        valid_domains))
            domains = frozenset(domain)

        n_all_genes = len(background)
        n_features_of_interest = len(features_of_interest)
        enrichment = defaultdict(dict)

        for go_term, go_genes in self.ontology.items():
            if go_genes['domain'] not in domains:
                continue

            features_in_go = go_genes['genes'].intersection(
                features_of_interest)
            background_in_go = go_genes['genes'].intersection(background)
            too_few_features = len(features_in_go) < min_feature_size
            too_few_background = len(background_in_go) < min_background_size
            if too_few_features or too_few_background:
                continue

            # Survival function is more accurate on small p-values
            p_value = hypergeom.sf(len(features_in_go), n_all_genes,
                                   len(background_in_go),
                                   n_features_of_interest)
            p_value = 0 if p_value < 0 else p_value
            symbols = [cross_reference[f] if f in cross_reference else f for f
                       in features_in_go]
            enrichment['p_value'][go_term] = p_value
            enrichment['n_features_of_interest_in_go_term'][go_term] = len(
                features_in_go)
            enrichment['n_background_in_go_term'][go_term] = len(
                background_in_go)
            enrichment['n_features_total_in_go_term'][go_term] = len(
                go_genes['genes'])
            enrichment['features_of_interest_in_go_term'][
                go_term] = ','.join(features_in_go)
            enrichment['features_of_interest_in_go_term_gene_symbols'][
                go_term] = ','.join(symbols)
            enrichment['go_domain'][go_term] = go_genes['domain']
            enrichment['go_name'][go_term] = go_genes['name']
        enrichment_df = pd.DataFrame(enrichment)

        if enrichment_df.empty:
            warnings.warn('No GO categories enriched in provided features')
            return

        # Bonferonni correction
        enrichment_df['bonferonni_corrected_p_value'] = \
            enrichment_df.p_value * enrichment_df.shape[0]
        ind = enrichment_df['bonferonni_corrected_p_value'] < p_value_cutoff
        enrichment_df = enrichment_df.ix[ind]
        enrichment_df = enrichment_df.sort(columns=['p_value'])

        return enrichment_df
Exemplo n.º 59
0
def main():
    usage = 'usage: %prog [options] <peaks gff> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='clip_fpkm_file', help='Control FPKM tracking file')
    parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf'%os.environ['GENCODE'])
    parser.add_option('--ggplot', dest='ggplot_script', default='%s/peaks_diff_compare.r'%os.environ['RDIR'], help='Script to make plots with [Default: %default]')
    parser.add_option('-m', dest='max_stat', default=10, type='float', help='Max cuffdiff stat [Default: %default]')
    parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]')
    parser.add_option('-r', dest='rbp', default='RBP', help='RBP name [Default: %default]')
    parser.add_option('-s', dest='single_gene_loci', default=False, action='store_true', help='Only use single gene loci [Default: %default]')
    parser.add_option('-t', dest='test_stat', default=False, action='store_true', help='Use test statistic rather than fold change [Default: %default]')
    parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff')
    parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide peaks GFF and .diff file')
    else:
        peaks_gff = args[0]
        diff_file = args[1]

    ##################################################
    # process GTF
    ##################################################
    if options.single_gene_loci:
        single_gtf_fd, single_gtf_file = filter_single(options.ref_gtf)
        options.ref_gtf = single_gtf_file

    gtf_genes = gff.gtf_gene_set(options.ref_gtf)

    ##################################################
    # collect CLIP peak bound genes
    ##################################################
    peak_genes = set()
    p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf, peaks_gff), shell=True, stdout=subprocess.PIPE)
    for line in p.stdout:
        peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id'])
    p.communicate()

    # find expressed genes in peak calls
    silent_genes = set()
    if options.clip_fpkm_file:
        silent_genes = find_silent(options.clip_fpkm_file)

    ##################################################
    # collect RIP stats
    ##################################################
    if options.test_stat:
        rip_fold, rip_bound = ripseq.hash_rip(diff_file, just_ok = True, use_fold=False, max_stat=options.max_stat, one_rbp=True)
    else:
        rip_fold, rip_bound = ripseq.hash_rip(diff_file, use_fold=True, max_stat=options.max_stat, one_rbp=True)
        rip_fold = ripseq.hash_rip_fold(diff_file, min_fpkm=0.125, max_fold=10, one_rbp=True)

    ##################################################
    # plot bound and unbound distributions
    ##################################################
    # construct data frame
    df_dict = {'Gene':[], 'CLIP':[], 'RIP':[]}
    for gene_id in rip_fold:
        if gene_id in gtf_genes and (len(silent_genes) == 0 or gene_id not in silent_genes):
            df_dict['Gene'].append(gene_id)
            df_dict['RIP'].append(rip_fold[gene_id])
            if gene_id in peak_genes:
                df_dict['CLIP'].append('Bound')
            else:
                df_dict['CLIP'].append('Unbound')

    ggplot.plot(options.ggplot_script, df_dict, [options.output_pre, options.rbp, options.test_stat])

    ##################################################
    # compute stats on bound and unbound distributions
    ##################################################
    bound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Bound']
    unbound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Unbound']

    # perform statistical test
    z, p = stats.mannwhitneyu(bound_fold, unbound_fold)

    stats_out = open('%s_stats.txt' % options.output_pre, 'w')
    cols = (options.rbp, len(bound_fold), stats.mean(bound_fold), len(unbound_fold), stats.mean(unbound_fold), z, p)    
    print >> stats_out, '%-10s  %5d  %6.2f  %5d  %6.2f  %6.2f  %9.2e' % cols
    stats_out.close()

    ##################################################
    # plot venn diagram
    ##################################################
    rip_genes = set([df_dict['Gene'][i] for i in range(len(df_dict['Gene'])) if rip_bound.get(df_dict['Gene'][i],False)])

    clip_only = len(peak_genes - rip_genes)
    rip_only = len(rip_genes - peak_genes)
    both = len(peak_genes & rip_genes)

    if options.clip_fpkm_file:
        print >> sys.stderr, 'Ignoring silent genes for hypergeometric test'

    # k is x
    # K is n
    # N is M
    # n is N
    # hypergeom.sf(x, M, n, N, loc=0)

    p1 = hypergeom.sf(both-1, len(gtf_genes), len(peak_genes), len(rip_genes))
    p2 = hypergeom.sf(both-1, len(gtf_genes), len(rip_genes), len(peak_genes))

    hyper_out = open('%s_hyper.txt' % options.output_pre, 'w')
    cols = (p1, p2, both, clip_only, rip_only, len(peak_genes), len(rip_genes), len(gtf_genes))
    print >> hyper_out, '%7.2e  %7.2e  %5d  %5d  %5d  %5d  %5d %5d' % cols
    hyper_out.close()

    if clip_only > 0 and rip_only > 0:
        plt.figure()
        # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#377eb8'])
        # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#1ae47d'])
        venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#A1A838'])
        plt.savefig('%s_venn.pdf' % options.output_pre)

    ##################################################
    # clean
    ##################################################
    if options.single_gene_loci:
        os.close(single_gtf_fd)
        os.remove(single_gtf_file)