def snp_count(nucs, species, recomp15, recomp16):
	SNPs = {x: 0 for x in ('A','T','C','G')}
	maxSNP = ""
	maxSNP_value = 0
	countahead = 0
	indel = ""
	consensus_num = len(recomp16.findall(nucs))
	for m in recomp15.finditer(nucs):
		prev = str(m.group(1))
		SNP = str(m.group(2))
		if prev != "None": # nucleotide belongs to an indel
			countahead = int(prev[1]) - 1
			indel = str(prev) + str(SNP)
		elif countahead > 0: # continue adding nucleotides 1 at a time to indel
			indel += str(SNP)
			countahead -= 1
			if countahead == 0:
				if indel in SNPs:
					SNPs[indel] += 1
				else:
					SNPs[indel] = 1
		else: # SNP not part of an indel
			SNPs[SNP] += 1
	SNPcount = sum([v for v in SNPs.values()])
	maxSNP = max([k for k, v in SNPs.items()],key=lambda k: SNPs[k])
	maxSNP_value = SNPs[maxSNP]
	totalusedreads = consensus_num + maxSNP_value
	totalreads = consensus_num + SNPcount
	oddsratio = pvalue = nullvalue = 0
	mapping = "-"
	if species == "t":
		# Fisher's Exact Test for correctly mapped reads in a duplicated region on A. thaliana. Testing for 50/50 Con:SNP distribution.
		#
		#				Actual		Ideal (for 50/50)
		#	Consensus	  a			  b
		#		SNPs	  c			  d
		if consensus_num == totalreads and maxSNP_value == 0:
			return(mapping) # Time-saver
		else:
			oddsratio, pvalue = fisher_exact([[consensus_num, math.ceil(totalreads*0.5)], [maxSNP_value, math.ceil(totalreads*0.5)]])
	elif species == "l":
		# Fisher's Exact Test for ped reads on A. lyrata. Testing for 0/100 Con:SNP distribution.
		#
		#				Actual		Ideal (for 0/100)
		#	Consensus	  a			  0
		#		SNPs	  c			  d
		if consensus_num == 0: # Scipy's implementation of Fisher's Exact Test will error if this is 0; a variable in scipy becomes infinity, and it isn't handled properly. Therefore, increment all variables in the test by 1
			consensus_num += 1
			maxSNP_value += 1
			totalreads += 1
			nullvalue += 1
		if consensus_num == totalreads and maxSNP_value == 0:
			return(mapping) # Time-saver
		else:
			oddsratio, pvalue = fisher_exact([[consensus_num, nullvalue], [maxSNP_value, totalreads]])
	if pvalue < 0: pvalue = 0 # An error sometimes occurs for ratios enormously different from the ideal where p-value is slightly negative. Probably a floating point error
	if pvalue >= 0.10:
		# To a 90% confidence level, cannot reject the hypothesis that the observed SNP frequency is the same as 0/100 (A. lyrata) or 50/50 (A. thaliana) consensus/SNP
		mapping = maxSNP
	return(mapping)
示例#2
0
def getFisher(distPop, racePop, all_punishments, group_punishments):
    # I don't know if this is a valid way to report the Fisher's exact test statistic, but the idea is that if getFisher returns a
    # positive number over .95, there's a 95% chance that the group's better-than-average treatment is not due to chance.
    # If it returns a number under -.95, there's a 95% chance that the group's worse-than-average treatment is not due to chance.
    # I think it should be easier to create a color scale to show the scores on a map this way.

    # The getFisher function assumes wrongly that everyone can have only one punishment (of each type). If the number of
    # punishments exceeds the number of kids, it reduces the number of punishments (and assumes wrongly that every
    # kid has been punished) But maybe the results are still close enough to correct to use for scaling?

    """
    >>> getFisher(20, 5, 20, 10)
    0.904604
    >>> getFisher(20, 0, 20, 0)
    (None, None)
    """

    if max(racePop, group_punishments) == 0 or None:
        return None, None
    elif all_punishments == 0 or None:
        return 1, 0
    else:
        oddsratio, pvalueG = stats.fisher_exact([[racePop, max(distPop - racePop, 0)],
                                                 [group_punishments, max(all_punishments - group_punishments, 0)]],
                                                alternative='greater')
        oddsratio, pvalueL = stats.fisher_exact([[racePop, max(distPop - racePop, 0)],
                                                 [group_punishments, max(all_punishments - group_punishments, 0)]],
                                                alternative='less')
        if pvalueL < pvalueG:
            pv = 1 - pvalueL
        else:
            pv = pvalueG - 1
        pv = Decimal(pv)
        pv = pv.quantize(Decimal('0.000001'))
    return float(pv)
def contam_contig(nmapped_sink, nmapped_source, contam_libs, KS_THRESHOLD=0.001, P_RATIO=0.0001) :
    """
    determine if a contig is contaminated
    """
    # count # of sources
    nsource = 0
    prop=0.
    N=0
    mssg=''
    for key in contam_libs:
        mssg+=key+','
    logging.info(mssg)
    max=0 # record the library from that contributed most weight to prop
    for lib_scr in nmapped_source:
        #print lib_scr
        if lib_scr in contam_libs:
            # found a source lib
            nsource += 1
            f=float(contam_libs[lib_scr][0])/float(contam_libs[lib_scr][1])*nmapped_source[lib_scr].n_mapped_reads
            prop+=f      #proportion
            N+=nmapped_source[lib_scr].n_mapped_reads
            mssg = '%s %d' % (lib_scr, N)
            logging.info(mssg)
            if f > max:
                max=f
                lib=lib_scr
    if N!=0:
        prop /= N
    else:
        return False
    mssg='max lib %s, # of sources %d, P is %f, nmap_sink is %d, nmap_src is %d' % (lib, nsource, prop, nmapped_sink[1], N)
    logging.info(mssg)
    mssg='bin.test %g' % stats.binom_test(nmapped_sink[1], nmapped_sink[1]+N, prop/(1.+prop))
    logging.info(mssg)
    mssg='%g ~ %g, p-value %g' % (prop, nmapped_sink[1]/float(N), stats.binom_test(contam_libs[lib][0], contam_libs[lib][0]+contam_libs[lib][1], prop/(1.+prop)))
    logging.info(mssg)
    if nsource == 1:
            mssg='fisher exact %g' % stats.fisher_exact([[contam_libs[lib][0], contam_libs[lib][1]], [nmapped_sink[1], nmapped_source[lib].n_mapped_reads]])[1]
    else:
            mssg='bin.test %g' % stats.binom_test(nmapped_sink[1], nmapped_sink[1]+N, prop/(1.+prop))
    logging.info(mssg)

    if nsource > 1 and stats.binom_test(contam_libs[lib][0], contam_libs[lib][0]+contam_libs[lib][1], prop/(1.+prop)) > 0.05:
        nsource=1
    if  nsource==0 or (nsource==1 and nmapped_source[lib].similar < KS_THRESHOLD) \
    or (nsource==1 and stats.fisher_exact([[contam_libs[lib][0], contam_libs[lib][1]], [nmapped_sink[1], nmapped_source[lib].n_mapped_reads]])[1] < P_RATIO)\
    or (stats.binom_test(nmapped_sink[1], nmapped_sink[1]+N, prop/(1.+prop)) < P_RATIO):
        # not a contam
        return False

    slist=[]
    for lib_scr in nmapped_source:
        if lib_scr in contam_libs:
            slist.append(lib_scr)

    logging.info(','.join(slist))
    return True
示例#4
0
def fisher(donors, gene_1_mutated, other_mutated, cooc):
	a = donors - gene_1_mutated - other_mutated + cooc # both wt (we subtracted the overlap twice)
	b = gene_1_mutated - cooc  # p53  mutated adn rpl5 wt
	c = other_mutated - cooc # rpl5 mutated and p53 wt
	d = cooc                 # both mutated
	[odds, pval_lt] = stats.fisher_exact([[a, b], [c, d]], "less")
	[odds, pval_gt] = stats.fisher_exact([[a, b], [c, d]], "greater")


	return pval_lt, pval_gt
示例#5
0
文件: core.py 项目: changebio/mamotif
def target_enrichment_peak2peak(peak1_table, peak2_table, motif_table):
    """Perform the enrichment analysis on two samples
    Args:
        peak_table: pandas dataframe, motifscan result table on sample1
        rnd_table: pandas dataframe, motifscan result table on sample2
        motif_table: pandas dataframe, motif information table

    Returns:
        motif_table: pandas dataframe, table containing both motif information and
                                       fisher exact test statistics
    """

    n_motif = len(motif_table)
    n_peak1 = len(peak1_table)
    n_peak2 = len(peak2_table)

    fold_change = np.zeros(n_motif)
    enrich_pvalue = np.zeros(n_motif)
    deplete_pvalue = np.zeros(n_motif)
    oddsratio = np.ones(n_motif)
    pvalue_corrected = np.ones(n_motif)

    peak1_tarnum = np.zeros(n_motif)
    peak2_tarnum = np.zeros(n_motif)
    # print pd.Index([i for i in peak1_table.columns if re.search(r'\.tarnum',i)])
    peak1_tarnum_table = peak1_table[
        pd.Index([i for i in peak1_table.columns if re.search(r'\.tarnum', i)])]
    peak2_tarnum_table = peak2_table[
        pd.Index([i for i in peak2_table.columns if re.search(r'\.tarnum', i)])]

    for mti, motif_name in zip(range(n_motif), motif_table['name']):
        peak1_tarnum[mti] = len(
            [i for i in peak1_tarnum_table['%s.tarnum' % motif_name] if i > 0])
        peak2_tarnum[mti] = len(
            [i for i in peak2_tarnum_table['%s.tarnum' % motif_name] if i > 0])
        if peak1_tarnum[mti] != 0 and peak2_tarnum[mti] != 0:
            fold_change[mti] = float(peak1_tarnum[mti] * n_peak2) / (
                peak2_tarnum[mti] * n_peak1)
        else:
            fold_change[mti] = 'NaN'
        table = [[peak1_tarnum[mti], n_peak1 - peak1_tarnum[mti]],
                 [peak2_tarnum[mti], n_peak2 - peak2_tarnum[mti]]]
        oddsratio[mti], enrich_pvalue[mti] = stats.fisher_exact(table, 'greater')
        oddsratio[mti], deplete_pvalue[mti] = stats.fisher_exact(table, 'less')
        pvalue_corrected[mti] = min(min(deplete_pvalue[mti],
                                        enrich_pvalue[mti]) * n_motif, 1)
    motif_table['peak1_target_number'] = peak1_tarnum
    motif_table['peak2_target_number'] = peak2_tarnum
    motif_table['fold_change'] = fold_change
    motif_table['enrich_pvalue'] = enrich_pvalue
    motif_table['deplete_pvalue'] = deplete_pvalue
    motif_table['oddsratio'] = oddsratio
    motif_table['pvalue_corrected'] = pvalue_corrected
    motif_table.sort('enrich_pvalue', inplace=True)
    return motif_table
def statistic_analysis(np_snp_info,np_feature_snp,np_label_classifyProgress,np_label_classifyPhenotype):
    ### proportion
    np_proportion = np.empty([np_snp_info.shape[0],np_snp_info.shape[1]],dtype='float')
    np_proportion = np.average(np_feature_snp, axis=0).reshape(np_snp_info.shape[0],np_snp_info.shape[1])

    ### get 2X2 matrix
    np_2_2_matrix_classifyProgress = np.empty([np_snp_info.shape[0],4],dtype='float')
    np_2_2_matrix_classifyPhenotype = np.empty([np_snp_info.shape[0],4],dtype='float')
    for idxSNP in range(0,np_snp_info.shape[0]):
        for idxSample in range(0,np_feature_snp.shape[0]):
            if np_label_classifyProgress[idxSample] == 0:
                np_2_2_matrix_classifyProgress[idxSNP,0] = np_2_2_matrix_classifyProgress[idxSNP,0] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1]
                np_2_2_matrix_classifyProgress[idxSNP,2] = np_2_2_matrix_classifyProgress[idxSNP,2] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2
            else:
                np_2_2_matrix_classifyProgress[idxSNP,1] = np_2_2_matrix_classifyProgress[idxSNP,1] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1]
                np_2_2_matrix_classifyProgress[idxSNP,3] = np_2_2_matrix_classifyProgress[idxSNP,3] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2
            if np_label_classifyPhenotype[idxSample] == 0:
                np_2_2_matrix_classifyPhenotype[idxSNP,0] = np_2_2_matrix_classifyPhenotype[idxSNP,0] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1]
                np_2_2_matrix_classifyPhenotype[idxSNP,2] = np_2_2_matrix_classifyPhenotype[idxSNP,2] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2
            else:
                np_2_2_matrix_classifyPhenotype[idxSNP,1] = np_2_2_matrix_classifyPhenotype[idxSNP,1] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1]
                np_2_2_matrix_classifyPhenotype[idxSNP,3] = np_2_2_matrix_classifyPhenotype[idxSNP,3] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2
    
    ### chi-square; fisher; oddsratio
    np_chi2 = np.empty([np_snp_info.shape[0],2],dtype='float')
    np_fisher = np.empty([np_snp_info.shape[0],2],dtype='float')
    np_oddsratio = np.empty([np_snp_info.shape[0],2],dtype='float')
    for idxSNP in range(0,np_snp_info.shape[0]):
        np_this_2_2_matrix = np_2_2_matrix_classifyProgress[idxSNP,:].reshape(2,2)
        print np_this_2_2_matrix
        chi2, p, dof, ex = st.chi2_contingency(np_this_2_2_matrix, correction=False)
        np_chi2[idxSNP,0] = p
        oddsratio, pvalue = st.fisher_exact(np_this_2_2_matrix)
        np_fisher[idxSNP,0] = pvalue
        np_oddsratio[idxSNP,0] = (np_this_2_2_matrix[0,0]*np_this_2_2_matrix[1,1])/(np_this_2_2_matrix[1,0]*np_this_2_2_matrix[0,1])
        np_this_2_2_matrix = np_2_2_matrix_classifyPhenotype[idxSNP,:].reshape(2,2)
        chi2, p, dof, ex = st.chi2_contingency(np_this_2_2_matrix, correction=False)
        np_chi2[idxSNP,1] = p
        oddsratio, pvalue = st.fisher_exact(np_this_2_2_matrix)
        np_fisher[idxSNP,1] = pvalue
        np_oddsratio[idxSNP,1] = (np_this_2_2_matrix[0,0]*np_this_2_2_matrix[1,1])/(np_this_2_2_matrix[1,0]*np_this_2_2_matrix[0,1])
    
    #proportion(AA:AB:BB); ClassifyProgress(Chi2,Fisher,OddsRatio); ; ClassifyPhenotype(Chi2,Fisher,OddsRatio)    
    np_statistic_result = np.empty([np_snp_info.shape[0],9],dtype='float')
    np_statistic_result[:,:3] = np_proportion
    np_statistic_result[:,3] = np_chi2[:,0]
    np_statistic_result[:,4] = np_fisher[:,0]
    np_statistic_result[:,5] = np_oddsratio[:,0]
    np_statistic_result[:,6] = np_chi2[:,1]
    np_statistic_result[:,7] = np_fisher[:,1]
    np_statistic_result[:,8] = np_oddsratio[:,1]
    
    return np_statistic_result
示例#7
0
def fisher_test_file(infile, mini_site):
    #print 'in fisher function'
    #print infile
    ofile = open('%s.fisher_test.txt' %(infile), 'w')
    with open (infile, 'r') as filehd:
        for line in filehd:
            line = line.rstrip()
            if len(line) > 2: 
                unit = re.split(r'\t',line)
                #print unit
                #CG
                if not 'NA' in [unit[3], unit[4], unit[15], unit[16]]:
                    if int(unit[5]) <= mini_site or int(unit[17]) <= mini_site:
                        print >> ofile, '%s\t%s\t%s\tCG\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], unit[3], unit[4], unit[15], unit[16], 'NA')  
                    else:
                        c1   = int(unit[3])
                        mc1  = int(unit[4])
                        c2   = int(unit[15])
                        mc2  = int(unit[16])
                        oddsratio, pvalue  = fisher_exact([[c1, mc1], [c2, mc2]])
                        print >> ofile, '%s\t%s\t%s\tCG\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], str(c1), str(mc1), str(c2), str(mc2), str(pvalue))
                else:
                    print >> ofile, '%s\t%s\t%s\tCG\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], unit[3], unit[4], unit[15], unit[16], 'NA')
                #CHG
                if not 'NA' in [unit[6], unit[7], unit[18], unit[19]]:
                    if int(unit[8]) <= mini_site or int(unit[20]) <= mini_site:
                        print >> ofile, '%s\t%s\t%s\tCHG\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], unit[6], unit[7], unit[18], unit[19], 'NA')
                    else:
                        chg1   = int(unit[6])
                        mchg1  = int(unit[7])
                        chg2   = int(unit[18])
                        mchg2  = int(unit[19])
                        oddsratio, pvalue  = fisher_exact([[chg1, mchg1], [chg2, mchg2]])
                        print >> ofile, '%s\t%s\t%s\tCHG\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], str(chg1), str(mchg1), str(chg2), str(mchg2), str(pvalue))
                else:
                    print >> ofile, '%s\t%s\t%s\tCHG\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], unit[6], unit[7], unit[18], unit[19], 'NA')
                #CHH
                if not 'NA' in [unit[9], unit[10], unit[21], unit[22]]:
                    if int(unit[11]) <= mini_site or int(unit[23]) <= mini_site:
                        print >> ofile, '%s\t%s\t%s\tCHH\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], unit[9], unit[10], unit[21], unit[22], 'NA')
                    else:
                        chh1   = int(unit[9])
                        mchh1  = int(unit[10])
                        chh2   = int(unit[21])
                        mchh2  = int(unit[22])        
                        oddsratio, pvalue  = fisher_exact([[chh1, mchh1], [chh2, mchh2]])
                        print >> ofile, '%s\t%s\t%s\tCHH\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], str(chh1), str(mchh1), str(chh2), str(mchh2), str(pvalue))                
                else:
                    print >> ofile, '%s\t%s\t%s\tCHH\t%s\t%s\t%s\t%s\t%s' %(unit[0], unit[1], unit[2], unit[9], unit[10], unit[21], unit[22], 'NA')
    ofile.close()
    return 1
def get_sorted_fisher_dct(clus_genes, go_dct):
    '''
    Returns a dictionary, where keys=GO terms, values=p-values of Fisher's test
    GO enrichment for the set of genes in the cluster.
    '''
    # Keys are GO terms, and values are the enrichment p-values.
    fisher_dct = {}
    for go_label in go_dct:
        go_genes = set(go_dct[go_label])

        # # Skip bad GO terms.
        # if len(go_genes) > MAX_GO_SIZE or len(go_genes) < MIN_GO_SIZE:
        #     continue

        # Compute the four sets for Fisher's test.
        clus_and_go = len(clus_genes.intersection(go_genes))
        clus_not_go = len(clus_genes.difference(go_genes))
        go_not_clus = len(go_genes.difference(clus_genes))
        neither = len(gene_universe) - len(go_genes.union(clus_genes))

        # Run Fisher's test.
        f_table = ([[clus_and_go, clus_not_go], [go_not_clus, neither]])
        o_r, p_value = fisher_exact(f_table, alternative='greater')

        # Handle overflow issues.
        p_value = max(p_value, 1e-300)

        fisher_dct[go_label] = p_value

    return sorted(fisher_dct.items(), key=operator.itemgetter(1))
示例#9
0
文件: plot.py 项目: arahuja/cohorts
def fishers_exact_plot(data, condition1, condition2):
    """
    Perform a Fisher's exact test to compare to binary columns

    Parameters
    ----------
    data: Pandas dataframe
        Dataframe to retrieve information from

    condition1: str
        First binary column compare

    condition2: str
        Second binary column to compare
    """
    sb.factorplot(
        x=condition1,
        y=condition2,
        kind='bar',
        data=data
    )
    count_table = pd.crosstab(data[condition1], data[condition2])
    print(count_table)
    oddsratio, pvalue = fisher_exact(count_table)
    print("Fisher's Exact Test: OR: {}, p-value={}".format(oddsratio, pvalue))
    return (oddsratio, pvalue)
示例#10
0
def fishers(ocean, conn, interval, zone, zoneExpected, zoneObserved, globalCount):
    """ The null hypothesis is that the relative proportions of one variable are independent
        of the second variable. For example, if you counted the number of male and female mice
        in two barns, the null hypothesis would be that the proportion of male mice is the
        same in the two barns.
        http://udel.edu/~mcdonald/statfishers.html
        inputs: zone expected, zone observed, not zone expected, not zone observed
                in zone     not in zone
    observed    a           b
    expected    c           d
    """
    notZoneExpected = globalCount - zoneExpected
    notZoneObserved = globalCount - zoneObserved

    if zoneExpected < 1 and zoneObserved == 0:
        print("** values too low")
    else:
        odds, pval = fisher_exact(np.array([[zoneObserved, notZoneObserved], [zoneExpected, notZoneExpected]]))
        fishers = robjects.r['fisher.test']
        res_r = fishers(np.array([[zoneObserved, notZoneObserved], [zoneExpected, notZoneExpected]]))
        r_p = res_r[0][0]
        r_odds = res_r[2][0]
        sql = "insert into fisherResults (ocean, zone, period, fisher, sig) values('" + ocean + "', '" + zone.get('name') + "', '" + str(interval) + "', " + formatOdds(r_odds) + ", " + str(r_p) + ")"
        utils.executeMysql_All(conn, sql)
        print("** Fishers Exact: %s: odds: %s, r_odds: %s, r_p: %s, p: %s" % (zone.get('name'), formatOdds(odds), r_odds, r_p, pval))
示例#11
0
def phyper_at_fpr(fg_vals, bg_vals, fpr=0.01):
    """
    Computes the hypergeometric p-value at a specific FPR (default 1%).

    Parameters
    ----------
    fg_vals : array_like
        The list of values for the positive set.

    bg_vals : array_like
        The list of values for the negative set.
    
    fpr : float, optional
        The FPR (between 0.0 and 1.0).
    
    Returns
    -------
    fraction : float
        The fraction positives at the specified FPR.
    """
    fg_vals = np.array(fg_vals)
    s = scoreatpercentile(bg_vals, 100 - fpr * 100)
    
    table = [
            [sum(fg_vals >= s), sum(bg_vals >= s)],
            [sum(fg_vals < s), sum(bg_vals < s)],
            ]
    
    return fisher_exact(table, alternative="greater")[1]
示例#12
0
    def calculate_differential_methylation_fisher_exact(self, weighted = False):
        sum_meth_control = 0
        sum_meth_affected = 0
        sum_cov_control = 0
        sum_cov_affected = 0
        for cpg in self.cpgs:
            if weighted:
                sum_meth_control += cpg.weighted_methylation_control
                sum_meth_affected += cpg.weighted_methylation_affected
                sum_cov_control += cpg.cov_control
                sum_cov_affected += cpg.cov_affected
            else:
                sum_meth_control += cpg.meth_control
                sum_meth_affected += cpg.meth_affected
                sum_cov_control += cpg.cov_control
                sum_cov_affected += cpg.cov_affected

        control = sum_meth_control / sum_cov_control
        affected = sum_meth_affected / sum_cov_affected
        control_methylated = sum_cov_control * control / 100
        control_unmethylated = sum_cov_control - control_methylated
        affected_methylated = sum_cov_affected * affected / 100
        affected_unmethylated = sum_cov_affected - affected_methylated
        try:
            #Try to use the much faster fisher module from http://pypi.python.org/pypi/fisher/
            p = fisher_exact.pvalue(control_methylated, control_unmethylated, affected_methylated, affected_unmethylated)
            pvalue = p.two_tail
        except:
            oddsratio, pvalue = stats.fisher_exact([(control_methylated, control_unmethylated), (affected_methylated, affected_unmethylated)], alternative='two-sided')
        return pvalue
示例#13
0
	def calc(self, c_table): # two-sided
		p = stats.fisher_exact(c_table)[1]
		html = "<b>P-value</b> = %s" % format(p, 'g')
		tex = "$P = %s$" % format(p, 'g')
		res = dict(p=p, X=", ".join(self.x_cats), Y=", ".join(self.y_cats),
				   report=dict(tex=tex, html=html))
		return res
示例#14
0
def calc_qval_dbl(study_n, pop_n, pop, assoc, term_pop, obo_dag, T=500):
    """
    :param study_n: Integer (number of ANs from sample frequency)
    :param pop_n: Integer (number of ANs from background frequency = sample freq.)
    :param pop:
    :param assoc:
    :param term_pop:
    :param obo_dag:
    :param T:
    :return:
    """
    distribution = []
    for i in range(T):
        new_study = random.sample(pop, study_n) # add pop and study
        new_term_study = count_terms(new_study, assoc, obo_dag)[0] #!!!
        smallest_p = 1
        for term, study_count in list(new_term_study.items()):
            pop_count = term_pop[term]
            a = study_count
            b = study_n - study_count
            c = pop_count
            d = pop_n - pop_count
            p_two_tail  = stats.fisher_exact([[a, b], [c, d]], alternative='greater')[1]
            if p_two_tail < smallest_p:
                smallest_p = p_two_tail
        distribution.append(smallest_p)
        if i % 10  == 0:
            print("Sample {0} / {1}: p-value {2}".\
                        format(i, T, smallest_p), file=sys.stderr)
    return distribution
示例#15
0
def fip(envoSize, clusterSize, tp):
    fn = envoSize - tp
    fp = clusterSize - tp
    tn = n - (tp + fn + fp)
    fi = fisher_exact([[tp,fn], [fp, tn]])[1]
    fscore = prf(tp, fp, fn)[-1]
    return fi, fscore
def computePropTest( data ):
	mC = np.asarray( data[['mC.r1', 'mC.r2']] )
	tC = np.asarray( data[['t.r1','t.r2']] )
	uC = tC - mC
	#chi2, p, dof, ex = stats.chi2_contingency( np.array([mC,uC]), correction=True)
	odr, p = stats.fisher_exact( np.array([mC,uC]) )
	return p
    def call_variant(self):
        """
        mirrors AminoAcidCaller::CallVariants() in
        https://github.com/PacificBiosciences/minorseq/blob/develop/src/AminoAcidCaller.cpp

        For each position (that has sufficient coverage),
         do Fisher exact test w/ correction
         if p-val < threshold, then store it.

        Stores results in self.variant as:

        self.variant[position] = desc list of (base, count).
        NOTE: base must be either all in lower case (which means - strand)
              or call upper case (+ strand).
              If - strand and ('a', 10), it means the ref base in A on the + strand,
              and the transcript should be T on the - strand.

        Only positions with more than the ref base is stored.
        """
        for pos in self.positions_to_call:
            r = self.record_by_pos[pos]
            alt_variant = []
            for base, count in r.clean_counts.most_common()[1:]:
                assert not base.startswith('+') and not base.startswith('-') # clean counts should NOT have indels
                exp = r.clean_cov * self.err_sub
                odds, pval = stats.fisher_exact([[count, r.clean_cov-count], [exp, r.clean_cov-exp]], alternative='greater')
                pval *= self.number_of_tests
                if pval < self.pval_cutoff: # store variant if below cutoff
                    alt_variant.append((base, count))
            if len(alt_variant) > 0: # only record this variant if there's at least two haps
                self.variant[pos] = [r.clean_counts.most_common()[0]] + alt_variant
                self.ref_base[pos] = r.ref
示例#18
0
def MK_test(SNPs, test_mode):
    '''
    (dict, str) -> dict
    Take a dict of gene : [PN, PS, DN, DS] pairs and a string fisher or G_test
    and a return  a new dict with gene : [PN, PS, DN, DS, p-val] pairs 
    with PN and DN being respectively replacement polymorphisms and divergence
    and PS and DS being respectively synonymous polymorphisms and divergence 
    and p-val being the p-value of the contingency test using either Fisher's
    two-sided exact test or the G-test with Yate's correction
    '''
    
    # create new dict
    MK = {}    
    
    # loop over genes in dict
    for gene in SNPs:
        # initialize list with PN, PS
        polym = [SNPs[gene][0], SNPs[gene][1]]
        # initialize list with DN, DS
        diverg = [SNPs[gene][2], SNPs[gene][3]]
        # perform the MK test according to fisher 2-tailed or G-test
        if test_mode == 'fisher':
            # get the p-value
            P = stats.fisher_exact([polym, diverg])[1]
        elif test_mode == 'G_test':
            P = stats.chi2_contingency([polym, diverg], lambda_ = 'log-likelihood')[1]
        # add p-val to list
        MK[gene] = list(SNPs[gene])
        MK[gene].append(P)
        
    return MK
示例#19
0
文件: GVR.py 项目: BRiDGEIris/digest
def scoreGenePair(gene_symbol_pair,variantList):
    
    variantList=list(variantList)
    
    patientsID_dictionnary=patientsID_dictionnary_b.value
    
    patientsID_split_index=patientsID_split_index_b.value
    
    score=0
    if len(variantList)==2:
        (genes,variantList1)=variantList[0]
        (genes,variantList2)=variantList[1]
        
        gene1=genes[0]
        gene2=genes[1]
        
        variantList1=list(variantList1)
        variantList2=list(variantList2)
        
        genoSum1=getGenotypeVectorByGene(gene1,variantList1,patientsID_dictionnary,patientsID_split_index)
        genoSum2=getGenotypeVectorByGene(gene2,variantList2,patientsID_dictionnary,patientsID_split_index)
        
        genoSum=[int(x>0 and y>0) for x,y in zip(genoSum1,genoSum2)]
        
        sumCase=float(sum([int(x>0) for x in genoSum[0:patientsID_split_index]]))
        ratioCase=sumCase/patientsID_split_index
        sumControl=float(sum([int(x>0) for x in genoSum[(patientsID_split_index+1):len(patientsID_dictionnary)]]))
        ratioControl=sumControl/(len(patientsID_dictionnary)-patientsID_split_index)
        
        score=ratioCase-ratioControl
        pvalue=fisher_exact([[sumCase,patientsID_split_index-sumCase],[sumControl,len(patientsID_dictionnary)-patientsID_split_index]],'greater')[1]
        
        if score>0:
            return (gene_symbol_pair,((gene1,gene2),score,pvalue,ratioCase,ratioControl,sumCase,sumControl))
def enrichment_analysis(regions, data_to_scatter, synnonsyn, reference, minor_af):
    '''Enrichment of nonsynonymous mutations at globally variable but intrapatient conserved sites'''
    from scipy.stats import fisher_exact
    E = {}
    for region in regions:
        # returns count matrix [syn/nonsyn, low/high fitness, low/high entropy (><0.1)]
        E[region] = scatter_vs_entropy(region, data_to_scatter, synnonsyn, reference,
                                        xlabel='fitness cost', xlim=(1e-4, 1.2),
                                        enrichment_thresholds = (0.0, 0.03, 10.0))

    print('NonSyn enrichment among variable sites with high fitness costs')
    with open('../data/fitness_pooled/enrichment_st_'+args.subtype+'.tsv', 'w') as ofile:
        ofile.write('\t'.join(['region', 'nonsyn-low cost', 'nonsyn-large cost', 'syn-low cost', 'syn-large cost', 'OR', 'invOR', 'pval'])+'\n')
        for region, ctable  in E.iteritems():
            print(region)
            print('non-syn:\n', ctable[0])
            print('syn:\n', ctable[1])
            print('nonsyn/syn among diverse:\n', ctable[:,:,1])
            OR, pval = fisher_exact(ctable[:,:,1])
            print('odds ratio:', OR)
            ofile.write('\t'.join([region]+map(str, [ctable[0,0,1], ctable[0,1,1], ctable[1,0,1],
                                                     ctable[1,1,1], OR, np.round(1.0/OR,2), pval]))+'\n')
        ctable = np.sum(E.values(), axis=0)
        ofile.write('\t'.join(['all']+map(str, [ctable[0,0,1], ctable[0,1,1], ctable[1,0,1],
                                                ctable[1,1,1], OR, np.round(1.0/OR,2), pval]))+'\n')

    return E
示例#21
0
def fisherExact():
    '''Fisher's Exact Test:
    Data are taken from Altman, Table 10.14
    Spectacle wearing among juvenile delinquensts and non-delinquents who failed a vision test
    Spectecle wearers: 1 delinquent, 5 non-delinquents
    non-spectacle wearers: 8 delinquents, 2 non-delinquents
    '''

    # Enter the data
    obs = np.array([[1,5], [8,2]])

    # --- >>> START stats <<< ---
    # Calculate the Fisher Exact Test
    # Note that by default, the option "alternative='two-sided'" is set;
    # other options are 'less' or 'greater'.
    fisher_result = stats.fisher_exact(obs)
    # --- >>> STOP stats <<< ---

    # Print the result
    print('\nFISHER --------------------------------------------------------')
    print(('The probability of obtaining a distribution at least as extreme '
    + 'as the one that was actually observed, assuming that the null ' +
    'hypothesis is true, is: {0:5.3f}.'.format(fisher_result[1])))
    
    return fisher_result
示例#22
0
def main (contingency_table):
	""" Calcula estadisticas de una tabal de contingencia 2x2 """
	
	SRS_types = set([])
	tables  = {}
	
	for row in csv.reader(open(contingency_table), delimiter = '\t'):
		
		ID, non_can, can = row
		SRS, tag = ID.split("_")
		
		SRS_types.add(SRS)
		tables[ID] = [int(non_can), int(can)]
	
	for srs in SRS_types:
		table = []
		table.append(tables[srs + "_YES"])
		table.append(tables[srs + "_NO"])
		
		
		
		obs = np.array(table)
		chi2, chi2_pvalue, chi2_dof, chi2_ex = chi2_contingency(obs, correction=False)
		chi2_yates, chi2_yates_pvalue, chi2_yates_dof, chi2_yates_ex = chi2_contingency(obs, correction=True)
		fisher_oddsratio, fisher_pvalue = stats.fisher_exact(table)
		
#		print srs, table, fisher_oddsratio, fisher_pvalue, chi2, chi2_pvalue, chi2_dof, chi2_ex
		
		print srs, fisher_oddsratio, log(fisher_oddsratio, 2), fisher_pvalue, chi2, chi2_pvalue, chi2_yates, chi2_yates_pvalue
示例#23
0
文件: vectorizer.py 项目: fototo/scan
    def get_vocab(self, vectorizer, input_text, input_scores):
        train_mat = vectorizer.transform(input_text)
        input_score_med = np.median(input_scores)
        new_scores = [0 if i <= input_score_med else 1 for i in input_scores]

        pvalues = []
        for i in range(0, train_mat.shape[1]):
            lcol = np.asarray(train_mat.getcol(i).todense().transpose())[0]
            good_lcol = lcol[[n for n in range(0, len(new_scores)) if new_scores[n] == 1]]
            bad_lcol = lcol[[n for n in range(0, len(new_scores)) if new_scores[n] == 0]]
            good_lcol_present = len(good_lcol[good_lcol > 0])
            good_lcol_missing = len(good_lcol[good_lcol == 0])
            bad_lcol_present = len(bad_lcol[bad_lcol > 0])
            bad_lcol_missing = len(bad_lcol[bad_lcol == 0])
            oddsratio, pval = fisher_exact(
                [[good_lcol_present, bad_lcol_present], [good_lcol_missing, bad_lcol_missing]]
            )
            pvalues.append(pval)

        col_inds = range(0, train_mat.shape[1])
        p_frame = np.array([col_inds, pvalues]).transpose()
        p_frame = p_frame[p_frame[:, 1].argsort()]

        rows = p_frame.shape[0]
        selection = self.max_features
        if rows < selection:
            selection = rows

        getVar = lambda searchList, ind: [searchList[int(i)] for i in ind]
        vocab = getVar(vectorizer.get_feature_names(), p_frame[:, 0][-selection:])
        return vocab
示例#24
0
def scoreVariantPair(variantIDpair,value_GenotypeListPair):
    
    genotypeListPair=list(value_GenotypeListPair)
    
    patientsID_dictionnary=patientsID_dictionnary_b.value
    patientsID_split_index=patientsID_split_index_b.value
    
    score=0
    if len(genotypeListPair)==2:
        (variantID,genotypeList1)=genotypeListPair[0]
        (variantID,genotypeList2)=genotypeListPair[1]
        
        variantID1=variantID[0]
        variantID2=variantID[1]
        
        genotypeList1=list(genotypeList1)
        genotypeList2=list(genotypeList2)
        
        genotypeVector1=getGenotypeVector(genotypeList1)
        genotypeVector2=getGenotypeVector(genotypeList2)
        
        genotypeVector=[int(x>0 and y>0) for x,y in zip(genotypeVector1,genotypeVector2)]
        
        sumCase=float(sum([int(x>0) for x in genotypeVector[0:patientsID_split_index]]))
        ratioCase=sumCase/patientsID_split_index
        sumControl=float(sum([int(x>0) for x in genotypeVector[(patientsID_split_index+1):len(patientsID_dictionnary)]]))
        ratioControl=sumControl/(len(patientsID_dictionnary)-patientsID_split_index)
        
        score=ratioCase-ratioControl
        pvalue=fisher_exact([[sumCase,patientsID_split_index-sumCase],[sumControl,len(patientsID_dictionnary)-patientsID_split_index]],'greater')[1]
        
        #if score>0:
        return (variantIDpair,((variantID1,variantID2),score,pvalue,ratioCase,ratioControl,sumCase,sumControl))
示例#25
0
def scoreGene(block):
    block=list(block)
    lenb=len(block)
    scores=[]
    
    patientsID_dictionnary=patientsID_dictionnary_b.value
    patientsID_split_index=patientsID_split_index_b.value
    
    if lenb>0:
        for i in range(0,lenb):
            listLoadBlock=block[i]
    
            sumCase=float(sum([int(x>0) for x in listLoadBlock[1][0:patientsID_split_index]]))
            sumControl=float(sum([int(x>0) for x in listLoadBlock[1][patientsID_split_index:len(patientsID_dictionnary)]]))
    
            ratioCase=sumCase/patientsID_split_index
            ratioControl=sumControl/(len(patientsID_dictionnary)-patientsID_split_index)
        
            score=ratioCase-ratioControl
            pvalue=fisher_exact([[sumCase,patientsID_split_index-sumCase],[sumControl,len(patientsID_dictionnary)-patientsID_split_index]],'greater')[1]
            #pvalue=ttest_ind(genotypeVectorByGene[0:patientsID_split_index],genotypeVectorByGene[patientsID_split_index:len(patientsID_dictionnary)])[1]/2
        
            if score>0:
                scores.append((listLoadBlock[0],(score,pvalue,ratioCase,ratioControl,sumCase,sumControl)))
    
    return scores
示例#26
0
def scoreGenePair(block1,block2):
    block1=list(block1)
    lenb1=len(block1)
    lenb2=len(block2)
    scores=[]

    patientsID_dictionnary=patientsID_dictionnary_b.value
    
    patientsID_split_index=patientsID_split_index_b.value
    
    if lenb1>0 and lenb2>0:
        for i in range(0,lenb1):
            for j in range(0,lenb2):
                listLoadBlock1=block1[i]
                listLoadBlock2=block2[j]
                if listLoadBlock1[0]>listLoadBlock2[0]:
                    genoSum=[int(x>0 and y>0) for x,y in zip(listLoadBlock1[1],listLoadBlock2[1])]
                    sumCase=float(sum([int(x>0) for x in genoSum[0:patientsID_split_index]]))
                    sumControl=float(sum([int(x>0) for x in genoSum[(patientsID_split_index):len(patientsID_dictionnary)]]))
        
                    ratioCase=sumCase/patientsID_split_index
                    ratioControl=sumControl/(len(patientsID_dictionnary)-patientsID_split_index)
        
                    score=ratioCase-ratioControl
                    pvalue=fisher_exact([[sumCase,patientsID_split_index-sumCase],[sumControl,len(patientsID_dictionnary)-patientsID_split_index]],'greater')[1]
        
                    if score>0:
                        scores.append(((listLoadBlock1[0],listLoadBlock2[0]),((listLoadBlock1[0],listLoadBlock2[0]),score,pvalue,ratioCase,ratioControl,sumCase,sumControl)))
    return scores
def chi_mode(data,depth,low=lcut,alpha=chi,f=freq):
    result=dict()
    plus=data['A'][0]+data['T'][0]+data['G'][0]+data['C'][0]
    minus=data['A'][1]+data['T'][1]+data['G'][1]+data['C'][1]
    for key in ['A','T','G','C']:
        if data[key][0] >= low[0]*depth and data[key][1] >=low[1]*depth:
            ndep=data[key][2]
            frequency=ndep/float(data['cover'])
            if frequency >= f:
#add chi square test:
                if frequency > 0.5:
                    result[key]=frequency
                else:
                    a=data[key][0]
                    b=data[key][1]
                    c=plus-data[key][0]
                    d=minus-data[key][1]

                    least=sorted([a,b,c,d])[0]
                    table=[[a,b],[c,d]]

                    if least < 5:
                        pvalue=stats.fisher_exact(table)[1]
                    else:
                        pvalue=stats.chi2_contingency(table)[1]

                    if pvalue > alpha:
                        result[key]=frequency
    return result
示例#28
0
文件: results.py 项目: Neurita/darwin
def get_confusion_matrix_fisher_significance(table, alternative='two-sided'):
    """
    Returns the value of fisher_exact test on table.

    Parameters
    ----------
    table : array_like of ints
        A 2x2 contingency table. Elements should be non-negative integers.

    alternative : {'two-sided', 'less', 'greater'}, optional
        Which alternative hypothesis to the null hypothesis the test uses.
        Default is 'two-sided'.

    Returns
    -------
    oddsratio : float
        This is prior odds ratio and not a posterior estimate.

    p_value : float
        P-value, the probability of obtaining a distribution at least as extreme
        as the one that was actually observed, assuming that the null hypothesis
        is true.
    """
    from scipy.stats import fisher_exact
    return fisher_exact(table, alternative)
示例#29
0
def fisherExact(g1r1, g1r2, g2r1, g2r2, nTails=1):
    #returns a p-value
    #Params: group 1 result 1, group 1 result 2, etc.
    
    p = 0
    if nTails == 1:
        pGreater = stats.fisher_exact([[g1r1, g1r2], [g2r1, g2r2]], alternative='greater')[1]
        pLess = stats.fisher_exact([[g1r1, g1r2], [g2r1, g2r2]], alternative='less')[1]
        
        p = pLess
        if pGreater < pLess:
            p = pGreater
    else:
        #default is two tailed
        p = stats.fisher_exact([[g1r1, g1r2], [g2r1, g2r2]])[1]
    
    return p
示例#30
0
def check_diff(tl, bg, total_num = 100015): # total_num = 100015 with clusters, total_num = 37391 without them
	tl_mean = np.mean(tl)
	bg_mean = np.mean(bg)
	is_different = False
	oddsratio, pvalue = stats.fisher_exact([[tl_mean, total_num - tl_mean], [bg_mean, total_num - bg_mean]])
	if pvalue < 0.00005:
		is_different = True
	return is_different
示例#31
0
        if gene not in subject:
            c.append(gene)
    for gene in all_genes:
        if gene not in subject and gene not in gene_set:
            d.append(gene)
    return a, b, c, d


subject = create_subject()
gene_sets = create_gene_sets()
all_genes = create_all_genes()

for set_name in gene_sets:
    experiment = []
    gene_set = create_gene_set(set_name)
    title = gene_set[0]
    description = gene_set[1]
    genes = gene_set[2:]
    a, b, c, d = contingency_matrix(subject, genes, all_genes)
    experiment.append(title)  # 0
    experiment.append(description)  # 1
    experiment.append([[a, b], [c, d]])  # 2
    experiment.append([[len(a), len(b)], [len(c), len(d)]])  # 3
    oddsratio, p_value = fisher_exact(experiment[3])
    experiment.append(p_value)  # 4
    #print('Genes in subject and in gene set:', len(a))
    #print('Genes in subject and not in gene set:', len(b))
    #print('Genes not in subject and in gene set:', len(c))
    #print('Genes not in subject and not in gene set:', len(d))
    experiments.append(experiment)
示例#32
0
            len(disease_module_genelist), ",".join(disease_module_snplist),
            ",".join(disease_module_genelist)
        ]
    temp += [",".join(module_geneset)]  #module gene set
    fisher_test_matrix = [
        [
            len(disease_module_genelist),
            num_eGene - len(disease_module_genelist)
        ],
        [
            module_nodes.shape[0] - len(disease_module_genelist),
            num_gg_nodeset - num_eGene - module_nodes.shape[0] +
            len(disease_module_genelist)
        ]
    ]
    oddsratio, pvalue = stats.fisher_exact(fisher_test_matrix)
    #	temp += [pvalue, oddsratio, len(set(module_geneset) & PD_familial), ",".join(list(set(module_geneset) & PD_familial))]
    temp += [pvalue, oddsratio]
    print(label, ' : ', fisher_test_matrix)
    result = pd.concat([result,
                        pd.DataFrame([temp],
                                     columns=result.columns)])  #insert row

fdr = statsmodels.stats.multitest.multipletests(result['p-value'],
                                                method='fdr_bh',
                                                is_sorted=False)
result['FDR'] = fdr[1]

##save result
result.sort_values(['p-value'], ascending=[1], inplace=True)  #sort
result.insert(0, 'label', list(range(1, result.shape[0] + 1, 1)))  #insert col
示例#33
0
def fisher_enrich(sample,
                  annotations,
                  depletions=True,
                  background=None,
                  restrict=False,
                  min_fold=None,
                  min_overlap=None,
                  fdr=None):
    """
    Given a sample of elements ( members ), and an annotation mapping [term][member] ( nested dictionary )
    Compute significant enrichments among the members
    : If not specified, background is all annotated terms
    : If not specified, analysis is "open" ( terms can appear in sample/annotations not in background )
    : Default is not FDR corrected for multiple tests
    : Default is positive enrichment of 1.1+, pvalue < 0.05, and overlap must be at least two members
    """
    if not background:
        background = generate_background(annotations)
    if restrict:
        # restrict sample and annotation space to background; may result in empty annotations ( which are removed )
        sample = [k for k in sample if k in background]
        annotations = {
            term: {k: 1
                   for k in annotations[term] if k in background}
            for term in annotations
        }
        annotations = {
            term: dictMembers
            for term, dictMembers in annotations.items()
            if len(dictMembers) > 0
        }
    # calculate results ( enrichment stats for each term in the annotations map )
    results = []
    for term, members in annotations.items():
        # overlap between members with term and members of sample
        overlap = list(set(sample).__and__(set(members.keys())))
        # counts
        count_overlap = len(overlap)
        count_background = len(background)
        count_sample = len(sample)
        count_term = len(members)
        count_sample_not_term = count_sample - count_overlap
        count_term_not_sample = count_term - count_overlap
        count_remainder = count_background - count_overlap - count_term_not_sample - count_sample_not_term
        # frequencies
        freq_sample = count_overlap / float(count_sample)
        freq_background = count_term / float(count_background)
        # fold enrichment
        fold_enrichment = freq_sample / freq_background
        # contingency table for fisher exact
        table = [[count_overlap, count_sample_not_term],
                 [count_term_not_sample, count_remainder]]
        # calculate pvalue and store results
        pvalue = fisher_exact(table)[1]  # [0] is an odds ratio; do not want
        results.append([term, len(overlap), fold_enrichment, pvalue])
    # convert pvalues to qvalues
    pvalues = [r[-1] for r in results]
    qvalues = pvalues2qvalues(pvalues)
    # attach qvalues
    for i, result in enumerate(results):
        results[i].append(qvalues[i])
    # filter results
    results2 = []
    for result in sorted(results, key=lambda x: x[-1]):
        include = True
        term, overlap, fe, pvalue, qvalue = result
        if min_overlap is not None and overlap < min_overlap:
            include = False
        if min_fold is not None and 1 / float(min_fold) < fe < min_fold:
            include = False
        if not depletions and fe < 1:
            include = False
        if fdr is not None and qvalue > fdr:
            include = False
        if include:
            results2.append(result)
    return results2
示例#34
0
 def np_fisher(a,b,c,d):
     oddsratio, pvalue = stats.fisher_exact([[a, b], [c, d]])
     return(oddsratio,pvalue)
示例#35
0
		def do_fisher_exact(x):
			return fisher_exact([[x[0], x[1]], [x[2], x[3]]], alternative='greater')
def mutations_association(args):
    # position to handle (will process interval of 250 positions ahead)
    input_x = args.position

    freqs = pd.read_csv(args.freqs_file, sep="\t")
    freqs = freqs[freqs['Pos'] == np.round(
        freqs['Pos']
    )]  #remove insertions #TODO- maoz- why not simply ref != '-'
    if (input_x < freqs["Pos"].min()) or (input_x > freqs["Pos"].max()):
        sys.exit()

    # blast files (all .fasta.blast files joined together)
    all_mappings = pd.read_csv(args.blast_output,
                               names=["read_id", "start", "end"],
                               sep="\t")
    # summary of all observed mutations from ref, including mappings to origin reads
    all_mutations = pd.read_csv(
        args.mutations_all,
        names=["pos", "read_id", "mutant", "read_positions"],
        sep="\t"
    )  #TODO- what mutations are included in mutations_all? is there a threshold?

    cons = freqs[(freqs["Rank"] == 0) & (freqs["Base"] != "-")]
    cons.insert(0, "pos", pd.to_numeric(cons.loc[:, "Pos"]))

    all_mutations = pd.merge(all_mutations, cons[["pos", "Ref"]],
                             on="pos")  # adding Ref\Cons to all_mutations
    #remove C>A and G>T
    #all_mutations = all_mutations[~(((all_mutations["Ref"]=="C")&(all_mutations["mutant"]=="A")) | ((all_mutations["Ref"]=="G")&(all_mutations["mutant"]=="T")))]

    #variants=all_mutations["pos"].unique()

    variants_combinations = range(input_x + 1, input_x +
                                  2)  # x-> (x+1,x+2) instead of (x+1,x+250)

    for y in variants_combinations:
        #x=pair[0]
        x = input_x
        #y=pair[1]
        maps_for_two_pos = all_mappings[(all_mappings["start"] <= x) & (
            all_mappings["end"] >= y)]  # reads surrounding the [x,y] interval
        merge_read_id = pd.DataFrame(
            {"read_id": maps_for_two_pos["read_id"].unique()})
        merge_x = all_mutations[all_mutations["pos"] == x][["pos", "read_id"]]
        merged = pd.merge(merge_read_id, merge_x, on="read_id", how="left")
        merge_y = all_mutations[all_mutations["pos"] == y][["pos", "read_id"]]
        merged = pd.merge(merged, merge_y, on="read_id", how="left")

        x_label = "pos_" + str(x)
        y_label = "pos_" + str(y)
        merged[x_label] = np.where(merged["pos_x"] == x, 1, 0)
        merged[y_label] = np.where(merged["pos_y"] == y, 1, 0)
        ct = pd.crosstab(merged[x_label], merged[y_label])
        if ct.shape == (2, 2):
            fisher_test = fisher_exact(
                ct, alternative='greater')  ## TODO- review fisher's test
            print('\t'.join([
                str(x) for x in [
                    x, y, fisher_test[0], fisher_test[1], ct[1][1] * 1.0 /
                    (ct[0][0] + ct[0][1] + ct[1][0] + ct[1][1])
                ]
            ]))
        else:
            print('\t'.join([
                str(x) for x in [x, y, 0.0, 1.0, 0.0]
            ]))  # statistic ('odds ratio'), p-value, *shared_freq*
示例#37
0
    counttot = 0
    for i in dic:
        countid += dic[i][intid][0]
        counttot += dic[i][intid][1]
    return [countid, counttot]


## runs tests and produces list of pvals corresponding to ids list
## cor_pval is a list of  pvals corresponding to ids list corrected for multiple tests

pval = []
tests = {}

for i in ids:
    test = [pm_dict[i], return_sums(i, names)]
    tests[i] = [stats.fisher_exact(test)[1], stats.fisher_exact(test)[0]]
    pval.append(tests[i][0])

cor_pval = pcorrect.multipletests(pval, alpha=0.05, method='fdr_bh')[1]

species = []

for i in range(1, len(sys.argv) - 1):
    species.append(sys.argv[i].split('/')[-1][:3])

species.remove('Pma')

outfile.write('InterPro ID\tPma\t' + '\t'.join(species) +
              '\tpvalue\tcorrected P value\tup/down ratio\n')

st = 0
示例#38
0
ref = items[2].upper()
depth = int(items[3])
if depth > 0:
    match = items[4]
    quality = items[5]
    count, pos_n, in_base, del_base = translate_bases(ref, depth, match)
print(count)
print(pos_n)
print(in_base)
print(del_base)
num_ref = count[gt_ref] + count[gt_ref.lower()]
num_alt = count[gt_alt] + count[gt_alt.lower()]
print(num_ref, num_alt)
ci_lower, ci_upper = wilson_binom_interval(num_alt,
                                           num_alt + num_ref,
                                           alpha=0.05)
print(ci_lower, ci_upper)

import scipy.stats as stats
num_ref_for = count[gt_ref]
num_ref_rev = count[gt_ref.lower()]
num_alt_for = count[gt_alt]
num_alt_rev = count[gt_alt.lower()]
oddsratio, pvalue = stats.fisher_exact([[num_ref_for, num_ref_rev],
                                        [num_alt_for, num_alt_rev]])
print(oddsratio, pvalue)
quality_ref, quality_alt = translate_qualities(gt_ref, gt_alt, pos_n, quality)
print(quality_ref, quality_alt)
scores = compute_table(gt_ref, gt_alt, quality_ref, quality_alt)
print(scores)
odds_good = []
p_val_bad = []
odds_bad = []
p_val_pre = []
odds_pre = []
genes_listy = []
for row in matrix_gene_counts:
    row = list(np.array(row).reshape(-1, ))
    genes_listy.append(row[0])
    absent_good = good_total - int(
        row[1])  #total_good_indiduals - individuals with variant gene present
    absent_bad = bad_total - int(row[2])
    absent_pre = pre_total - int(row[3])
    absent_control = control_total - int(row[4])
    oddsratio, pvalue = stats.fisher_exact(
        [[int(row[1]), int(row[2])], [absent_good, absent_bad]]
    )  #, alternative='less') #0 = gene name, 1=good, 2=bad, 3=pre-chemo, 4=control, , ‘less’, ‘greater’},
    p_values_good_bad.append(pvalue)
    odds_good_bad.append(oddsratio)
    oddsratio, pvalue = stats.fisher_exact(
        [[int(row[1]), int(row[4])], [absent_good, absent_control]]
    )  #, alternative='less') #0 = gene name, 1=good, 2=bad, 3=pre-chemo, 4=control
    p_val_good.append(pvalue)
    odds_good.append(oddsratio)
    oddsratio, pvalue = stats.fisher_exact(
        [[int(row[2]), int(row[4])], [absent_bad, absent_control]]
    )  #, alternative='less') #0 = gene name, 1=good, 2=bad, 3=pre-chemo, 4=control
    p_val_bad.append(pvalue)
    odds_bad.append(oddsratio)
    oddsratio, pvalue = stats.fisher_exact(
        [[int(row[3]), int(row[4])], [absent_pre, absent_control]]
示例#40
0
def producer(queue, arrs):
    for comAr in arrs:
        com, ar = comAr
        p = fisher_exact(ar, 'greater')
        queue.put([com, ar[0, 0], p])
示例#41
0
chromosome_length=np.loadtxt(open("../Data/chromosome_file.txt",'r'),delimiter="\t",dtype=np.str)
chr_number=np.loadtxt(open("../Data/chr_distribution_number.txt",'r'),delimiter="\t",dtype=np.str,skiprows=1)

chr_length_list=chromosome_length[:,2].astype(np.float)
variant_number_list=chr_number[:,3].astype(np.float) # 1-pos 2-neg 3-all

chr_length_total=np.sum(chr_length_list,axis=0)
variant_number_total=np.sum(variant_number_list,axis=0)

p_value_g_list=[]
p_value_l_list=[]

for x in range(24):
    a=variant_number_list[x]
    b=chr_length_list[x]-a
    c=variant_number_total-a
    d=chr_length_total-variant_number_total-b
    pvalue_greater=fisher_exact([[a,b],[c,d]],alternative="greater")[1]
    pvalue_less=fisher_exact([[a,b],[c,d]],alternative="less")[1]
    p_value_g_list.append(pvalue_greater)
    p_value_l_list.append(pvalue_less)

q_value_g_list=bh_qvalues(p_value_g_list)
q_value_l_list=bh_qvalues(p_value_l_list)

out=[]
out.append(["chromosome","variant number","p-value-g","FDR-g","p-value-l","FDR-l"])
for x in range(24):
    out.append([chr_number[x][0],chr_number[x][3],p_value_g_list[x],q_value_g_list[x],p_value_l_list[x],q_value_l_list[x]])
np.savetxt("all_distribution.txt",np.array(out),delimiter="\t",fmt="%s")
print("Finished.")
示例#42
0
def fishers_exact(dataset: Dataset, predictions, combined_data: CombinedData):
    assert (len(combined_data.vars) == 2)

    # Compute the contingency table
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(xs) == 1)
    assert (len(ys) == 1)

    x = xs[0]
    y = ys[0]

    # Get the count for each category
    x_cat = [k for k, v in x.metadata[categories].items()]
    y_cat = [k for k, v in y.metadata[categories].items()]

    contingency_table = []
    contingency_table_key = [
    ]  # labels for the order in which data is stored in data array (define above)

    for xc in x_cat:
        table_row = []
        table_row_key = []
        for yc in y_cat:
            data = dataset.select(y.metadata[name],
                                  where=[
                                      f"{x.metadata[name]} == '{xc}'",
                                      f"{y.metadata[name]} == '{yc}'"
                                  ])
            table_row.append(len(data))

            x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(
                y.metadata[name]) + ':' + str(yc)
            table_row_key.append(x_y_key)

        assert (len(table_row_key) == len(table_row))
        assert (len(table_row) == len(y_cat))
        contingency_table.append(table_row)
        contingency_table_key.append(table_row_key)

    # odds_ratio, p_value = stats.fisher_exact(contingency_table, alternative='two-sided')
    # return FishersResult(odds_ratio, p_value)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    odds_ratio, p_val = stats.fisher_exact(contingency_table,
                                           alternative='two-sided')
    dof = None
    test_result = TestResult(name=fisher_exact_name,
                             test_statistic=odds_ratio,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha,
                             x=x,
                             y=y)

    return test_result
示例#43
0
def gse_marker_handle(gse_data,
                      organism,
                      cell_marker_dict,
                      odds_ratio_threshold=2,
                      p_value_threshold=0.01,
                      method='greater'):
    """Fisher exact text for every cluster"""

    assert method in {'two-sided', 'less', 'greater'}
    all_gse_data = gse_data
    for count, gse in enumerate(all_gse_data, 1):
        marker_genes_file = os.path.join(gse, 'marker_genes.csv')
        if os.path.isdir(gse) and not os.path.isfile(marker_genes_file):
            text = f'Missing: {marker_genes_file}!'
            print(text)
        else:
            if organism not in cell_marker_dict:
                text = f'{gse}: Did not find marker genes.txt of {organism} in cell_marker!'
                print(text)
                continue

            text = f'Handling: {gse} {organism} ({count}/{len(all_gse_data)})'
            print(text)
            with open(marker_genes_file, 'r', encoding='utf8') as f:
                marker_genes_data = pd.read_csv(f, sep=',')

            item_list = []
            all_marker = cell_marker_dict['all'][organism]  # all marker
            for cluster, data in marker_genes_data.groupby('cluster'):
                cluster_marker = set(
                    data['gene']) & all_marker  # marker in one cluster
                n_all_marker = len(all_marker)
                n_cluster_marker = len(cluster_marker)
                if n_cluster_marker == 0:
                    continue
                cluster_marker_prop = n_cluster_marker / n_all_marker  # proportion of cluster marker in all marker
                for cell_type, cell_type_marker in cell_marker_dict[
                        organism].items():
                    n_cell_type_marker = len(
                        cell_type_marker)  # marker in one cell type
                    # expected hit in random condition
                    n_expected_hit = cluster_marker_prop * n_cell_type_marker
                    hit_genes = cluster_marker & cell_type_marker
                    n_hit = len(hit_genes)
                    odds_ratio = n_hit / n_expected_hit
                    if odds_ratio > odds_ratio_threshold:
                        n_non_hit_cell_type_marker = n_cell_type_marker - n_hit
                        n_non_hit_cluster_marker = n_cluster_marker - n_hit
                        n_other_marker = n_all_marker - n_hit - n_non_hit_cell_type_marker - n_non_hit_cluster_marker
                        table = [[n_other_marker, n_non_hit_cell_type_marker],
                                 [n_non_hit_cluster_marker, n_hit]]
                        p_value = stats.fisher_exact(table, method)[1]
                        if p_value < p_value_threshold:
                            item = [
                                cluster, organism, cell_type[0], cell_type[1],
                                n_all_marker, n_cluster_marker,
                                n_cell_type_marker, n_hit, n_expected_hit,
                                odds_ratio, p_value, '|'.join(hit_genes)
                            ]
                            item_list.append(item)
            if item_list:
                item_data = pd.DataFrame(item_list)
                columns = [
                    'cluster', 'organism', 'tissueType', 'cellName',
                    'n_all_marker', 'n_cluster_marker', 'n_cell_type_marker',
                    'n_hit', 'n_expected_hit', 'odds_ratio', 'p_value', 'hits'
                ]
                item_data.columns = columns
                item_data.sort_values(by=['cluster', 'p_value'], inplace=True)

                cells_type_file = os.path.join(gse, 'cells_type.csv')
                with open(cells_type_file, 'w', encoding='utf8') as f:
                    item_data.to_csv(f, index=False)
                text = f'Finished: {gse}'
                print(text)
            else:
                text = f'Not cluster can be marked to cell type: {gse}!'
                print(text)
示例#44
0
def rfet(container,
         relative_risk=1,
         min_events=1,
         decision_metric='fdr',
         decision_thres=0.05,
         mid_pval=False,
         expected_method='mantel-haentzel',
         method_alpha=1):
    '''
    Calculate the proportional reporting ratio.

    Arguments:
        container: A DataContainer object produced by the convert()
                    function from data_prep.py

        relative_risk (int/float): The relative risk value

        min_events: The min number of AE reports to be considered a signal

        decision_metric (str): The metric used for detecting signals:
                            {fdr = false detection rate,
                            signals = number of signals,
                            rank = ranking statistic}

        decision_thres (float): The min thres value for the decision_metric

        expected_method: The method of calculating the expected counts for
                        the disproportionality analysis.

        method_alpha: If the expected_method is negative-binomial, this
                    parameter is the alpha parameter of the distribution.

    '''
    DATA = container.data
    N = container.N

    if min_events > 1:
        DATA = DATA[DATA.events >= min_events]

    n11 = np.asarray(DATA['events'], dtype=np.float64)
    n1j = np.asarray(DATA['product_aes'], dtype=np.float64)
    ni1 = np.asarray(DATA['count_across_brands'], dtype=np.float64)
    num_cell = len(n11)
    expected = calculate_expected(N, n1j, ni1, n11, expected_method,
                                  method_alpha)

    n10 = n1j - n11
    n01 = ni1 - n11 + 1e-7
    n00 = N - (n11 + n10 + n01)

    log_rfet = np.log(n11 * n00 / (n10 * n01))
    pval_fish_uni = np.empty((num_cell))
    for p in range(num_cell):
        table = [[n11[p], n10[p]], [n01[p], n00[p]]]
        pval_fish_uni[p] = fisher_exact(table, alternative='greater')[1]

    if mid_pval:
        for p in range(num_cell):
            pval_fish_uni[p] = (pval_fish_uni[p] - .5 * hypergeom.pmf(
                n11[p], n11[p] + n10[p], n11[p] + n01[p], n10[p] + n00[p]))

    pval_uni = pval_fish_uni
    pval_uni[pval_uni > 1] = 1
    pval_uni[pval_uni < 0] = 0
    RankStat = pval_uni

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        results = lbe(2 * np.minimum(pval_uni, 1 - pval_uni))
    pi_c = results[1]
    fdr = (pi_c * np.sort(pval_uni[pval_uni <= .5]) /
           (np.arange(1, (pval_uni <= .5).sum() + 1) / num_cell))

    fdr = np.concatenate((fdr, (pi_c / (2 * np.arange(
        ((pval_uni <= .5).sum()), num_cell) / num_cell) + 1 -
                                (pval_uni <= .5).sum() / np.arange(
                                    (pval_uni <= .5).sum(), num_cell))),
                         axis=None)

    FDR = np.minimum(fdr, np.ones((len(fdr), )))

    if decision_metric == 'fdr':
        num_signals = (FDR <= decision_thres).sum()
    elif decision_metric == 'signals':
        num_signals = min(decision_thres, num_cell)
    elif decision_metric == 'rank':
        num_signals = (RankStat <= decision_thres).sum()

    RC = Container()
    RC.all_signals = pd.DataFrame(
        {
            'Product': DATA['product_name'].values,
            'Adverse Event': DATA['ae_name'].values,
            'Count': n11,
            'Expected Count': expected,
            'p_value': RankStat,
            'PRR': np.exp(log_rfet),
            'product margin': n1j,
            'event margin': ni1,
            'FDR': FDR
        },
        index=np.arange(len(n11))).sort_values(by=['p_value'])

    RC.signals = RC.all_signals.iloc[0:num_signals, ]
    RC.num_signals = num_signals
    return RC
示例#45
0
                if oro3p[1] == 0:
                    continue
                # if ctable[0][1] > ctable[0][0]:  # this junction has more counts in the mutant
                totalnumsites += 1
                if chrom[0] == '-':
                    closest_annotated = find_wiggle(threeprime1,
                                                    annotmin,
                                                    maxdist=150)
                    if closest_annotated == 150:
                        nocanonical += 1
                        diff = min(closest_annotated,
                                   int(oro3p[0]) - int(threeprime1))

                    entries += [[
                        chrom[1:], threeprime1, fiveprime,
                        sps.fisher_exact(ctable)[1], chrom[0]
                    ] + ctable[0] + ctable[1] + [name] + [diff] + [oro3p[0]]]
                    # writer.writerow([chrom[1:], threprime1, fiveprime, sps.fisher_exact(ctable)[1],
                    # chrom[0]] + ctable[0] + ctable[1] + [name] + [int(oro3p[0])-int(threeprime1)] + [oro3p[0]])
                else:
                    closest_annotated = find_wiggle(threeprime1,
                                                    annotpos,
                                                    maxdist=150)
                    if closest_annotated == 150:
                        nocanonical += 1
                        diff = min(closest_annotated,
                                   int(threeprime1) - int(oro3p[0]))
                    entries += [[
                        chrom[1:], fiveprime, threeprime1,
                        sps.fisher_exact(ctable)[1], chrom[0]
                    ] + ctable[0] + ctable[1] + [name] + [diff] + [oro3p[0]]]
def bar_plot_enrichment(vals, xticklabels, figname):
    a1, a2, b1, b2, c1, c2 = vals
    plt.figure(figsize=(2.6, 2.6))
    plot_positions = [1, 2, 3]
    width = 0.6
    plot_vals = [100 * a2 / a1, 100 * b2 / b1, 100 * c2 / c1]
    p = plt.bar(plot_positions,
                plot_vals,
                width=width,
                color=['silver', 'k', 'firebrick'])
    ###
    s, p = stats.fisher_exact([[a1 - a2, a2], [b1 - b2, b2]])
    print('Up:', s, p)
    p_label = '{:.1e}'.format(p)
    if p_label[-2] == '0':
        p_label = p_label[:-2] + p_label[-1]
    if p < 0.05:
        star_mark = "*"
        if p < 0.001:
            star_mark = "**"
    plt.text(plot_positions[1],
             plot_vals[1] * 1.02,
             star_mark,
             ha='center',
             fontsize=16)

    ###
    s, p = stats.fisher_exact([[a1 - a2, a2], [c1 - c2, c2]])
    print('CTCF Up:', s, p)
    p_label = '{:.1e}'.format(p)
    if p_label[-2] == '0':
        p_label = p_label[:-2] + p_label[-1]
    if p < 0.05:
        star_mark = "*"
        if p < 0.001:
            star_mark = "**"

    plt.text(plot_positions[2],
             plot_vals[2] * 1.02,
             star_mark,
             ha='center',
             fontsize=16)
    plt.axes().set_xticks(plot_positions)
    plt.axes().set_xticklabels(xticklabels,
                               rotation=25,
                               ha='right',
                               fontsize=15)
    #     sns.despine(offset=None, trim=False)
    plt.ylabel('Down-regulated \n genes in shCTCF (%)', fontsize=16)
    plt.xlim([0.35, 3.6])
    plt.ylim([0, 46])
    plt.axes().tick_params(axis='x',
                           direction='out',
                           length=3,
                           width=.8,
                           colors='black')
    plt.axes().tick_params(axis='y',
                           direction='out',
                           length=3,
                           width=.8,
                           colors='black')
    plt.savefig(figname,
                bbox_inches='tight',
                pad_inches=0.02,
                dpi=600,
                transparent=True)
    plt.close()
示例#47
0
def pval_fisher(tbl, *args):
  return stats.fisher_exact(tbl,*args)[1]
示例#48
0
inner = list()
outer = list()
p_list = list()
df1 = genetic_df[genetic_df['c1'] == genetic_df['c2']]
genetic_inner = df1.shape[0]
genetic_outer = genetic_df.shape[0] - genetic_inner

df1 = snp_df[snp_df['c1'] == snp_df['c2']]
a = df1.shape[0]
b = snp_df.shape[0] - a
inner.append(float(a) / genetic_inner)
outer.append(float(b) / genetic_outer)
c = genetic_inner - a
d = genetic_df.shape[0] - a - b - c
[odds, p] = fisher_exact([[a, b], [c, d]])
print([odds, p])
p_list.append(p)

df1 = gene_df.loc[gene_df['c1'] == gene_df['c2']]
a = df1.shape[0]
b = gene_df.shape[0] - a
inner.append(float(a) / genetic_inner)
outer.append(float(b) / genetic_outer)
c = genetic_inner - a
d = genetic_df.shape[0] - a - b - c
[odds, p] = fisher_exact([[a, b], [c, d]])
print([odds, p])
p_list.append(p)

df1 = ppi_df.loc[ppi_df['c1'] == ppi_df['c2']]
示例#49
0
async def enrichment(bedfile: pd.DataFrame,
                     bedcol=8,
                     groups=None,
                     correct=True,
                     okpval=10**-3):
    """
	compute pairwise enrichment and correlation for a set of peaks mappe to a conscensus 

	with each columns after the 7th one representing the signal of a given ChIP experiment
	over this conscensus. will present enrichment of row values in col values

	Args:
	----
		bedfile: df bed-like representing a conscensus set of peaks, and a set of values/foldchanges
			over it
		bedcol: int col where the bed information ends and signal information columns start
		correct: bool whether to correct for multiple hypothesis testing or not
		docorrelation: bool whether or not to compute correlation as well as enrichment
		okpval: float max pvalue over which to set the enrichment to 0

	Returns:
	-------
		a dataframe[values_name x values_name] of enrichment of row values in col values
		a dataframe[values_name x values_name] of correlation of values signal over the overlaps
	"""
    dat = bedfile[bedfile.columns[bedcol:]].values
    prob = dat.astype(bool).sum(0) / len(dat)
    enrichment = np.zeros(
        (dat.shape[1] if groups is None else len(set(groups)), dat.shape[1]))
    pvals = np.zeros(
        (dat.shape[1] if groups is None else len(set(groups)), dat.shape[1]))
    if groups is not None:
        for i in set(groups):
            overlapping = dat[groups == i]
            for j, val in enumerate(overlapping.T):
                # enrichment of j in i
                e, p = fisher_exact(
                    [[len(val[val != 0]),
                      len(val[val == 0])],
                     [prob[j] * len(dat), (1 - prob[j]) * len(dat)]])
                enrichment[i, j] = np.log2(e)
                pvals[i, j] = p
    else:
        for i, col in enumerate(dat.T):
            overlapping = np.delete(dat, i, axis=1)[col != 0]
            col = col[col != 0]
            add = 0
            for j, val in enumerate(overlapping.T):
                if j == i:
                    add = 1
                    enrichment[i, i] = 0
                e, p = fisher_exact([[len(val[val != 0]),
                                      len(val[val == 0])],
                                     [
                                         prob[j + add] * len(dat),
                                         (1 - prob[j + add]) * len(dat)
                                     ]])
                enrichment[i, j + add] = np.log2(e)
                pvals[i, j + add] = p
        enrichment[i, i] = 0
    enrichment = pd.DataFrame(
        data=enrichment,
        index=bedfile.columns[bedcol:] if groups is None else set(groups),
        columns=bedfile.columns[bedcol:]).T
    enrichment[enrichment == -np.inf] = -1000
    enrichment[enrichment.isna()] = 0
    enrichment[enrichment == np.inf] = 1000
    if correct:
        pvals = np.reshape(
            multipletests(pvals.ravel(), 0.1, method="bonferroni")[1],
            pvals.shape)
    pvals = pd.DataFrame(
        data=pvals,
        index=bedfile.columns[bedcol:] if groups is None else set(groups),
        columns=bedfile.columns[bedcol:]).T
    enrichment[pvals > okpval] = 0
    return enrichment, pvals
for seg_index in breaks_freq.seg_index.unique():

    a = breaks_freq.loc[breaks_freq.seg_index == seg_index,
                        "enh_id"].to_list()[0]  # num simple enhancers
    b = enh_totals - a  # num complex enhancers
    c = total_shuf_breaks.loc[total_shuf_breaks.seg_index == seg_index,
                              "enh_id"].to_list()
    if len(c) > 0:
        c = c[0]
    else:
        c = 0  # num simple shuffle
    d = total_shuf_breaks.enh_id.sum() - c  # num complex shuffle

    obs = [[a, b], [c, d]]
    OR, P = stats.fisher_exact(obs)
    table = sm.stats.Table2x2(obs)  # get confidence interval
    odds_ci = table.oddsratio_confint()
    newdf = pd.DataFrame({
        "seg_index": [seg_index],
        "a": [a],
        "b": [b],
        "c": [c],
        "d": [d],
        "OR": [OR],
        "P": [P],
        "ci_lower": [odds_ci[0]],
        "ci_upper": [odds_ci[1]]
    })
    OR_dict[seg_index] = newdf
    #print(seg_index, obs, OR, P)
示例#51
0
def call_differential_editing_sites(config_file):
    stability_value = 0.03  #value below which you may use a lower coverage for adding more samples to increase power
    min_disease_people = 5  #min number people supporting higher coverage for whch you may base stability off measurements off of
    min_control_people = 5  #min number control poeple supporting higher coverage for which you may base stability off of
    min_disease_people_5_cov = 10  #min disease number of people of 5 coverage you must have if needing to use unstable 5x coverage
    min_control_people_5_cov = 10  #min control number of people of 5 coverage you must have if needing to use unstable 5x coverage
    editing_file = './temp.csv'
    output_file = './editing_sites.with_stats_converted_disease.csv'
    #read in files
    editing_table = pd.read_csv(editing_file, sep='\t')
    #config_table = pd.read_csv(config_file,sep=',',header=None)
    config_table = pd.read_csv(config_file, sep=',', skiprows=1, header=None)
    all_people = config_table[0]
    disease_people = config_table[0][config_table[1] == "DIS"].reset_index(
        drop=True)  #TODO Change do disease!!!
    control_people = config_table[0][config_table[1] == "CTRL"].reset_index(
        drop=True)  #TODO Change to control!!!

    #now get just an editing table and coverage table
    edit_level_table = editing_table[all_people]

    #edit_level_table = editing_table[np.r_[all_people]]

    def get_editing_levels_for_cov_table(i):
        info = i.astype(str).str.split(pat="\\^")
        editing_levels = info.apply(lambda x: float('nan')
                                    if x[0] == "nan" else x[2])
        return editing_levels

    cov_table = edit_level_table.apply(get_editing_levels_for_cov_table)
    cov_table = cov_table.apply(lambda x: pd.to_numeric(
        x))  #TODO check if as.numeric and pandas to_numeric do the same.

    def get_editing_levels(i):
        info = i.astype(str).str.split(pat="\\^")
        editing_levels = info.apply(lambda x: float('nan')
                                    if x[0] == "nan" else x[0])
        return editing_levels

    edit_level_table = edit_level_table.apply(get_editing_levels)
    edit_level_table = edit_level_table.apply(
        lambda x: pd.to_numeric(x))  #TODO check precision on R and python

    #go down line by line and get the prevalence info and mean editing levels based off of stable coverages
    #WARNING I'm using float here, not integer allowing NaN values. Is ok?
    coverage_threshold_used = np.repeat(
        0., edit_level_table.shape[0]
    )  #will hold the coverage threshold required for this editing site
    stability_based_on = np.repeat(
        0., edit_level_table.shape[0]
    )  #will hold what coverage stability requirements were determined
    stable_mean_disease_editing_level = np.repeat(
        0., edit_level_table.shape[0]
    )  #mean autistic editing level using individuals passing coverage threshold
    stable_std_dev_disease_editing_level = np.repeat(
        0., edit_level_table.shape[0]
    )  #standard deviation of autistic editing level using individuals passing coverage threshold
    stable_mean_control_editing_level = np.repeat(
        0., edit_level_table.shape[0]
    )  #mean control editing level using individuals passing coverage threshold
    stable_std_dev_control_editing_level = np.repeat(
        0., edit_level_table.shape[0]
    )  #standard deviation of control editing level using individuals passing coverage threshold
    stable_number_disease_with_at_least_min_coverage = np.repeat(
        0., edit_level_table.shape[0]
    )  #number of autistic individuals passing the coverage threshold
    stable_number_disease_nonzero_editing_and_min_coverage = np.repeat(
        0., edit_level_table.shape[0]
    )  #number of autistic individuals without non zero editing level and passing coverage threshold
    stable_disease_prevalence = np.repeat(
        0., edit_level_table.shape[0]
    )  #proportion autistic individuals with nonzero editing
    stable_number_control_with_at_least_min_coverage = np.repeat(
        0.,
        edit_level_table.shape[0])  #same as disease but for control subjects
    stable_number_control_nonzero_editing_and_min_coverage = np.repeat(
        0., edit_level_table.shape[0])
    stable_control_prevalence = np.repeat(0., edit_level_table.shape[0])
    stable_total_number_individuals_nonzero_editing_and_min_coverage = np.repeat(
        0., edit_level_table.shape[0]
    )  #total number of disease and control subjects passing the coverage threshold and having nonzero editing level
    stable_mann_whitney_p_value = np.repeat(
        0., edit_level_table.shape[0]
    )  #wilcoxon rank sum test p value using individuals passing the coverage threshold
    stable_editing_level_effect_size = np.repeat(
        0., edit_level_table.shape[0]
    )  #difference between mean disease and mean control
    stable_frequency_fishers_p_value = np.repeat(
        0., edit_level_table.shape[0]
    )  #prevalence p value determined using two-tailed fisher's exact test
    stable_frequency_OR = np.repeat(
        0., edit_level_table.shape[0])  #odds ratio of the fisher's exact teest
    stable_prevalence_effect_size = np.repeat(
        0., edit_level_table.shape[0]
    )  #difference in editing level prevalences between disease and control subjects
    #WARNING those are np arrays.

    for i in range(0, edit_level_table.shape[0]):
        print i  #keep track of progress
        disease_edit_row = edit_level_table.loc[i, disease_people]
        control_edit_row = edit_level_table.loc[i, control_people]
        disease_cov_row = cov_table.loc[i, disease_people]
        control_cov_row = cov_table.loc[i, control_people]
        #find what coverage we can base stability off of
        number_disease_20_cov = disease_cov_row[disease_cov_row >= 20].count()
        number_control_20_cov = control_cov_row[control_cov_row >= 20].count()
        number_disease_15_cov = disease_cov_row[disease_cov_row >= 15].count()
        number_control_15_cov = control_cov_row[control_cov_row >= 15].count()
        number_disease_10_cov = disease_cov_row[disease_cov_row >= 10].count()
        number_control_10_cov = control_cov_row[control_cov_row >= 10].count()
        number_disease_5_cov = disease_cov_row[disease_cov_row >= 5].count()
        number_control_5_cov = control_cov_row[control_cov_row >= 5].count()
        if number_disease_20_cov >= min_disease_people and number_control_20_cov >= min_control_people:
            stability_based_on[i] = 20
        elif number_disease_15_cov >= min_disease_people and number_control_15_cov >= min_control_people:
            stability_based_on[i] = 15
        elif number_disease_10_cov >= min_disease_people and number_control_10_cov >= min_control_people:
            stability_based_on[i] = 10
        elif number_disease_5_cov >= min_disease_people_5_cov and number_control_5_cov >= min_control_people_5_cov:
            stability_based_on[i] = 5
        else:
            #stability_based_on[i] = -99999 # there's no np.nan integer representation, only float. We use an invalid value.
            stability_based_on[i] = float('nan')

        #need to deal with cases where there just are not enough disease individuals or control individuals to calculate mean
        if np.isnan(stability_based_on[i]):

            coverage_threshold_used[
                i] = 5  #I warn users not to use editing sites that don't have any stability_based_on measurement. We include min coverage of 5 just to get statistical information anyways
            #stable_min_cov=5
            #otherwise we can now try to find the stable_min_cov that'll be used for calculation of all statistics'

        else:
            current_stability_cov = stability_based_on[i]
            stability_disease_mean = disease_edit_row[
                disease_cov_row >= current_stability_cov].mean()
            stability_control_mean = control_edit_row[
                control_cov_row >= current_stability_cov].mean()
            #print np.arange(5,stability_based_on[i]+1e-4,5)
            for j in np.arange(
                    5, stability_based_on[i] + 1e-4,
                    5):  #WARNING using 1e-4 allowing to include stop
                disease_mean = disease_edit_row[disease_cov_row >= j].mean()
                control_mean = control_edit_row[control_cov_row >= j].mean()
                if np.absolute(disease_mean - stability_disease_mean
                               ) <= stability_value and np.absolute(
                                   control_mean -
                                   stability_control_mean) <= stability_value:
                    coverage_threshold_used[i] = j
                    break
        #now let's calculate all our statics based on the stable coverage threshold
        stable_min_cov = coverage_threshold_used[i]
        disease_adju_edit_row = disease_edit_row[np.logical_and(
            np.logical_and((~np.isnan(disease_edit_row)),
                           (~np.isnan(disease_cov_row))),
            (disease_cov_row >= stable_min_cov))]
        disease_adju_cov_row = disease_cov_row[np.logical_and(
            (~np.isnan(disease_cov_row)), (disease_cov_row >= stable_min_cov))]
        control_adju_edit_row = control_edit_row[np.logical_and(
            np.logical_and((~np.isnan(control_edit_row)),
                           (~np.isnan(control_cov_row))),
            (control_cov_row >= stable_min_cov))]
        control_adju_cov_row = control_cov_row[np.logical_and(
            (~np.isnan(control_cov_row)), (control_cov_row >= stable_min_cov))]
        stable_mean_disease_editing_level[i] = disease_adju_edit_row.mean()
        stable_std_dev_disease_editing_level[i] = disease_adju_edit_row.std()
        stable_mean_control_editing_level[i] = control_adju_edit_row.mean()
        stable_std_dev_control_editing_level[i] = control_adju_edit_row.std()
        stable_number_disease_with_at_least_min_coverage[
            i] = disease_adju_cov_row[
                disease_adju_cov_row >= stable_min_cov].count()
        stable_number_disease_nonzero_editing_and_min_coverage[
            i] = disease_adju_cov_row[(~np.isnan(disease_adju_cov_row)) &
                                      (disease_adju_cov_row >= stable_min_cov)
                                      & (disease_adju_edit_row > 0)].count()
        stable_disease_prevalence[
            i] = stable_number_disease_nonzero_editing_and_min_coverage[
                i] / stable_number_disease_with_at_least_min_coverage[i]
        stable_number_control_with_at_least_min_coverage[
            i] = control_adju_cov_row[
                control_adju_cov_row >= stable_min_cov].count()
        stable_number_control_nonzero_editing_and_min_coverage[
            i] = control_adju_cov_row[(~np.isnan(control_adju_cov_row)) &
                                      (control_adju_cov_row >= stable_min_cov)
                                      & (control_adju_edit_row > 0)].count()
        stable_control_prevalence[
            i] = stable_number_control_nonzero_editing_and_min_coverage[
                i] / stable_number_control_with_at_least_min_coverage[i]
        stable_total_number_individuals_nonzero_editing_and_min_coverage[i] = (
            stable_number_disease_nonzero_editing_and_min_coverage[i] +
            stable_number_control_nonzero_editing_and_min_coverage[i]).sum()
        if (len(disease_adju_edit_row) >= 1) & (len(control_adju_edit_row) >=
                                                1):
            if (np.all(disease_adju_edit_row.values ==
                       control_adju_edit_row.values)):
                stable_mann_whitney_p_value[i] = float('nan')
            else:
                temp, stable_mann_whitney_p_value[i] = mannwhitneyu(
                    disease_adju_edit_row,
                    control_adju_edit_row,
                    alternative='two-sided')
        else:
            stable_mann_whitney_p_value[i] = float('nan')
        stable_editing_level_effect_size[i] = np.absolute(
            stable_mean_disease_editing_level[i] -
            stable_mean_control_editing_level[i])
        fisher_matrix = np.matrix(
            [[
                stable_number_disease_nonzero_editing_and_min_coverage[i],
                stable_number_disease_with_at_least_min_coverage[i] -
                stable_number_disease_nonzero_editing_and_min_coverage[i]
            ],
             [
                 stable_number_control_nonzero_editing_and_min_coverage[i],
                 stable_number_control_with_at_least_min_coverage[i] -
                 stable_number_control_nonzero_editing_and_min_coverage[i]
             ]])
        stable_frequency_OR[i], stable_frequency_fishers_p_value[
            i] = fisher_exact(fisher_matrix)
        #print stable_frequency_OR[i]
        #print stable_frequency_fishers_p_value[i]
        stable_prevalence_effect_size[i] = np.absolute(
            stable_disease_prevalence[i] - stable_control_prevalence[i])

    #now put everything back together as a table
    header_info = editing_table[['chromosome', 'position', 'type_editing']]
    stats_table = pd.DataFrame(coverage_threshold_used)
    stats_table = stats_table.rename(
        columns={stats_table.columns[0]: 'coverage_threshold_used'})
    stats_table['stability_based_on'] = pd.DataFrame(stability_based_on)
    stats_table['stable_mean_disease_editing_level'] = pd.DataFrame(
        stable_mean_disease_editing_level)
    stats_table['stable_std_dev_disease_editing_level'] = pd.DataFrame(
        stable_std_dev_disease_editing_level)
    stats_table['stable_mean_control_editing_level'] = pd.DataFrame(
        stable_mean_control_editing_level)
    stats_table['stable_std_dev_control_editing_level'] = pd.DataFrame(
        stable_std_dev_control_editing_level)
    stats_table[
        'stable_number_disease_with_at_least_min_coverage'] = pd.DataFrame(
            stable_number_disease_with_at_least_min_coverage)
    stats_table[
        'stable_number_disease_nonzero_editing_and_min_coverage'] = pd.DataFrame(
            stable_number_disease_nonzero_editing_and_min_coverage)
    stats_table['stable_disease_prevalence'] = pd.DataFrame(
        stable_disease_prevalence)
    stats_table[
        'stable_number_control_with_at_least_min_coverage'] = pd.DataFrame(
            stable_number_control_with_at_least_min_coverage)
    stats_table[
        'stable_number_control_nonzero_editing_and_min_coverage'] = pd.DataFrame(
            stable_number_control_nonzero_editing_and_min_coverage)
    stats_table['stable_control_prevalence'] = pd.DataFrame(
        stable_control_prevalence)
    stats_table[
        'stable_total_number_individuals_nonzero_editing_and_min_coverage'] = pd.DataFrame(
            stable_total_number_individuals_nonzero_editing_and_min_coverage)
    stats_table['stable_mann_whitney_p_value'] = pd.DataFrame(
        stable_mann_whitney_p_value)
    stats_table['stable_editing_level_effect_size'] = pd.DataFrame(
        stable_editing_level_effect_size)
    stats_table['stable_frequency_fishers_p_value'] = pd.DataFrame(
        stable_frequency_fishers_p_value)
    stats_table['stable_frequency_OR'] = pd.DataFrame(stable_frequency_OR)
    stats_table['stable_prevalence_effect_size'] = pd.DataFrame(
        stable_prevalence_effect_size)

    full_table = pd.concat(
        [header_info, stats_table, editing_table[all_people]], axis=1)

    #write the full_table to output
    full_table.to_csv(output_file, sep='\t', index=False)

    print "job completed\n"
def pathway_enrichment_analysis_pw_bg(input_genes, input_bg, db, enrichment, p_threshold, exclude_unique_pw):
    enriched_pathways = {}

    input_genes = set(input_genes)
    input_count = len(input_genes)
    pathways = db['dict']
    bg = set(input_bg)

    for pathway in pathways:
        if exclude_unique_pw:
            # Consider only pathway genes annotated in input bg
            pathway_genes = pathways[pathway]['genes'].intersection(bg)
        else:
            pathway_genes = pathways[pathway]['genes']
            bg = bg.union(pathway_genes)

        pathway_count = len(pathway_genes)
        bg_count = len(bg)

        overlap = input_genes.intersection(pathway_genes)
        overlap_count = len(overlap)

        if not overlap_count:
            continue

        pathway_only_count = pathway_count - overlap_count
        input_only_count = input_count - overlap_count

        bg_only_count = bg_count - pathway_count - input_count + overlap_count

        enrichment_coefficient = find_enrichment_coefficient(overlap_count,
                                                             pathway_count,
                                                             input_count,
                                                             bg_count)

        if enrichment_coefficient <= 0 and enrichment == 'p':
            continue

        if enrichment_coefficient > 0 and enrichment == 'n':
            continue

        tail = find_tail(overlap_count, pathway_count, input_count, bg_count)

        try:
            odds_ratio, pvalue = stats.fisher_exact(
                [
                    [overlap_count, pathway_only_count],
                    [input_only_count, bg_only_count]
                ],
                alternative=tail
            )
        except ValueError:
            print('Something is wrong with this 2x2 table: \n')
            print([overlap_count, pathway_only_count], [input_only_count, bg_only_count])
            break

        if pvalue > float(p_threshold):
            continue

        enriched_pathways[pathway] = {
            'input_genes': input_count,
            'db': pathways[pathway]['db'],
            'length': pathway_count,
            'overlap': overlap_count,
            'pval': pvalue,
            'enrichment': enrichment_coefficient,
            'db_u_input': bg_count,
            'metabolites': pathways[pathway]['metabolites'],
            'overlap_genes': overlap
        }

    return enriched_pathways
示例#53
0
def compare_groups(labels, results,
                   low=None, high=None, num=100,
                   comp_groups=None, print_skips=False):
    """
    Function to plot proportion of largest and smallest bias groups and
    get relative z scores

    Parameters
    --------
    labels : array_like
        contains categorical values like ['M', 'F']
    results : array_like
        contains real numbers, e.g. threshold scores or floats in (0,1)
    low : float
        lower threshold value
    high : float
        upper threshold value
    num : int
        number of thresholds to check
    comp_groups : list of strings, optional
        subset of labels to compare, e.g. ['white', 'black']
    print_skips : bool
        whether to display thresholds skipped

    Returns
    ---------
    min_props : dict
        contains (key, value) of (threshold : max group/min group proportions)
    z_ps : dict
        contains (key, value) of (threshold : p-value of two tailed z test)
    fisher_ps : dict
        contains (key, value) of (threshold : p-value of fisher exact test)
    chi_ps : dict
        contains (key, value) of (threshold : p-value of chi squared test)
    bayes_facts : dict
        contains (key, value) of (threshold : bayes factor)
    """

    # cast labels and scores to pandas Series
    df = pd.DataFrame(list(zip(labels, results)), columns=['label', 'result'])

    min_props = {}
    fisher_ps = {}
    chi_ps = {}
    z_ps = {}
    bayes_facts = {}

    if comp_groups is not None:
        df = df[df['label'].isin(comp_groups)]

    # define range of values to test over if not inputted
    if low is None:
        low = min(results)
    if high is None:
        high = max(results)

    thresholds = np.linspace(low, high, num)

    skip_thresholds = []
    for thresh in thresholds:

        df['dec'] = [i >= thresh for i in results]

        # compare rates of passing across groups
        ctabs = pd.crosstab(df['label'], df['dec'])

        # skip any thresholds for which the crosstabs are one-dimensional
        if 1 in ctabs.shape:
            skip_thresholds.append(thresh)
            continue

        normed_ctabs = ctabs.div(ctabs.sum(axis=1), axis=0)
        true_val = max(set(df['dec']))
        max_group = normed_ctabs[true_val].max()
        normed_proportions = normed_ctabs[true_val] / max_group
        min_proportion = normed_proportions.min()

        # run statistical tests
        if ctabs.shape == (2, 2):
            test_results = test_multiple(df['label'].values, df['dec'].values)
            z_pval = test_results.get('z_score')[1]
            fisher_pval = test_results.get('fisher_p')[1]
            chi2_pval = test_results.get('chi2_p')[1]
            bayes_fact = test_results.get('BF')

        else:
            top_bottom_ctabs = top_bottom_crosstab(df['label'], df['dec'])
            z_pval = crosstab_ztest(top_bottom_ctabs)[1]
            fisher_pval = fisher_exact(top_bottom_ctabs)[1]
            chi2_pval = chi2_contingency(ctabs)[1]
            bayes_fact = crosstab_bayes_factor(ctabs)

        min_props[thresh] = min_proportion
        z_ps[thresh] = z_pval
        fisher_ps[thresh] = fisher_pval
        chi_ps[thresh] = chi2_pval
        bayes_facts[thresh] = bayes_fact

    if len(skip_thresholds) > 0 and print_skips:
        print('One-dimensional thresholds were skipped: %s' % skip_thresholds)

    return min_props, z_ps, fisher_ps, chi_ps, bayes_facts
示例#54
0
def test_multiple(labels, decisions,
                  tests=('ztest', 'fisher', 'chi2', 'BF', 'prop'),
                  display=False):
    """
    Function that returns p_values for z-score, fisher exact, and chi2 test
    of 2x2 crosstab of passing rate by labels and decisions

    See docs for z_test_ctabs, fisher_exact, chi2_contingency and
    bf_ctabs for details of specific tests

    Parameters
    ----------
    labels : array_like
        categorical labels for each corresponding value of `decision` ie. M/F

    decisions : array_like
        binary decision values, ie. True/False or 0/1

    tests : list
        a list of strings specifying the tests to run, valid options
        are 'ztest', 'fisher', 'chi2' and 'bayes'. Defaults to all four.
        -ztest: p-value for two-sided z-score for proportions
        -fisher: p-value for Fisher's exact test for proportions
        -chi2: p-value for chi-squared test of independence for proportions
        -bayes: bayes factor for independence assuming uniform prior
        -prop: proportion of lowest to highest passing rates by group

    display : bool
        print the results of each test in addition to returning them

    Returns
    -------
    results : dict
        dictionary of values, one for each test.
        Valid keys are: 'z_score', 'fisher_p', 'chi2_p', 'BF', and 'prop'

    Examples
    --------
    >>> # no real difference between groups
    >>> labels = ['group1']*100 + ['group2']*100 + ['group3']*100
    >>> decisions = [1,0,0]*100
    >>> all_test_ctabs(dependent_ctabs)
        (0.0, 1.0, 1.0, 0.26162148804907587)

    >>> # massively biased ratio of hits/misses by group
    >>> ind_ctabs = np.array([[75,50],[25,50]])
    >>> all_test_ctabs(ind_ctabs)
        (-3.651483716701106,
         0.0004203304586999487,
         0.0004558800052056139,
         202.95548692414306)

    >>> # correcting with a biased prior
    >>> biased_prior =  np.array([[5,10],[70,10]])
    >>> all_test_ctabs(ind_ctabs, biased_prior)
        (-3.651483716701106,
         0.0004203304586999487,
         0.0004558800052056139,
         0.00012159518854984268)
    """

    decisions = boolean_array(decisions)
    crosstab = pd.crosstab(pd.Series(labels), pd.Series(decisions))
    crosstab = crosstab.values

    # can only perform 2-group z-tests & fisher tests
    # getting crosstabs for groups with highest and lowest pass rates
    # as any difference between groups is considered biased
    tb_crosstab = top_bottom_crosstab(labels, decisions)

    results = {}
    if 'ztest' in tests:
        results['z_score'] = crosstab_ztest(tb_crosstab)
    if 'fisher' in tests:
        # although fisher's exact can be generalized to multiple groups
        # scipy is limited to shape (2, 2)
        # TODO make generalized fisher's exact test
        # returns oddsratio and p-value
        results['fisher_p'] = fisher_exact(tb_crosstab)[:2]
    if 'chi2' in tests:
        # returns chi2 test statistic and p-value
        results['chi2_p'] = chi2_contingency(crosstab)[:2]
    if 'BF' in tests:
        results['BF'] = crosstab_bayes_factor(crosstab)
    if 'prop' in tests:
        results['prop'] = min(proportion_test(labels, decisions))

    if display:
        for key in results:
            print("%s: %f" % (key, results[key]))

    return results
示例#55
0
    c = {}

    fraction1 = (len(cna_ais) - cna_ais.count(0)) / len(cna_ais)
    fraction2 = (len(cna_inv) - cna_inv.count(0)) / len(cna_inv)
    ais_gain = (cna_ais.count(1)) / len(cna_ais)
    ais_loss = (cna_ais.count(-1)) / len(cna_ais)
    inv_gain = (cna_inv.count(1)) / len(cna_inv)
    inv_loss = (cna_inv.count(-1)) / len(cna_inv)

    c["Gain"] = inv_gain - ais_gain
    c["Lost"] = inv_loss - ais_loss

    t, pval = stats.fisher_exact(
        [[len(cna_ais) - cna_ais.count(0),
          cna_ais.count(0)],
         [len(cna_inv) - cna_inv.count(0),
          cna_inv.count(0)]])
    print i, cna_ais.count(0), cna_inv.count(0), c, t, pval

    t, pval_gain = stats.fisher_exact([[ais_gain, cna_ais.count(0)],
                                       [inv_gain, cna_inv.count(0)]])
    print i, pval_gain

    t, pval_loss = stats.fisher_exact([[ais_loss, cna_ais.count(0)],
                                       [inv_loss, cna_inv.count(0)]])
    print i, pval_loss

    p_3[i] = c
    if i in "6p":
        gain = df3[df3["CNA"] == 1]
示例#56
0
def get_info_gatk(info, format, tumor, normal):
    '''estrae le informazioni dal vcf di mutect'''
    mutect = Variantcaller()

    mutect.GT = tumor[format.index('GT')]
    if mutect.GT == '0|1' or mutect.GT == '1|0':
        mutect.GT = '0/1'
    if len(mutect.GT.split('/')) > 2:
        mutect.GT = '0/1'

    [mutect.RO,
     mutect.AO] = [int(a) for a in tumor[format.index('AD')].split(',')]
    mutect.DP = mutect.AO + mutect.RO
    mutect.AF = round(float(mutect.AO) / float(mutect.DP), 4)

    try:
        mutect.GQ = float(tumor[format.index('GQ')])
    except:
        mutect.GQ = '.'
    try:
        mutect.PGT = tumor[format.index('PGT')]
    except:
        mutect.PGT = '.'
    try:
        mutect.PID = tumor[format.index('PID')]
    except:
        mutect.PID = '.'

    try:
        mutect.RO_f, mutect.RO_r, mutect.AO_f, mutect.AO_r = tumor[
            format.index('SB')].split(',')
    except:
        print(tumor)

    R = (float(mutect.RO_f) + 1) * (float(mutect.AO_r) + 1) / (
        float(mutect.RO_r) + 1) * (float(mutect.AO_f) + 1)
    SymmetricRatio = R + 1 / R
    RefRatio = min((float(mutect.RO_f) + 1), (float(mutect.RO_r) + 1)) / max(
        (float(mutect.RO_f) + 1), (float(mutect.RO_r) + 1))
    AltRatio = min((float(mutect.AO_f) + 1), (float(mutect.AO_r) + 1)) / max(
        (float(mutect.AO_f) + 1), (float(mutect.AO_r) + 1))
    mutect.StOR = np.log(SymmetricRatio) + np.log(RefRatio) - np.log(AltRatio)

    mutect.DP_r = float(mutect.RO_r) + float(mutect.AO_r)
    mutect.DP_f = float(mutect.RO_f) + float(mutect.AO_f)

    if opts.amplicon:
        if min(mutect.DP_r, mutect.DP_f) / (mutect.DP_r + mutect.DP_f) >= 0.05:
            mutect.FStBias = 1 - stats.fisher_exact(
                [[mutect.RO_f, mutect.RO_r], [mutect.AO_f, mutect.AO_r]])[1]
        else:
            mutect.FStBias = '1.0'
    else:
        if min(mutect.DP_r, mutect.DP_f) / (mutect.DP_r + mutect.DP_f) > 0:
            mutect.FStBias = 1 - stats.fisher_exact(
                [[mutect.RO_f, mutect.RO_r], [mutect.AO_f, mutect.AO_r]])[1]
        else:
            mutect.FStBias = '1.0'

    for ind in info:
        if ind.startswith("CONTQ="):
            mutect.CONTQ = ind.split('=')[1]
        if ind.startswith("ECNT="):
            mutect.ECNT = ind.split('=')[1]
        if ind.startswith("GERMQ="):
            mutect.GERMQ = ind.split('=')[1]
        if ind.startswith("MBQ="):
            mutect.MBQ_ref, mutect.MBQ_alt = ind.split('=')[1].split(',')
        if ind.startswith("MFRL="):
            mutect.MFRL_ref, mutect.MFRL_alt = ind.split('=')[1].split(',')
        if ind.startswith("MMQ="):
            mutect.MMQ_ref, mutect.MMQ_alt = ind.split('=')[1].split(',')
        if ind.startswith("MPOS="):
            mutect.MPOS = ind.split('=')[1]
        if ind.startswith("POPAF="):
            mutect.POPAF = ind.split('=')[1]
        if ind.startswith("PON"):
            mutect.PON = '1'
        if ind.startswith("RPA="):
            mutect.RPA_ref, mutect.RPA_alt = ind.split('=')[1].split(',')
        if ind.startswith("RU="):
            mutect.RU = ind.split('=')[1]
        if ind == "STR":
            mutect.STR = '1'
        if ind.startswith("SEQQ="):
            mutect.SEQQ = float(ind.split('=')[1])
        if ind.startswith("STRANDQ="):
            mutect.STRANDQ = float(ind.split('=')[1])
        if ind.startswith("STRQ="):
            mutect.STRQ = float(ind.split('=')[1])
        if ind.startswith("TLOD="):
            mutect.tumor_lod = float(ind.split('=')[1])

    return mutect
示例#57
0
def pvalue_fisher(x, a):
    a0 = x - a
    oddsratio, pvalue_f = stats.fisher_exact([[a, n1 - a], [a0, n0 - a0]])
    return pvalue_f
def frac_correct(bdata):
    '''
    Calculate the fraction of correct trials overall for a bdata object.

    Args:
        bdata (jaratoolbox.loadbehavior.BehaviorData dict): the behavior data to use
    Returns:
        nCorrect (int): Number of correct trials
        nValid (int): Number of valid trials
    '''
    correct = bdata['outcome'] == bdata.labels['outcome']['correct']
    nCorrect = sum(correct)
    valid = bdata['valid']
    nValid = sum(valid)
    # return nCorrect/float(nValid)
    return nCorrect, nValid


ncs, nvs = frac_correct(sdata)
ncm, nvm = frac_correct(mdata)

nis = nvs - ncs
nim = nvm - ncm

from scipy.stats import fisher_exact

oddsratio, pval = fisher_exact([[ncs, nis], [ncm, nim]])

print oddsratio
print pval
示例#59
0
文件: stats.py 项目: zchelseal/scipy
 def time_fisher_exact(self, alternative):
     oddsratio, pvalue = stats.fisher_exact(self.a, alternative=alternative)
示例#60
0
def main():
	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.")
	parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological groups of each sample. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs.  It must have a header row. Sample IDs should match to the \"Data file\".")
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="Prefix of the output file.")
	(options,args)=parser.parse_args()
	
	print ()
	#print (options.paired)
	#print (options.welch_ttest)
	if not (options.input_file):
		print (__doc__)
		parser.print_help()
		sys.exit(101)

	if not (options.group_file):
		print (__doc__)
		parser.print_help()
		sys.exit(102)
				
	if not (options.out_file):
		print (__doc__)
		parser.print_help()
		sys.exit(103)	
	
	FOUT = open(options.out_file + '.pval.txt','w')
	#ROUT = open(options.out_file + '.r','w')
	
	printlog("Read group file \"%s\" ..." % (options.group_file))
	(s,g) = read_grp_file1(options.group_file)
	s2g = dict(zip(s,g))
	g2s = collections.defaultdict(list)
	
	for k,v in s2g.items():
		g2s[v].append(k)
	
	group_IDs = sorted(g2s.keys())
	for g in group_IDs:
		print ("\tGroup %s has %d samples:" % (g, len(g2s[g])))
		print ('\t\t' + ','.join(g2s[g]))
	
	if len(group_IDs) != 2:
		printlog("You must have two groups!", file=sys.stderr)
		sys.exit(1)
	
	line_num = 1
	probe_list = []
	p_list = []
	or_list = []
	for l in ireader.reader(options.input_file):
		f = l.split()
		if line_num == 1:
			sample_IDs = f[1:]
			# check if sample ID matches
			for s in s2g:
				if s not in sample_IDs:
					printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
					sys.exit(3)
		else:
			cg_id = f[0]
			probe_list.append(cg_id)
			proportions = f[1:]
			methyl_reads = 0
			unmethyl_reads = 0
			g2values = collections.defaultdict(dict)
			for g in group_IDs:
				g2values[g]['methyl'] = 0
				g2values[g]['unmethyl'] = 0
			for s,p in zip(sample_IDs, proportions):
				gid = s2g[s]
				m = re.match(r'(\d+)\s*\,\s*(\d+)', p)
				if m is None:
					continue
				else:
					c = int(m.group(1))
					n = int(m.group(2))
					if n >= c and n > 0:
						g2values[gid]['methyl'] += c
						g2values[gid]['unmethyl']  += (n-c)
					else:
						printlog("Incorrect data format!")
						print (f)
						sys.exit(1)		
			(odds, pval) = stats.fisher_exact([ [g2values[group_IDs[0]]['methyl'], g2values[group_IDs[0]]['unmethyl']],[g2values[group_IDs[1]]['methyl'], g2values[group_IDs[1]]['unmethyl']] ])
			#print (g2values[group_IDs[0]]['methyl'], g2values[group_IDs[0]]['unmethyl'],g2values[group_IDs[1]]['methyl'], g2values[group_IDs[1]]['unmethyl'])
			p_list.append(pval)
			or_list.append(odds)				
		line_num += 1
	
	printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
	adjusted_p = {}
	q_list =  padjust.multiple_testing_correction(p_list)
	for id,o,p,q in zip(probe_list, or_list, p_list, q_list):
		adjusted_p[id] = '\t'.join([str(i) for i in (o,p,q)])
	
	printlog("Writing to %s" % (options.out_file + '.pval.txt'))
	line_num = 1
	for l in ireader.reader(options.input_file):
		if line_num == 1:
			print (l + '\tOddsRatio\tpval\tadj.pval', file=FOUT)
		else:
			f = l.split()
			probe_ID = f[0]
			print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
		line_num += 1
	FOUT.close()