def test_regularity(dataset): choice_sets, choices, person_df = dataset.load() unique_choice_sets, idx = np.unique(choice_sets, axis=0, return_inverse=True) tests = 0 for i, j in combinations(range(len(unique_choice_sets)), 2): c1, c2 = unique_choice_sets[i], unique_choice_sets[j] if all(c1 * c2 == c2): c1, c2 = c2, c1 i, j = j, i if any(c1 * c2 != c1): continue c1_counts = np.bincount(choices[idx == i, 0], minlength=len(c1)) c2_counts = np.bincount(choices[idx == j, 0], minlength=len(c1)) c1_tot = np.sum(c1_counts) c2_tot = np.sum(c2_counts) c1_prop = c1_counts / c1_tot c2_prop = c2_counts / c2_tot c1_names = [dataset.item_names[k] for k in range(len(c1)) if c1[k] == 1] c2_names = [dataset.item_names[k] for k in range(len(c1)) if c2[k] == 1] for k in range(len(c1)): if c1[k] == 1: tests += 1 oddsratio, pvalue = stats.fisher_exact( [[c1_counts[k], c1_tot - c1_counts[k]], [c2_counts[k], c2_tot - c2_counts[k]]]) if c1_prop[k] < c2_prop[k] and pvalue < 0.05: print(f'{dataset.item_names[k]} (p={pvalue:.2g}])\n' f'\t{c1_prop[k]:.2f} ({c1_tot} samples) in {c1_names} \n' f'\t{c2_prop[k]:.2f} ({c2_tot} samples) in {c2_names} ')
def FisherScoreCutoff(posScores, negScores): """Define the upper and lower 5 percent bounds of the distribution of scores, then increase the score cutoff by 1/100 the difference between the two scores at a time compute the Fishter p-value for each score cutoff and record the best one """ #posScores=sorted(posScores,reverse=True) negScores = sorted(negScores, reverse=True) contingencyTable = [[0, 0], [0, 0]] scoreCutoff = negScores[len(negScores) / 100] posCount = 0 negCount = 0 for i in range(len(posScores)): if posScores[i] >= scoreCutoff: posCount += 1 for j in range(len(negScores)): if negScores[j] >= scoreCutoff: negCount += 1 contingencyTable[0][0] = posCount contingencyTable[0][1] = len(posScores) - posCount contingencyTable[1][0] = negCount contingencyTable[1][1] = len(negScores) - negCount ob, p_value = stats.fisher_exact(contingencyTable) return p_value, scoreCutoff, contingencyTable[0] + contingencyTable[1] '''upperBound=negScores[len(negScores)/10000] lowerBound=negScores[len(negScores)/100*50] print "Lower index",len(negScores)/100*50 print len(posScores),len(negScores) print "UPPER LOWER",upperBound,lowerBound intervalnum=100 interval=(upperBound-lowerBound)/intervalnum print "INTERVAL",interval bestcuttoff=-100.0 bestpvalue=1.0 contingencyTable=[[0,0],[0,0]] besttable=contingencyTable for i in range(intervalnum): scoreCutoff=lowerBound+interval*i if scoreCutoff >upperBound: break posCount=0 negCount=0 for j in range(len(posScores)): if posScores[j]>=scoreCutoff: posCount+=1 for j in range(len(negScores)): if negScores[j]>=scoreCutoff: negCount+=1 contingencyTable[0][0]=posCount contingencyTable[0][1]=len(posScores)-posCount contingencyTable[1][0]=negCount contingencyTable[1][1]=len(negScores)-negCount ob,p_value=stats.fisher_exact(contingencyTable) #print p_value,scoreCutoff,contingencyTable if p_value<bestpvalue: bestpvalue=p_value bestcuttoff=scoreCutoff besttable=contingencyTable[0]+contingencyTable[1] return bestpvalue,bestcuttoff,besttable''' return
def generate_rules_for_class(self, general_summary, class_name): special_summary = [] for summary_detail in general_summary: if summary_detail[1][class_name] > 0: special_summary.append(summary_detail) ''' Compute p-value ''' item_set = string_2_itemset(summary_detail[0]) satisfy_rule = self.freq_itemset_dict.get_frequency( summary_detail[0]) no_satisfy_rule = self.freq_itemset_dict.ntransactions - satisfy_rule correct_predict = self.lookup_frequency(item_set, class_name) incorrect_predict = satisfy_rule - correct_predict belong_to_class = self.freq_itemset_dict.get_frequency( class_name) no_rule_belong_to_class = belong_to_class - correct_predict contingency_matrix = np.array( [[correct_predict, incorrect_predict], [ no_rule_belong_to_class, no_satisfy_rule - no_rule_belong_to_class ]]) _, p_value = stats.fisher_exact(contingency_matrix) summary_detail[1]['p-value'] = p_value return special_summary
def main(): posfile = argv[1] negfile = argv[2] fastafile = argv[3] outfile = argv[4] totalseq = len(open(fastafile).read().split(">")) - 1 posdict = {} file = open(posfile) file.readline() for line in file: tmp = line.strip().split('\t') if tmp[0] not in posdict: posdict[tmp[0]] = {} posdict[tmp[0]][tmp[1]] = 1 negdict = {} file = open(negfile) file.readline() for line in file: tmp = line.strip().split('\t') if tmp[0] not in negdict: negdict[tmp[0]] = {} negdict[tmp[0]][tmp[1]] = 1 lines = [] pvalues = [] #enrichments = [] for motif in posdict: if motif not in negdict: print "ERROR, not the same set of motifs" break posnum = len(posdict[motif]) negnum = len(negdict[motif]) enrichment, pvalue = stats.fisher_exact([[posnum, totalseq - posnum], [negnum, totalseq - negnum]]) pvalues += [pvalue] #enrichments = [enrichment] line = [motif, str(posnum), str(totalseq - posnum), str(negnum), str(totalseq - negnum), str(enrichment), str(pvalue)] line = '\t'.join(line) lines += [line] sortedindex = sorted(range(len(pvalues)), key = lambda x: pvalues[x]) lines = [lines[x] for x in sortedindex] target = open(outfile,'w') for line in lines: target.write(line+'\n') target.close() return
def fisher_exact_two_groups(dataset, target_col, protected_col): """ Performs a Fisher exact test on a 2x2 contingency table as in scipy.stats.fisher_exact_two_groups() @param dataset: @param target_col: name of the column that contains the classifier results @param protected_col: name of the column that contains the protection status @return: odds ratio and related p-value """ positive_protected = dataset.count_classification_and_category(target_col, protected_col, group=1, accepted=1) negative_protected = dataset.count_classification_and_category(target_col, protected_col, group=1, accepted=0) positive_nonprotected = dataset.count_classification_and_category(target_col, protected_col, group=0, accepted=1) negative_nonprotected = dataset.count_classification_and_category(target_col, protected_col, group=0, accepted=0) contingency_table = [[positive_protected, negative_protected], [positive_nonprotected, negative_nonprotected]] return stats.fisher_exact(contingency_table)
def test_discrete(a, b): # multiple classes, Fisher's exact test, followed by Bonferonni correction # returns the smallest p-value for all tested classes all_categories = set(flatten(a)) pvalues = [] for category in all_categories: # calculate number of items with this category a1, a0 = get_counts(a, category) b1, b0 = get_counts(b, category) # we are only interested in enrichment, so right_tail oddsratio, pvalue = stats.fisher_exact([[a1, a0], [b1, b0]], alternative="greater") pvalues.append((pvalue, category)) # fisher's exact test plus bonferroni correction of number of tests min_pvalue, min_category = min(pvalues) min_pvalue *= len(pvalues) return min_pvalue, min_category
def TestPatternInNegSeq(posPatternCovLis, posSeqCnt, negSeqFn, allKmerSet): patternSet = set() patternCnt = len(posPatternCovLis) initPatternSet = map(lambda x:x[0], posPatternCovLis) negSeqLis, negSeqCnt, _, _ = BioinfoComm.loadSinglelineSeq(negSeqFn) negKmer2seqIdSet = BioinfoComm.FetchCovInSeqLisMutliKmer(negSeqLis, allKmerSet) negKmer2seqIdInt = BioinfoComm.formatCovId(negKmer2seqIdSet, negSeqCnt) for pattern, posCov in posPatternCovLis: posUncov = posSeqCnt - posCov negCov, _, negKmer2seqIdInt = BioinfoComm.FetchPatternCov(pattern, negSeqLis, negKmer2seqIdInt) negUncov = negSeqCnt - negCov dataTable = [[posCov, posUncov], [negCov, negUncov]] _, rawPValue = stats.fisher_exact(dataTable) adjustedPvalue = min(rawPValue * patternCnt, 1) if adjustedPvalue < 0.05: patternSet.add(pattern) INFO('pattern before negative filter') INFO(initPatternSet) INFO('pattern after negative filter') INFO(patternSet) return patternSet
def pvalue_calculation(infile): sample = ((infile.split('_tenmers'))[0]).replace('donor', 'd') with open(allseqs_background) as inF: for line in inF: if 'seqs' in line: index_sample = ((line.strip()).split('\t')).index(sample) else: linea = line.split('\t') all_background = float(linea[index_sample]) with open(allseqs_snatched) as inF: for line in inF: if 'seqs' in line: index_sample = ((line.strip()).split('\t')).index(sample) else: linea = line.split('\t') all_snatched = float(linea[index_sample]) out = ('fishers_output_%s.txt')%(sample) o = open(out, 'w') with open(infile, 'r') as inF: for line in inF: linea = (line.strip()).split('\t') sequence = linea[0] snatch = float(linea[2]) unsnatch = float(linea[1]) A = snatch B = unsnatch C = all_snatched - snatch D = all_background - unsnatch oddsratio, pvalue = stats.fisher_exact([[A, B], [C,D]]) outlist = [sequence, str(A), str(B), str(C), str(D), str(pvalue), str(oddsratio), '\n'] output = '\t'.join(outlist) o.write(output)
def FisherScoreCutoff(posScores,negScores): """Define the upper and lower 5 percent bounds of the distribution of scores, then increase the score cutoff by 1/100 the difference between the two scores at a time compute the Fishter p-value for each score cutoff and record the best one """ #posScores=sorted(posScores,reverse=True) negScores=sorted(negScores,reverse=True) contingencyTable=[[0,0],[0,0]] scoreCutoff=negScores[len(negScores)/100] posCount=0 negCount=0 for i in range(len(posScores)): if posScores[i]>=scoreCutoff: posCount+=1 for j in range(len(negScores)): if negScores[j]>=scoreCutoff: negCount+=1 contingencyTable[0][0]=posCount contingencyTable[0][1]=len(posScores)-posCount contingencyTable[1][0]=negCount contingencyTable[1][1]=len(negScores)-negCount ob,p_value=stats.fisher_exact(contingencyTable) return p_value,scoreCutoff,contingencyTable[0]+contingencyTable[1] '''upperBound=negScores[len(negScores)/10000] lowerBound=negScores[len(negScores)/100*50] print "Lower index",len(negScores)/100*50 print len(posScores),len(negScores) print "UPPER LOWER",upperBound,lowerBound intervalnum=100 interval=(upperBound-lowerBound)/intervalnum print "INTERVAL",interval bestcuttoff=-100.0 bestpvalue=1.0 contingencyTable=[[0,0],[0,0]] besttable=contingencyTable for i in range(intervalnum): scoreCutoff=lowerBound+interval*i if scoreCutoff >upperBound: break posCount=0 negCount=0 for j in range(len(posScores)): if posScores[j]>=scoreCutoff: posCount+=1 for j in range(len(negScores)): if negScores[j]>=scoreCutoff: negCount+=1 contingencyTable[0][0]=posCount contingencyTable[0][1]=len(posScores)-posCount contingencyTable[1][0]=negCount contingencyTable[1][1]=len(negScores)-negCount ob,p_value=stats.fisher_exact(contingencyTable) #print p_value,scoreCutoff,contingencyTable if p_value<bestpvalue: bestpvalue=p_value bestcuttoff=scoreCutoff besttable=contingencyTable[0]+contingencyTable[1] return bestpvalue,bestcuttoff,besttable''' return
def main(): usage = 'usage: %prog anchor_results.txt anchor_results_null.txt\n'\ 'Requires two input arguments:\n'\ '1) Interesting anchor results, output from run_anchor_batch.py\n'\ '2) Null anchor results, output from run_anchor_batch.py\n' parser = OptionParser(usage=usage) parser.add_option('-1', '--exon_label1', dest='exon_label1', default='Exon label 1', help='Exon label of anchor_results.txt.') parser.add_option('-2', '--exon_label2', dest='exon_label2', default='Exon label 2', help='Exon label of anchor_results_null.txt') parser.add_option( '-t', '--title', dest='title', default='Fraction of exons with predicted binding regions', help='Title of plot.') (options, args) = parser.parse_args() if len(args) != 2: print 'Two arguments need to be specified in command line.\n' print usage sys.exit() anchor_results_path = args[0] anchor_results_null_path = args[1] exon_label1 = options.exon_label1 exon_label2 = options.exon_label2 mytitle = options.title # init dic with keys and empty lists anchor_dic = {} for key in ['binding', 'non_binding', 'total']: anchor_dic[key] = [] for results in [anchor_results_path, anchor_results_null_path]: binding_count, total_count = count_anchor_results(results) non_binding_count = total_count - binding_count for key, val in zip(['binding', 'non_binding', 'total'], [binding_count, non_binding_count, total_count]): anchor_dic[key].append(val) oddsratio, pvalue = \ fisher_exact([anchor_dic['binding'], anchor_dic['non_binding']]) print 'oddsratio: %s\npvalue: %s' % (oddsratio, pvalue) # plot distributions (from plot_meme_motif_null_comparison.py) mylabels = [exon_label1, exon_label2] # Plot bargraphs frac_binding = float(anchor_dic['binding'][0]) / anchor_dic['total'][0] frac_binding_null = float( anchor_dic['binding'][1]) / anchor_dic['total'][1] myvals = [frac_binding, frac_binding_null] plot_barplot(myvals, mytitle, mylabels, ylabel='Fraction predicted binding regions', mytext1="%i/%i" \ %(anchor_dic['binding'][0], anchor_dic['total'][0]), mytext2='%i/%i' %(anchor_dic['binding'][1], anchor_dic['total'][1]), mytext3="*Fisher's Exact Test\nP-value=%.2e" %pvalue, ymin=0, ymax=1, width=0.5) plt.show()
def main(): usage = 'usage: %prog anchor_results.txt anchor_results_null.txt\n'\ 'Requires two input arguments:\n'\ '1) Interesting anchor results, output from run_anchor_batch.py\n'\ '2) Null anchor results, output from run_anchor_batch.py\n' parser = OptionParser(usage=usage) parser.add_option('-1', '--exon_label1', dest='exon_label1', default='Exon label 1', help='Exon label of anchor_results.txt.') parser.add_option('-2', '--exon_label2', dest='exon_label2', default='Exon label 2', help='Exon label of anchor_results_null.txt') parser.add_option('-t', '--title', dest='title', default='Fraction of exons with predicted binding regions', help='Title of plot.') (options, args) = parser.parse_args() if len(args) != 2: print 'Two arguments need to be specified in command line.\n' print usage sys.exit() anchor_results_path = args[0] anchor_results_null_path = args[1] exon_label1 = options.exon_label1 exon_label2 = options.exon_label2 mytitle = options.title # init dic with keys and empty lists anchor_dic = {} for key in ['binding', 'non_binding', 'total']: anchor_dic[key] = [] for results in [anchor_results_path, anchor_results_null_path]: binding_count, total_count = count_anchor_results(results) non_binding_count = total_count - binding_count for key, val in zip(['binding', 'non_binding', 'total'], [binding_count, non_binding_count, total_count]): anchor_dic[key].append(val) oddsratio, pvalue = \ fisher_exact([anchor_dic['binding'], anchor_dic['non_binding']]) print 'oddsratio: %s\npvalue: %s' %(oddsratio, pvalue) # plot distributions (from plot_meme_motif_null_comparison.py) mylabels = [exon_label1, exon_label2] # Plot bargraphs frac_binding = float(anchor_dic['binding'][0]) / anchor_dic['total'][0] frac_binding_null = float(anchor_dic['binding'][1]) / anchor_dic['total'][1] myvals = [frac_binding, frac_binding_null] plot_barplot(myvals, mytitle, mylabels, ylabel='Fraction predicted binding regions', mytext1="%i/%i" \ %(anchor_dic['binding'][0], anchor_dic['total'][0]), mytext2='%i/%i' %(anchor_dic['binding'][1], anchor_dic['total'][1]), mytext3="*Fisher's Exact Test\nP-value=%.2e" %pvalue, ymin=0, ymax=1, width=0.5) plt.show()