def pval_est(pvals, top=None, threshold=3): p_est = dict() if top is not None: pvs = { x: pvals[x] for x in sorted(pvals, key=pvals.get, reverse=False)[:top] } else: pvs = pvals # pool = Pool(processes=4) for motif in pvs: # res = [(pool.apply_async(pval_est_helper, (x,motif)),pvals[x]) for x in pvals] # similar = [y for x,y in res if x.get()] similar = [pvals[x] for x in pvals if edit_dist(x, motif) <= threshold] p_fast = 1.0 for p in similar: p_fast = p_fast * p qfst = qfast(len(similar), p_fast) fsher = stats.combine_pvalues(similar, method="fisher")[1] stffr = stats.combine_pvalues(similar, method="stouffer")[1] p_est[motif] = (qfst, fsher, stffr) return p_est
def merge(links): result = dict(links[0]) result['from'] = [] result['articles'] = [] for i in links: if 'probability' not in i: # Do a very rudimentary meta-analysis based on the number of supporting papers rates = [base_rate for x in i['articles']] if not rates: rates = [base_rate] p = combine_pvalues(rates, method="stouffer")[1] i['probability'] = p if 'frequency' in i: idf = 10**i.get('idf', rdflib.Literal(100)).value tfidf = (0.5 + i['frequency'].value) * idf # old_div is no longer defined! i['probability'] = combine_pvalues( [tfidf / (1 + tfidf)], method='stouffer')[1] else: i['probability'] = i['probability'].value result['from'].append(i['np']) result['articles'].extend(i['articles']) result['probability'] = max([i['probability'] for i in links]) #print "end: " return result
def pval(motif, sequences, order=1, dist="binomial", method="qfast", pvals=None, threshold=None): mlen = len(motif) # default threshold to 20% of the motif length if threshold is None: threshold = int(len(motif) * .20) # generate pvalues if not given if pvals is None: pvals = pval_all(mlen, sequences, order=order, dist=dist) similar = [ pvals[x] for x in pvals.keys() if edit_dist(x, motif) <= threshold ] if len(similar) == 0: return 1 if method == "fisher": return stats.combine_pvalues(similar, method="fisher") elif method == "stouffer": return stats.combine_pvalues(similar, method="stouffer") else: p_fast = 1 for p in similar: p_fast = p_fast * p return qfast( len(similar), p_fast) #p_fast is the product of p values for similar sequences
def evaluate(self, weighted=False): """ Use the stouffer method to combine pvalues :return: pval :test: iris = pd.read_csv('/data/iris.csv') sample = iris.sample(frac=0.3, random_state=123) AssessCombVar(sample, iris, target_col='iris_class').evaluate() = 0.5712377451855659 """ if weighted: p = combine_pvalues(self.pvals, method='stouffer', weights=self.weights)[1] else: p = combine_pvalues(self.pvals)[1] return p
def combine_all_pvals(table, indices): out = pd.Series(index=table.keys(), data=nan, dtype=float) for ix in tqdm(indices): out[ix] = combine_pvalues(table[ix], "fisher")[1] return out
def append_pval(df, columns, multitest): pvalcols = [] for col in columns: pvals = [ normcdf(val, np.mean(df[col].values), np.std(df[col].values)) for val in df[col].values ] df.insert(len(df.columns), col + "_pvals", pvals) pvalcols.append(col + "_pvals") log("P-values calculated for following groups: " + str(columns), 1) if multitest: combined = [] for row in df[pvalcols].get_values(): combined.append( combine_pvalues(row, method='fisher', weights=None)[1]) log("Fisher combined p_value test completed", 1) df.insert(len(df.columns), 'fisher_combined_pval', combined) bh_corrected = bh_correct( dict(zip(df.index.values, df['fisher_combined_pval'].values))) corrected_vals = [] for k in df.index.values: corrected_vals.append(bh_corrected[k]) df.insert(len(df.columns), 'benj_hoch_corrected_pval', corrected_vals) log( "Benjamini-hochberg correction successfully applied to combined p-values", 1) return df
def combine(self, p_values, ref_p_values=None): """ Takes a list of p_values and combines them into a single p_value using the Stouffer’s Z-score method. :param p_values: a list of p_values of features. :param ref_p_values: p-values of reference observations (if needed). :return: The combined p-value. """ p_values = check_p_values(p_values) # # assert p_values is a list # assert type(p_values) == list # # # check that p_values is not empty # if len(p_values) == 0: # raise ValueError('The given list of p_values is empty') # # # check that all elements in p_values are floats # if not all(isinstance(x, (int, float)) for x in p_values): # raise ValueError('The elements in p_values should all be of the type \'float\'') combined_p_values = np.apply_along_axis(lambda x: combine_pvalues(x, method='stouffer')[1], axis=1, arr=p_values) return combined_p_values
def combine_pvalues(p, nsubs): """Combining p-values for meta-analysis statistics Function that takes pvals from each experiments and number of subjects to return meta-analysis significance using Stouffer's method Parameters ---------- p: DataFrame p-values for 2 pipelines computed on different datasets nsubs: float average number of subject per datasets Returns ------- pval: float Estimatation of the combined p-value """ if len(p) == 1: return p.item() else: W = np.sqrt(nsubs) out = stats.combine_pvalues(np.array(p), weights=W, method="stouffer")[1] return out
def append_wilcoxmann(df,columns,multitest): pvalcols = [] groups = [ratcol+"_ratio" for ratcol in design.run.ratios] cntr=0 for col in columns: pvals = [] data = df[col].values for vals in data: pvals.append(ranksums([v for v in vals],[item for sublist in data for item in sublist])[1]) df.insert(len(df.columns),groups[cntr].replace("_ratio","")+"^wmannpvals",pvals) pvalcols.append(groups[cntr].replace("_ratio","")+"^wmannpvals") cntr+=1 log("P-values calculated for following groups: "+str(columns),1) if multitest: combined=[] for row in df[pvalcols].get_values(): combined.append(combine_pvalues(row,method='fisher', weights=None)[1]) log("Fisher combined p_value test completed",1) df.insert(len(df.columns),'fisher_combined_wmannpval',combined) bh_corrected = bh_correct(dict(zip(df.index.values,df['fisher_combined_wmannpval'].values))) corrected_vals = [] for k in df.index.values: corrected_vals.append(bh_corrected[k]) df.insert(len(df.columns),'benj_hoch_corrected_wmannpval',corrected_vals) log("Benjamini-hochberg correction successfully applied to combined p-values",1) return df
def windows(): spy = urllib2.urlopen('http://real-chart.finance.yahoo.com/table.csv?s=SPY' ).read().splitlines() ndays = len(spy) - 30 print 'ndays', ndays spy_r = [] act = [] # date=[] for i in range(1, ndays): # Date,Open,High,Low,Close,Volume,Adj Close spy_r.append( float(spy[i].split(',')[4]) / float(spy[i + 30].split(',')[4]) - 1) act.append(float(spy[i].split(',')[4])) # date.append(datetime.datetime.strptime(spy[i].split(',')[0], "%Y-%m-%d").date()) spy_label = generate_labels(spy_r) print spy_label x = np.array(spy_r, dtype='float') #Potential Labels y = np.array(spy_label, dtype='float') window_length = 30 window_labels = [] for i in range(0, ndays - 1): z = np.array(y[i:i + 30]) window_labels.append(stats.combine_pvalues(z)[1]) print window_labels # fig, ax = plt.subplots(2, 1, sharex=False, sharey=False) plt.hist(window_labels, 1000, normed=1, facecolor='green', alpha=0.75) plt.title('Window Labels') plt.show()
def utest_ficher_score(df, clusters, genes): """ This method returns the fischer p value for the statistical test comparing the expression of given genes in each cluster vs the rest. """ #Array keeping the cluster names (keys) clusterNames = list(zip(*sorted(Counter(clusters).items())))[0] ficher_pvals = [] # iterate through clusters and calculate utest current cluster vs rest for igroup in clusterNames: idx = np.where(clusters == igroup)[0] idx_rest = np.where(clusters != igroup)[0] pvals = [] for gene in genes: if gene in df.columns: data = np.array(df[gene].tolist()) stat, p = stats.mannwhitneyu(data[idx], data[idx_rest], alternative='greater') pvals.append(p) _, fischer_p = stats.combine_pvalues(pvals) ficher_pvals.append(fischer_p) return np.array(ficher_pvals)
def calculateStoufferCombinedPvalue(pvalues, weights): # P-value in third position ([nFeatures, nRelevantFeatures, pValue]) combinedPvalue = combine_pvalues( [pvalue[2] if type(pvalue) is list else pvalue for pvalue in pvalues], 'stouffer', weights) return combinedPvalue[1]
def test_significance(gstar_srnas, gstar_shuffled): null_distributions = make_null_distributions(gstar_shuffled) median_null_dists = [] log_ratio_p_vals = [] allen_score_p_vals = [] combined_p_vals = [] for _, srna_id, allen_score, log_ratio in gstar_srnas[ TEST_COLUMNS].itertuples(): log_ratio_dist, allen_score_dist = null_distributions[srna_id] lr_p = ( (log_ratio_dist > log_ratio).sum() + 1) / (len(log_ratio_dist) + 1) as_p = ((allen_score_dist < allen_score).sum() + 1) / (len(allen_score_dist) + 1) _, p_val = stats.combine_pvalues([lr_p, as_p], method='fisher') median_null_dists.append(np.median(log_ratio_dist)) log_ratio_p_vals.append(lr_p) allen_score_p_vals.append(as_p) combined_p_vals.append(p_val) gstar_srnas['median_null_log_ratio'] = median_null_dists gstar_srnas['log_odds'] = (gstar_srnas.log_ratio - gstar_srnas.median_null_log_ratio) gstar_srnas['log_ratio_p_val'] = log_ratio_p_vals gstar_srnas['allen_score_p_val'] = allen_score_p_vals gstar_srnas['p_val'] = combined_p_vals _, gstar_srnas['fdr'], *_ = multipletests(gstar_srnas.p_val, method='fdr_bh') gstar_srnas['score'] = np.round(-np.log10(gstar_srnas.fdr)) return gstar_srnas
def weat_analysis(combined_word2vecs, quadruples, all_sets, steps=-1, effect_size_n=-1): if type(quadruples) == tuple: quadruples = [quadruples] print('This might take some time') res = [] total = len(quadruples) * sum([len(combined_word2vecs[key]) for key in combined_word2vecs]) counter = 0 for names in quadruples: sets = [all_sets[name] for name in names] for wiki in combined_word2vecs: ps = [] ds = [] ps2 = [] ds2 = [] for word2vec in combined_word2vecs[wiki]: p, d = WEAT(word2vec, sets[0], sets[1], sets[2], sets[3], steps, effect_size_n=effect_size_n).get_stats() ps.append(p) ds.append(d) p, d = WEATvec(word2vec, sets[0], sets[1], sets[2], sets[3], steps, effect_size_n=effect_size_n).get_stats() ps2.append(p) ds2.append(d) counter += 1 if counter % 25 == 0: print('Finished', counter, 'of', total) res.append((names, wiki, ps, stats.combine_pvalues(ps, 'fisher')[1], ds, np.mean(ds), np.std(ds), ps2, stats.combine_pvalues(ps2, 'fisher')[1], ds2, np.mean(ds2), np.std(ds2))) print('Done!') return pd.DataFrame(res, columns=['names', 'wiki', 'p', 'fishers_p', 'd', 'mean_d', 'std_d', 'p2', 'fishers_p2', 'd2', 'mean_d2', 'std_d2'])
def cluster_bin(bonferroni_filter_list): bonferroni_peak = [] peak_line = [] idx = 0 pre_end_position = 0 for data in bonferroni_filter_list: distance = data[1] - pre_end_position if pre_end_position == 0 or distance > 0: if peak_line: peak_region = peak_line[2] - peak_line[1] if peak_region >= 100: bonferroni_peak.append([]) bonferroni_peak[idx] = peak_line idx += 1 peak_line = [] peak_line = data[:] pre_end_position = data[2] else: peak_line[2] = data[2] pre_end_position = data[2] peak_line.append(data[3]) for data in bonferroni_peak: statistic, pval = stats.combine_pvalues(data[3:len(data)], method='fisher', weights=None) data[3] = pval del data[4:len(data)] return bonferroni_peak
def combine_pvals(row, reps): pvals = np.asarray(list(row[reps])) non_na_pvals = pvals[~np.isnan(pvals)] if len(non_na_pvals) > 1: new_pval = stats.combine_pvalues(non_na_pvals, method="stouffer")[1] else: new_pval = np.nan return new_pval
def combine_phreds(phred1, phred2): new_pvalue = combine_pvalues([phred2pval(phred1), phred2pval(phred2)], "fisher")[1] if new_pvalue < MIN_FLOAT: return MAX_PHRED else: return -10 * mlog10(new_pvalue)
def combine_pvals(row, cols): pvals = list(row[cols]) non_na_pvals = [x for x in pvals if "NA" not in (str(x))] if len(non_na_pvals) > 0: new_pval = stats.combine_pvalues(non_na_pvals, method="stouffer")[1] else: new_pval = "NA__too_many_rep_NAs" return new_pval
def linear_fit_correlation(dataDB, datatype, intervNames=None, trialTypes=None, minTrials=50): for mousename in sorted(dataDB.mice): for row, kwargs in _sweep_iter(dataDB, mousename, intervNames=intervNames, trialTypes=trialTypes): dataRSP = dataDB.get_neuro_data({'session': row['session']}, datatype=datatype, **kwargs)[0] dataRP = np.mean(dataRSP, axis=1) nTrials, nChannel = dataRP.shape results = np.zeros((nChannel, nChannel)) if nTrials < minTrials: print('Too few trials =', nTrials, ' for', row.values, ': skipping') else: for iChannel in range(nChannel): dataA = dataRP[:, iChannel] dataOther = np.delete(dataRP, iChannel, axis=1) dataSub = dataOther.copy() # Part 1: Fit-subtract A from all other channels for iOther in range(nChannel - 1): dataSub[:, iOther] -= polyfit_transform( dataA, dataSub[:, iOther]) # corrOther = corr_2D(dataSub.T, settings={'havePVal': True})[..., 0] # corrMean = np.array([np.mean(np.delete(corrOther[i], i)) for i in range(nChannel - 1)]) # results[iChannel] = np.insert(corrMean, iChannel, 1) # Part 2: Compute correlation and its p-value pValsOther = corr_2D(dataSub.T, settings={'havePVal': True})[..., 1] # Part 3: Combine pvalues over all pairs pValsCombined = np.array([ combine_pvalues(np.delete(pValsOther[i], i))[1] for i in range(nChannel - 1) ]) results[iChannel] = np.insert(pValsCombined, iChannel, 0) results = -np.log10(results, where=offdiag_idx(nChannel)) plt.figure() plt.imshow(results) plt.title('_'.join(list(row.values))) plt.colorbar() plt.savefig('pics/corr_subtracted_' + '_'.join(list(row.values)) + '.png') plt.close()
def Fisher(self, df): ''' * Helper function, used in pValues() * Applies Fisher's method to combine p-values ''' p = df.apply(lambda row: stats.combine_pvalues( row[~np.isnan(row)] + 1e-06, method='fisher')[1], axis=1) return (p)
def combine_pvals(row, reps): pvals = np.asarray(list(row[reps])) non_na_pvals = np.asarray([float(x) for x in pvals if not "NA" in str(x)]) non_na_pvals = non_na_pvals[~np.isnan(non_na_pvals)] if len(non_na_pvals) > 1: new_pval = stats.combine_pvalues(non_na_pvals, method="stouffer")[1] else: new_pval = np.nan return new_pval
def test_combine_pvalues_hou(pvalues): weights = np.array([1, 1, 1, 1]) cor_mat = np.zeros((4, 4)) hou = combine_pvalues_hou(pvalues, weights, cor_mat) fisher = combine_pvalues(pvalues, method='fisher')[1] if np.isnan(hou): assert np.isnan(fisher) else: assert hou == fisher
def _test_distribution(sample_probabilities, predicted_win_percents, num_plays_used): """Based off assuming the data at each probability is a Bernoulli distribution.""" #Get the p-values: p_values = [stats.binom_test(np.round(predicted_win_percents[i] * num_plays_used[i]), np.round(num_plays_used[i]), p=sample_probabilities[i]) for i in range(len(sample_probabilities))] combined_p_value = stats.combine_pvalues(p_values)[1] return(combined_p_value)
def combine_ps(percentiles, n_samples, method='fisher'): assert method in ['fisher', 'stouffer'] min_percentile = 1 / (n_samples + 1) shifted_percentiles = percentiles - (min_percentile / 2) # center around 0.5 p = combine_pvalues(shifted_percentiles, method=method)[1] return p
def combine_pvalues(p_values_tuple, method='fisher'): """ https://en.wikipedia.org/wiki/Fisher's_method """ p_values = [ stats.combine_pvalues(row, method=method)[1] for row in np.vstack(p_values_tuple).T ] return p_values
def get_combin_pvalue(moptions, i): if moptions["neighborPvalues"] > 0 and len(moptions['sign_test']) > 0: #enoughNeighbor = True; pvalue_neighbors = [] for j in range(i - moptions["neighborPvalues"], i + moptions["neighborPvalues"] + 1): if j < 0 or j > len(moptions['sign_test']) - 1 or (not pos_check( moptions['sign_test'], i, j)): #enoughNeighbor = False; #continue; pvalue_neighbors.append(1.0) else: pvalue_neighbors.append(moptions['sign_test'][j][1][2][1]) #default: fisher, no weights if moptions["testMethod"] == 'fisher': comb_p_st, comb_p_p = combine_pvalues(pvalue_neighbors) #stouffer, weights if moptions["testMethod"] == 'stouffer': midweight = 100 mweights = [midweight] for k in range(moptions["neighborPvalues"]): mweights.insert(0, mweights[0] / moptions["WeightsDif"]) mweights.append(mweights[-1] / moptions["WeightsDif"]) comb_p_st, comb_p_p = combine_pvalues(pvalue_neighbors, method='stouffer', weights=mweights) #[1,2,4,2,1]) comb_p_p = m_min_float(comb_p_p) comb_p_st = m_max_float(comb_p_st) return (comb_p_st, comb_p_p) #if not enoughNeighbor: # return combine_pvalues(pvalue_neighbors)[1] #else: # return 1.0 else: if moptions["neighborPvalues"] == 0: return moptions['sign_test'][i][1][2] else: return None
def calc_pval(row, base): pval_path = stats.fisher_exact([[row['CorrectPath'], row['WrongPath']], [base['CorrectPath'], base['WrongPath']]], alternative='greater')[1] pval_benign = stats.fisher_exact( [[row['CorrectBenign'], row['WrongBenign']], [base['CorrectBenign'], base['WrongBenign']]], alternative='greater')[1] return stats.combine_pvalues([pval_path, pval_benign])[1]
def combined_p_test(): global max_pwm prefix = 'csv/robot-' m_l = ['m1', 'm2', 'm3', 'm4'] faulty_df = pd.read_csv('csv/robot-10.csv') faulty_data = { 'm1': list(faulty_df.to_dict()['m1'].values()) } faulty_motor_velocities = np.array(faulty_data['m1']).reshape((-1, 70))[:, 10:] comb_p_val_l = [] for i in range(0, 6): f_name = prefix + str(i) + '.csv' df = pd.read_csv(f_name) data = { 'pwm': list(df.to_dict()['pwm'].values()), 'm1': list(df.to_dict()['m1'].values()), 'm2': list(df.to_dict()['m2'].values()), 'm3': list(df.to_dict()['m3'].values()), 'm4': list(df.to_dict()['m4'].values()) } for m in m_l: motor_velocities = np.array(data[m]).reshape((-1, 70))[:, 10:] p_val_l = [] for pwm in range(0, max_pwm-10): _, p = stats.wilcoxon(motor_velocities[:, pwm], faulty_motor_velocities[:, pwm]) p_val_l.append(p) _, comb_p_val = stats.combine_pvalues(p_val_l) comb_p_val_l.append(comb_p_val) print(len(comb_p_val_l)) print('\n\n') for p in comb_p_val_l: print('%0.4f' %(p)) _, p = stats.combine_pvalues(comb_p_val_l) print('\n\n%0.4f' %(p)) return comb_p_val_l
def test_correct_sampling(sampler_c, rbm_and_weights, set_pdf_power): sampler = set_pdf_power(sampler_c) hi = sampler.hilbert all_states = hi.all_states() n_states = hi.n_states ma, w = rbm_and_weights(hi) n_samples = max(40 * n_states, 100) ps = np.absolute(nk.nn.to_array(hi, ma, w, normalize=False))**sampler.machine_pow ps /= ps.sum() n_rep = 6 pvalues = np.zeros(n_rep) sampler_state = sampler.init_state(ma, w, seed=SAMPLER_SEED) for jrep in range(n_rep): sampler_state = sampler.reset(ma, w, state=sampler_state) # Burnout phase samples, sampler_state = sampler.sample(ma, w, state=sampler_state, chain_length=n_samples // 100) assert samples.shape == ( n_samples // 100, sampler.n_chains, hi.size, ) samples, sampler_state = sampler.sample(ma, w, state=sampler_state, chain_length=n_samples) assert samples.shape == (n_samples, sampler.n_chains, hi.size) sttn = hi.states_to_numbers(np.asarray(samples.reshape(-1, hi.size))) n_s = sttn.size # fill in the histogram for sampler unique, counts = np.unique(sttn, return_counts=True) hist_samp = np.zeros(n_states) hist_samp[unique] = counts # expected frequencies f_exp = n_s * ps statistics, pvalues[jrep] = chisquare(hist_samp, f_exp=f_exp) s, pval = combine_pvalues(pvalues, method="fisher") assert pval > 0.01 or np.max(pvalues) > 0.01
def rerun_pvalues(scores, cosine_pval_function): p_vals = [] combined_pvals = [] for score in scores: single_test_pval = [cosine_pval_function(c, 512) for c in list(score)] p_vals.append(single_test_pval) single_test_combined_pvals = combine_pvalues(single_test_pval)[1] combined_pvals.append(single_test_combined_pvals) return (p_vals, combined_pvals)
def combine_pvals(pvalues, method="stouffer"): """:param pvs :return: combined pvalue """ pvs = pvalues[~np.isnan(pvalues)] if pvs.size != 2: comb_pv = np.nan else: comb_pv = stats.combine_pvalues(pvalues, method=method)[1] return comb_pv
def combine_pvalues(p, nsubs): """Function that takes pvals from each experiments and number of subjects to return meta-analysis significance """ if len(p) == 1: return p.item() else: W = np.sqrt(nsubs) out = stats.combine_pvalues(np.array(p), weights=W, method="stouffer")[1] return out
def test_kstest(self): for varname, cdf in self.cdfs.items(): samples = self.samples[varname] if samples.ndim == 1: t, p = stats.kstest(samples[::self.ks_thin], cdf=cdf) assert self.alpha < p elif samples.ndim == 2: pvals = [] for samples_, cdf_ in zip(samples.T, cdf): t, p = stats.kstest(samples_[::self.ks_thin], cdf=cdf_) pvals.append(p) t, p = stats.combine_pvalues(pvals) assert self.alpha < p else: raise NotImplementedError()
def append_pval(df, columns, multitest): pvalcols = [] for col in columns: pvals = [normcdf(val, np.mean(df[col].values), np.std(df[col].values)) for val in df[col].values] df.insert(len(df.columns), col + "_pvals", pvals) pvalcols.append(col + "_pvals") if multitest: combined = [] for row in df[pvalcols].get_values(): combined.append(combine_pvalues(row, method="fisher", weights=None)[1]) df.insert(len(df.columns), "fisher_combined_pval", combined) bh_corrected = bh_correct(dict(zip(df.index.values, df["fisher_combined_pval"].values))) corrected_vals = [] for k in df.index.values: corrected_vals.append(bh_corrected[k]) df.insert(len(df.columns), "benj_hoch_corrected_pval", corrected_vals) return df
def append_pval(df,columns,multitest): pvalcols = [] for col in columns: pvals = [normcdf(val,np.mean(df[col].values),np.std(df[col].values)) for val in df[col].values] df.insert(len(df.columns),col+"_pvals",pvals) pvalcols.append(col+"_pvals") log("P-values calculated for following groups: "+str(columns),1) if multitest: combined=[] for row in df[pvalcols].get_values(): combined.append(combine_pvalues(row,method='fisher', weights=None)[1]) log("Fisher combined p_value test completed",1) df.insert(len(df.columns),'fisher_combined_pval',combined) bh_corrected = bh_correct(dict(zip(df.index.values,df['fisher_combined_pval'].values))) corrected_vals = [] for k in df.index.values: corrected_vals.append(bh_corrected[k]) df.insert(len(df.columns),'benj_hoch_corrected_pval',corrected_vals) log("Benjamini-hochberg correction successfully applied to combined p-values",1) return df
def combine_pvalues(pvalues): stat, pval = stats.combine_pvalues(pvalues, method='fisher', weights=None) return pval
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values." ) parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored" ) parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_ ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()