def test(self, arr1, arr2): p_value = 0 if self.statistics == "auto": # проверяем Левеном на равенство дисперсий. Если равны if stats.levene(arr1, arr2)[1] > 0.05: # Шапир на нормальность выборок. Если нормальные if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05: # p = Student p_value = stats.ttest_ind(arr1, arr2)[1] else: # p = Mann if equal(arr1, arr2): p_value = 1 else: p_value = stats.mannwhitneyu(arr1, arr2)[1] else: p_value = stats.ttest_ind(arr1, arr2, False)[1] elif self.statistics == "student": p_value = stats.ttest_ind(arr1, arr2)[1] elif self.statistics == "welch": p_value = stats.ttest_ind(arr1, arr2, False)[1] elif self.statistics == "mann": if equal(arr1, arr2): p_value = 1 else: p_value = stats.mannwhitneyu(arr1, arr2)[1] return p_value
def rankTest(arg): ou=[] ou.append(stats.kruskal(data[arg][1],data[arg][2],data[arg][3])[1]) ou.append(stats.mannwhitneyu(data[arg][1],data[arg][2])[1]) ou.append(stats.mannwhitneyu(data[arg][1],data[arg][3])[1]) ou.append(stats.mannwhitneyu(data[arg][2],data[arg][3])[1]) return ou
def target_analysis(mirna2age, mirna2disease, mirna2target, gene2age): mir_targetdb = pd.read_csv('/Users/virpatel/Desktop/pub_stuff/relevant_data/mir_target_vectordb.txt', sep='\t',index_col=[0], encoding='utf-8') target_lst = list(mir_targetdb.columns.values) mirnanumdis = [] mirnanumtar = [] mir_avg_tar_age_dis = [] mir_avg_tar_age_nondis = [] mir_age = [] for mir in mir_targetdb.index: if mir not in mirna2disease: mirnanumdis.append(0) else: mirnanumdis.append(len(mirna2disease[mir])) bintarlt = mir_targetdb.loc[mir].tolist() mirnanumtar.append(sum(bintarlt)) tarages = [float(gene2age[target_lst[ind]]) for ind, a in enumerate(bintarlt) if target_lst[ind] in gene2age and a == 1] # mir_avg_tar_age_all.append(median(tarage)) mir_avg_tar_age_dis.append(mean(tarages)) for mir in mir_targetdb.index: if mir not in mirna2disease: bintarlt = mir_targetdb.loc[mir].tolist() tarages = [float(gene2age[target_lst[ind]]) for ind, a in enumerate(bintarlt) if target_lst[ind] in gene2age and a == 1] mir_avg_tar_age_nondis.append(mean(tarages)) print mannwhitneyu(mir_avg_tar_age_dis,mir_avg_tar_age_nondis)
def violin_nocomp(lst_for_exclusion, binary_data_frame, tipo,xentry,df_name): yes = [] datalst = [] no = [] for alpha in binary_data_frame.index: if alpha in lst_for_exclusion: datalst.append([sum(binary_data_frame.loc[alpha].tolist()),'%s miRNAs' %(tipo)]) yes.append(sum(binary_data_frame.loc[alpha].tolist())) else: datalst.append([sum(binary_data_frame.loc[alpha].tolist()),'Non-%s miRNAs' %(tipo)]) no.append(sum(binary_data_frame.loc[alpha].tolist())) print mean(yes), mean(no) print median(yes), median(no) print mannwhitneyu(yes, no) data_master = pd.DataFrame(datalst,columns=[xentry, 'miRNA Class']) sns.violinplot(x='miRNA Class',y=xentry,data=data_master, cut=0) if 'tis' in df_name: plt.gca().set_ylim([0,20]) if 'tar' in df_name: plt.gca().set_ylim([0,1000]) plt.savefig('figures/nocomp_violin_%s.pdf' %(df_name),bbox_inches='tight') plt.close()
def mann_whitneyu(data, alternative): """Mann Whitneyu 's U-test, for 2 groups of samples with any length. It tests whether the two groups come from the same population, any distributed. Args: data (List[numpy.array]): each element of the list is an array of observations alternative (String): Whether uA, should be >, < or <> than uB Note: * It's a non-parametric test """ if len(data) != 2: raise ValueError("2 groups are needed") a = data[0] b = data[1] if alternative == "<>": _, p = stats.mannwhitneyu(a, b) p *= 2 elif alternative == ">": _, p = stats.mannwhitneyu(a, b) elif alternative == "<": _, p = stats.mannwhitneyu(a, b) p = 1-p return p
def regionalEffectSizes(subj_group, data_prior, data_post, index_lookup): # calculate prior suvr/rank distr sorted_uptakes_prior = [] group_uptakes_prior = {k:[] for k in index_lookup} group_ranks_prior = {k:[] for k in index_lookup} for rid in subj_group: if rid in data_prior: sorted_uptakes_prior.append(sorted(data_prior[rid].iteritems(), key=lambda x: x[1], reverse=True)) for k in data_prior[rid]: group_uptakes_prior[k].append(float(data_prior[rid][k])) uptakes_prior = {k: (np.mean(v),np.std(v)) for k,v in group_uptakes_prior.iteritems()} # calculate prior rank distr for sorted_list in sorted_uptakes_prior: region_ranks = [k for k,v in sorted_list] for rank, region in enumerate(region_ranks): group_ranks_prior[region].append(rank) ranks_prior = {k: (np.mean(v),np.std(v)) for k,v in group_ranks_prior.iteritems()} # calculate post suvr/rank distr sorted_uptakes_post = [] group_uptakes_post = {k:[] for k in index_lookup} group_ranks_post = {k:[] for k in index_lookup} for rid in subj_group: if rid in data_post: sorted_uptakes_post.append(sorted(data_post[rid].iteritems(), key=lambda x: x[1][0], reverse=True)) for k,(v,yrs) in data_post[rid].iteritems(): group_uptakes_post[k].append(float(v)) uptakes_post = {k: (np.mean(v),np.std(v)) for k,v in group_uptakes_post.iteritems()} # calculate post rank distr for sorted_list in sorted_uptakes_post: region_ranks = [k for k,v in sorted_list] for rank, region in enumerate(region_ranks): group_ranks_post[region].append(rank) ranks_post = {k: (np.mean(v),np.std(v)) for k,v in group_ranks_post.iteritems()} # calculate effect sizes group_effects = {} for k in index_lookup: prior_uptake = group_uptakes_prior[k] post_uptake = group_uptakes_post[k] prior_rank = group_ranks_prior[k] post_rank = group_ranks_post[k] u_uptake, pvalue_uptake = mannwhitneyu(prior_uptake, post_uptake, use_continuity=True) u_max_uptake = len(prior_uptake) * len(post_uptake) rank_biserial_uptake = 1.0 - (2*u_uptake/u_max_uptake) u_rank, pvalue_rank = mannwhitneyu(prior_rank, post_rank, use_continuity=True) u_max_rank = len(prior_rank) * len(post_rank) rank_biserial_rank = 1.0 - (2*u_rank/u_max_rank) to_save = {'uptake_effect': {'pvalue': pvalue_uptake, 'rank_biserial': rank_biserial_uptake}, 'rank_effect': {'pvalue': pvalue_rank, 'rank_biserial': rank_biserial_rank}} group_effects[k] = to_save return (uptakes_prior, ranks_prior, uptakes_post, ranks_post, group_effects)
def target_gene_expression_analysis(mirna2age, mirna2disease,mirna2family,gene2age): mir_targetdb = pd.read_csv('/Users/virpatel/Desktop/pub_stuff/relevant_data/mir_target_vectordb.txt', sep='\t',index_col=[0], encoding='utf-8') mir_expdb = pd.read_csv('/Users/virpatel/Desktop/pub_stuff/relevant_data/exp_data_alldmir.txt', sep='\t',index_col=[0]) family_target_hamming = [] family_target_avg_age = [] family_perc_dis = [] # for fam in mirna2family: # family_vector = [] # mirlst = [a for a in mirna2family[fam] if a in mir_targetdb.index] # mirdislst = [a for a in mirna2family[fam] if a in mirna2disease] # if len(mirlst) < 4: continue # if len(mirdislst) < 4: continue # for mir in mirlst: # for other_mir in mirlst: # if mir == other_mir: continue # family_vector.append(hamming(mir_targetdb.loc[mir], mir_targetdb.loc[other_mir],normalized=True)) # family_target_hamming.append(std(family_vector)) # family_target_avg_age.append(round(mean([float(mirna2age[mirna]) for mirna in mirlst if mirna in mirna2age]),1)) # family_perc_dis.append(float(len(mirdislst)) / float(len(mirna2family[fam]))) target_lst = list(mir_targetdb.columns.values) mirnanumdis = [] mirnanumtar = [] mir_avg_tar_age_dis = [] mir_avg_tar_age_nondis = [] mir_age = [] for mir in mirna2disease: if mir in mir_targetdb.index: mirnanumdis.append(len(mirna2disease[mir])) bintarlt = mir_targetdb.loc[mir].tolist() mirnanumtar.append(sum(bintarlt)) tarages = [float(gene2age[target_lst[ind]]) for ind, a in enumerate(bintarlt) if target_lst[ind] in gene2age and a == 1] mir_avg_tar_age_dis.append(mean(tarages)) for mir in mir_targetdb.index: if mir not in mirna2disease: bintarlt = mir_targetdb.loc[mir].tolist() tarages = [float(gene2age[target_lst[ind]]) for ind, a in enumerate(bintarlt) if target_lst[ind] in gene2age and a == 1] mir_avg_tar_age_nondis.append(mean(tarages)) sns.boxplot(x=mir_avg_tar_age_nondis) sns.plt.show() plt.close() print mannwhitneyu(mir_avg_tar_age_dis,mir_avg_tar_age_nondis)
def return_test_results(self, arr1, arr2): test_name = "" p_value = 0 t_value = 0 levene = stats.levene(arr1, arr2)[1] if self.statistics == "auto": # проверяем Левеном на равенство дисперсий. Если равны if levene > 0.05: # Шапир на нормальность выборок. Если нормальные if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05: # p = Student test_name = "Student" result = stats.ttest_ind(arr1, arr2) t_value = result[0] p_value = result[1] else: # p = Mann test_name = "Mann" if equal(arr1, arr2): t_value = None p_value = 1 else: result = stats.mannwhitneyu(arr1, arr2) t_value = result[0] p_value = result[1] else: test_name = "Welch" result = stats.ttest_ind(arr1, arr2, False) t_value = result[0] p_value = result[1] elif self.statistics == "student": test_name = "Student" result = stats.ttest_ind(arr1, arr2) t_value = result[0] p_value = result[1] elif self.statistics == "welch": test_name = "Welch" result = stats.ttest_ind(arr1, arr2, False) t_value = result[0] p_value = result[1] elif self.statistics == "mann": test_name = "Mann" if equal(arr1, arr2): t_value = None p_value = 1 else: result = stats.mannwhitneyu(arr1, arr2) t_value = result[0] p_value = result[1] df = len(arr1) + len(arr2) - 2 return [test_name, t_value, p_value, df, levene]
def determine_significance(mesa1, mesa2): """ Determines if two sets of values are statistically significant. In the best case, we can determine a normal distribution, and equal variance. Once determined we can use the independent t-test function if the values are of equal variance. If we have normal data, but the variance is unequal, the welch t-test is used. http://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test http://en.wikipedia.org/wiki/Student%27s_t-test#Equal_or_unequal_sample_sizes.2C_unequal_variances In the case where we cannot determine normality the mann-whitney u-test is desired to be used, but this test is only effective when there are greater than 20 samples. http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test """ # FIXME: Is it possible to determine these things with fewer samples? Distribution = Enum('Distribution', 'Normal, Non_normal Unknown') normality = Distribution.Normal try: k2, normal = stats.normaltest(mesa1) # FIXME: Unhardcode if (normal < NORMAL_CI): normality = Distribution.Non_normal k2, normal = stats.normaltest(mesa2) if (normal < NORMAL_CI): normality = Distribution.Non_normal except ValueError: normality = Distribution.Unknown equal_variance = is_equal_variance(mesa1, mesa2) if args.ttest: t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance) return (p, normality == Distribution.Normal, "t-test" if equal_variance else "Welch's") elif args.mannwhitney: u, p = stats.mannwhitneyu(mesa1, mesa2) p *= 2 # We want a 2-tailed p-value return (p, len(mesa1) < 20 or len(mesa2) < 20, "Mann-Whitney") if normality == Distribution.Normal: error_handler='raise' if np.var(mesa1) == 0 and equal_variance: error_handler='ignore' with np.errstate(divide=error_handler): t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance) return (p, False, "t-test" if equal_variance else "Welch's") else: u, p = stats.mannwhitneyu(mesa1, mesa2) p *= 2 # We want a 2-tailed p-value flawed = len(mesa1) < 20 or len(mesa2) < 20 return (p, flawed, "Mann-Whitney")
def violin_comp_rel_ratio(gen_exlus_dic, hamming_df, tipo, xentry, df_name, new_df): yes = [] no = [] datalst = [] flipped_exlus = map_relatives(gen_exlus_dic) genmirtar = [str(a) for a in list(new_df.index)] for alpha in hamming_df.index: print alpha for beta in hamming_df.index: if alpha == beta: continue if alpha in genmirtar and beta in genmirtar: if alpha in flipped_exlus: if beta in flipped_exlus[alpha]: datalst.append([float(hamming_df[alpha][beta]), '%s miRNAs' %(tipo)]) yes.append( float(hamming_df[alpha][beta]) / (float(100) / float(sum(new_df.loc[alpha])))) else: datalst.append([float(hamming_df[alpha][beta]), 'Non-%s miRNAs' %(tipo)]) no.append( float(hamming_df[alpha][beta]) / (float(100) / float(sum(new_df.loc[alpha])))) else: datalst.append([float(hamming_df[alpha][beta]), 'Non-%s miRNAs' %(tipo)]) no.append(float(hamming_df[alpha][beta]) / (float(100) / float(sum(new_df.loc[alpha])))) print mean(yes), mean(no) print median(yes), median(no) print mannwhitneyu(yes, no) data_master = pd.DataFrame(datalst,columns=[xentry, 'miRNA Class']) if 'tis' in df_name: sns.boxplot(x='miRNA Class',y=xentry,data=data_master) plt.savefig('figures/comp_rel_boxplot_%s.pdf' %(df_name),bbox_inches='tight') plt.close() if 'tar' in df_name: sns.violinplot(x='miRNA Class',y=xentry,data=data_master, cut=0) plt.savefig('figures/comp_rel_violinratio_%s.pdf' %(df_name),bbox_inches='tight') plt.close()
def evaluate(x, y): ds = zip(x,y) size = len(x) x,y = zip(*filter(lambda a: not(isnan(a[0])) and not(isnan(a[1])), ds)) index = sorted(range(size), key=lambda k: y[k]) ssize = int(floor(0.3 * size)) return mannwhitneyu(x[:ssize],x[-ssize:])
def calc_utest(timings_cpu, timings_time): min_rep_cnt = min(len(timings_time[0]), len(timings_time[1]), len(timings_cpu[0]), len(timings_cpu[1])) # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions? if min_rep_cnt < UTEST_MIN_REPETITIONS: return False, None, None time_pvalue = mannwhitneyu( timings_time[0], timings_time[1], alternative='two-sided').pvalue cpu_pvalue = mannwhitneyu( timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
def stats(d_lengths,dn,): for bool_skip in [False,True,]: even = [] odd = [] for dist_min in d_lengths.keys(): for len_diff in d_lengths[dist_min].keys(): if bool_skip == True: if len_diff == 1: continue if len_diff % 2 == 0: even += d_lengths[dist_min][len_diff]*[dist_min] else: odd += d_lengths[dist_min][len_diff]*[dist_min] import scipy from scipy import stats u,p = stats.mannwhitneyu(even,odd) fd = open('stats','a') fd.write('mannwhitneyu u %s p %s %s %s\n' %(u,p,dn,bool_skip)) fd.close() z,p = stats.ranksums(even,odd) fd = open('stats','a') fd.write('ranksums z %s p %s %s %s\n' %(z,p,dn,bool_skip)) fd.close() average_even = sum(even)/len(even) average_odd = sum(odd)/len(odd) fd = open('stats','a') fd.write('average even %s odd %s %s %s\n' %(average_even,average_odd,dn,bool_skip)) fd.close() return
def compareDistributions(self, target, nontarget): """ Compares two distributions with Mann-Whitney U test. """ result = st.mannwhitneyu(nontarget, target)[0] return result
def _PValues(arguments): """Performs a simulation of a comparison and returns the p-values. Starts with two normal distributions with a predetermined distance. Randomly pulls values from that distribution and calculates the running p-value as the samples grow in size, up to max_sample_size. Arguments: distance_stddev: The distance between the means of the two normal distributions, in multiples of the standard deviation. max_sample_size: The number of values to pull per sample. Returns: A list of p-values, from N=1 to N=max_sample_size. """ distance_stddev, max_sample_size = arguments a = [] b = [] p_values = [] for _ in xrange(max_sample_size): a.append(stats.norm.rvs()) b.append(stats.norm.rvs(distance_stddev)) p_values.append(stats.mannwhitneyu(a, b, alternative='two-sided').pvalue) return p_values
def regionEffectSizesBetweenGroups(group_prefix, group_one, group_two, data, index_lookup): # calculate group one suvr distr group_uptakes_one = {k:[] for k in index_lookup} for rid in group_one: if rid not in data: continue for k in data[rid]: group_uptakes_one[k].append(float(data[rid][k])) # calculate group two suvr distr group_uptakes_two = {k:[] for k in index_lookup} for rid in group_two: if rid not in data: continue for k in data[rid]: group_uptakes_two[k].append(float(data[rid][k])) # calculate effect sizes group_effects = {} for k in index_lookup: one_uptake = group_uptakes_one[k] two_uptake = group_uptakes_two[k] u_uptake, pvalue_uptake = mannwhitneyu(one_uptake, two_uptake, use_continuity=True) u_max_uptake = len(one_uptake) * len(two_uptake) rank_biserial_uptake = 1.0 - (2*u_uptake/u_max_uptake) to_save = {'pvalue': pvalue_uptake,'rank_biserial': rank_biserial_uptake} group_effects[k] = to_save line_data = defaultdict(dict) for k,v in group_effects.iteritems(): for eff_k, eff_v in v.iteritems(): line_data[k]['%s_uptake_effect_%s' % (group_prefix,eff_k)] = eff_v df = pd.DataFrame(dict(line_data)).T df.index.name = 'Region' return df
def ranksum(samp1, samp2): ''' Calculates the U statistic and probability that the samples are from two different distributions. These tests are non-parametric, so you can use them if your sample distributions are not Gaussian. The null hypothesis for the test is that the two samples come from the same distribution, so if p is less than some cutoff, you can reject the null hypothesis and claim the samples come from different distributions, that is, one sample is ranked higher (for instance, larger times, higher spiking rates) than the other sample. For small sample sizes (n, m <30), the U statistic is calculated directly. The probability is found from a table, at the p<0.05 level. For large sample sizes (n, m >30), the U statistic and probability are calculated using scipy.stats.mannwhitneyu which uses a normal approximation. Parameters ---------- samp1 : array like A 1D array of the sample data samp2 : array like A 1D array of the sample data Returns ------- U : int The smaller U statistic of the sample p : float The probability that the null hypothesis is true. ''' if (len(samp1) <= 30) & (len(samp2) <= 30): return ranksum_small(samp1, samp2) else: return mannwhitneyu(samp1, samp2)
def steady_state_test_rw_to_end(sink,corr=True): steady_pdf= approx_steady_pdf(sink,corr=corr) pdfs,time_LHS= moving_window_pdfs(sink,dt=0.1,wind_dt=1.,corr=corr) assert(pdfs.shape[0] == len(time_LHS)) #compare pdfs welch= dict(t=[],p=[]) ks=dict(t=[],p=[]) U=dict(t=[],p=[]) for i in range(len(time_LHS)): s1= pdfs[i,:] if np.all( np.isfinite(s1) ) and np.all( np.isfinite(steady_pdf) ): t_p= ttest_ind(s1,steady_pdf, equal_var=False) welch['t'].append( t_p[0] ) welch['p'].append( t_p[1] ) t_p= ks_2samp(s1,steady_pdf) ks['t'].append( t_p[0] ) ks['p'].append( t_p[1] ) t_p= mannwhitneyu(s1,steady_pdf) U['t'].append( t_p[0] ) U['p'].append( t_p[1] ) else: #no data found between tbeg and tbeg+twid welch['t'].append( -1 ) welch['p'].append( -1 ) ks['t'].append( -1 ) ks['p'].append( -1 ) U['t'].append( -1 ) U['p'].append( -1 ) for key in welch.keys(): welch[key]= np.array(welch[key]) ks[key]= np.array(ks[key]) U[key]= np.array(U[key]) return time_LHS,welch,ks,U
def _test( self, deltas ): # "Passing" behavior is more like the original (slower, more energy). # "Failing" behavior is more optimized (faster, less energy). fitness = np.array( self.get_fitness( deltas ) ) if len( fitness ) == 0: return self.UNRESOLVED if np.any( fitness == 0 ): return self.UNRESOLVED m = np.mean( fitness, axis = 0 ) s = np.std( fitness, axis = 0 ) sqrtn = np.sqrt( fitness.shape[ 0 ] ) for i in range( fitness.shape[ 1 ] ): infomsg( " ", m[ i ], "+/-", 1.96 * s[ i ] / sqrtn ) for i in range( fitness.shape[ 1 ] ): if np.ptp( self.optimized[ ::, i ] ) == 0 and \ np.ptp( fitness[ ::, i ] ) == 0 and \ self.optimized[ 0, i ] == fitness[ 0, i ]: # Optimized and fitness are all the same value, likely because # we are comparing the optimized variant to itself. This counts # as a fail, since they are clearly drawn from the same distro. continue pval = mannwhitneyu( self.optimized[ ::, i ], fitness[ ::, i ] )[ 1 ] if pval < options.alpha and m[ i ] < self.mean[ i ]: return self.PASS return self.FAIL
def fit(self, df_X, df_y): if not df_y.shape[0] == df_X.shape[0]: raise ValueError("number of regions is not equal") if df_y.shape[1] != 1: raise ValueError("y needs to have 1 label column") # calculate Mann-Whitney U p-values pvals = [] clusters = df_y[df_y.columns[0]].unique() for cluster in clusters: pos = df_X[df_y.iloc[:,0] == cluster] neg = df_X[df_y.iloc[:,0] != cluster] p = [] for m in pos: try: p.append(mannwhitneyu(pos[m], neg[m], alternative="greater")[1]) except Exception as e: sys.stderr.write(str(e) + "\n") sys.stderr.write("motif {} failed, setting to p = 1\n".format(m)) p.append(1) pvals.append(p) # correct for multipe testing pvals = np.array(pvals) fdr = multipletests(pvals.flatten(), method="fdr_bh")[1].reshape(pvals.shape) # create output DataFrame self.act_ = pd.DataFrame(-np.log10(pvals.T), columns=clusters, index=df_X.columns)
def runCompare(self, objId, labelToAdd, expression1, expression2): fh = open(self._getPath("report.txt"),'w') self.experiment = self.readExperiment(self.inputExperiment.get().fnPKPD) x1 = [float(x) for x in self.experiment.getSubGroupLabels(self.expression1.get(),self.labelToCompare.get())] x2 = [float(x) for x in self.experiment.getSubGroupLabels(self.expression2.get(),self.labelToCompare.get())] self.doublePrint(fh,"Values in SubGroup 1: %s"%str(x1)) self.doublePrint(fh,"Values in SubGroup 2: %s"%str(x2)) self.doublePrint(fh,"Testing H0: mu1=mu2") self.doublePrint(fh," ") try: [t,pval] = stats.ttest_ind(np.asarray(x1,np.double),np.asarray(x2,np.double),True) self.doublePrint(fh,"T-test two independent samples (same variance): t-statistic=%f p-value=%f"%(t,pval)) except: pass try: [t,pval] = stats.ttest_ind(x1,x2, False) self.doublePrint(fh,"T-test two independent samples (different variance, Welch's test): t-statistic=%f p-value=%f"%(t,pval)) except: pass try: [u,pval] = stats.mannwhitneyu(x1, x2, True) self.doublePrint(fh,"Mann-Whitney U test for two independent samples: u-statistic=%f p-value=%f"%(u,pval)) except: pass fh.close()
def steady_state_test_rw_to_rw(sink,dt=0.1,wind_dt=1.,corr=True): pdfs,time_LHS= moving_window_pdfs(sink,dt=dt,wind_dt=wind_dt,corr=corr) assert(pdfs.shape[0] == len(time_LHS)) #compare pdfs welch= dict(t=[],p=[]) ks=dict(t=[],p=[]) U=dict(t=[],p=[]) for i in range(len(time_LHS)-1): s1= pdfs[i,:] s2= pdfs[i+1,:] if np.all( np.isfinite(s1) ) and np.all( np.isfinite(s2) ): t_p= ttest_ind(s1,s2, equal_var=False) welch['t'].append( t_p[0] ) welch['p'].append( t_p[1] ) t_p= ks_2samp(s1,s2) ks['t'].append( t_p[0] ) ks['p'].append( t_p[1] ) t_p= mannwhitneyu(s1,s2) U['t'].append( t_p[0] ) U['p'].append( t_p[1] ) else: #no data found between tbeg and tbeg+twid welch['t'].append( -1 ) welch['p'].append( -1 ) ks['t'].append( -1 ) ks['p'].append( -1 ) U['t'].append( -1 ) U['p'].append( -1 ) return time_LHS[:-1],welch,ks,U
def test_loh(bins, alpha=0.0025): """Test each chromosome's SNP shifts and the combined others'. The statistical test is Mann-Whitney, a one-sided non-parametric test for difference in means. """ # TODO - this doesn't work right if there are many shifted regions try: from scipy import stats except ImportError: # SciPy not installed; can't test for significance return [] significant_chroms = [] for chrom, partitions in iteritems(bins): these_shifts = np.array(partitions['thisbin'], np.float_) other_shifts = np.array(partitions['otherbins'], np.float_) if len(these_shifts) < 20: logging.info("Too few points (%d) to test chrom %s", len(these_shifts), chrom) elif these_shifts.mean() > other_shifts.mean(): logging.debug("\nThese ~= %f (N=%d), Other ~= %f (N=%d)", these_shifts.mean(), len(these_shifts), other_shifts.mean(), len(other_shifts)) u, prob = stats.mannwhitneyu(these_shifts, other_shifts) logging.info("Mann-Whitney - %s: u=%s, p=%s", chrom, u, prob) if prob < alpha: significant_chroms.append(chrom) return significant_chroms
def significance(self, fun, other, test="wilcoxon"): """computes stats significance of difference between two sets of scores test can be paired wilcoxon, mannwhitney for indep samples, or paired ttest. """ scores1 = self.map_doc_scores(fun) scores2 = other.map_doc_scores(fun) if isinstance(scores1[0], float) or isinstance(scores1[0], int): pass else: # TODO: this is suspicious scores1 = [x for x, _ in scores1] scores2 = [x for x, _ in scores2] # differences = [(x, y) for (x, y) in zip(scores1, scores2) if x != y] # print(difference, file=sys.stderr) # print(d2, file=sys.stderr) # print([x for (i,x) in enumerate(d1) if x!=d2[i]], file=sys.stderr) assert len(scores1) == len(scores1) results = {} if test == "wilcoxon" or test == "all": results["wilcoxon"] = wilcoxon(scores1, scores2)[1] if test == "ttest" or test == "all": results["paired ttest"] = ttest_rel(scores1, scores2)[1] if test == "mannwhitney" or test == "all": results["mannwhitney"] = mannwhitneyu(scores1, scores2)[1] return results
def mann_whitney_plus_means(turnstile_weather): ''' This function will consume the turnstile_weather dataframe containing our final turnstile weather data. You will want to take the means and run the Mann Whitney U-test on the ENTRIESn_hourly column in the turnstile_weather dataframe. This function should return: 1) the mean of entries with rain 2) the mean of entries without rain 3) the Mann-Whitney U-statistic and p-value comparing the number of entries with rain and the number of entries without rain You should feel free to use scipy's Mann-Whitney implementation, and you might also find it useful to use numpy's mean function. Here are the functions' documentation: http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html http://docs.scipy.org/doc/numpy/reference/generated/numpy.mean.html You can look at the final turnstile weather data at the link below: https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv ''' ### YOUR CODE HERE ### with_rain_data = turnstile_weather[turnstile_weather['rain'] == 1]['ENTRIESn_hourly'] with_rain_mean = with_rain_data.mean() without_rain_data = turnstile_weather[turnstile_weather['rain'] == 0]['ENTRIESn_hourly'] without_rain_mean = without_rain_data.mean() U, p = stats.mannwhitneyu(with_rain_data, without_rain_data) return with_rain_mean, without_rain_mean, U, p # leave this line for the grader
def do_significance_test(tpx_feature, test="Wilcoxon Ranksum"): """ Do significance testing to see if the two distributions differ significantly. If p <= 0.05, we are highly confident that the distributions differ significantly. Arguments: tpx_feature (string): Name of the temporal expression feature to test test (string): which test to do: Wilcoxon Ranksum or Mann Whitney U """ md_table = pd.DataFrame.from_csv(os.path.join(wdir, md_csv), header=0) ht_table = pd.DataFrame.from_csv(os.path.join(wdir, "tpx-corpus-counts.csv"), header=0) working_table = ht_table.join(md_table) # get data points data = copy.copy(working_table[tpx_feature]) # get ids of historical novels idnos_hist = md_table[md_table["subgenre_hist"] == "historical"].index.tolist() # get ids of non-historical novels idnos_not_hist = md_table[md_table["subgenre_hist"] == "not_historical"].index.tolist() # split data into subgroups data_hist = data[idnos_hist] data_not_hist = data[idnos_not_hist] if test == "Mann Whitney": test_stat = stats.mannwhitneyu(data_hist, data_not_hist) else: # do Wilcoxon Ranksum by default test_stat = stats.ranksums(data_hist, data_not_hist) return test_stat
def mannwhitneyu(var1, var2): try: res = STATS.mannwhitneyu(allvals_dict[var1], allvals_dict[var2]) print('%4s vs %s u,p=%r => \t%s @a=10%%, %s @a=5%%' % (var1, var2, res, 'NE' if res[1] < .01116 else ' E', 'NE' if res[1] < .00568 else ' E'), file=sys.stderr) except Exception as e: print('%4s vs %s failed: %r' % (var1, var2, e))
def analyze_pairwise_mi_dict(mi_dict): """Given an mi_dict as returned by compute_motif_pairwise_mis, pretty print the results""" motif_width = max(j for (i,j) in mi_dict) + 1 tests = len(mi_dict) positives = 0 adjacents = 0 positive_adjacents = 0 obs_p_dict = {} adjacent_mis = [] non_adjacent_mis = [] for i,j in sorted(mi_dict): mi_obs,p_val = mi_dict[(i,j)] positive = p_val < 0.05 positives += positive adjacent = (j == i + 1) adjacents += adjacent if adjacent: adjacent_mis.append(mi_obs) else: non_adjacent_mis.append(mi_obs) positive_adjacents += positive * adjacent mi_test_string = "POSITIVE" if positive else "negative" obs_p_dict[(i,j)] = (mi_obs,p_val) #print i,j,mi_obs,p_val,mi_test_string,("adjacent" if adjacent else "") print "Motif had width:",motif_width print "tests:",tests print "positives:",positives print "positive_rate:",positives/float(tests) print "adjacents:",adjacents print "positive_adjacents",positive_adjacents print "positive_adjacents/positives:",positive_adjacents/float(positives) if positives else 0 print "adjacencts/tests:",adjacents/float(tests) print "Adjacent mis higher:",mean(adjacent_mis),mean(non_adjacent_mis),mannwhitneyu(adjacent_mis,non_adjacent_mis) return obs_p_dict
def utest( self, score ): """ Gives the Mann-Withney U test probability that the score is random. See: Mason & Graham (2002) Areas beneath the relative operating characteristics (ROC) and relative operating levels (ROL) curves: Statistical significance and interpretation Note (1): P-values below ~1e-16 are reported as 0.0. See zprob() in Biskit.Statistics.stats! Note (2): the P-value does not distinguish between positive and negative deviations from random -- a ROC area of 0.1 will get the same P-value as a ROC area of 0.9. @param score: the score predicted for each item @type score: [ float ] @return: 1-tailed P-value @rtype: float """ sample1 = N.compress( self.positives, score ) sample1 = sample1[-1::-1] # invert order sample2 = N.compress( N.logical_not( self.positives ), score ) sample2 = sample2[-1::-1] # invert order sample1 = sample1.tolist() sample2 = sample2.tolist() p = stats.mannwhitneyu( sample1, sample2 ) return p[1]
def mann_whitney(data1, data2): """ non parametric two samples independent """ return st.mannwhitneyu(data1, data2)
pvarianzalev = [] pvarianzaflig = [] ## pruebas estadisticas for i in range(1, len(df.iloc[0])): stat, p = normaltest(df[i]) # normalidad de columna pvnormal.append(p) stat, p = normaltest(df0[i]) # normalidad de columna clase 0 pvnormalclass0.append(p) stat, p = normaltest(df1[i]) # normalidad de columna clase 1 pvnormalclass1.append(p) stat, p = mannwhitneyu(df0[i], df1[i]) #prueba igualdad de equidistribucion pvdistribution.append(p) stat, p = ttest_ind(df0[i], df1[i]) # medias diferentes pvmean.append(p) stat, p = levene(df0[i], df1[i]) #prueba igualdad de varianza pvarianzalev.append(p) stat, p = fligner(df0[i], df1[i]) #prueba igualdad de varianza pvarianzaflig.append(p) #aclaraciones #no se usa test de shapiro por que hay indicios que dicen que # funciona mal con muchos datos
def makeBoxPlots(comparison, Map): root = '/space/jazz/1/users/gwarner/histograms/' + Map + '/' if comparison == 'gesiemens1000': files = [ root + x for x in [ 'histdist_results_GE_manufacturers_1000_%s_data_points.txt' % Map, 'histdist_results_SIEMENS_manufacturers_1000_%s_data_points.txt' % Map ] ] elif comparison == 'between': files = [ root + x for x in [ 'GE_data_points.txt', 'histdist_results_SIEMENS_%s_data_points.txt' % Map ] ] #'siemens_data_points.txt']] elif comparison == '7001000': files = [ root + x for x in [ 'histdist_results_SIEMENS_manufacturers_700_%s_data_points.txt' % Map, 'histdist_results_SIEMENS_manufacturers_1000_%s_data_points.txt' % Map ] ] elif comparison == 'between1.5T': files = [ root + x for x in [ 'histdist_results_GE_manufacturers_1.5_%s_data_points.txt' % Map, 'histdist_results_SIEMENS_manufacturers_1.5_%s_data_points.txt' % Map ] ] elif comparison == 'Siemens3Tvs1.5T': files = [ root + x for x in [ 'histdist_results_SIEMENS_manufacturers_1.5_%s_data_points.txt' % Map, 'histdist_results_SIEMENS_manufacturers_3.0_%s_data_points.txt' % Map ] ] elif comparison == '1.5vs3.0SiemensDirsBval': files = [ root + x for x in [ 'histdist_results_SIEMENS_1000_bval_30_dirs_1.5T_%s_data_points.txt' % Map, 'histdist_results_SIEMENS_1000_bval_30_dirs_3.0T_%s_data_points.txt' % Map ] ] #files = [root+x for x in ['histdist_results_1.5_30_Directions_1000_bval_%s_data_points.txt'%Map,'histdist_results_3.0_30_Directions_1000_bval_%s_data_points.txt'%Map]] elif comparison == 'GESiemensBvalFieldDirControlled': files = [ root + x for x in [ 'histdist_results_GE_1000_bval_30_dirs_1.5T_%s_data_points.txt' % Map, 'histdist_results_SIEMENS_1000_bval_30_dirs_1.5T_%s_data_points.txt' % Map ] ] elif comparison == 'GE6vs25DirBvalFieldControlled': files = [ root + x for x in [ 'histdist_results_GE_6_Dirs_1.5T_1000_bval_%s_data_points.txt' % Map, 'histdist_results_GE_25_Dirs_1.5T_1000_bval_%s_data_points.txt' % Map ] ] else: sys.exit('Bad comparison') metrics = [ "canberra", "cityblock", "euclidean", "canberra", "chebyshev", "hellinger" ] data = {} for x in metrics: data[x] = {} for spreadsheet in files: f = open(spreadsheet, 'r') lines = f.readlines() oneline = ' '.join(lines).replace('\n', '') dataDict = ast.literal_eval(oneline) for x in metrics: vals = dataDict[x][1] #zeroth index is x values, we want y values vals.sort() numpyarray = np.array(vals) data[x][spreadsheet] = numpyarray for metric in ['hellinger']: #data: #fig = plt.figure() #ax = fig.add_subplot(111) #ax.set_ylabel('Distance', fontsize=14) info, labels = [], [] Keys = data[metric].keys() Keys.sort() for f in Keys: info.append(data[metric][f]) if comparison == '7001000': man = f.split('_')[4] labels.append(man + ' (n=' + str(len(data[metric][f])) + ')') elif comparison == 'gesiemens1000': man = f.split('_')[2] labels.append(man + ' (n=' + str(len(data[metric][f])) + ')') elif comparison == 'between': if 'SIEMENS' in f: labels.append('Siemens (n=' + str(len(data[metric][f])) + ')') else: labels.append('GE (n=' + str(len(data[metric][f])) + ')') elif comparison == 'between1.5T': man = f.split('_')[2] labels.append(man + ' (n=' + str(len(data[metric][f])) + ')') elif comparison == 'Siemens3Tvs1.5T': man = f.split('_')[4] + 'T' labels.append(man + ' (n=' + str(len(data[metric][f])) + ')') elif comparison == '1.5vs3.0SiemensDirsBval': man = f.split('_')[7] labels.append(man + ' (n=' + str(len(data[metric][f])) + ')') elif comparison == 'GESiemensBvalFieldDirControlled': man = f.split('_')[2] labels.append(man + ' (n=' + str(len(data[metric][f])) + ')') elif comparison == 'GE6vs25DirBvalFieldControlled': man = f.split('_')[3] + ' Directions' labels.append(man + ' (n=' + str(len(data[metric][f])) + ')') else: sys.exit('Bad comparison ', comparison) print '\n\n' + Map print comparison + ' ' + metric print info[0] print len(info[0]), len(info[1]) #print 'Moods Median: '+str(median_test(info[0], info[1])) #print 'Medians (left, right)' median1, median2 = np.median(info[0]), np.median(info[1]) print str(labels[0]) + ': ' + str(median1) print str(labels[1]) + ': ' + str(median2) #print 'Moods p-value' #pprint ([(i, median_test(info[0],info[1][:i])[1]) #for i in range(200,len(info[1]),200)]) #print median_test(info[0],info[1]) print 'Mann-Whitney U: ' + str(mannwhitneyu(info[0], info[1])) pprint([(i, mannwhitneyu(info[0], info[1][:i])[1]) for i in range(200, len(info[1]), 200)]) m = plt.boxplot(x=info, labels=labels) whiskers = [item.get_ydata() for item in m['whiskers']] print 'Whiskers for ' + str(labels[0]) med0 = np.median(info[0]) med1 = np.median(info[1]) print whiskers[:2] print 'Whiskers for ' + str(labels[1]) print whiskers[-2:] print 'Max minus median for ' + str( labels[0]) + ': ' + str(float(whiskers[1][1]) - float(median1)) print 'Max minus median for ' + str( labels[1]) + ': ' + str(float(whiskers[3][1]) - float(median2)) sys.exit() plt.xticks(fontsize=14) plt.yticks(fontsize=14) if comparison == 'gesiemens1000': plt.title('GE 1000 B-value vs Siemens 1000 B-value ' + metric.title() + ' ' + Map) plt.savefig('/space/jazz/1/users/gwarner/boxplots/' + Map + '/Siemens_vs_GE_1000_bval_' + metric + '_' + Map + '_boxplot.png') elif comparison == '7001000': plt.title('Siemens 1000 B-value vs Siemens 700 B-value ' + metric.title() + ' ' + Map) plt.savefig('/space/jazz/1/users/gwarner/boxplots/' + Map + '/Siemens_700_vs_Siemens_1000_bval_' + metric + '_' + Map + '_boxplot.png') elif comparison == 'between1.5T': plt.title('GE 1.5T vs Siemens 1.5T ' + metric.title() + ' ' + Map) plt.savefig('/space/jazz/1/users/gwarner/boxplots/' + Map + '/GE_1.5T_vs_Siemens_1.5T_' + metric + '_' + Map + '_boxplot.png') elif comparison == 'Siemens3Tvs1.5T': plt.title('Siemens 1.5T vs Siemens 3.0T ' + metric.title() + ' ' + Map) plt.savefig('/space/jazz/1/users/gwarner/boxplots/' + Map + '/Siemens_1.5T_vs_Siemens_3.0T_' + metric + '_' + Map + '_boxplot.png') # elif comparison == '1.5vs3.0SiemensDirsBval': # plt.title('Siemens 1.5T 30 Gradient Directions 1000 B-Value vs\nSiemens 3.0T 30 Gradient Directions 1000 B-Value '+metric.title()+' '+Map) plt.savefig( '/space/jazz/1/users/gwarner/boxplots/' + Map + '/Siemens_1.5T_30_Directions_1000_Bval_vs_Siemens_3.0T_30_Directions_1000_Bval_' + metric + '_' + Map + '_boxplot.png') elif comparison == 'GESiemensBvalFieldDirControlled': plt.title( 'GE vs Siemens 1000 B-Value 30 Gradient Directions 1.5T ' + metric.title() + ' ' + Map) plt.savefig( '/space/jazz/1/users/gwarner/boxplots/' + Map + '/GE_vs_Siemens_1000_B-Value_30_Gradient_Directions_1.5T_' + metric + '_' + Map + 'boxplot.png') elif comparison == 'GE6vs25DirBvalFieldControlled': plt.title( 'GE 6 Gradient Directions 1000 b-Value 1.5T vs\nGE 25 Gradient Directions 1000 b-Value 1.5T ' + metric.title() + ' ' + Map) plt.savefig( '/space/jazz/1/users/gwarner/boxplots/' + Map + '/GE_6_Gradient_Directions_1000_Bval_1.5T_vs_GE_25_Gradient_Directions_1000_Bval_1.5T_' + metric + '_' + Map + '_boxplot.png') else: plt.title('All GE vs All Siemens ' + metric.title() + ' ' + Map)
xt, _ = stats.boxcox(iris["Petal.Length"]) stats.shapiro(xt) # Visualize the difference fig = plt.figure() ax1 = fig.add_subplot(211) prob = stats.probplot(x, dist=stats.norm, plot=ax1) ax1.set_xlabel('') ax1.set_title('Prob plot against normal distribution') # Log transformed ax2 = fig.add_subplot(212) prob = stats.probplot(xt, dist=stats.norm, plot=ax2) ax2.set_title('Prob plot after BoxCox transformation') # Q40 - Test whether median temp of beaver1 and median temp of beaver2 are equal or not. u1, p_value1 = stats.mannwhitneyu(beaver1.temp, beaver2.temp) print("Test statistic: ", u1, "\np-value: ", p_value1) # Q41 - Single loop on iris data datalist = [] # note that this is a list and not a dataframe # subset the data by getting columns you need from original data d2 d9 = iris[["Species", "Petal.Length"]] ds = [rows for _, rows in d9.groupby('Species')] # I just wanted to see my list nicely to analyze the list. You could just use ds, and it will do it. pprint(ds) len(ds)
total = list(freeze_figure['stim']) condition = list(freeze_figure['condition']) ax = sns.swarmplot(x="condition", y="stim", data=freeze_figure, color='black') ax.set_title('Percent freezing during stimulus (all animals included)') ax = sns.boxplot(x=condition, y=total, palette="Set2", showfliers=False) #ax.legend(bbox_to_anchor=(1,1)) #%% freeze_t_shock = freeze_t_shock.T freeze_t_loom = freeze_t_loom.T freeze_tl_filtered = freeze_tl_filtered.T freeze_tone = freeze_tone.T #%% T_SvsT = ss.mannwhitneyu(freeze_t_shock['stim'], freeze_tone['stim']) T_SvsT_L = ss.mannwhitneyu(freeze_t_shock['stim'], freeze_t_loom['stim']) TvsT_L = ss.mannwhitneyu(freeze_tone['stim'], freeze_t_loom['stim']) #%% print('T_SvsT =', T_SvsT) print('T_SvsT_L =', T_SvsT_L) print('TvsT_L =', TvsT_L) #%% freeze_figure_fil = pd.concat( [freeze_tl_filtered, freeze_t_shock, freeze_tone], axis=0) freeze_figure_fil[
def Find_DMR2(context, cutoff, test_method): # union=pd.read_csv('Unionsite.txt',sep='\t',na_values='-') # expgroup = samples[samples[2] == 'WT'][0].to_list() # ctrlgroup = samples[samples[2] == 'met1'][0].to_list() chrs = union['chr'].unique() data_holder = [] positions = [] meanMaths = [] pvalues = [] for chromosome in chrs: subset = union[(union['context'] == context) & (union['chr'] == chromosome)] maxPos = subset['pos'].max() bins = range(0, maxPos, region) groups = subset.groupby(pd.cut(subset['pos'], bins)) for sRange, sValues in groups: minDepth = sValues.iloc[:, 3:].count().min() if minDepth >= qualifiedSite: expValue = sValues.loc[:, expgroup] expValue2 = [ x for sublist in expValue.values for x in sublist if math.isnan(x) == False ] ctrlValue = sValues.loc[:, ctrlgroup] ctrlValue2 = [ x for sublist in ctrlValue.values for x in sublist if math.isnan(x) == False ] pKS = stats.kstest(expValue2, ctrlValue2)[1] pMWU = stats.mannwhitneyu(expValue2, ctrlValue2)[1] meanMeth = sValues.iloc[:, 3:].astype(float).mean() meanMeth2 = [("%.3f" % x) for x in meanMeth] meanMeth3 = meanMeth + [ random.random() * 0.00001 for x in meanMeth ] pTTest = 1.0 if meanMeth.sum() == 0 else stats.ttest_ind( meanMeth3.loc[expgroup], meanMeth3.loc[ctrlgroup])[1] methMeth = sValues.iloc[:, 3:].astype(float).mean().tolist() start = sRange.left end = sRange.right deltaMean = meanMeth.loc[expgroup].mean( ) - meanMeth.loc[ctrlgroup].mean() out = [chromosome, start, end ] + meanMeth2 + [deltaMean, pTTest, pKS, pMWU] data_holder.append(out) merge = pd.DataFrame(data_holder, columns=['Chr', 'Start', 'End'] + samples[0].tolist() + ['DeltaMean', 'pTTest', 'pKS', 'pMWU']) tests_methods = {0: 'pTTest', 1: 'pKS', 2: 'pMWU'} pvals = merge.loc[:, tests_methods[test_method]] sig = merge[pvals <= pvalue] sig_all = sig[(sig.DeltaMean >= cutoff) | (sig.DeltaMean <= -1 * cutoff)] sig_all.to_csv('DMR_' + context + '_all_' + str(cutoff) + '.txt', sep='\t', index=False) sig_all[sig_all.DeltaMean > 0].to_csv('DMR_' + context + '_hyper_' + str(cutoff) + '.txt', sep='\t', index=False) sig_all[sig_all.DeltaMean < 0].to_csv('DMR_' + context + '_hypo_' + str(cutoff) + '.txt', sep='\t', index=False)
def mann_whitney_plot(data, condition, distribution, ax=None, condition_value=None, alternative="two-sided", skip_plot=False, **kwargs): """ Create a box plot comparing a condition and perform a Mann Whitney test to compare the distribution in condition A v B Parameters ---------- data: Pandas dataframe Dataframe to retrieve information from condition: str Column to use as the splitting criteria distribution: str Column to use as the Y-axis or distribution in the test ax : Axes, default None Axes to plot on condition_value: If `condition` is not a binary column, split on =/!= to condition_value alternative: Specify the sidedness of the Mann-Whitney test: "two-sided", "less" or "greater" skip_plot: Calculate the test statistic and p-value, but don't plot. """ condition_mask = get_condition_mask(data, condition, condition_value) U, p_value = mannwhitneyu( data[condition_mask][distribution], data[~condition_mask][distribution], alternative=alternative ) plot = None if not skip_plot: plot = stripboxplot( x=condition, y=distribution, data=data, ax=ax, significant=p_value <= 0.05, **kwargs ) sided_str = sided_str_from_alternative(alternative, condition) print("Mann-Whitney test: U={}, p-value={} ({})".format(U, p_value, sided_str)) return MannWhitneyResults(U=U, p_value=p_value, sided_str=sided_str, with_condition_series=data[condition_mask][distribution], without_condition_series=data[~condition_mask][distribution], plot=plot)
####### Fig 05 b ####### complexity = pd.read_csv('saved/graph_complexity.csv', header=None).values coastalComplexity = complexity[coastalIndices[0], coastalIndices[1] + 1] coastalCompelxity = RejectOutliers( np.hstack((coastalComplexity, complexity[:, 0])), 2) landComplexity = RejectOutliers( complexity[landIndices[0], landIndices[1] + 1], 2) coralComplexity = RejectOutliers( complexity[coralIndices[0], coralIndices[1] + 1], 2) highPlanComplexity = RejectOutliers( complexity[highPlanIndicies[0], highPlanIndicies[1] + 4], 2) PCoastalL = mannwhitneyu(coastalCompelxity, landComplexity)[1] PCoralL = mannwhitneyu(landComplexity, coralComplexity)[1] PCC = mannwhitneyu(coastalCompelxity, coralComplexity)[1] fig, axs = plt.subplots(1, 1, figsize=(3, 7)) plt.setp(axs.spines.values(), linewidth=2) yconferror = np.array([ list(sms.DescrStatsW(coastalComplexity).tconfint_mean(0.05)), list(sms.DescrStatsW(landComplexity).tconfint_mean(0.05)), list(sms.DescrStatsW(coralComplexity).tconfint_mean(0.05)) ]) ysem = np.array( [sem(coastalComplexity), sem(landComplexity), sem(coralComplexity)])
print(dataset.groupby('clase').size()) # Split-out validation dataset for i in Datos: print i sys.exit() #obtengo los x que son todos los elementos menos el id y la clase X = array[:, 1:len(nombres) - 1] resultados = [] for i in range(len(X)): print "###################" try: X[i][np.isnan(X[i])] = 0 Y[i][np.isnan(Y[i])] = 0 Test = [["t estudent", stats.ttest_ind(X[i], Y[i])], ["mannwhitneyu", stats.mannwhitneyu(X[i], Y[i])], ["kruskal", stats.kruskal(X[i], Y[i])]] # print atributosX[i] # print "valor clase 1 : "+str(X[i]) # print "valor clase 2 : "+str(Y[i]) data_to_plot = [X[i], Y[i]] for nombre, test in Test: print "datos" print "para hipotesis " + str(nombre) + " los datos son :" stat, p = test print "stat : " + str(stat) print "p : " + str(p) fig = plt.figure(1, figsize=(9, 6)) ax = fig.add_subplot(111)
import scipy.stats as st final_test=[165.11899394, 167.42154615, 192.18840315, 188.70493079,166.0677011 , 200.83600747, 186.27117725, 199.42551454, 217.20806414,201.86719385, 95.62154468, 159.49015417, 126.02467042,nan,nan, 194.70626691, 193.76326486, 181.54441614, 186.17321256,nan] ephys_test = [149.24336357, 160.1565535 , 127.11240186, 149.11904847, 127.45482092, 81.52236443, 94.86429959, 145.35673907, 137.44541069, 126.06250692, 60.97663738, 82.70175177, 84.6588944 , 69.37222471, 87.05228826, 81.77171595, 78.89516761, 40.03542436, 85.09368774, 89.08363123, 88.02874388, 115.49876754, 118.60709819, 134.69194744, 117.82368202, 108.3018049 , 136.78586032, 111.54460503, 132.41798123, 134.87473783] test= st.ttest_ind(final_test,ephys_test,nan_policy='omit') test1=st.mannwhitneyu(final_test,ephys_test) target = open(main_folder +"level_3_comparison_reaction_time.txt", 'w') target.writelines(str(mean) +str(stds)+str(test)+ ' LEVEL 3: dst to poke / time to reward in secons (frame*120)*360 mean +- std, reaction_speed_ephys.py') target.close()
if prop_inf > .5: ER_ep = ER_ep + 1 ER_prop_infected.append(prop_inf) PA_ep = 0 PA_prop_infected = [] for k in range(100): prop_inf = problem4a.SIRmodel(PA, [choice(PA.nodes())], beta, delta) if prop_inf > .5: PA_ep = PA_ep + 1 PA_prop_infected.append(prop_inf) print "ER", ER_ep print "PA", PA_ep print "jazz", jazz_ep print chi2 = stats.chi2_contingency([[ER_ep, 100 - ER_ep], [jazz_ep, 100 - jazz_ep]]) print "ER vs jazz X = ", chi2[0], "p = ", chi2[1] print "ER mean proportion = ", np.mean(ER_prop_infected), print "jazz mean proportion = ", np.mean(jazz_prop_infected) mw = stats.mannwhitneyu(ER_prop_infected, jazz_prop_infected) print "U-statistic = ", mw[0], print "p = ", mw[1] print chi2 = stats.chi2_contingency([[PA_ep, 100 - PA_ep], [jazz_ep, 100 - jazz_ep]]) print "PA vs jazz X = ", chi2[0], "p = ", chi2[1] print "PA mean proportion = ", np.mean(PA_prop_infected), print "jazz mean proportion = ", np.mean(jazz_prop_infected) mw = stats.mannwhitneyu(PA_prop_infected, jazz_prop_infected) print "U-statistic = ", mw[0], print "p = ", mw[1]
i = 0 #deal with the problem pvalue while i < gnumber: count = 0 j = 1 #j,k means the class number while j <= classes: k = j + 1 while k <= classes: if group_label.get_group(j)[i].equals( other=group_label.get_group(k)[i]): cellpvalue.loc[i, count] = 0.5 count = count + 1 print(count) else: u12, pvalue = stats.mannwhitneyu( group_label.get_group(j)[i], group_label.get_group(k)[i]) cellpvalue.loc[i, count] = pvalue count = count + 1 k = k + 1 j = j + 1 i = i + 1 #cellpvalue.to_csv('cellpvalue.csv',sep=',',index=False) pvalue = pd.DataFrame() pvalue['targetid'] = targetid pvalue['min_value'] = cellpvalue.min(axis=1) pvalue = pvalue.sort_values(by="min_value", ascending=True) pvalue.to_csv('pvalue.csv', sep=',', index=False)
def make_significance_plot_homogeneity(X, homogeneity, Category_str, my_rosetta, thecmap='viridis', NMFCOMPS=16, save=True, filename_addon='', verbose=True, maxcats=20): CategoryType = X[Category_str].value_counts().keys()[0:maxcats] CategoryCats = X[Category_str].values[my_rosetta] list_of_sig = [] for i, cat in enumerate(CategoryType[0:maxcats]): growthlist = [cat] if verbose: print('*****************') print('Category ', i, cat) CatCut = (CategoryCats == cat) growthlist += [len(CatCut[CatCut])] majorcount = 0 semimajorcount = 0 bigcount = 0 minorcount = 0 noncount = 0 car = mannwhitneyu(homogeneity[CatCut], homogeneity[~CatCut], alternative='greater') if verbose: print('mean homogeneity of ', cat, np.mean(homogeneity[CatCut])) print('mean homogeneity of anti-', cat, np.mean(homogeneity[~CatCut])) Ncats = min(maxcats, len(CategoryType[0:maxcats])) adjustedp = (car[1] + 1e-30) * 1 * Ncats growthlist += [-1 * np.log10(adjustedp)] list_of_sig.append(growthlist) colnames = [Category_str.replace(" ", ""), 'Count'] + ['A'] CategoryChart = pd.DataFrame(list_of_sig, columns=colnames) CategoryChartMatrix = CategoryChart.values[:, 2:].astype(float) plt.clf() plt.figure(figsize=(len(CategoryType[0:maxcats]) * 2, 6)) plt.imshow(CategoryChartMatrix.T, cmap=thecmap, vmin=-3, vmax=15) plt.xlabel(Category_str, fontsize=25) plt.xticks(np.arange(Ncats), CategoryType[0:maxcats], rotation='vertical', fontsize=25) cbar = plt.colorbar(fraction=0.046, pad=0.04, ticklabel_size=24) cbar.set_label(r'- $\log_{10} (p*$' + str(Ncats) + r'$)$', fontsize=25) cbar_ax = cbar.ax cbar_ax.tick_params(labelsize=35) for i in cbar_ax.get_yticklabels(): i.set_fontsize(35) if (save): plt.savefig(filename_addon + Category_str + 'MWhom_plot.pdf') plt.show() return (CategoryChartMatrix, CategoryType[0:maxcats])
result = sm.OLS(y, X).fit() rho, pval = stats.spearmanr(df2["ARM"], df2["IMS"]) print "TCGA spearman", i, rho, pval print result.summary() c["TCGA Leuk. Frac."] = -math.log(float(result.pvalues[1]), 10) p_1[i] = c df3 = df2[df2["TMB"].astype(float) > 0] if i in "6p": gain = df3[df3["ARM"] == 1] lost = df3[df3["ARM"] == -1] neu = df3[df3["ARM"] == 0] t, prob = mannwhitneyu(gain["IMS"].values, neu["IMS"].values) print "gain", i, t, prob t, prob = mannwhitneyu(lost["IMS"].values, neu["IMS"].values) print "loss", i, t, prob df4 = df3.sort(['ARM']) df4["ARM"] = df4["ARM"].map({-1: "loss", 1: "gain", 0: "none"}) df4.to_csv("Analysis/TCR/6p_sm_IMS.source.txt", sep="\t") sns.boxplot(x="ARM", y="IMS", data=df4, palette="Set2") sns.swarmplot(x="ARM", y="IMS", data=df4, color=".25") x1, x2 = 1, 2 y, h, col = 0.7, 0.025, 'k' plt.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.5, c=col) plt.text((x1 + x2) * .5, y + h,
def make_significance_plot_WSO(X, Basis, Category_str, my_rosetta, thecmap='binary', NMFCOMPS=16, save=True, filename_addon='', PCAmode=False, write_mode=False, verbose=True, maxcats=20): CategoryType = X[Category_str].value_counts().keys()[0:maxcats] CategoryCats = X[Category_str].values[my_rosetta] CategoryType = np.sort(CategoryType) list_of_sig = [] for i, cat in enumerate(CategoryType[0:maxcats]): growthlist = [cat] if verbose: print('*****************') print('Category ', i, cat) CatCut = (CategoryCats == cat) growthlist += [len(CatCut[CatCut])] for i in range(NMFCOMPS): car = mannwhitneyu(Basis[:, i][CatCut], Basis[:, i][~CatCut], alternative='greater') Ncats = min(maxcats, len(CategoryType[0:maxcats])) adjustedp = (car[1] + 1e-30) * NMFCOMPS * Ncats growthlist += [-1 * np.log10(adjustedp)] list_of_sig.append(growthlist) colnames = [Category_str.replace(" ", ""), 'Count' ] + ['Comp' + str(i + 1) for i in range(NMFCOMPS)] CategoryChart = pd.DataFrame(list_of_sig, columns=colnames) if write_mode: CategoryChart.to_csv(filename_addon + Category_str + 'MWmatrix.csv', sep='\t') CategoryChartMatrix = CategoryChart.values[:, 2:].astype(float) plt.clf() myfs = 45 plt.figure(figsize=(35, len(CategoryType[0:maxcats]) * 2)) plt.imshow(CategoryChartMatrix, cmap=thecmap, vmin=-3, vmax=30) if (PCAmode): plt.xlabel('Principal Component', fontsize=myfs) else: plt.xlabel('NMF component', fontsize=myfs) plt.ylabel(Category_str, fontsize=myfs) plt.yticks(np.arange(Ncats), CategoryType[0:maxcats], rotation='horizontal', fontsize=myfs) plt.xticks(np.arange(NMFCOMPS), (np.arange(NMFCOMPS) + 1).astype(str), rotation='vertical', fontsize=myfs) cbar = plt.colorbar(fraction=0.046, pad=0.04) cbar.set_label(r'- $\log_{10} (p*$' + str(NMFCOMPS) + r'$*$' + str(Ncats) + r'$)$', fontsize=myfs) cbar_ax = cbar.ax cbar_ax.tick_params(labelsize=myfs) for i in cbar_ax.get_yticklabels(): i.set_fontsize(myfs) if (save): plt.savefig(filename_addon + Category_str + 'MWplot.pdf', bbox_inches='tight') plt.show() return (CategoryChartMatrix, CategoryType[0:maxcats])
def test_one_cell(): path_out = '/lustre/tianlab/zhangyu/PEI/mid_data_correct/try_CTCF' term = 'GM12878' file_pre = '/lustre/tianlab/zhangyu/PEI/mid_data_correct/cell_line/' \ 'model_input/GM12878/correlation.txt' file_cre = os.path.join(path_out, 'cRE_GM12878.txt') df_pre = pd.read_csv(file_pre, sep='\t') abs_distance = np.abs(df_pre['distance']) df_pre = df_pre.loc[abs_distance > 5000, :] file_promoter = path_origin + \ '/gene/promoters.up2k.protein.gencode.v19.unique.bed' df_promoter = pd.read_csv(file_promoter, sep='\t', header=None) file_tmp = os.path.join(path_out, "input_file.tmp") df_tmp = df_pre.loc[:, ['gene', 'dhs_id']] df_tmp = pd.merge(df_tmp, df_promoter, left_on='gene', right_on=7, how='inner') df_tmp = df_tmp.loc[:, ['gene', 'dhs_id', 0, 1, 2]] df_tmp[3] = df_tmp['dhs_id'].apply( lambda x: int(x.split(':')[-1].split('-')[0])) df_tmp[4] = df_tmp['dhs_id'].apply( lambda x: int(x.split(':')[-1].split('-')[1])) df_tmp[5] = df_tmp.apply( lambda x: x[2] if x[3] > x[2] else x[4], axis=1) df_tmp[6] = df_tmp.apply( lambda x: x[3] if x[3] > x[2] else x[1], axis=1) # df_tmp.loc[(df_tmp['gene'] == 'TADA2B') & # (df_tmp['dhs_id'] == 'DHS<-chr4:7045626-7045765'), :] df_tmp_out = df_tmp.loc[:, [0, 5, 6, 'gene', 'dhs_id']] df_tmp_out.to_csv(file_tmp, sep='\t', header=None, index=None) file_cre_ctcf = os.path.join(path_out, "cRE.CTCF.tmp") os.system(f"grep -w 'Insulator' {file_cre} > {file_cre_ctcf}") file_ctcf_tmp = os.path.join(path_out, "CTCF.tmp") os.system(f"bedtools intersect -a {file_tmp} -b {file_cre_ctcf} -wao | " f"cut -f 4,5,9,10,11,17 > {file_ctcf_tmp}") df_ctcf = pd.read_csv(file_ctcf_tmp, sep='\t', header=None, na_values='.', dtype={2: 'str', 3: 'str', 4: 'float', 5: 'float'}) def unique_ctcf(df_in): if df_in.shape[0] == 1: if np.isnan(df_in.iloc[0, 5]): df_in.iloc[0, 5] = 0 df_out = df_in.loc[:, [0, 1, 5]] else: df_out = df_in.loc[:, [0, 1, 5]] else: max_ctcf = np.max(df_in[5]) # df_out = df_in.loc[df_in[5] == max_ctcf, [0, 1, 4, 5]] df_out = df_in.loc[df_in[5] == max_ctcf, [0, 1, 5]] return df_out df_uniq = df_ctcf.groupby([0, 1]).apply(unique_ctcf) df_uniq.index = list(range(df_uniq.shape[0])) # df_uniq.columns = \ # ['gene', 'dhs_id', 'score_dhs_insulator', 'score_ctcf_insulator'] df_uniq.columns = ['gene', 'dhs_id', 'score_ctcf_insulator'] df_uniq = df_uniq.drop_duplicates() df_genome_ctcf = pd.merge(df_pre, df_uniq, on=['gene', 'dhs_id'], how='left') df_genome_ctcf = df_genome_ctcf.fillna(0) os.remove(file_tmp) os.remove(file_cre_ctcf) os.remove(file_ctcf_tmp) file_out = os.path.join(path_out, f"{term}_input_file.txt") df_genome_ctcf.to_csv(file_out, sep='\t', index=None, na_rep='NA') file_corr = file_out file_label = os.path.join(path_label, f"{term}/{term}.txt") file_fea_label = os.path.join(path_out, f"{term}_feature_label.txt") file_res = os.path.join(path_out, f"{term}_result.txt") label = term df_corr = pd.read_csv(file_corr, sep='\t') df_label = pd.read_csv(file_label, sep='\t') # only select distal enhancer df_corr = \ df_corr.loc[df_corr['type_cre'] != 'Protein-Promoter(Enhancer)', ] # if df_label.shape[0] == 0: # return df_label['label'] = np.full(df_label.shape[0], 1) df_combine = pd.merge( df_corr, df_label, how='left', on=['gene', 'dhs_id', 'type_cre', 'ref_dhs_id']) df_combine = df_combine.fillna(0) # first step array_pred = np.full(df_combine.shape[0], 0) array_pred[df_combine['score_ctcf_insulator'] <= 0] = 1 df_combine['pred'] = array_pred df_combine.to_csv(file_fea_label, sep='\t', index=None) precision = precision_score(df_combine['label'], array_pred) recall = recall_score(df_combine['label'], array_pred) cols = df_combine.columns[5:-2] list_res = [{'feature': 'CTCF_pred', 'correlation': '', 'diff_median': precision, 'pval': recall, 'label': label}] df_combine_filter = df_combine.loc[df_combine['pred'] == 1, :] for col in cols: df_sub = df_combine_filter.loc[:, [col, 'label']] array_pos = df_sub.loc[df_sub['label'] == 1, col] array_neg = df_sub.loc[df_sub['label'] == 0, col] try: _, pval = mannwhitneyu(array_pos, array_neg, alternative='greater') except ValueError: pval = np.nan diff_median = np.median(array_pos) - np.median(array_neg) # feature, corr = col.split('|') list_res.append({'feature': col, 'correlation': 'Spearman', 'diff_median': diff_median, 'pval': pval, 'label': label}) df_res = pd.DataFrame(list_res) df_res.to_csv(file_res, sep='\t', index=False, na_rep='NA') return
Question 12 ''' print_header("Question 12") p_count = 0 index_arr_2 = np.full(len(data), -1, dtype=int ) index_i = 0 for i in range(0, len(genes)): datagene = data[i] allgene = datagene[ALL] amlgene = datagene[AML] u_stat, p_value = st.mannwhitneyu(allgene, amlgene) if p_value < 0.05: index_arr_2[index_i] = i index_i += 1 p_count += 1 print(f"The Number of genes differentially expressed according to rank-sum test: {p_count}") ''' Question 13 ''' print_header("Question 13")
def entropy_over_time( entropy_df, tmpts=[0, 1, 2], use_ranksum=False, use_one_sided=True, path_to_save='/Users/jendawk/Dropbox (MIT)/C Diff Recurrence Paper/Analyses/' ): pval = {} pvals = [] for cl in ['Recurrer', 'Non-recurrer']: pval[cl] = {} for tix, tmpt in enumerate(tmpts[:-1]): entropy_1 = entropy_df[tmpt] entropy_2 = entropy_df[tmpt + 1] entropy_1 = entropy_1.loc[entropy_1['Outcome'] == cl] entropy_2 = entropy_2.loc[entropy_2['Outcome'] == cl] ix_sim = set(entropy_1.index.values).intersection( set(entropy_2.index.values)) if use_one_sided: if tmpt == 0: alt = 'greater' else: alt = 'less' else: alt = 'two-sided' try: if use_ranksum: _, p = st.wilcoxon( entropy_1.loc[ix_sim]['Week ' + str(tmpt)], entropy_2.loc[ix_sim]['Week ' + str(tmpt + 1)], alternative=alt) tname = 'wilcoxnon_ranksum' else: _, p = st.mannwhitneyu(entropy_1['Week ' + str(tmpt)], entropy_2['Week ' + str(tmpt + 1)], alternative=alt) tname = 'mannwhitney' pval[cl][(tmpt, tmpt + 1)] = p pvals.append(p) except: continue pval_corr = {} pvals_corr = multipletests(pvals, alpha=0.05, method='fdr_bh')[1] it = 0 for cl in ['Recurrer', 'Non-recurrer']: pval_corr[cl] = {} for tmpt in tmpts[:-1]: pval_corr[cl][(tmpt, tmpt + 1)] = pvals_corr[it] it += 1 # p_df = pd.DataFrame(pval, index = [0]).T # p_df.to_csv('paper_figs/entropy_ttest.csv') dictionary = {'Uncorrected': pval, 'Corrected': pval_corr} reform = {(outerKey, innerKey): values for outerKey, innerDict in dictionary.items() for innerKey, values in innerDict.items()} df_over_time = pd.DataFrame(reform) pd.DataFrame(reform).to_csv(path_to_save + 'Fig3_results/entropy/intra_entropy_' + tname + '_' + alt + '.csv') return df_over_time
from scipy.stats import mannwhitneyu data = np.loadtxt('/Users/KANG/geneoscopy_dev/data/20170113_nanostring_project_18731/POP_48_samples_011817_PosNormData_lit.txt', dtype=str, delimiter='\t') samples = data[0,6:] groups = data[1,6:] genes = data[2:,1] expr = np.array(data[2:,6:], dtype=float) normal_indx = np.where(groups == "Normal")[0] polyp_indx = np.where(groups == "Polyp")[0] cancer_indx = np.where(groups == "Cancer")[0] for contrast_group_indx in [np.append(polyp_indx,cancer_indx)]: # for contrast_group_indx in [polyp_indx, cancer_indx, np.append(polyp_indx,cancer_indx)]: utest_results = [] for i in range(len(genes)): control = expr[i,normal_indx] contrast = expr[i,contrast_group_indx] utest_stats = mannwhitneyu(control, contrast) fc = np.mean(contrast)/np.mean(control) utest_results.append([np.log2(fc), utest_stats[1]]) utest_results = np.array(utest_results) indx_sorted = np.argsort(utest_results[:,1]) # indx_sorted = np.argsort(np.abs(utest_results[:,0]))[::-1] out = np.hstack(( genes[indx_sorted][np.newaxis].T, utest_results[indx_sorted,:])) np.savetxt('/Users/KANG/geneoscopy_dev/data/20170113_nanostring_project_18731/POP_48_samples_011817_PosNormData_lit_DE_analysis.txt', out, fmt="%s", delimiter="\t") # print "gene", "log2FC", "p-val"
import pandas as pd import sys from scipy import stats SAMPLE_TMP = sys.argv[1] DD = sys.argv[2] sgRNA_tmp = sys.argv[3] TF = sys.argv[4] FILE_tmp = DD + '/' + sgRNA_tmp + '/' + SAMPLE_TMP + "_" + sgRNA_tmp + '_' + TF + '_raw_data.txt' data_tmp = pd.read_table(FILE_tmp) res_WMW = stats.mannwhitneyu( data_tmp['deviation'][data_tmp['sgRNA'] == 'sgNTC'], data_tmp['deviation'][data_tmp['sgRNA'] == sgRNA_tmp]) Pvalue_tmp = res_WMW.pvalue print(sgRNA_tmp + '__' + TF, res_WMW.pvalue)
style='treatment', data=long_data) plt.savefig('mean_line.svg') plt.figure(figsize=(4, 3)) sns.lineplot(x='Time(s)', y='value', hue='treatment', style='repetition', data=long_data) plt.savefig('all_line.svg') ## statistical analysis # Mann Whiteney U man_s, man_p = mannwhitneyu( long_data.value[long_data['treatment'] == 'Control'], long_data.value[long_data['treatment'] == 'IP10']) w_s, w_p = wilcoxon(long_data.value[long_data['treatment'] == 'Control'], long_data.value[long_data['treatment'] == 'IP10']) alpha = 0.05 if w_p < alpha: print( f'The differences are statistically significant with a p value of {w_p}, we reject H0' ) else: print( 'Both groups come from a population with the same distribution, we accept H0' )
# -*- coding: utf-8 -*- import math import random from scipy import stats # Test Mood's równoci median: stats.median_test(dane_1, dane_2) # Test U Manna Whitney'a (nieparametryczny odpowiednik testu t-studenta dla prób niezależnych): stats.mannwhitneyu(dane_1, dane_2) # Test Wilcoxsona (odpowiednik testu t-studenta dla prób zależnych): stats.wilcoxon(dane_1, dane_2) # Test Kurskala - Wallisa (nieparametryczny odpowiednik jednoczynnikowej ANOVA dla prób niezależnych): stats.kruskal(dane_1, dane_2, dane_3) # Test Friedmana (nieparametryczny odpowiednik jednoczynnikowej ANOVA dla prób zależnych): stats.friedmanchisquare(dane_1, dane_2, dane_3)
#ListMajor = ['Mean','STD','Skewness','Kurtosis','Entropy'] #ListMinor = ['Area','MajorAxis','MinorAxis','AxesRatio','mean_R','mean_G','mean_B','Mean Distance','Max Distance','Min Distance'] ## Check these 🚩 #for x in ListMajor: # for y in ListMinor: # labelList.append(x +'_'+ y) labelList = ['Mean_Area', 'Mean_MajorAxis', 'Mean_MinorAxis', 'Mean_AxesRatio', 'Mean_mean_R', 'Mean_mean_G', 'Mean_mean_B', 'Mean_Mean Distance', 'Mean_Max Distance', 'Mean_Min Distance', 'STD_Area', 'STD_MajorAxis', 'STD_MinorAxis', 'STD_AxesRatio', 'STD_mean_R', 'STD_mean_G', 'STD_mean_B', 'STD_Mean Distance', 'STD_Max Distance', 'STD_Min Distance', 'Skewness_Area', 'Skewness_MajorAxis', 'Skewness_MinorAxis', 'Skewness_AxesRatio', 'Skewness_mean_R', 'Skewness_mean_G', 'Skewness_mean_B', 'Skewness_Mean Distance', 'Skewness_Max Distance', 'Skewness_Min Distance', 'Kurtosis_Area', 'Kurtosis_MajorAxis', 'Kurtosis_MinorAxis', 'Kurtosis_AxesRatio', 'Kurtosis_mean_R', 'Kurtosis_mean_G', 'Kurtosis_mean_B', 'Kurtosis_Mean Distance', 'Kurtosis_Max Distance', 'Kurtosis_Min Distance', 'Entropy_Area', 'Entropy_MajorAxis', 'Entropy_MinorAxis', 'Entropy_AxesRatio', 'Entropy_mean_R', 'Entropy_mean_G', 'Entropy_mean_B', 'Entropy_Mean Distance', 'Entropy_Max Distance', 'Entropy_Min Distance', 'Shape.FSD1', 'Shape.FSD2', 'Shape.FSD3', 'Shape.FSD4', 'Shape.FSD5', 'Shape.FSD6', 'Gradient.Mag.Mean', 'Gradient.Mag.Std', 'Gradient.Mag.Skewness', 'Gradient.Mag.Kurtosis', 'Gradient.Mag.HistEntropy', 'Gradient.Mag.HistEnergy', 'Gradient.Canny.Sum', 'Gradient.Canny.Mean', 'Haralick.ASM.Mean', 'Haralick.ASM.Range', 'Haralick.Contrast.Mean', 'Haralick.Contrast.Range', 'Haralick.Correlation.Mean', 'Haralick.Correlation.Range', 'Haralick.SumOfSquares.Mean', 'Haralick.SumOfSquares.Range', 'Haralick.IDM.Mean', 'Haralick.IDM.Range', 'Haralick.SumAverage.Mean', 'Haralick.SumAverage.Range', 'Haralick.SumVariance.Mean', 'Haralick.SumVariance.Range', 'Haralick.SumEntropy.Mean', 'Haralick.SumEntropy.Range', 'Haralick.Entropy.Mean', 'Haralick.Entropy.Range', 'Haralick.DifferenceVariance.Mean', 'Haralick.DifferenceVariance.Range', 'Haralick.DifferenceEntropy.Mean', 'Haralick.DifferenceEntropy.Range', 'Haralick.IMC1.Mean', 'Haralick.IMC1.Range', 'Haralick.IMC2.Mean', 'Haralick.IMC2.Range', 'Size.Area', 'Size.MajorAxisLength', 'Size.MinorAxisLength', 'Size.Perimeter', 'Shape.Circularity', 'Shape.Eccentricity', 'Shape.EquivalentDiameter', 'Shape.Extent', 'Shape.MinorMajorAxisRatio', 'Shape.Solidity'] failCount = 0 ############# Calculate Stats ############## for i in range(1, data1.shape[1]): ## Iterating over columns # for i in range(51, 91): ## Iterating over columns list1 = data1.iloc[:,i].values list2 = data2.iloc[:,i].values try: stat, p = mannwhitneyu(list1, list2) except ValueError: # import pdb; pdb.set_trace() print('Defect in feature number '+str(i)) continue statsList.append(stat) pList.append(p) # print('Statistics=%.3f, p=%.3f' % (stat, p)) alpha = 0.05 if p > alpha: # print('SAME (fail to reject H0)') resultList.append('Same') failCount +=1 else: # print('Different distribution (reject H0)')
plt.xlabel('Complexity') plt.ylabel('Reward') plt.show() plt.savefig('cr.png') elif command.strip().lower() == 'lean': env = SingleCartPoleEnv() env.lean() elif command.strip().lower() == 'utest': '''执行Mann-Whitney U test''' algs = ['neat', 'hyperneat', 'dqn', 'ddqn', 'policy'] for i, alg1 in enumerate(algs): algs2 = algs[i + 1:] for j, alg2 in enumerate(algs2): complex1, reward1, _ = loadcomplex(alg1, 'noreset') complex2, reward2, _ = loadcomplex(alg2, 'noreset') u_stat, p_val = stats.mannwhitneyu(reward1, reward2) lessthan0_05 = bool(p_val < 0.05) print(alg1 + '-' + alg2 + '的u_stat,pvalue为' + str(u_stat) + ',' + str(p_val) + ',p值小于0.05为' + str(lessthan0_05)) elif command.strip().lower() == 'evolvability': #complexityupperlimit =params['upper'] if 'upper' in params.keys() else 2000.0 complexityupperlimit = params[ 'upper'] if 'upper' in params else 2000.0 '''采用公式8''' algs = ['neat', 'hyperneat', 'dqn', 'ddqn', 'policy'] evolvability1 = {} t = 0. for i, alg in enumerate(algs): complex, reward, _ = loadcomplex(alg, 'noreset') complex = [c for c in complex if c <= complexityupperlimit] reward = reward[:len(complex)]
axBW.plot(pos, D1PopStat, 'o', mec=colorD1, mfc='None', alpha=markerAlpha) medline(axBW, np.median(D1PopStat), 1, 0.5) axBW.set_ylabel('BW10', fontsize=fontSizeLabels) # tickLabels = ['nD1:Str', 'D1:Str'] tickLabels = [ 'nD1:Str\nn={}'.format(len(nD1PopStat)), 'D1:Str\nn={}'.format(len(D1PopStat)) ] axBW.set_xticks(range(2)) axBW.set_xlim([-0.5, 1.5]) extraplots.boxoff(axBW) extraplots.set_ticks_fontsize(axBW, fontSizeTicks) axBW.set_xticklabels(tickLabels, fontsize=fontSizeLabels, rotation=45) zstat, pVal = stats.mannwhitneyu(nD1PopStat, D1PopStat) #Nick used stats.ranksum messages.append("{} p={}".format(popStatCol, pVal)) yDataMax = max([max(D1PopStat), max(nD1PopStat)]) yStars = yDataMax + yDataMax * starYfactor yStarHeight = (yDataMax * starYfactor) * starHeightFactor plt.sca(axBW) starString = None if pVal < 0.05 else 'n.s.' extraplots.significance_stars([0, 1], yStars, yStarHeight, starMarker='*', starSize=fontSizeStars + 2, starString=starString, gapFactor=starGapFactor)
# p2 # print(odd_df_number, # even_df_number) # print(odd_df_l0_number, # odd_df_s0_number, # even_df_l0_number, # even_df_s0_number) # print(odd_df_l0_number+ # odd_df_s0_number+ # even_df_l0_number+ # even_df_s0_number) # print(df.shape[0]) # In[6]: p3 = stats.mannwhitneyu(odd_df['search_count'], even_df['search_count']).pvalue p4 = stats.mannwhitneyu(odd_insdf['search_count'], even_insdf['search_count']).pvalue # In[7]: def main(): # searchdata_file = sys.argv[1] # df = pd.read_json(searchdata_file, orient = 'records', lines = True) # ... # Output print( OUTPUT_TEMPLATE.format( more_users_p=p1,
print('Numero de genes no esenciales que estan en CERES:', str(len(ceres_genes_linea.loc[essential_in_ceres]))) #Añades al data frame de la frecuencia de los genes esenciales, si el gen esta como esencial o no esta. print('Añadiendo al df de la frecuencia de esenciales, presencia o ausencia de gen esencial en el modelo') for gene in ceres_genes_linea.index: if gene in essential_in_ceres: df_frecuencia_esenciales.at[gene,th_l+'_'+th_u] = 1 else: df_frecuencia_esenciales.at[gene,th_l+'_'+th_u] = 0 print('Mann Whitney') x = ceres_genes_linea.loc[essential_in_ceres] y = ceres_genes_linea.loc[non_essential_in_ceres] U, p = mannwhitneyu(x, y, use_continuity=True) #Calculas media de la expresion y del Score Ceres de los genes predichos como esenciales y su suma. df_expresion_t = df_expresion.T expr_genes_linea = df_expresion_t[cell_line] ScoreCeres_genes_predict = ceres_genes_linea.loc[essential_in_ceres] Expr_genes_predict = expr_genes_linea.loc[essential_in_ceres] mean_ScoreCeres_predict = ScoreCeres_genes_predict.mean() mean_Expr_predict = Expr_genes_predict.mean() valor_ceres_suma = ScoreCeres_genes_predict.sum() total_rx = len(csm2.reactions) Number_essential = len(essential)
ax.set_xticks([0, 1, 2]) ax.set_xticklabels([ 'Tagged\nN={}'.format(len(dataTagged)), 'Close\nUntagged\nN={}'.format(len(dataCloseUntagged)), 'Far\nUntagged\nN={}'.format(len(dataFarUntagged)) ]) extraplots.boxoff(ax) #0-1 yMin = 0 yMax = 2 yStars = [yMax * 1.1, yMax * 1.2] yStarHeight = (yMax - yMin) * 0.05 starGapFactor = 0.1 fontSizeStars = 9 zVal, pVal = stats.mannwhitneyu(dataTagged, dataCloseUntagged) print "{} Tagged vs. close untagged, p={}".format(feature, pVal) if pVal < 0.05: extraplots.new_significance_stars([0, 0.9], yStars[0], yStarHeight, starMarker='*', fontSize=fontSizeStars, gapFactor=starGapFactor, ax=ax) else: extraplots.new_significance_stars([0, 0.9], yStars[0], yStarHeight, starMarker='n.s.', fontSize=fontSizeStars,
#%% plot enhancer architecture length per age e_colors = ["amber", "faded green"] e_pal = sns.xkcd_palette(e_colors) s_colors = ["greyish", "slate grey"] s_pal = sns.xkcd_palette(s_colors) hue_order = ["FANTOM", "Shuffle"] fig, (ax1) = plt.subplots(figsize=(8, 8)) order = ["Simple", "Complexenh"] sns.barplot(y = "enh_len", x = "taxon2",\ data = enh_lens.sort_values(by = "mrca_2"), ax = ax1,\ hue = "arch", palette = e_pal, estimator = np.median)#showfliers=False) ms, msp = stats.mannwhitneyu( enh_lens.enh_len.loc[enh_lens.arch.str.contains("imple")], shuf_len.enh_len.loc[shuf_len.arch.str.contains("imple")]) print("simple", ms, msp) mc, mcp = stats.mannwhitneyu( enh_lens.enh_len.loc[enh_lens.arch.str.contains("omplex")], shuf_len.enh_len.loc[shuf_len.arch.str.contains("omplex")]) print("complex", mc, mcp) ax1.set(ylabel="Enhancer Length (bp)", ylim=(190, 400), xlabel="") ax1.set_xticklabels(ax1.get_xticklabels(), rotation=90, horizontalalignment="left") ax1.get_legend().remove() plt.savefig("%sfig2c-Fantom_ENH_MRCA_x_LEN_ENH.pdf" % RE, bbox_inches="tight") """ RESULTS enhancer lengths v. expected shuffle lengths for simple, complex
df_subjects = df_cycles_burst.groupby(['group', 'subject_id']).mean()[features_keep].reset_index() print(df_subjects) #################################################################################################### feature_names = {'volt_amp': 'Amplitude', 'period': 'Period (ms)', 'time_rdsym': 'Rise-decay symmetry', 'time_ptsym': 'Peak-trough symmetry'} for feat, feat_name in feature_names.items(): g = sns.catplot(x='group', y=feat, data=df_subjects) plt.xlabel('') plt.xticks(size=20) plt.ylabel(feat_name, size=20) plt.yticks(size=15) plt.tight_layout() plt.show() #################################################################################################### # # Statistical differences in cycle features # ----------------------------------------- #################################################################################################### for feat, feat_name in feature_names.items(): x_treatment = df_subjects[df_subjects['group']=='patient'][feat] x_control = df_subjects[df_subjects['group']=='control'][feat] U, p = stats.mannwhitneyu(x_treatment, x_control) print('{:20s} difference between groups, U= {:3.0f}, p={:.5f}'.format(feat_name, U, p))
def generate_significance_dataframe(): # classificadores names = [] names.append(('RL')) names.append(('ADL')) names.append(('ADQ')) names.append(('KNN')) names.append(('NBG')) names.append(('NBM')) names.append(('SVML')) names.append(('SVMR')) names.append(('RF')) names.append(('ET')) names.append(('ENS')) # csv com as previsões feitas pelos classificadores (instâncias) # no conjunto de dados de teste pred = pd.read_csv("../SoccerPrediction/Results/\ pred.csv", sep=';') df_pred = pd.DataFrame(pred, columns=[ 'RL', 'ADL', 'ADQ', 'ET', 'KNN', 'NBG', 'NBM', 'RF', 'SVML', 'SVMR', 'ENS' ]) df_significance = pd.DataFrame(columns=['Class1', 'Class2', 'p']) # loop que itera entre todos os classificadores e gera um dataframe # da significancia da previsão de todos entre todos for name in names: class1 = name dist1 = df_pred['%s' % class1].tolist() for name2 in names: class2 = name2 dist2 = df_pred['%s' % class2].tolist() u, prob = stats.mannwhitneyu(dist1, dist2, alternative='two-sided') df_temp = pd.DataFrame({ 'Class1': [class1], 'Class2': [class2], 'p': [prob] }) df_significance = df_significance.append(df_temp) df_significance.to_csv('significance.csv', sep=';') # numeros foram arredondados e retirou-se a notação científica pelo excel df_significance = pd.read_csv("../SoccerPrediction/Results/\ significance.csv", sep=';') df_significance = pd.DataFrame(df_significance, columns=['Class1', 'Class2', 'p']) significance = df_significance.pivot('Class1', 'Class2', 'p') print(significance) f, ax = plt.subplots(figsize=(9, 6)) sns.heatmap(significance, annot=True, linewidths=.5, ax=ax) plt.ylabel('Classificador 1', fontsize=16) plt.xlabel('Classificador 2', fontsize=16) plt.show()