def in_out_mask_ttest_from_conn_map(wpth,rseedz, imsk, omsk): oldpth = pthswp(wpth) incde = codegen(6) outcde = codegen(6) print 'generating data masks....' inmsk = img_2_maskd_array(imsk) outmsk = img_2_maskd_array(omsk) print 'beginning tests' ind = [] for i in range(len(rseedz)): ind.append((i+1)) df = pandas.DataFrame(np.zeros((len(rseedz),4)),index = ind, columns =['t','p','wt','wp']) for i,seed in enumerate(rseedz): os.system('fslmaths %s -mas %s %s'%(seed,imsk,incde)) os.system('fslmaths %s -mas %s %s'%(seed,omsk,outcde)) invals = np.ma.masked_array(ni.load('%s.nii.gz'%(incde)).get_data(),mask = inmsk).flatten() outvals = np.ma.masked_array(ni.load('%s.nii.gz'%(outcde)).get_data(),mask = outmsk).flatten() t,p = st.ttest_ind(invals,outvals) wt,wp = st.ttest_ind(invals,outvals,equal_var = False) df.ix[(i+1),'t'] = t df.ix[(i+1),'p'] = p df.ix[(i+1),'wt'] = wt df.ix[(i+1),'wp'] = wp print 'finished with seed %s'%(seed) os.system('rm %s* %s*'%(incde,outcde)) os.chdir(oldpth) return df
def test_continuous(a, b): # simple t-test try: p_value = stats.ttest_ind(a, b)[1] except: p_value = 1 return p_value, "%.2g" % mean(a)
def print_table(): print >> sys.stderr, "printing table now" for i in range(len(Table[0])): a = bin2pos(i) if ifSlim: s = 0 for j in range(len(Table)): s = s + Table[j][i] if s == 0: continue print a.chr, "\t", a.start, "\t", a.stop, if ifCompare: x = [] y = [] for j in range(case_number): x.append(Table[j][i]) for j in range(case_number, len(Table)): y.append(Table[j][i]) (t, p) = ttest_ind(x, y) print "\t", t, "\t", p, for j in range(len(Table)): if ifNormalize: print "\t%.2f" % Table[j][i], else: print "\t", Table[j][i], for j in range(len(AnnoTable)): print "\t", AnnoTable[j][i], print
def permutation_ttest(W, B): p_value = permutation_test(W, B, method='approximate', num_rounds=100, func=lambda W, B: stats.ttest_ind(W, B), seed=0) return 1 if p_value < 0.05 else 0
def _ttest(orig_score, rep_score, rpd=True, pbar=False): """ @param orig_score: The original scores. @param rep_score: The reproduced/replicated scores. @param rpd: Boolean indicating if the evaluated runs are reproduced. @param pbar: Boolean value indicating if progress bar should be printed. @return: Generator with p-values. """ if rpd: # paired two-tailed t-test topic_scores_orig = topic_scores(orig_score) topic_scores_rep = topic_scores(rep_score) generator = tqdm( topic_scores_orig.items()) if pbar else topic_scores_orig.items() for measure, scores in generator: yield measure, ttest_rel(scores, topic_scores_rep.get(measure)).pvalue else: # else unpaired two-tailed t-test topic_scores_orig = topic_scores(orig_score) topic_scores_rep = topic_scores(rep_score) generator = tqdm( topic_scores_orig.items()) if pbar else topic_scores_orig.items() for measure, scores in generator: yield measure, ttest_ind(scores, topic_scores_rep.get(measure)).pvalue
def print_table(): print >>sys.stderr,"printing table now" for i in range(len(Table[0])): a=bin2pos(i) if ifSlim: s=0 for j in range(len(Table)): s=s+Table[j][i] if s==0: continue print a.chr,"\t",a.start,"\t",a.stop, if ifCompare: x=[] y=[] for j in range(case_number): x.append(Table[j][i]) for j in range(case_number,len(Table)): y.append(Table[j][i]) (t,p)=ttest_ind(x,y) print "\t",t,"\t",p, for j in range(len(Table)): if ifNormalize: print "\t%.2f"%Table[j][i], else: print "\t",Table[j][i], for j in range(len(AnnoTable)): print "\t",AnnoTable[j][i], print
def jitter_MWU(values, start, mid, end): """ RETURN A BETTER MIDPOINT< ACCOUNTING FOR t-test RESULTS """ # ADD SOME CONSTRAINTS TO THE RANGE OF VALUES TESTED m_start = min(mid, max(start + MIN_POINTS, mid - JITTER)) m_end = max(mid, min(mid + JITTER, end - MIN_POINTS)) if m_start == m_end: return no_good_edge, no_good_edge, mid mids = np.array(range(m_start, m_end)) # MWU SCORES try: m_score = np.array([ stats.mannwhitneyu( values[max(start, m - MAX_POINTS):m], values[m:min(end, m + MAX_POINTS)], use_continuity=True, alternative="two-sided", ) for m in mids ]) t_score = np.array([ stats.ttest_ind( values[max(start, m - MAX_POINTS):m], values[m:min(end, m + MAX_POINTS)], equal_var=False, ) for m in mids ]) except Exception as e: e = Except.wrap(e) if "All numbers are identical" in e: return no_good_edge, no_good_edge, mids[0] raise e # TOTAL SUM-OF-SQUARES # DO NOT KNOW WHAT THIS WAS DOING # if m_start - start == 0: # # WE CAN NOT OFFSET BY ONE, SO WE ADD A DUMMY VALUE # v_prefix = np.array([np.nan] + list(not_right(cumSS(values[start:m_end]), 1))) # else: # # OFFSET BY ONE, WE WANT cumSS OF ALL **PREVIOUS** VALUES # v_prefix = not_right( # not_left(cumSS(values[start:m_end]), m_start - start - 1), 1 # ) # v_suffix = not_right(cumSS(values[m_start:end][::-1])[::-1], end - m_end) # v_score = v_prefix + v_suffix # pvalue = np.sqrt(m_score[:, 1] * v_score) # GOEMEAN OF SCORES # PICK LOWEST pvalue = np.sqrt(m_score[:, 1] * t_score[:, 1]) best = np.argmin(pvalue) return Data(pvalue=m_score[best, 1]), Data(pvalue=t_score[best, 1]), mids[best]
def select(self, n, array, elems, result, ref): if n == 0: t2, p2 = stats.ttest_ind(elems, ref) result.append(p2) else: for i in range(len(array)): result = self.select(n - 1, array[i + 1:], elems + [array[i]], result, ref) return result
def perform_ttest_on_averages(self): """ Performs t-test on the average cos-sim score of each user :return: """ control_scores = self.get_avg_scores('control') patient_scores = self.get_avg_scores('patients') ttest = stats.ttest_ind(control_scores, patient_scores) return ttest
def ttest_f(train, trainLabel, loop): (a, b) = np.shape(train) score = np.zeros(b) target = trainLabel for i in range(b): fea = train[:, i] val = ttest_ind(fea, target)[0] score[i] = val ranking = np.argsort(score)[::-1] return ranking[0:loop]
def roi_ttest(): """ compare rsfc difference between ROIs scheme: hemi-separately network-wise """ import numpy as np import pickle as pkl import pandas as pd from scipy.stats.stats import ttest_ind from cxy_hcp_ffa.lib.predefine import net2label_cole from commontool.stats import EffectSize # parameters hemis = ('lh', 'rh') roi_pair = ('pFus-face', 'mFus-face') data_file = pjoin(work_dir, 'rsfc_individual2Cole_{}.pkl') compare_name = f"{roi_pair[0].split('-')[0]}_vs_" \ f"{roi_pair[1].split('-')[0]}" # outputs out_file = pjoin(work_dir, f"rsfc_individual2Cole_{compare_name}_ttest.csv") # start trg_names = list(net2label_cole.keys()) trg_labels = list(net2label_cole.values()) out_data = {'network': trg_names} es = EffectSize() for hemi in hemis: data = pkl.load(open(data_file.format(hemi), 'rb')) assert data['trg_label'] == trg_labels out_data[f'CohenD_{hemi}'] = [] out_data[f't_{hemi}'] = [] out_data[f'P_{hemi}'] = [] for trg_idx, trg_name in enumerate(trg_names): sample1 = data[roi_pair[0]][:, trg_idx] sample2 = data[roi_pair[1]][:, trg_idx] nan_vec1 = np.isnan(sample1) nan_vec2 = np.isnan(sample2) print(f'#NAN in sample1:', np.sum(nan_vec1)) print(f'#NAN in sample2:', np.sum(nan_vec2)) sample1 = sample1[~nan_vec1] sample2 = sample2[~nan_vec2] d = es.cohen_d(sample1, sample2) t, p = ttest_ind(sample1, sample2) out_data[f'CohenD_{hemi}'].append(d) out_data[f't_{hemi}'].append(t) out_data[f'P_{hemi}'].append(p) # save out out_data = pd.DataFrame(out_data) out_data.to_csv(out_file, index=False)
def in_out_mask_ttest_from_conn_map(wpth, rseedz, imsk, omsk): oldpth = pthswp(wpth) incde = codegen(6) outcde = codegen(6) print 'generating data masks....' inmsk = img_2_maskd_array(imsk) outmsk = img_2_maskd_array(omsk) print 'beginning tests' ind = [] for i in range(len(rseedz)): ind.append((i + 1)) df = pandas.DataFrame(np.zeros((len(rseedz), 4)), index=ind, columns=['t', 'p', 'wt', 'wp']) for i, seed in enumerate(rseedz): os.system('fslmaths %s -mas %s %s' % (seed, imsk, incde)) os.system('fslmaths %s -mas %s %s' % (seed, omsk, outcde)) invals = np.ma.masked_array(ni.load('%s.nii.gz' % (incde)).get_data(), mask=inmsk).flatten() outvals = np.ma.masked_array(ni.load('%s.nii.gz' % (outcde)).get_data(), mask=outmsk).flatten() t, p = st.ttest_ind(invals, outvals) wt, wp = st.ttest_ind(invals, outvals, equal_var=False) df.ix[(i + 1), 't'] = t df.ix[(i + 1), 'p'] = p df.ix[(i + 1), 'wt'] = wt df.ix[(i + 1), 'wp'] = wp print 'finished with seed %s' % (seed) os.system('rm %s* %s*' % (incde, outcde)) os.chdir(oldpth) return df
def ttest_variables_targetBinary(data, X, y=None, method='data'): """ Calculates the effect of feature variables on target variable using ttest and save it on a DataFrame :param data - DataFrame with relevant columns or dictionary with statistics (mean, std) X - feature columns y - target variable name, only binary y is accepted for now method - ttest to be used, 'either' data or 'statistics' (when only mean and std available) :return result - DataFrame with column name, statistic value and p-value """ result = pd.DataFrame(columns=['col', 'statistic_value', 'pvalue']) if method == 'data': assert isinstance(data, pd.DataFrame), "Data is not a DataFrame" assert y is not None, "Target variable name not provided" y_levels = data[y].unique() assert len(y_levels) == 2, "More that two levels in target variable" for col in X: data_col = data.dropna(subset=[col]) level_0 = data_col.loc[data_col[y] == y_levels[0], col] level_1 = data_col.loc[data_col[y] == y_levels[1], col] stat, p = ttest_ind(level_0, level_1) to_append = pd.DataFrame([{ 'col': col, 'statistic_value': stat, 'pvalue': p }]) result = result.append(to_append, sort=False, ignore_index=True) elif method == 'statistics': assert isinstance(data, dict), "Data is not a Dictionary" for col in X: assert col in data, "{} not in data".format(col) data_col = data[col] stats_ = ['mean1', 'mean2', 'std1', 'std2', 'nob1', 'nob2'] for stat in stats_: assert stat in data_col, "{} not in {} data".format(stat, col) mean1, std1, nob1 = data_col['mean1'], data_col['std1'], data_col[ 'nob1'] mean2, std2, nob2 = data_col['mean1'], data_col['std2'], data_col[ 'nob2'] stat, p = ttest_ind_from_stats(mean1, std1, nob1, mean2, std2, nob2) to_append = pd.DataFrame([{ 'col': col, 'statistic_value': stat, 'pvalue': p }]) result = result.append(to_append, sort=False, ignore_index=True) return result
def jitter_MWU(values, start, mid, end): # ADD SOME CONSTRAINTS TO THE RANGE OF VALUES TESTED m_start = min(mid, max(start + MIN_POINTS, mid - JITTER)) m_end = max(mid, min(mid + JITTER, end - MIN_POINTS)) if m_start == m_end: return no_good_edge, no_good_edge, mid mids = np.array(range(m_start, m_end)) # MWU SCORES m_score = np.array( [ stats.mannwhitneyu( values[max(start, m - MAX_POINTS) : m], values[m : min(end, m + MAX_POINTS)], use_continuity=True, alternative="two-sided", ) for m in mids ] ) t_score = np.array( [ stats.ttest_ind( values[max(start, m - MAX_POINTS) : m], values[m : min(end, m + MAX_POINTS)], equal_var=False, ) for m in mids ] ) # TOTAL SUM-OF-SQUARES if m_start - start == 0: # WE CAN NOT OFFSET BY ONE, SO WE ADD A DUMMY VALUE v_prefix = np.array([np.nan] + list(not_right(cumSS(values[start:m_end]), 1))) else: # OFFSET BY ONE, WE WANT cumSS OF ALL **PREVIOUS** VALUES v_prefix = not_right( not_left(cumSS(values[start:m_end]), m_start - start - 1), 1 ) v_suffix = not_right(cumSS(values[m_start:end][::-1])[::-1], end - m_end) v_score = v_prefix + v_suffix # PICK LOWEST pvalue = np.sqrt(m_score[:, 1] * v_score) # GOEMEAN OF SCORES best = np.argmin(pvalue) return Data(pvalue=m_score[best, 1]), Data(pvalue=t_score[best, 1]), mids[best]
def t_test_ind(dataset, target_col, protected_col, equal_var=0): """ performs the independent two-sample t-Test, or Welch's test if equality of the variances is not given @param dataset: @param target_col: name of the column that contains the classifier results @param protected_col: name of the column that contains the protection status @param equal_var: if True, perform a standard independent 2 sample test that assumes equal population variances and sample size. If False (default), perform Welch’s t-test, which does not assume equal population variance @return: calculated t-statistic and two-tailed p-value """ protected_targets = dataset.get_all_targets_of_group( target_col, protected_col, 1) nonprotected_targets = dataset.get_all_targets_of_group( target_col, protected_col, 0) return ttest_ind(protected_targets, nonprotected_targets, equal_var)
def transform(self, labels): ds = self.dataset conditions = self.conditions single_value = self.sample_value if conditions == single_value == None: raise ValueError() elif len(conditions) > 2: raise ValueError() if single_value != None: t, p = ttest_1samp(ds, single_value, axis=0) return t, p t, p = ttest_ind(ds[labels == conditions[0]], ds[labels == conditions[1]], axis=0) #print ds.shape #print t.shape t[np.isnan(t)] = 1 return t
def run(self, labels): ds = self.dataset conditions = self.conditions single_value = self.sample_value if conditions == single_value == None: raise ValueError() elif len(conditions)>2: raise ValueError() if single_value != None: t, p = ttest_1samp(ds, single_value, axis=0) return t, p t, p = ttest_ind(ds[labels == conditions[0]], ds[labels == conditions[1]], axis=0 ) #print ds.shape #print t.shape t[np.isnan(t)] = 1 return t
def main(symbol_dict): # Here's where I put it all together. Once the slope of each fund # have been calculated, I run a two-tailed independent t-test # comparing the sets of slopes for the two types of funds. The # test returns a tuple of the form (test statistic, p-value) # The results require us to set a statistical significance level (alpha). # (when I was analyzing astroparticle physics data, we used # alpha = 5.7x10^(-5), but for this, alpha = 0.05 or 0.01 should be # sufficient.) The results can be interpreted as follows: # p > alpha/2: There is insufficient evidence to reject the claim # that the two samples have the same mean (i.e. there is no difference # between the leveraged bonds and the emerging market stocks) # p < alpha/2 and t < 0: the average increase in value of an emerging # market fund is likely greater than that of a leveraged bond fund over the # same period # p < alpha/2 and t > 0: the average increase in value of an emerging # market fund is likely less than that of a leveraged bond fund over the # same period # Note: I am running a Welch's t-test rather than a Student's t-test # because there is no reason to assume that the two populations have equal # variances. slopes = {} for key in symbol_dict: for bond in symbol_dict[key]: pull_historical_data(bond, key) slopes[key] = get_slopes(symbol_dict[key], key) return stats.ttest_ind(slopes['US_bond'], slopes['emerging_market'], equal_var=False)
def student_t_test_ind(approaches, accuracy_values, save_path): # calculate the two sided unpaired students t-test from scipy # it compare all approaches with each other # calculate the T-test for the means of two independent samples of scores student_t_test_ind_frame = pd.DataFrame() for i in range(len(approaches)): for j in range(i, len(approaches), 1): # iterate through approaches approach_i = approaches[i] approach_j = approaches[j] values_i = accuracy_values.loc[:, approach_i] values_j = accuracy_values.loc[:, approach_j] t_statistic, two_tailed_p_test = stats.ttest_ind(values_i, values_j) student_t_test_ind_frame.at[approach_i, approach_j] = two_tailed_p_test save_path.mkdir(parents=True, exist_ok=True) fig = plt.figure(figsize=(4, 2)) ax = fig.subplots() ax = sns.heatmap(student_t_test_ind_frame, ax=ax, annot=True, fmt="0.3f", cmap="autumn", vmin=0, vmax=0.05) plt.xticks(rotation=45) fig.canvas.start_event_loop(sys.float_info.min) path = save_path / 'students-test_scipy_ind.png' fig.savefig(path, bbox_inches='tight', dpi=100) plt.close(fig)
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) if logb!=0: if value==0.0: raise ValueError value=log(value)/logb plotData[idx].append(value) except: pass fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() print >> stderr,xtickLabels print >> stderr,relabels for i,relabel in zip(range(0,len(relabels)),relabels): xtickLabels[i]=relabel for i in range(0,len(plotMedianForGroups)): plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i]) #drawing medians: medianToDraw=[] for mediangrouper in plotMedianForGroups: curD=[] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData)-1,-1,-1): if len(plotData[c])==0: print >> stderr,xtickLabels[c],"discarded" del plotData[c] del xtickLabels[c] print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1] except: print >> stdout, xtickLabels[x],"NA","NA" pvalueM=[] print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=ttest_ind(plotData[x],plotData[y])[1] except: pvalue=1.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 except: pvalue=1.0 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes) #ylim([0,200]) for m in medianToDraw: axhline(y=m,linestyle=':',color='gray') savefig(outputFile,bbox_inches="tight") if len(plotHistogramToFile)>0: drawHistogram(plotHistogramToFile,plotData,xtickLabels)
####Pearson test for correlations print('Correlation test for continuous variables') for i in range(0, len(continuous)): for j in range(i + 1, len(continuous)): print('corr between', continuous[i], 'and', continuous[j], ' : r=', round(pearsonr(stud[continuous[i]], stud[continuous[j]])[0], 3), ' p=', round(pearsonr(stud[continuous[i]], stud[continuous[j]])[1], 3)) ####T-test for G3 mean: binary features for var in binary: print('G3 mean analisys by ' + var) print(stud.groupby([var]).mean()['G3']) x = stud.where(stud[var] == np.unique(stud[var])[0]).dropna()['G3'] y = stud.where(stud[var] == np.unique(stud[var])[1]).dropna()['G3'] print(ttest_ind(x, y)) plt.figure() sns.boxplot(x=var, y='G3', data=stud) plt.show() #The null hypothesis is not rejected in the variables: #famsize, parents cohabitation, scholar support, family support, #paid, extracurricular activities, and nursery. ####Spearman rho test between G3 and ordinal features for var in ordinal: print('Spearman rho test between G3 and' + ' ' + var) print(spearmanr(stud.G3, stud[var])) #The null hypothesis of nulity correlation of correlation is not #rejected in the variables: famrel
numsamples=500 samp_size=10 alpha=0.05 print "\n",lith[f],round(np.mean(curRh),0) smd.append(SMD_analysis(curUCS,numsamples,samp_size,alpha)) curax=plt.gca() if f==len(lith)-1: curax.set_xlabel("UCS, MPa") plt.setp(curax.get_yticklabels(), visible=False) plt.tight_layout() for f in range(len(lith)): cursmd=smd[f] print "" for g in range(f+1,len(lith)): comsmd=smd[g] T,pT=st.ttest_ind(cursmd,comsmd) KS,pKS=st.ks_2samp(cursmd,comsmd) print [lith[f], lith[g], round(pT,3), round(pKS,3)] plt.show()
plt.title('Difference between charges of a smoker and a non-smoker') plt.show() plt.close() # No apparent relationship between gender and charges ########################################################################### # 1. T-test to check the dependancy of smoking and charges h0 = "Charges of smoker and non-smoker are the same" h1 = "Charges of smoker and non-smoker are not the same" # selecting charges corressponding to smokers as an array x = np.array(insurance_df[insurance_df['smoker'] == 'yes']['charges']) # selecting charges corressponding to non-smokers as an array y = np.array(insurance_df[insurance_df['smoker'] == 'no']['charges']) t, p_value = stats.ttest_ind(x, y, axis=0) # For significance level of 5% if p_value < 0.05: print(f'{h1} as the p_value {p_value.__round__(3)} < 0.05') else: print(f'{h0} as the p_value {p_value.__round__(3)} > 0.05') print( 'Analysis: Charges of smoker and non-smoker are not the same as p_value < 0.05' ) ############################################################################# # 2. BMI of males differ from females significantly
nTrialsSuffix = "-nTrials(1)"; wholeSuffix= truncatedSuffix + controlledErrorSuffix + nTrialsSuffix; # trainEvalData = [pickle.load(open(folder + "trainingSetIDs " + str(i) + "truncated-trainSetEvaluated.dat")) for i in setNumbers)]; simpleTestSampleData = [pickle.load(open("simpleData/" + "trainingSetIDs " + str(i) + wholeSuffix + "-evaluated.dat")) for i in setNumbers]; simpleShuffledData = [pickle.load(open("simpleData/" + "trainingSetIDs " + str(i) + wholeSuffix + "-randomizedRSEvaluated.dat")) for i in setNumbers]; simpleRandomANNData = [pickle.load(open("simpleData/" + "trainingSetIDs " + str(i) + wholeSuffix + "-randomANNEvaluated.dat")) for i in setNumbers]; complexTestSampleData = [pickle.load(open("complexData/" + "trainingSetIDs " + str(i) + wholeSuffix + "-evaluated.dat")) for i in setNumbers]; complexShuffledData = [pickle.load(open("complexData/" + "trainingSetIDs " + str(i) + wholeSuffix + "-randomizedRSEvaluated.dat")) for i in setNumbers]; complexRandomANNData = [pickle.load(open("complexData/" + "trainingSetIDs " + str(i) + wholeSuffix + "-randomANNEvaluated.dat")) for i in setNumbers]; ind = np.arange(N); # the x locations for the groups width = 0.2; # the width of the bars print("Ttest simple Exp. vs Permuted", ttest_ind(simpleTestSampleData, simpleShuffledData)); print("Ttest simple Exp. vs Random", ttest_ind(simpleTestSampleData, simpleRandomANNData)); print("Ttest complex Exp. vs Permuted", ttest_ind(complexTestSampleData, complexShuffledData)); print("Ttest complex Exp. vs Random", ttest_ind(complexTestSampleData, complexRandomANNData)); fig, ax = plt.subplots(); x = [.15, .25, .35, .55, .65, .75] point0 = ax.errorbar(x[2], np.mean(simpleTestSampleData), yerr = np.std(simpleTestSampleData), fmt = 'o', markersize = 5, color = 'g', ecolor = 'g'); point1 = ax.errorbar(x[1], np.mean(simpleShuffledData), yerr = np.std(simpleShuffledData),
def poetry_analysis(): poems_needed = [ 'i-will-sing-you-one-o.txt', 'place-for-a-third.txt', 'the-runaway.txt', 'wild-grapes.txt', 'a-winter-eden.txt', 'sitting-by-a-bush-in-broad-sunlight.txt', 'new-hampshire.txt', 'pea-brush.txt', 'the-most-of-it.txt', 'the-times-table.txt' ] calculate_cli = [] calculate_ari = [] calculate_fkgl = [] for file in poems_needed: with open(os.path.join("allpoems", file), "r") as infile: all_text = infile.readlines() entry = ''.join(all_text) split_words = entry.split() no_punc = re.sub(r"[^\w\d\s\.]*", "", entry) no_lines_punc = no_punc.replace('\n', ' ') ########################################### # CALCULATE NUMBER OF LINES ########################################### def count_lines(): line_count = 0 for line in entry: if line == '\n': line_count += 1 return line_count ########################################### # CALCULATE NUMBER OF WORDS ########################################### def count_words(): count = len(split_words) return (count) ########################################### # CALCULATE NUMBER OF CHARACTERS ########################################### def count_char(): return len(no_lines_punc) ########################################## # CALCULATE NUMBER OF SYLLABLES IN POEMS ######################################### no_newline = no_punc.replace("\n", " ") no_extra = re.sub(pattern=r"\s{2,}", repl=" ", string=no_newline) the_words = no_extra.split( ' ') # a list of every word in the poem, for every poem word_count = len(the_words) syllables = 0 for word in the_words: syllables += syllable_count(word) ########################################## # CALCULATE NUMBER OF FKGL, CLI, ARI ########################################## FKGL = float(0.39 * (count_words() / count_lines()) + 11.8 * (syllables / count_words()) - 15.59) calculate_fkgl.append(FKGL) CLI = (5.89 * (count_char() / count_words())) - ( 0.3 * (count_lines() / count_words())) - 15.8 calculate_cli.append(CLI) ARI = (4.71 * ((count_char() / count_words())) + 0.5 * ((count_words() / count_lines())) - 21.43) calculate_ari.append(ARI) # ########################################## # # CREATING A CSV FILE # ########################################## df = pd.read_csv('poem_info.csv') df.to_csv('poem_data.csv', header=True, index=False) poem_data = pd.read_csv('poem_data.csv') poem_data = df.loc[[2, 7, 12, 13, 14, 17, 27, 29, 30, 32]] poem_data.rename(columns={'poemname': 'poemid'}, inplace=True) poem_data['fkgl'] = [float(item) for item in calculate_fkgl] poem_data['cli'] = [float(item) for item in calculate_cli] poem_data['ari'] = [float(item) for item in calculate_ari] poem_data.to_csv('poem_data.csv', header=True, index=False) # pprint(poem_data) def zprint(*args, **kwargs): print(*args, **kwargs, end='\n\n') # ########################################## # SHOW MEAN # ########################################## zprint('___MEANS___:\n', poem_data.groupby('poemsize').mean()) # ########################################## # SHOW STATS # ########################################## small_medium = stats.ttest_ind( poem_data.query('poemsize=="small"')['cli'], poem_data.query('poemsize=="medium"')['cli'], equal_var=False) medium_large = stats.ttest_ind( poem_data.query('poemsize=="medium"')['cli'], poem_data.query('poemsize=="large"')['cli'], equal_var=False) small_large = stats.ttest_ind(poem_data.query('poemsize=="small"')['cli'], poem_data.query('poemsize=="large"')['cli'], equal_var=False) N = len(poem_data.index) - 1 print('t({})={:0.2f}, p={:0.2f}'.format(N, small_medium.statistic, small_medium.pvalue)) print('t({})={:0.2f}, p={:0.2f}'.format(N, medium_large.statistic, medium_large.pvalue)) print('t({})={:0.2f}, p={:0.2f}'.format(N, small_large.statistic, small_large.pvalue)) fig = seaborn.factorplot(x='poemsize', y='ari', data=poem_data, kind='bar', size=5) pyplot.show(fig)
s1 += [0] if rnd.random() < p2: s2 += [1] else: s2 += [0] a1 = [] a2 = [] for i in xrange(0, num_tosses+1): a1 += [ average( s1[ : (i+1)] ) ] a2 += [ average( s2[ : (i+1)] ) ] ps = [] for i in xrange(0, num_tosses+1): statistic, p = ttest_ind(s1[ : i+1], s2[ : i+1]) ps += [p] subplot(211) line1 = plot(range(0, num_tosses+1), a1, color='blue') line2 = plot(range(0, num_tosses+1), a2, color='black') legend((line1, line2), ('p1', 'p2')) axis([0, num_tosses, 0, 1]) xlabel('Number of tosses') ylabel('Average heads') title('Estimated probability of heads') subplot(212) pline = plot(range(0, num_tosses+1), ps, color='green') threshold = plot(range(0, num_tosses+1), [0.05] * (num_tosses+1), color='red') legend((pline, threshold), ('ttest', '0.05 threshold'))
def plotExpBox_Main(inputFile,header,cols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! fin=generic_istream(inputFile) plotData=[] xtickLabels=[] for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(0,len(cols)) lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) plotData[idx].append(value) except: pass fin.close() print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val" for x in range(0,len(plotData)): print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1] pvalueM=[] print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=ttest_ind(plotData[x],plotData[y])[1] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) if plotPvalueCluster: makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) if plotPvalueCluster: makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) if plotPvalueCluster: makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) figure(figsize=figsz) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes) #ylim([0,200]) savefig(outputFile,bbox_inches="tight")
def dropout_pred(model, ref, ref_rc, alt, alt_rc, mutation_positions, out_annotation_all_outputs, output_filter_mask=None, out_annotation=None, dropout_iterations=30): """Dropout-based variant effect prediction This method is based on the ideas in [Gal et al.](https://arxiv.org/pdf/1506.02142.pdf) where dropout layers are also actived in the model prediction phase in order to estimate model uncertainty. The advantage of this method is that instead of a point estimate of the model output the distribution of the model output is estimated. # Arguments model: Keras model ref: Input sequence with the reference genotype in the mutation position ref_rc: Reverse complement of the 'ref' argument alt: Input sequence with the alternative genotype in the mutation position alt_rc: Reverse complement of the 'alt' argument mutation_positions: Position on which the mutation was placed in the forward sequences out_annotation_all_outputs: Output labels of the model. output_filter_mask: Mask of boolean values indicating which model outputs should be used. Use this or 'out_annotation' out_annotation: List of outputs labels for which of the outputs (in case of a multi-task model) the predictions should be calculated. dropout_iterations: Number of prediction iterations to be performed in order to estimate the output distribution. Values greater than 30 are recommended to get a reliable p-value. # Returns Dictionary with a set of measures of the model uncertainty in the variant position. The ones of interest are: - do_{ref, alt}_mean: Mean of the model predictions given the respective input sequence and dropout. - Forward or reverse-complement sequences are chosen as for 'do_pv'. - do_{ref, alt}_var: Variance of the model predictions given the respective input sequence and dropout. - Forward or reverse-complement sequences are chosen as for 'do_pv'. - do_diff: 'do_alt_mean' - 'do_alt_mean', which is an estimate similar to ISM using diff_type "diff". - do_pv: P-value of a paired t-test, comparing the predictions of ref with the ones of alt. Forward or - reverse-complement sequences are chosen based on which pair has the lower p-value. """ prefix = "do" seqs = {"ref": ref, "ref_rc": ref_rc, "alt": alt, "alt_rc": alt_rc} assert np.all([ np.array(get_seq_len(ref)) == np.array(get_seq_len(seqs[k])) for k in seqs.keys() if k != "ref" ]) assert get_seq_len(ref)[0] == mutation_positions.shape[0] assert len(mutation_positions.shape) == 1 # determine which outputs should be selected if output_filter_mask is None: if out_annotation is None: output_filter_mask = np.arange(out_annotation_all_outputs.shape[0]) else: output_filter_mask = np.where( np.in1d(out_annotation_all_outputs, out_annotation))[0] # make sure the labels are assigned correctly out_annotation = out_annotation_all_outputs[output_filter_mask] # Instead of loading the model from a json file I will transfer the model architecture + weights in memory model_config = model._updated_config() alt_config = replace_dict_values(model_config, u"Dropout", u"BiDropout") # Custom objects have to be added before correctly! alt_model = keras.layers.deserialize(alt_config) # Transfer weights and biases alt_model.set_weights(model.get_weights()) # ANALOGOUS TO ISM: # predict preds = {} for k in seqs: preds[k] = pred_do(alt_model, seqs[k], output_filter_mask=output_filter_mask, dropout_iterations=dropout_iterations) t, prob = ttest_ind(preds["ref"], preds["alt"], axis=0) t_rc, prob_rc = ttest_ind(preds["ref_rc"], preds["alt_rc"], axis=0) logit_prob = None logit_prob_rc = None pred_range = get_range(preds) # In case the predictions are bound to [0,1] it might make sense to use logit on the data, as the model output # could be probalilities if np.all([(pred_range[k] >= 0) and (pred_range[k] <= 1) for k in pred_range]): logit_preds = apply_over_single(preds, logit) logit_prob = apply_over_double(logit_preds["ref"], logit_preds["alt"], apply_func=ttest_ind, select_return_elm=1, axis=0) logit_prob_rc = apply_over_double(logit_preds["ref_rc"], logit_preds["alt_rc"], apply_func=ttest_ind, select_return_elm=1, axis=0) # fwd and rc are independent here... so this can be done differently here... sel = (np.abs(prob) > np.abs(prob_rc)).astype( np.int) # Select the LOWER p-value among fwd and rc out_dict = {} out_dict["%s_pv" % prefix] = pd.DataFrame(overwite_by(prob, prob_rc, sel), columns=out_annotation) if logit_prob is not None: logit_sel = (np.abs(logit_prob) > np.abs(logit_prob_rc)).astype(np.int) out_dict["%s_logit_pv" % prefix] = pd.DataFrame(overwite_by( logit_prob, logit_prob_rc, logit_sel), columns=out_annotation) pred_means = {} pred_vars = {} pred_cvar2 = {} for k in preds: pred_means[k] = np.mean(preds[k], axis=0) pred_vars[k] = np.var(preds[k], axis=0) pred_cvar2[k] = pred_vars[k] / (pred_means[k]**2) mean_cvar = np.sqrt((pred_cvar2["ref"] + pred_cvar2["alt"]) / 2) mean_cvar_rc = np.sqrt((pred_cvar2["ref_rc"] + pred_cvar2["alt_rc"]) / 2) mean_cvar = overwite_by(mean_cvar, mean_cvar_rc, sel) ref_mean = overwite_by(pred_means["ref"], pred_means["ref_rc"], sel) alt_mean = overwite_by(pred_means["alt"], pred_means["alt_rc"], sel) ref_var = overwite_by(pred_vars["ref"], pred_vars["ref_rc"], sel) alt_var = overwite_by(pred_vars["alt"], pred_vars["alt_rc"], sel) out_dict["%s_ref_mean" % prefix] = pd.DataFrame(ref_mean, columns=out_annotation) out_dict["%s_alt_mean" % prefix] = pd.DataFrame(alt_mean, columns=out_annotation) out_dict["%s_ref_var" % prefix] = pd.DataFrame(ref_var, columns=out_annotation) out_dict["%s_alt_var" % prefix] = pd.DataFrame(alt_var, columns=out_annotation) out_dict["%s_cvar" % prefix] = pd.DataFrame(mean_cvar, columns=out_annotation) out_dict["%s_diff" % prefix] = out_dict["%s_alt_mean" % prefix] - out_dict["%s_ref_mean" % prefix] return out_dict
def one_side_test(first, second): value, p = ttest_ind(first, second, equal_var=False) if value < 0: return 0.0 else: return 1 - p / 2
t_stat, df, cv, p = independent_ttest(data1, data2, alpha) print('t=%.3f, df=%d, cv=%.3f, p=%.3f' % (t_stat, df, cv, p)) # interpret via critical value if abs(t_stat) <= cv: print('Accept null hypothesis that the means are equal.') else: print('Reject the null hypothesis that the means are equal.') # interpret via p-value if p > alpha: print('Accept null hypothesis that the means are equal.') else: print('Reject the null hypothesis that the means are equal.') # In[6]: twosample_results = stats.ttest_ind(data1, data2) twosample_results # In[7]: matrix_twosample = [['', 'Test Statistic', 'p-value'], [ 'Sample Data', twosample_results[0], twosample_results[1] ]] matrix_twosample # In[8]: twosample_table = ff.create_table(matrix_twosample, index=True) twosample_table
def compute(self, chromosome, start, end, additional=None): part_type, g_one, g_two, grouping = self.unpack_params(additional) expression_data = Gene_data(start, end, chromosome, measurements=self.gene_types) corr_list = [] group_one, group_two = self.partion(part_type, g_one) exp_group_one = Gene_data(start, end, chromosome, measurements=group_one.to_dict('records')) group_one = [c for c in exp_group_one.columns if "_" in c] group_one = self.to_list_of_dict(group_one) if part_type is not None: exp_group_two = Gene_data( start, end, chromosome, measurements=group_two.to_dict('records')) group_two = [c for c in exp_group_two.columns if "_" in c] group_two = self.to_list_of_dict(group_two) group_pairs = [(x, y) for x in group_one for y in group_two] else: group_pairs = itertools.combinations(group_one, 2) # all combinations of gene expressions # TODO (Kyle?): simplify the above code group_pairs = itertools.combinations(group_one + group_two, 2) # pvalue_list = [] #for data_source_one, data_source_two in itertools.combinations(self.gene_types, 2): for data_source_one, data_source_two in group_pairs: exp1 = data_source_one['id'] exp2 = data_source_two['id'] if exp1 in expression_data.columns and exp2 in expression_data.columns: col_one = expression_data[exp1] col_two = expression_data[exp2] correlation_coefficient = pearsonr(col_one, col_two) corr_obj = build_obj('correlation', 'expression', 'expression', True, data_source_one, data_source_two, correlation_coefficient[0], correlation_coefficient[1]) corr_list.append(corr_obj) t_value, p_value = ttest_ind(col_one, col_two, equal_var=False) corr_list = sorted(corr_list, key=lambda x: x['value'], reverse=True) corr_res = pd.Series(corr_list) corr_res = corr_res.apply(pd.Series) parse_res = corr_res # corr_res = corr_res.to_json(orient='records') # parse_res = json.loads(corr_res) return parse_res
def in_out_mask_ttest_from_glm_file(wpth,scale_path,contrast,scale,imsk,omsk,parcel_img,eff='ttest',membership=[],conndf=''): oldpth = pthswp(wpth) tdf = pandas.DataFrame(np.zeros((scale,4)), columns=['t','p','wt','wp']) if not membership: #determine which seeds are in which mask inseedz = [] outseedz = [] cde = codegen(6) print 'determining parcel membership...' for i in range(1,(scale+1)): os.system('fslmaths %s -thr %s -uthr %s %s'%(parcel_img,i,i,cde)) os.system('fslmaths %s.nii.gz -mas %s %s1'%(cde,imsk,cde)) os.system('fslmaths %s.nii.gz -mas %s %s2'%(cde,omsk,cde)) ival = subprocess.check_output('fslstats %s1.nii.gz -V'%(cde),shell = True).rsplit()[0] oval = subprocess.check_output('fslstats %s2.nii.gz -V'%(cde),shell = True).rsplit()[0] # determine membership via winner-takes-all if int(ival) > int(oval): inseedz.append(i) print 'seed %s going inside mask'%(i) elif int(ival) < int(oval): outseedz.append(i) print 'seed %s going outside mask'%(i) else: print 'could not resolve seed %s. In vox = %s, out vox = %s. Excluding from analysis'%(i,ival,oval) os.system('rm %s*') else: inseedz = membership[0] outseedz = membership[1] print 'preparing connectivity map...' if type(conndf) == pandas.core.frame.DataFrame: df = conndf else: df =jni.create_df_from_mat(scale_path,scale,pval=0.1,eff_tp=eff,mat_tp = 'glm') for i in range(scale): print 'calculating values for seed %s'%(i+1) ivalz = [] ovalz = [] indz = [] for ind in df.index.tolist(): for x in ind: if x == (i+1): indz.append(ind) indz.remove(indz[i]) for y in indz: if y[0] == (i+1): conn = y[1] else: conn = y[0] if conn in inseedz: ivalz.append(df.ix[y,eff]) elif conn in outseedz: ovalz.append(df.ix[y,eff]) invec = np.array(ivalz) outvec = np.array(ovalz) t,p = st.ttest_ind(invec,outvec) wt,wp = st.ttest_ind(invec,outvec,equal_var = False) tdf.ix[(i+1),'t'] = t tdf.ix[(i+1),'p'] = p tdf.ix[(i+1),'wt'] = wt tdf.ix[(i+1),'wp'] = wp tdf.ix[(i+1),'gof'] = np.mean(invec) / np.mean(outvec) os.chdir(oldpth) return tdf,df,inseedz,outseedz
############################################### # Variance homogeneity # H0 = Variance is homogeneous # H1 = Variance is not homogeneous. stats.levene(AB['T_Purchase'], AB['C_Purchase']) # LeveneResult(statistic=2.6392694728747363, pvalue=0.10828588271874791) # h0 isn't rejected. ###################################################### # Hypothesis Testing test_statistics, pvalue = stats.ttest_ind(AB['T_Purchase'], AB['C_Purchase'], equal_var=True) print('test statistics = %.4f, p-value = %.4f' % (test_statistics, pvalue)) #test statistics = 0.9416, p-value = 0.3493 # h0 isn't rejected. ####################################### # # Create hypothesis (Earning) ####################################### # Control and Test Purchase mean: # C_Earning: 1908.5683 # T_Earning: 2514.890733 # H0: M1 = M2 There is no statistically significant difference between the maximum bidding and average bidding.
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) plotData[idx].append(value) except: pass fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: if len(relabels)!=len(xtickLabels): print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels exit() xtickLabels=relabels print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1] pvalueM=[] print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=ttest_ind(plotData[x],plotData[y])[1] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) figure(figsize=figsz) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes) #ylim([0,200]) savefig(outputFile,bbox_inches="tight")
def get_difference(X, Y): return 1 - stats.ttest_ind(X, Y)[1]
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta,showViolin,showBox,firstColAnnot,plotTrend,showLegend,makePzfxFile,makeBinMatrix,writeDataSummaryStat,summaryStatRange,minuslog10pvalue,minNDataToKeep,vfacecolor,valpha,outXYZPvalues,dividePlots): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] trendData={} annot={} minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) if firstColAnnot: colAnnot=cols[0] cols=cols[1:] annotThisFile=[] annot[startIdx]=annotThisFile else: colAnnot=-1 annotThisFile=None for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) if plotTrend: #print >> stderr,"plotTrend" trendDataThisFile=[] trendData[startIdx]=trendDataThisFile else: trendDataThisFile=None lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) if plotTrend: #print >> stderr,"a" trendDataThisLine=[] else: trendDataThisLine=None allDataOKThisLine=True if colAnnot>=0: annotThisFile.append(fields[colAnnot]) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) if logb!=0: if value==0.0: raise ValueError value=log(value)/logb plotData[idx].append(value) if plotTrend: trendDataThisLine.append(value) #print >> stderr,"value:",value except: allDataOKThisLine=False if plotTrend: if allDataOKThisLine: trendDataThisFile.append(trendDataThisLine) else: trendDataThisFile.append(None) fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() print >> stderr,xtickLabels print >> stderr,relabels for i,relabel in zip(range(0,len(relabels)),relabels): xtickLabels[i]=relabel for i in range(0,len(plotMedianForGroups)): plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i]) #drawing medians: medianToDraw=[] for mediangrouper in plotMedianForGroups: curD=[] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData)-1,-1,-1): if len(plotData[c])<minNDataToKeep: print >> stderr,xtickLabels[c],"discarded because has only",len(plotData[c]),"data points <",minNDataToKeep del plotData[c] del xtickLabels[c] if not skipStat: print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val","median" if writeDataSummaryStat: fDSS=open(writeDataSummaryStat,"w") print >> fDSS,"sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange["+str(summaryStatRange[0])+","+str(summaryStatRange[1])+"]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1],median(plotData[x]) except: print >> stdout, xtickLabels[x],mean(plotData[x]),"NA",median(plotData[x]) if writeDataSummaryStat: sumData,N,NIN,NBelow,NAbove=filterDataInRangeInclusive(plotData[x],summaryStatRange[0],summaryStatRange[1]) if NIN>1: #print >> stderr,"sumData=",sumData #print >> stderr,mean mea=mean2(sumData) DDOF=1 sd=std(sumData,ddof=DDOF) var=sd*sd mi=min(sumData) ma=max(sumData) else: mea="NA" sd="NA" var="NA" mi="NA" ma="NA" print >> fDSS,xtickLabels[x]+"\t"+str(mea)+"\t"+str(var)+"\t"+str(sd)+"\t"+str(mi)+"\t"+str(ma)+"\t"+str(N)+"\t"+str(NIN)+"\t"+str(float(NIN)*100/N)+"\t"+str(NBelow)+"\t"+str(float(NBelow)*100/N)+"\t"+str(NAbove)+"\t"+str(float(NAbove)*100/N) pvalueM=[] if writeDataSummaryStat: fDSS.close() print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=ttest_ind(plotData[x],plotData[y])[1] except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if outXYZPvalues: writeXYZPvalues(outXYZPvalues+"_Welch.xyz",xtickLabels,pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if outXYZPvalues: writeXYZPvalues(outXYZPvalues+"_U.xyz",xtickLabels,pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) #####now the variance tests print >> stdout,"" print >> stdout,"Ansari-Bradley Two-sample Test for difference in scale parameters " print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=ansari(plotData[x],plotData[y])[1] except: pvalue="NA" if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 #pvalue=1.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Ansari_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Ansari",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Fligner's Two-sample Test for equal variance (non-parametrics)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=fligner(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_fligner_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_fligner",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Levene's Two-sample Test for equal variance" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=levene(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_levene_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_levene",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Bartlett's Two-sample Test for equal variance (for normal distributions)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=bartlett(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_bartlett_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_bartlett",xtickLabels,pvalueM,methodCluster) ##### figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes,showViolin,showBox,annot,trendData,showLegend,makePzfxFile,makeBinMatrix,dividePlots) #ylim([0,200]) for m in medianToDraw: axhline(y=m,linestyle=':',color='gray') savefig(outputFile,bbox_inches="tight") if len(plotHistogramToFile)>0: drawHistogram(plotHistogramToFile,plotData,xtickLabels) drawDensigram(plotHistogramToFile+".density.png",plotData,xtickLabels)
# Set random seed to get the same result or remove for different each time np.random.seed(123) # Initialize effect_size, control_mean, control_sd effect_size, sample_size, control_mean, control_sd = 0.1, 50, 1, 0.5 sims = 1000 ''' INSTRUCTIONS * For the time spent random variables, set the size such that it has shape sample_size × sims. * Calculate power as a fraction of p-values less than 0.05 (statistically significant). * If power is greater than or equal to 80%, break out of the while loop. Else, keep incrementing sample_size by 10. ''' sample_size = 50 # Keep incrementing sample size by 10 till we reach required power while 1: control_time_spent = np.random.normal(loc=control_mean, scale=control_sd, size=(sample_size, sims)) treatment_time_spent = np.random.normal(loc=control_mean*(1+effect_size), scale=control_sd, size=(sample_size, sims)) t, p = st.ttest_ind(treatment_time_spent, control_time_spent) # Power is the fraction of times in the simulation when the p-value was less than 0.05 power = (p < 0.05).sum()/sims if power >= 0.8: break else: sample_size += 10 print("For 80% power, sample size required = {}".format(sample_size))
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) if logb!=0: value=log(value)/logb if value<-100000: raise ValueError plotData[idx].append(value) except: pass fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() for i,relabel in zip(range(0,len(relabels)),relabels): xtickLabels[i]=relabel for i in range(0,len(plotMedianForGroups)): plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i]) #drawing medians: medianToDraw=[] for mediangrouper in plotMedianForGroups: curD=[] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData)-1,-1,-1): if len(plotData[c])==0: print >> stderr,xtickLabels[c],"discarded" del plotData[c] del xtickLabels[c] print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1] except: print >> stdout, xtickLabels[x],"NA","NA" pvalueM=[] print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=ttest_ind(plotData[x],plotData[y])[1] except: pvalue=1.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 except: pvalue=1.0 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes) #ylim([0,200]) for m in medianToDraw: axhline(y=m,linestyle=':',color='gray') savefig(outputFile,bbox_inches="tight") if len(plotHistogramToFile)>0: drawHistogram(plotHistogramToFile,plotData,xtickLabels)
# - Gene symbol map proteomics = proteomics[[ i.split(';')[0] in umap for i in proteomics['uniprot'] ]] proteomics['genesymbol'] = [ umap[i.split(';')[0]] for i in proteomics['uniprot'] ] # - Log fold-change proteomics = proteomics.groupby('genesymbol').mean() # - Differential protein abundance de_proteomics = {} for i in proteomics.index: t, p = ttest_ind(proteomics.ix[i, ko], proteomics.ix[i, wt]) de_proteomics[i] = { 'fc': proteomics.ix[i, ko].mean() - proteomics.ix[i, wt].mean(), 't': t, 'pval': p } de_proteomics = DataFrame(de_proteomics).T.dropna() # - FDR correction de_proteomics['fdr'] = multipletests(de_proteomics['pval'], method='fdr_bh')[1] # - Export protein level proteomics de_proteomics.to_csv('./data/uok262_proteomics_labelfree_processed_fc.csv') # de_proteomics = read_csv('./data/uok262_proteomics_labelfree_processed_fc.csv', index_col=0) print de_proteomics.sort_values('fdr')
).reset_index(name='Average value (Treatment villages)'), progresa_df_edited[progresa_df_edited.progresa == '0'].mean(). reset_index(name='Average value (Control villages)'), on=['index'])) #Creating empty lists to append t value, p value and statistical significancy t_value = [] p_value = [] stats_significant = [] #Iterating over the df to calculate t, p value and statistical significancy for i in list(progresa_treatment_control['index']): t_value.append( stats.ttest_ind( list( progresa_df_edited[progresa_df_edited.progresa == 'basal'][i]), list(progresa_df_edited[progresa_df_edited.progresa == '0'][i]), nan_policy='omit').statistic) p_value.append( stats.ttest_ind( progresa_df_edited[progresa_df_edited.progresa == 'basal'][i], progresa_df_edited[progresa_df_edited.progresa == '0'][i], nan_policy='omit').pvalue) if stats.ttest_ind( progresa_df_edited[progresa_df_edited.progresa == 'basal'][i], progresa_df_edited[progresa_df_edited.progresa == '0'][i], nan_policy='omit').pvalue < 0.05: stats_significant.append('TRUE') else: stats_significant.append('FALSE')
print() print("------------------------------------------") print("------ STATISTICS FOR MOM HEURISTIC ------") print("------------------------------------------") print() difficulty = ["Simple", "Easy", "Intermediate", "Expert"] # statistics for d in range(4): print() print("Testing for difficulty: " + difficulty[d]) print('T-test for baseline vs. naked-pairs:') x = stats.ttest_ind(normal_splits[d], pairs_splits[d]) print(x) print('T-test for baseline vs. naked-triples:') x = stats.ttest_ind(normal_splits[d], triple_splits[d]) print(x) print('T-test for baseline vs. x-wing:') x = stats.ttest_ind(normal_splits[d], x_splits[d]) print(x) print('T-test for baseline vs. all:') x = stats.ttest_ind(normal_splits[d], all_splits[d]) print(x)