def sliding_MWU(values): """ RETURN :param values: :return: """ # ADD MEDIAN TO EITHER SIDE OF values prefix = [np.median(values[: i + weight_radius]) for i in range(weight_radius)] suffix = [ np.median(values[-i - weight_radius:]) for i in reversed(range(weight_radius)) ] combined = np.array(prefix + list(values) + suffix) b = combined.itemsize window = as_strided( combined, shape=(len(values), weight_radius * 2), strides=(b, b) ) med = (len(median_weight) + 1) / 2 m_score = np.array( [ stats.mannwhitneyu( w[:weight_radius], w[-weight_radius:], use_continuity=True, alternative="two-sided", ) for v in window for r in [rankdata(v)] for w in [(r - med) * median_weight] ] ) return m_score
def jitter_MWU(values, start, mid, end): """ RETURN A BETTER MIDPOINT< ACCOUNTING FOR t-test RESULTS """ # ADD SOME CONSTRAINTS TO THE RANGE OF VALUES TESTED m_start = min(mid, max(start + MIN_POINTS, mid - JITTER)) m_end = max(mid, min(mid + JITTER, end - MIN_POINTS)) if m_start == m_end: return no_good_edge, no_good_edge, mid mids = np.array(range(m_start, m_end)) # MWU SCORES try: m_score = np.array([ stats.mannwhitneyu( values[max(start, m - MAX_POINTS):m], values[m:min(end, m + MAX_POINTS)], use_continuity=True, alternative="two-sided", ) for m in mids ]) t_score = np.array([ stats.ttest_ind( values[max(start, m - MAX_POINTS):m], values[m:min(end, m + MAX_POINTS)], equal_var=False, ) for m in mids ]) except Exception as e: e = Except.wrap(e) if "All numbers are identical" in e: return no_good_edge, no_good_edge, mids[0] raise e # TOTAL SUM-OF-SQUARES # DO NOT KNOW WHAT THIS WAS DOING # if m_start - start == 0: # # WE CAN NOT OFFSET BY ONE, SO WE ADD A DUMMY VALUE # v_prefix = np.array([np.nan] + list(not_right(cumSS(values[start:m_end]), 1))) # else: # # OFFSET BY ONE, WE WANT cumSS OF ALL **PREVIOUS** VALUES # v_prefix = not_right( # not_left(cumSS(values[start:m_end]), m_start - start - 1), 1 # ) # v_suffix = not_right(cumSS(values[m_start:end][::-1])[::-1], end - m_end) # v_score = v_prefix + v_suffix # pvalue = np.sqrt(m_score[:, 1] * v_score) # GOEMEAN OF SCORES # PICK LOWEST pvalue = np.sqrt(m_score[:, 1] * t_score[:, 1]) best = np.argmin(pvalue) return Data(pvalue=m_score[best, 1]), Data(pvalue=t_score[best, 1]), mids[best]
def square_vs_diag(codon_permutation_f, outdir): # Replicates the analysis presented in Fig. 5B and creates plot ret = Utils.Load(codon_permutation_f) npr = np.array(ret['n+_risk']) codes = ret['code'] squares = np.array([issquare(i) for i in codes[1:]]) diags = np.array([isdiag(i) for i in codes[1:]]) print('n diags: {}'.format(sum(diags))) print('n squares: {}'.format(sum(squares))) _, ax = plt.subplots(1, figsize=(4.7, 5.2), dpi=144) grps = [npr[1:][squares], npr[1:][~squares & ~diags], npr[1:][diags]] ax.boxplot(grps, showfliers=False, whis=(5, 95), flierprops={ 'color': 'k', 'marker': 'x', 'markersize': 2 }, boxprops={ 'color': 'k', 'lw': 0.6 }, capprops={ 'color': 'k', 'lw': 0.6 }, whiskerprops={ 'color': 'k', 'lw': 0.6 }, medianprops={ 'color': '', 'lw': 1.2 }) ax.set_ylim(0.15, 0.31) ax.set_yticks([0.15, 0.2, 0.25, 0.3]) print('squares vs all: {}'.format(mannwhitneyu(grps[0], grps[1]))) print('squares vs diags: {}'.format(mannwhitneyu(grps[0], grps[2]))) print('diags vs all: {}'.format(mannwhitneyu(grps[2], grps[1]))) plt.savefig(join(outdir, 'Squares_diags.png'), dpi=144)
def jitter_MWU(values, start, mid, end): # ADD SOME CONSTRAINTS TO THE RANGE OF VALUES TESTED m_start = min(mid, max(start + MIN_POINTS, mid - JITTER)) m_end = max(mid, min(mid + JITTER, end - MIN_POINTS)) if m_start == m_end: return no_good_edge, no_good_edge, mid mids = np.array(range(m_start, m_end)) # MWU SCORES m_score = np.array( [ stats.mannwhitneyu( values[max(start, m - MAX_POINTS) : m], values[m : min(end, m + MAX_POINTS)], use_continuity=True, alternative="two-sided", ) for m in mids ] ) t_score = np.array( [ stats.ttest_ind( values[max(start, m - MAX_POINTS) : m], values[m : min(end, m + MAX_POINTS)], equal_var=False, ) for m in mids ] ) # TOTAL SUM-OF-SQUARES if m_start - start == 0: # WE CAN NOT OFFSET BY ONE, SO WE ADD A DUMMY VALUE v_prefix = np.array([np.nan] + list(not_right(cumSS(values[start:m_end]), 1))) else: # OFFSET BY ONE, WE WANT cumSS OF ALL **PREVIOUS** VALUES v_prefix = not_right( not_left(cumSS(values[start:m_end]), m_start - start - 1), 1 ) v_suffix = not_right(cumSS(values[m_start:end][::-1])[::-1], end - m_end) v_score = v_prefix + v_suffix # PICK LOWEST pvalue = np.sqrt(m_score[:, 1] * v_score) # GOEMEAN OF SCORES best = np.argmin(pvalue) return Data(pvalue=m_score[best, 1]), Data(pvalue=t_score[best, 1]), mids[best]
def mannwhitneyu_test(approaches, accuracy_values, save_path): # Compute the Mann-Whitney rank test on samples x and y. mannwhitneyu_test_frame = pd.DataFrame() for i in range(len(approaches)): for j in range(i, len(approaches), 1): # iterate through approaches approach_i = approaches[i] approach_j = approaches[j] values_i = accuracy_values.loc[:, approach_i] values_j = accuracy_values.loc[:, approach_j] t_statistic, two_tailed_p_test = stats.mannwhitneyu(values_i, values_j) mannwhitneyu_test_frame.at[approach_i, approach_j] = two_tailed_p_test save_path.mkdir(parents=True, exist_ok=True) fig = plt.figure(figsize=(4, 2)) ax = fig.subplots() ax = sns.heatmap(mannwhitneyu_test_frame, ax=ax, annot=True, fmt="0.3f", cmap="autumn", vmin=0, vmax=0.05) plt.xticks(rotation=45) fig.canvas.start_event_loop(sys.float_info.min) path = save_path / 'mannwhitneyu.png' fig.savefig(path, bbox_inches='tight', dpi=100) plt.close(fig)
def sliding_MWU(values): """ RETURN :param values: :return: """ # ADD MEDIAN TO EITHER SIDE OF values prefix = [ np.median(values[:i + weight_radius]) for i in range(weight_radius) ] suffix = [ np.median(values[-i - weight_radius:]) for i in reversed(range(weight_radius)) ] combined = np.array(prefix + list(values) + suffix) b = combined.itemsize window = as_strided(combined, shape=(len(values), weight_radius * 2), strides=(b, b)) med = (len(median_weight) + 1) / 2 try: m_score = np.array([ stats.mannwhitneyu( w[:weight_radius], w[-weight_radius:], use_continuity=True, alternative="two-sided", ) for v in window for r in [rankdata(v)] for w in [(r - med) * median_weight] ]) return m_score except Exception as cause: cause = Except.wrap(cause) if "All numbers are identical" in cause: return np.ones((window.shape[0], 2)) raise cause
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta,showViolin,showBox,firstColAnnot,plotTrend,showLegend,makePzfxFile,makeBinMatrix,writeDataSummaryStat,summaryStatRange,minuslog10pvalue,minNDataToKeep,vfacecolor,valpha,outXYZPvalues,dividePlots): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] trendData={} annot={} minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) if firstColAnnot: colAnnot=cols[0] cols=cols[1:] annotThisFile=[] annot[startIdx]=annotThisFile else: colAnnot=-1 annotThisFile=None for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) if plotTrend: #print >> stderr,"plotTrend" trendDataThisFile=[] trendData[startIdx]=trendDataThisFile else: trendDataThisFile=None lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) if plotTrend: #print >> stderr,"a" trendDataThisLine=[] else: trendDataThisLine=None allDataOKThisLine=True if colAnnot>=0: annotThisFile.append(fields[colAnnot]) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) if logb!=0: if value==0.0: raise ValueError value=log(value)/logb plotData[idx].append(value) if plotTrend: trendDataThisLine.append(value) #print >> stderr,"value:",value except: allDataOKThisLine=False if plotTrend: if allDataOKThisLine: trendDataThisFile.append(trendDataThisLine) else: trendDataThisFile.append(None) fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() print >> stderr,xtickLabels print >> stderr,relabels for i,relabel in zip(range(0,len(relabels)),relabels): xtickLabels[i]=relabel for i in range(0,len(plotMedianForGroups)): plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i]) #drawing medians: medianToDraw=[] for mediangrouper in plotMedianForGroups: curD=[] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData)-1,-1,-1): if len(plotData[c])<minNDataToKeep: print >> stderr,xtickLabels[c],"discarded because has only",len(plotData[c]),"data points <",minNDataToKeep del plotData[c] del xtickLabels[c] if not skipStat: print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val","median" if writeDataSummaryStat: fDSS=open(writeDataSummaryStat,"w") print >> fDSS,"sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange["+str(summaryStatRange[0])+","+str(summaryStatRange[1])+"]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1],median(plotData[x]) except: print >> stdout, xtickLabels[x],mean(plotData[x]),"NA",median(plotData[x]) if writeDataSummaryStat: sumData,N,NIN,NBelow,NAbove=filterDataInRangeInclusive(plotData[x],summaryStatRange[0],summaryStatRange[1]) if NIN>1: #print >> stderr,"sumData=",sumData #print >> stderr,mean mea=mean2(sumData) DDOF=1 sd=std(sumData,ddof=DDOF) var=sd*sd mi=min(sumData) ma=max(sumData) else: mea="NA" sd="NA" var="NA" mi="NA" ma="NA" print >> fDSS,xtickLabels[x]+"\t"+str(mea)+"\t"+str(var)+"\t"+str(sd)+"\t"+str(mi)+"\t"+str(ma)+"\t"+str(N)+"\t"+str(NIN)+"\t"+str(float(NIN)*100/N)+"\t"+str(NBelow)+"\t"+str(float(NBelow)*100/N)+"\t"+str(NAbove)+"\t"+str(float(NAbove)*100/N) pvalueM=[] if writeDataSummaryStat: fDSS.close() print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=ttest_ind(plotData[x],plotData[y])[1] except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if outXYZPvalues: writeXYZPvalues(outXYZPvalues+"_Welch.xyz",xtickLabels,pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 except: pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if outXYZPvalues: writeXYZPvalues(outXYZPvalues+"_U.xyz",xtickLabels,pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) #####now the variance tests print >> stdout,"" print >> stdout,"Ansari-Bradley Two-sample Test for difference in scale parameters " print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=ansari(plotData[x],plotData[y])[1] except: pvalue="NA" if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 #pvalue=1.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Ansari_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Ansari",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Fligner's Two-sample Test for equal variance (non-parametrics)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=fligner(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_fligner_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_fligner",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Levene's Two-sample Test for equal variance" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=levene(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_levene_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_levene",xtickLabels,pvalueM,methodCluster) ##### #####now the variance tests print >> stdout,"" print >> stdout,"Bartlett's Two-sample Test for equal variance (for normal distributions)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=bartlett(plotData[x],plotData[y])[1] except: pvalue="NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue)!="NA": try: pvalue=-1*log(pvalue,10) except: pvalue=-1000.0 print >> stdout,pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_bartlett_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_bartlett",xtickLabels,pvalueM,methodCluster) ##### figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes,showViolin,showBox,annot,trendData,showLegend,makePzfxFile,makeBinMatrix,dividePlots) #ylim([0,200]) for m in medianToDraw: axhline(y=m,linestyle=':',color='gray') savefig(outputFile,bbox_inches="tight") if len(plotHistogramToFile)>0: drawHistogram(plotHistogramToFile,plotData,xtickLabels) drawDensigram(plotHistogramToFile+".density.png",plotData,xtickLabels)
def plotExpBox_Main(inputFile,header,cols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! fin=generic_istream(inputFile) plotData=[] xtickLabels=[] for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(0,len(cols)) lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) plotData[idx].append(value) except: pass fin.close() print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val" for x in range(0,len(plotData)): print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1] pvalueM=[] print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=ttest_ind(plotData[x],plotData[y])[1] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) if plotPvalueCluster: makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) if plotPvalueCluster: makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) if plotPvalueCluster: makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) figure(figsize=figsz) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes) #ylim([0,200]) savefig(outputFile,bbox_inches="tight")
def compute_auc(train, test, Y, y): """Computes measures of accuracy for train and test data. Computes the ROC curve and the area under that curve, as a measure of classification accuracy. The threshold corresponding to the point on the ROC curve farthest from `y=x` line is selected and fraction of correct predictions corresponding to that threshold is returned. in Arguments train : float array Array of predictions of the model on training data where rows correspond to labels and columns correspond to samples. test : float array Array of predictions of the model on testing data where rows correspond to labels and columns correspond to samples. Y : float array Training labels where rows correspond to labels and columns to samples. This is an array of {1,-1}. y : float array Testing labels where rows correspond to labels and columns to samples. This is an array of {1,-1}. Returns performance : float array Array containing the AUC for training data, classification accuracy for training data, AUC for testing data and classification accuracy for testing data, in that order. .. note:: * For binary-class classification, AUC is proportional to the Mann-Whitney U test statistic which computes a measure of the separation between values of positive labels and negative labels. * For multi-class classification, this formula for computing classifier AUC is one of many. A more principled way would involve computing the Volume under an ROC surface. """ # BINARY CASE if (Y.shape[0] == 2): # computing train AUC NP = (Y[0, :] == 1).sum() NM = (Y[0, :] == -1).sum() try: U = stats.mannwhitneyu(train[0, (Y[0, :] == 1)], train[0, (Y[0, :] == -1)]) train_auc = 1. - U[0] / (NP * NM) except ValueError: # if all function outputs are equal, AUC = 0.5 train_auc = 0.5 # computing test AUC NP = (y[0, :] == 1).sum() NM = (y[0, :] == -1).sum() try: U = stats.mannwhitneyu(test[0, (y[0, :] == 1)], test[0, (y[0, :] == -1)]) test_auc = 1. - U[0] / (NP * NM) except ValueError: # if all function outputs are equal, AUC = 0.5 test_auc = 0.5 # MULTICLASS CASE else: # computing train AUC NP = (Y == 1).sum() NM = (Y == -1).sum() try: U = stats.mannwhitneyu(train[(Y == 1)], train[(Y == -1)]) train_auc = 1. - U[0] / (NP * NM) except ValueError: # if all function outputs are equal, AUC = 0.5 train_auc = 0.5 # computing test AUC NP = (y == 1).sum() NM = (y == -1).sum() try: U = stats.mannwhitneyu(test[(y == 1)], test[(y == -1)]) test_auc = 1. - U[0] / (NP * NM) except ValueError: # if all function outputs are equal, AUC = 0.5 test_auc = 0.5 # accuracy = number of examples where argmax of prediction # equals true label # train accuracy train_accuracy = (train.argmax(0) - Y.argmax(0) == 0).sum() / float(Y.shape[1]) # test accuracy test_accuracy = (test.argmax(0) - y.argmax(0) == 0).sum() / float(y.shape[1]) # train exploss train_loss = np.sum(np.exp(-1 * train * Y)) # test exploss test_loss = np.sum(np.exp(-1 * test * y)) return np.array([train_loss, train_auc, train_accuracy, test_loss, test_auc, test_accuracy])
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) if logb!=0: if value==0.0: raise ValueError value=log(value)/logb plotData[idx].append(value) except: pass fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() print >> stderr,xtickLabels print >> stderr,relabels for i,relabel in zip(range(0,len(relabels)),relabels): xtickLabels[i]=relabel for i in range(0,len(plotMedianForGroups)): plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i]) #drawing medians: medianToDraw=[] for mediangrouper in plotMedianForGroups: curD=[] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData)-1,-1,-1): if len(plotData[c])==0: print >> stderr,xtickLabels[c],"discarded" del plotData[c] del xtickLabels[c] print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1] except: print >> stdout, xtickLabels[x],"NA","NA" pvalueM=[] print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=ttest_ind(plotData[x],plotData[y])[1] except: pvalue=1.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 except: pvalue=1.0 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes) #ylim([0,200]) for m in medianToDraw: axhline(y=m,linestyle=':',color='gray') savefig(outputFile,bbox_inches="tight") if len(plotHistogramToFile)>0: drawHistogram(plotHistogramToFile,plotData,xtickLabels)
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) if logb!=0: value=log(value)/logb if value<-100000: raise ValueError plotData[idx].append(value) except: pass fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() for i,relabel in zip(range(0,len(relabels)),relabels): xtickLabels[i]=relabel for i in range(0,len(plotMedianForGroups)): plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i]) #drawing medians: medianToDraw=[] for mediangrouper in plotMedianForGroups: curD=[] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData)-1,-1,-1): if len(plotData[c])==0: print >> stderr,xtickLabels[c],"discarded" del plotData[c] del xtickLabels[c] print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1] except: print >> stdout, xtickLabels[x],"NA","NA" pvalueM=[] print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue=ttest_ind(plotData[x],plotData[y])[1] except: pvalue=1.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 except: pvalue=1.0 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes) #ylim([0,200]) for m in medianToDraw: axhline(y=m,linestyle=':',color='gray') savefig(outputFile,bbox_inches="tight") if len(plotHistogramToFile)>0: drawHistogram(plotHistogramToFile,plotData,xtickLabels)
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData=[] xtickLabels=[] minSize=-1 for inputFile,header,cols in zip(inputFiles,headers,valcols): fin=generic_istream(inputFile) startIdx=len(plotData) for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices=range(startIdx,startIdx+len(cols)) lino=0 for lin in fin: lino+=1 if lino<startRow: continue fields=lin.rstrip("\r\n").split(sep) for idx,col in zip(colIndices,cols): try: value=float(fields[col]) plotData[idx].append(value) except: pass fin.close() if minSize==-1: minSize=len(plotData[idx]) #or startIDX? else: minSize=min([minSize,len(plotData[idx])]) if trimToMinSize: print >> stderr,"trimming to min size =",minSize trimData(plotData,minSize) if len(relabels)>0: if len(relabels)!=len(xtickLabels): print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels exit() xtickLabels=relabels print >> stdout,"student t-test (1 sample; mean=0)" print >> stdout,"sample","mean","p-val" for x in range(0,len(plotData)): #print >> stderr, len(plotData[x]) print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1] pvalueM=[] print >> stdout,"" print >> stdout,"student t-test (2 samples)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=ttest_ind(plotData[x],plotData[y])[1] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; print >> stdout,"" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster) pvalueM=[] print >> stdout,"welch t-test" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3] print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster) print >> stdout,"" print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout,"p-val", for x in range(0,len(plotData)): print >> stdout,xtickLabels[x], pvalueM=[] print >> stdout,"" for x in range(0,len(plotData)): pvalueRow=[] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0,len(plotData)): if y<=x: print >> stdout, "", if x==y: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2 print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout,""; if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM) makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster) figure(figsize=figsz) if len(titl)==0: titl=outputFile plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes) #ylim([0,200]) savefig(outputFile,bbox_inches="tight")
import numpy import pandas import datetime import scipy.stats.stats as st # import matplotlib.pyplot as plt format = "%Y_%m_%d" current_date=datetime.datetime.today() print 'dataset path: /Volumes/daelsaid/inquisit/INQUISIT_RERUN/all_launches_dir/scored/clean_copies/scored_data_2017_07_01_comb_mturk.csv' print ' ' print current_date scored_data=pandas.read_csv('/Volumes/daelsaid/inquisit/INQUISIT_RERUN/all_launches_dir/scored/clean_copies/scored_data_2017_07_01_newageranges_comb_mturk.csv') y=['trial1', 'trial2', 'trial3', 'trial4', 'trial5', 'listb', 'trial6', 'trial7'] test_df=pandas.DataFrame(data=scored_data.set_index(['gender', 'age_range']).loc[:,'trial1':'trial7']) for col in y: print ' ' print '*******' + col + '*******' for idx,val in test_df.groupby(level=1): if len(val) >3: u,p= st.mannwhitneyu(val.loc['female'][col],val.loc['male'][col], use_continuity=True,alternative='two-sided') if p < 0.05: print 'age_range ' + list(set(val.loc['female'][col].index.tolist()))[0] + ':', u, p, '**' else: print 'age_range ' + list(set(val.loc['female'][col].index.tolist()))[0] + ':', u, p, 'ns' else: print 'age_range ' + list(set(val.loc['female'][col].index.tolist()))[0]+ ': n is less than 3'
) ax['circ'].scatter(df_py.jitter, df_py.vector_strength, edgecolors='w', lw=.5, color=sns.xkcd_rgb['dark fuchsia'], \ label='pyramidal', s=point_size ) ax['vs_freq_beat'].scatter(df_pu_b.frequency, df_pu_b.vector_strength, edgecolors='w', lw=.5, color=sns.xkcd_rgb['azure'], \ label='p-units', s=point_size) ax['vs_freq_beat'].scatter(df_py_b.frequency, df_py_b.vector_strength, edgecolors='w', lw=.5, color=sns.xkcd_rgb['dark fuchsia'], \ label='pyramidal', s=point_size) # --- circular variance scatter plots ax['circ_beat'].scatter(df_pu_b.jitter, df_pu_b.vector_strength, edgecolors='w', lw=.5, color=sns.xkcd_rgb['azure'], \ label='p-units', s=point_size ) ax['circ_beat'].scatter(df_py_b.jitter, df_py_b.vector_strength, edgecolors='w', lw=.5, color=sns.xkcd_rgb['dark fuchsia'], \ label='pyramidal', s=point_size ) print(stats.mannwhitneyu(df_pu.vector_strength, df_py.vector_strength, use_continuity=True, alternative=None)) print(stats.mannwhitneyu(df_pu_b.vector_strength, df_py_b.vector_strength, use_continuity=True, alternative=None)) print(np.median(df_pu.vector_strength)) print(np.median(df_py.vector_strength)) print(np.median(df_pu_b.vector_strength)) print(np.median(df_py_b.vector_strength))
def stat_test(df,df_num): for i in df_num.columns: df_1=df[df['Response']==1][i] df_0=df[df['Response']==0][i] tsats,pval=stats.ttest_ind(df_1,df_0) tstas,pval=stats.mannwhitneyu(df_1,df_0)
def compute_auc(train, test, Y, y): """Computes measures of accuracy for train and test data. Computes the ROC curve and the area under that curve, as a measure of classification accuracy. The threshold corresponding to the point on the ROC curve farthest from `y=x` line is selected and fraction of correct predictions corresponding to that threshold is returned. Arguments train : float array Array of predictions of the model on training data where rows correspond to labels and columns correspond to samples. test : float array Array of predictions of the model on testing data where rows correspond to labels and columns correspond to samples. Y : float array Training labels where rows correspond to labels and columns to samples. This is an array of {1,-1}. y : float array Testing labels where rows correspond to labels and columns to samples. This is an array of {1,-1}. Returns performance : float array Array containing the AUC for training data, classification accuracy for training data, AUC for testing data and classification accuracy for testing data, in that order. .. note:: * For binary-class classification, AUC is proportional to the Mann-Whitney U test statistic which computes a measure of the separation between values of positive labels and negative labels. * For multi-class classification, this formula for computing classifier AUC is one of many. A more principled way would involve computing the Volume under an ROC surface. """ # computing train AUC NP = (Y == 1).sum() NM = (Y == -1).sum() try: U = stats.mannwhitneyu(train[(Y == 1)], train[(Y == -1)]) train_auc = 1. - U[0] / (NP * NM) except ValueError: train_auc = 0.5 # computing test AUC NP = (y == 1).sum() NM = (y == -1).sum() try: U = stats.mannwhitneyu(test[(y == 1)], test[(y == -1)]) test_auc = 1. - U[0] / (NP * NM) except ValueError: test_auc = 0.5 # accuracy = number of examples where argmax of prediction # equals true label # train accuracy train_accuracy = (train.argmax(0) - Y.argmax(0) == 0).sum() / float( Y.shape[1]) # test accuracy test_accuracy = (test.argmax(0) - y.argmax(0) == 0).sum() / float( y.shape[1]) return np.array([train_auc, train_accuracy, test_auc, test_accuracy])
def plotExpBox_Main(inputFiles, headers, valcols, outputFile, sep, startRow, showIndPoints, mark, markMean, showMean, notch, whisker, outliers, plotPvalueCluster, outputClusterPrefix, methodCluster, xlegendrotation, xlabe, ylabe, figsz, titl, showSampleSizes, trimToMinSize, relabels, logb, plotHistogramToFile, plotMedianForGroups, botta, showViolin, showBox, firstColAnnot, plotTrend, showLegend, makePzfxFile, makeBinMatrix, writeDataSummaryStat, summaryStatRange, minuslog10pvalue, minNDataToKeep, vfacecolor, valpha, outXYZPvalues, dividePlots): #if plotPvalueCluster: #if pvalue cluster is needed: # from Bio.Cluster.cluster import * # from Bio.Cluster import * #endif #the real deal! plotData = [] xtickLabels = [] trendData = {} annot = {} minSize = -1 for inputFile, header, cols in zip(inputFiles, headers, valcols): fin = generic_istream(inputFile) startIdx = len(plotData) if firstColAnnot: colAnnot = cols[0] cols = cols[1:] annotThisFile = [] annot[startIdx] = annotThisFile else: colAnnot = -1 annotThisFile = None for col in cols: plotData.append([]) xtickLabels.append(header[col]) colIndices = range(startIdx, startIdx + len(cols)) if plotTrend: #print >> stderr,"plotTrend" trendDataThisFile = [] trendData[startIdx] = trendDataThisFile else: trendDataThisFile = None lino = 0 for lin in fin: lino += 1 if lino < startRow: continue fields = lin.rstrip("\r\n").split(sep) if plotTrend: #print >> stderr,"a" trendDataThisLine = [] else: trendDataThisLine = None allDataOKThisLine = True if colAnnot >= 0: annotThisFile.append(fields[colAnnot]) for idx, col in zip(colIndices, cols): try: value = float(fields[col]) if logb != 0: if value == 0.0: raise ValueError value = log(value) / logb plotData[idx].append(value) if plotTrend: trendDataThisLine.append(value) #print >> stderr,"value:",value except: allDataOKThisLine = False if plotTrend: if allDataOKThisLine: trendDataThisFile.append(trendDataThisLine) else: trendDataThisFile.append(None) fin.close() if minSize == -1: minSize = len(plotData[idx]) #or startIDX? else: minSize = min([minSize, len(plotData[idx])]) if trimToMinSize: print >> stderr, "trimming to min size =", minSize trimData(plotData, minSize) if len(relabels) > 0: #if len(relabels)!=len(xtickLabels): # print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels # exit() print >> stderr, xtickLabels print >> stderr, relabels for i, relabel in zip(range(0, len(relabels)), relabels): xtickLabels[i] = relabel for i in range(0, len(plotMedianForGroups)): plotMedianForGroups[i] = getCol0ListFromCol1ListStringAdv( xtickLabels, plotMedianForGroups[i]) #drawing medians: medianToDraw = [] for mediangrouper in plotMedianForGroups: curD = [] for c in mediangrouper: curD.extend(plotData[c]) medianToDraw.append(median(curD)) for c in range(len(plotData) - 1, -1, -1): if len(plotData[c]) < minNDataToKeep: print >> stderr, xtickLabels[c], "discarded because has only", len( plotData[c]), "data points <", minNDataToKeep del plotData[c] del xtickLabels[c] if not skipStat: print >> stdout, "student t-test (1 sample; mean=0)" print >> stdout, "sample", "mean", "p-val", "median" if writeDataSummaryStat: fDSS = open(writeDataSummaryStat, "w") print >> fDSS, "sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange[" + str( summaryStatRange[0]) + "," + str( summaryStatRange[1] ) + "]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove" for x in range(0, len(plotData)): #print >> stderr, len(plotData[x]) try: print >> stdout, xtickLabels[x], mean( plotData[x]), ttest_1samp(plotData[x], 0)[1], median(plotData[x]) except: print >> stdout, xtickLabels[x], mean( plotData[x]), "NA", median(plotData[x]) if writeDataSummaryStat: sumData, N, NIN, NBelow, NAbove = filterDataInRangeInclusive( plotData[x], summaryStatRange[0], summaryStatRange[1]) if NIN > 1: #print >> stderr,"sumData=",sumData #print >> stderr,mean mea = mean2(sumData) DDOF = 1 sd = std(sumData, ddof=DDOF) var = sd * sd mi = min(sumData) ma = max(sumData) else: mea = "NA" sd = "NA" var = "NA" mi = "NA" ma = "NA" print >> fDSS, xtickLabels[x] + "\t" + str(mea) + "\t" + str( var) + "\t" + str(sd) + "\t" + str(mi) + "\t" + str( ma) + "\t" + str(N) + "\t" + str(NIN) + "\t" + str( float(NIN) * 100 / N) + "\t" + str(NBelow) + "\t" + str( float(NBelow) * 100 / N) + "\t" + str(NAbove) + "\t" + str( float(NAbove) * 100 / N) pvalueM = [] if writeDataSummaryStat: fDSS.close() print >> stdout, "" print >> stdout, "student t-test (2 samples)" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue = ttest_ind(plotData[x], plotData[y])[1] except: pvalue = 1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout, "" print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_t_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_t", xtickLabels, pvalueM, methodCluster) pvalueM = [] print >> stdout, "welch t-test" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: try: pvalue = welchs_approximate_ttest_arr( plotData[x], plotData[y])[3] except: pvalue = 1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, str(pvalue), pvalueRow.append(pvalue) print >> stdout, "" if outXYZPvalues: writeXYZPvalues(outXYZPvalues + "_Welch.xyz", xtickLabels, pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_Welch_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_Welch", xtickLabels, pvalueM, methodCluster) print >> stdout, "" print >> stdout, "non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = mannwhitneyu(plotData[x], plotData[y])[1] * 2 except: pvalue = 1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, pvalue, #mann-whiteney need to mul by 2 (one tail to two tail) pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if outXYZPvalues: writeXYZPvalues(outXYZPvalues + "_U.xyz", xtickLabels, pvalueM) if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_U_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_U", xtickLabels, pvalueM, methodCluster) #####now the variance tests print >> stdout, "" print >> stdout, "Ansari-Bradley Two-sample Test for difference in scale parameters " print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = ansari(plotData[x], plotData[y])[1] except: pvalue = "NA" if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 #pvalue=1.0 print >> stdout, pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_Ansari_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_Ansari", xtickLabels, pvalueM, methodCluster) ##### #####now the variance tests print >> stdout, "" print >> stdout, "Fligner's Two-sample Test for equal variance (non-parametrics)" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = fligner(plotData[x], plotData[y])[1] except: pvalue = "NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_fligner_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_fligner", xtickLabels, pvalueM, methodCluster) ##### #####now the variance tests print >> stdout, "" print >> stdout, "Levene's Two-sample Test for equal variance" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = levene(plotData[x], plotData[y])[1] except: pvalue = "NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_levene_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_levene", xtickLabels, pvalueM, methodCluster) ##### #####now the variance tests print >> stdout, "" print >> stdout, "Bartlett's Two-sample Test for equal variance (for normal distributions)" print >> stdout, "p-val", for x in range(0, len(plotData)): print >> stdout, xtickLabels[x], pvalueM = [] print >> stdout, "" for x in range(0, len(plotData)): pvalueRow = [] pvalueM.append(pvalueRow) print >> stdout, xtickLabels[x], for y in range(0, len(plotData)): if y <= x: print >> stdout, "", if x == y: if minuslog10pvalue: pvalueRow.append(0.0) else: pvalueRow.append(1.0) else: pvalueRow.append(pvalueM[y][x]) else: #if max(len(plotData[x]),len(plotData[y]))<=20: try: pvalue = bartlett(plotData[x], plotData[y])[1] except: pvalue = "NA" #pvalue=1.0 if minuslog10pvalue and str(pvalue) != "NA": try: pvalue = -1 * log(pvalue, 10) except: pvalue = -1000.0 print >> stdout, pvalue, pvalueRow.append(pvalue) #else: # print >> stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", # print >> stdout, "" if plotPvalueCluster: makePValueRawPlot(outputClusterPrefix + "_bartlett_raw", xtickLabels, pvalueM) makePValueClusterPlot(outputClusterPrefix + "_bartlett", xtickLabels, pvalueM, methodCluster) ##### figure(figsize=figsz) subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8) if len(titl) == 0: titl = outputFile plotExpBox(plotData, xtickLabels, showIndPoints, mark, markMean, showMean, notch, whisker, outliers, xlegendrotation, xlabe, ylabe, titl, showSampleSizes, showViolin, showBox, annot, trendData, showLegend, makePzfxFile, makeBinMatrix, dividePlots) #ylim([0,200]) for m in medianToDraw: axhline(y=m, linestyle=':', color='gray') savefig(outputFile, bbox_inches="tight") if len(plotHistogramToFile) > 0: drawHistogram(plotHistogramToFile, plotData, xtickLabels) drawDensigram(plotHistogramToFile + ".density.png", plotData, xtickLabels)