Python welchs_approximate_ttest_arr 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: welchttest

메소드/함수: welchs_approximate_ttest_arr

hotexamples.com에서의 예제들: 10

Python welchs_approximate_ttest_arr - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 welchttest.welchs_approximate_ttest_arr에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: plotExpBox.py 프로젝트: albertwcheng/albert-bioinformatics-scripts

def plotExpBox_Main(inputFile,header,cols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!


	fin=generic_istream(inputFile)
	
	plotData=[]	
	xtickLabels=[]
	for col in cols:
		plotData.append([])
		xtickLabels.append(header[col])

	colIndices=range(0,len(cols))

	lino=0
	for lin in fin:
		lino+=1
		if lino<startRow:
			continue		
		fields=lin.rstrip("\r\n").split(sep)
		
		for idx,col in zip(colIndices,cols):
			try:
				value=float(fields[col])			
				plotData[idx].append(value)
			except:
				pass		
	fin.close()

	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]


	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				pvalue=ttest_ind(plotData[x],plotData[y])[1]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)


	if plotPvalueCluster:
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)

	if plotPvalueCluster:
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)

	if plotPvalueCluster:
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
		
	if len(titl)==0:
		titl=outputFile

	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	#ylim([0,200])

	savefig(outputFile,bbox_inches="tight")

예제 #2

파일 보기

def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)

		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))

		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
		
			for idx,col in zip(colIndices,cols):
				try:
					
					value=float(fields[col])
					if logb!=0:
						value=log(value)/logb	
						if value<-100000:
							raise ValueError						
					plotData[idx].append(value)
					
				except:
					pass		
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()

		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])==0:
			print >> stderr,xtickLabels[c],"discarded"
			del plotData[c]
			del xtickLabels[c]


	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		#print >> stderr, len(plotData[x])
		try:
			print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]
		except:
			print >> stdout, xtickLabels[x],"NA","NA"

	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				try:
					pvalue=ttest_ind(plotData[x],plotData[y])[1]
				except:
					pvalue=1.0

				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	


	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				try:
					pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				except:
					pvalue=1.0
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)

예제 #3

파일 보기

파일: plotExpBox2.py 프로젝트: albertwcheng/albert-bioinformatics-scripts

def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)

		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))

		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
		
			for idx,col in zip(colIndices,cols):
				try:
					
					value=float(fields[col])
					if logb!=0:
						if value==0.0:
							raise ValueError
						value=log(value)/logb							
					plotData[idx].append(value)
					
				except:
					pass		
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()
		print >> stderr,xtickLabels
		print >> stderr,relabels
		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])==0:
			print >> stderr,xtickLabels[c],"discarded"
			del plotData[c]
			del xtickLabels[c]


	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		#print >> stderr, len(plotData[x])
		try:
			print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]
		except:
			print >> stdout, xtickLabels[x],"NA","NA"

	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				try:
					pvalue=ttest_ind(plotData[x],plotData[y])[1]
				except:
					pvalue=1.0

				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	


	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				try:
					pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				except:
					pvalue=1.0
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)

예제 #4

파일 보기

def attachWelchpValue(filename, cols1, cols2, startRow1, sortByFDR):
    fin = open(filename)
    lino = 0

    pvaluesMap = dict()
    orderedAsInFile = []
    #[pvalue][ [line,p-value,FDR] ]

    for line in fin:
        line = line.strip()
        lino += 1
        if lino < startRow1:
            print >> sys.stdout, line, "\tWelch p-value\tWelch FDR"
            continue
        spliton = line.split("\t")
        arr1 = []
        arr2 = []

        for i0 in cols1:
            try:
                arr1.append(float(spliton[i0]))
            except:
                pass

        for i0 in cols2:
            try:
                arr2.append(float(spliton[i0]))
            except:
                pass

        try:
            welchRes = welchttest.welchs_approximate_ttest_arr(arr1, arr2)
            pval = welchRes[3]
        except:
            pval = 1.0

        try:
            copvalues = pvaluesMap[pval]
        except KeyError:
            copvalues = []
            pvaluesMap[pval] = copvalues
        thisEntry = [line, pval, 0]
        copvalues.append(thisEntry)
        orderedAsInFile.append(thisEntry)
        #print >> sys.stdout, line, "\t", str(pval);

    fin.close()

    totalEntry = len(orderedAsInFile)
    nAlready = 0

    #Now cal FDR and output

    sortedpvalues = pvaluesMap.keys()
    sortedpvalues.sort()

    for pval in sortedpvalues:
        copvalues = pvaluesMap[pval]
        lcopvalues = len(copvalues)
        nAlready += lcopvalues
        FDR = totalEntry * float(pval) / nAlready
        for copvalue in copvalues:
            copvalue[2] = FDR

    if sortByFDR:
        for pval in sortedpvalues:
            copvalues = pvaluesMap[pval]
            for copvalue in copvalues:
                line, pval, FDR = copvalue
                print >> sys.stdout, line + "\t" + str(pval) + "\t" + str(FDR)

    else:
        for line, pval, FDR in orderedAsInFile:
            print >> sys.stdout, line + "\t" + str(pval) + "\t" + str(FDR)

예제 #5

파일 보기

def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)

		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))

		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
		
			for idx,col in zip(colIndices,cols):
				try:
					
					value=float(fields[col])			
					plotData[idx].append(value)
					
				except:
					pass		
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		if len(relabels)!=len(xtickLabels):
			print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
			exit()

		xtickLabels=relabels
		
	

	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		#print >> stderr, len(plotData[x])
		print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]


	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				pvalue=ttest_ind(plotData[x],plotData[y])[1]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	


	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
		
	if len(titl)==0:
		titl=outputFile

	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	#ylim([0,200])

	savefig(outputFile,bbox_inches="tight")

예제 #6

파일 보기

파일: plotViolinBox.py 프로젝트: jidiazhernandez/albert-bioinformatics-scripts

def plotExpBox_Main(inputFiles, headers, valcols, outputFile, sep, startRow,
                    showIndPoints, mark, markMean, showMean, notch, whisker,
                    outliers, plotPvalueCluster, outputClusterPrefix,
                    methodCluster, xlegendrotation, xlabe, ylabe, figsz, titl,
                    showSampleSizes, trimToMinSize, relabels, logb,
                    plotHistogramToFile, plotMedianForGroups, botta,
                    showViolin, showBox, firstColAnnot, plotTrend, showLegend,
                    makePzfxFile, makeBinMatrix, writeDataSummaryStat,
                    summaryStatRange, minuslog10pvalue, minNDataToKeep,
                    vfacecolor, valpha, outXYZPvalues, dividePlots):

    #if plotPvalueCluster:
    #if pvalue cluster is needed:
    #	from Bio.Cluster.cluster import *
    #	from Bio.Cluster import *
    #endif

    #the real deal!
    plotData = []
    xtickLabels = []

    trendData = {}
    annot = {}

    minSize = -1

    for inputFile, header, cols in zip(inputFiles, headers, valcols):
        fin = generic_istream(inputFile)

        startIdx = len(plotData)

        if firstColAnnot:
            colAnnot = cols[0]
            cols = cols[1:]
            annotThisFile = []
            annot[startIdx] = annotThisFile
        else:
            colAnnot = -1
            annotThisFile = None

        for col in cols:
            plotData.append([])
            xtickLabels.append(header[col])

        colIndices = range(startIdx, startIdx + len(cols))

        if plotTrend:
            #print >> stderr,"plotTrend"
            trendDataThisFile = []
            trendData[startIdx] = trendDataThisFile
        else:
            trendDataThisFile = None

        lino = 0
        for lin in fin:
            lino += 1
            if lino < startRow:
                continue
            fields = lin.rstrip("\r\n").split(sep)

            if plotTrend:
                #print >> stderr,"a"
                trendDataThisLine = []
            else:
                trendDataThisLine = None

            allDataOKThisLine = True

            if colAnnot >= 0:
                annotThisFile.append(fields[colAnnot])

            for idx, col in zip(colIndices, cols):
                try:
                    value = float(fields[col])
                    if logb != 0:
                        if value == 0.0:
                            raise ValueError
                        value = log(value) / logb
                    plotData[idx].append(value)

                    if plotTrend:
                        trendDataThisLine.append(value)
                        #print >> stderr,"value:",value

                except:
                    allDataOKThisLine = False

            if plotTrend:
                if allDataOKThisLine:
                    trendDataThisFile.append(trendDataThisLine)
                else:
                    trendDataThisFile.append(None)

        fin.close()

        if minSize == -1:
            minSize = len(plotData[idx])  #or startIDX?
        else:
            minSize = min([minSize, len(plotData[idx])])

    if trimToMinSize:
        print >> stderr, "trimming to min size =", minSize
        trimData(plotData, minSize)

    if len(relabels) > 0:
        #if len(relabels)!=len(xtickLabels):
        #	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
        #	exit()
        print >> stderr, xtickLabels
        print >> stderr, relabels
        for i, relabel in zip(range(0, len(relabels)), relabels):
            xtickLabels[i] = relabel

    for i in range(0, len(plotMedianForGroups)):
        plotMedianForGroups[i] = getCol0ListFromCol1ListStringAdv(
            xtickLabels, plotMedianForGroups[i])

    #drawing medians:
    medianToDraw = []
    for mediangrouper in plotMedianForGroups:
        curD = []
        for c in mediangrouper:
            curD.extend(plotData[c])
        medianToDraw.append(median(curD))

    for c in range(len(plotData) - 1, -1, -1):
        if len(plotData[c]) < minNDataToKeep:
            print >> stderr, xtickLabels[c], "discarded because has only", len(
                plotData[c]), "data points <", minNDataToKeep
            del plotData[c]
            del xtickLabels[c]

    if not skipStat:
        print >> stdout, "student t-test (1 sample; mean=0)"
        print >> stdout, "sample", "mean", "p-val", "median"

        if writeDataSummaryStat:
            fDSS = open(writeDataSummaryStat, "w")
            print >> fDSS, "sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange[" + str(
                summaryStatRange[0]) + "," + str(
                    summaryStatRange[1]
                ) + "]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove"

        for x in range(0, len(plotData)):
            #print >> stderr, len(plotData[x])
            try:
                print >> stdout, xtickLabels[x], mean(
                    plotData[x]), ttest_1samp(plotData[x],
                                              0)[1], median(plotData[x])
            except:
                print >> stdout, xtickLabels[x], mean(
                    plotData[x]), "NA", median(plotData[x])

            if writeDataSummaryStat:
                sumData, N, NIN, NBelow, NAbove = filterDataInRangeInclusive(
                    plotData[x], summaryStatRange[0], summaryStatRange[1])

                if NIN > 1:
                    #print >> stderr,"sumData=",sumData
                    #print >> stderr,mean
                    mea = mean2(sumData)
                    DDOF = 1
                    sd = std(sumData, ddof=DDOF)
                    var = sd * sd
                    mi = min(sumData)
                    ma = max(sumData)
                else:
                    mea = "NA"
                    sd = "NA"
                    var = "NA"
                    mi = "NA"
                    ma = "NA"

                print >> fDSS, xtickLabels[x] + "\t" + str(mea) + "\t" + str(
                    var) + "\t" + str(sd) + "\t" + str(mi) + "\t" + str(
                        ma) + "\t" + str(N) + "\t" + str(NIN) + "\t" + str(
                            float(NIN) * 100 /
                            N) + "\t" + str(NBelow) + "\t" + str(
                                float(NBelow) * 100 /
                                N) + "\t" + str(NAbove) + "\t" + str(
                                    float(NAbove) * 100 / N)

        pvalueM = []

        if writeDataSummaryStat:
            fDSS.close()

        print >> stdout, ""

        print >> stdout, "student t-test (2 samples)"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        print >> stdout, ""

        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    try:
                        pvalue = ttest_ind(plotData[x], plotData[y])[1]
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, str(pvalue),
                    pvalueRow.append(pvalue)
            print >> stdout, ""

        print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_t_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_t", xtickLabels,
                                  pvalueM, methodCluster)

        pvalueM = []

        print >> stdout, "welch t-test"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])

                else:
                    try:
                        pvalue = welchs_approximate_ttest_arr(
                            plotData[x], plotData[y])[3]
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, str(pvalue),
                    pvalueRow.append(pvalue)
            print >> stdout, ""

        if outXYZPvalues:
            writeXYZPvalues(outXYZPvalues + "_Welch.xyz", xtickLabels, pvalueM)

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_Welch_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_Welch", xtickLabels,
                                  pvalueM, methodCluster)

        print >> stdout, ""
        print >> stdout, "non-parametric (Mann-Whitney U)"  #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = mannwhitneyu(plotData[x], plotData[y])[1] * 2
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,  #mann-whiteney need to mul by 2 (one tail to two tail)
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if outXYZPvalues:
            writeXYZPvalues(outXYZPvalues + "_U.xyz", xtickLabels, pvalueM)

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_U_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_U", xtickLabels,
                                  pvalueM, methodCluster)

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Ansari-Bradley Two-sample Test for difference in scale parameters "
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = ansari(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                        #pvalue=1.0
                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_Ansari_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_Ansari", xtickLabels,
                                  pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Fligner's Two-sample Test for equal variance (non-parametrics)"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = fligner(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_fligner_raw",
                              xtickLabels, pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_fligner",
                                  xtickLabels, pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Levene's Two-sample Test for equal variance"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = levene(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_levene_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_levene", xtickLabels,
                                  pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Bartlett's Two-sample Test for equal variance (for normal distributions)"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = bartlett(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_bartlett_raw",
                              xtickLabels, pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_bartlett",
                                  xtickLabels, pvalueM, methodCluster)

        #####

    figure(figsize=figsz)
    subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)

    if len(titl) == 0:
        titl = outputFile

    plotExpBox(plotData, xtickLabels, showIndPoints, mark, markMean, showMean,
               notch, whisker, outliers, xlegendrotation, xlabe, ylabe, titl,
               showSampleSizes, showViolin, showBox, annot, trendData,
               showLegend, makePzfxFile, makeBinMatrix, dividePlots)

    #ylim([0,200])
    for m in medianToDraw:
        axhline(y=m, linestyle=':', color='gray')

    savefig(outputFile, bbox_inches="tight")

    if len(plotHistogramToFile) > 0:
        drawHistogram(plotHistogramToFile, plotData, xtickLabels)
        drawDensigram(plotHistogramToFile + ".density.png", plotData,
                      xtickLabels)

예제 #7

파일 보기

파일: ttest.py 프로젝트: DevasenaInupakutika/MIT_OCW_DATASCIENCE

    candtomoney[name].append(amount)

obama = candtomoney["Obama Barack"]
mccain = candtomoney["McCain John S"]
print len(obama), len(mccain)

# certainly the means look different...
print "Obama mean, stdev", numpy.mean(obama), numpy.std(obama)
print "McCain mean, stdev", numpy.mean(mccain), numpy.std(mccain)

# running a ttest of independent samples suggests that they aren't the same mean
print "ttest, equal variances", scipy.stats.ttest_ind(obama, mccain)

# small white lie: there is no reason to believe that these two samples have
# equal variance, so let's use a welch test.
print "welch", welchttest.welchs_approximate_ttest_arr(obama, mccain)

# but we've been lying to you.  you should only run ttests on normal data, so run
# the shapiro-wilk test of normalcy
print "obama shapiro", scipy.stats.shapiro(obama)
print "mccain shapiro", scipy.stats.shapiro(mccain)

# ooops...we have to reject the null hypothesis: it's very unlikely these two
# are normally distributed.  it's actually not that bad for a ttest, but just to
# be sure, let's run a non-parametric test called the Mann-Whitney U test.
print "mann-whitney U", scipy.stats.mannwhitneyu(obama, mccain)

# cool!  the p-value is 0.  So if our alpha was .05 or .01, we'd still be below
# it.  it's unlikely these two are from the same distribution, and thus we
# can safely reject the null hypothesis that they have the same mean.  Obama's
# donations were really smaller than McCain's!

예제 #8

파일 보기

파일: ttest.py 프로젝트: vkiran2112/dataiap

    candtomoney[name].append(amount)

obama = candtomoney["Obama Barack"]
mccain = candtomoney["McCain John S"]
print len(obama), len(mccain)

# certainly the means look different...
print "Obama mean, stdev", numpy.mean(obama), numpy.std(obama)
print "McCain mean, stdev", numpy.mean(mccain), numpy.std(mccain)

# running a ttest of independent samples suggests that they aren't the same mean
print "ttest, equal variances", scipy.stats.ttest_ind(obama, mccain)

# small white lie: there is no reason to believe that these two samples have
# equal variance, so let's use a welch test.
print "welch", welchttest.welchs_approximate_ttest_arr(obama, mccain)

# but we've been lying to you.  you should only run ttests on normal data, so run
# the shapiro-wilk test of normalcy
print "obama shapiro", scipy.stats.shapiro(obama)
print "mccain shapiro", scipy.stats.shapiro(mccain)

# ooops...we have to reject the null hypothesis: it's very unlikely these two
# are normally distributed.  it's actually not that bad for a ttest, but just to
# be sure, let's run a non-parametric test called the Mann-Whitney U test.
print "mann-whitney U", scipy.stats.mannwhitneyu(obama, mccain)

# cool!  the p-value is 0.  So if our alpha was .05 or .01, we'd still be below
# it.  it's unlikely these two are from the same distribution, and thus we
# can safely reject the null hypothesis that they have the same mean.  Obama's
# donations were really smaller than McCain's!

예제 #9

파일 보기

파일: attachWelchpValue.py 프로젝트: albertwcheng/albert-bioinformatics-scripts

def attachWelchpValue(filename,cols1,cols2,startRow1,sortByFDR):
	fin=open(filename);
	lino=0;	
	
	pvaluesMap=dict()
	orderedAsInFile=[]
	#[pvalue][ [line,p-value,FDR] ]
	
	for line in fin:
		line=line.strip();
		lino+=1;
		if lino<startRow1:
			print >> sys.stdout, line, "\tWelch p-value\tWelch FDR";
			continue;
		spliton=line.split("\t");
		arr1=[];
		arr2=[];

		for i0 in cols1:
			try:
				arr1.append(float(spliton[i0]));
			except:
				pass
				
		for i0 in cols2:
			try:
				arr2.append(float(spliton[i0]));
			except:
				pass
		
		try:		
			welchRes=welchttest.welchs_approximate_ttest_arr(arr1,arr2);
			pval=welchRes[3];
		except:
			pval=1.0
			
		try:		
			copvalues=pvaluesMap[pval]
		except KeyError:
			copvalues=[]
			pvaluesMap[pval]=copvalues
		thisEntry=[line,pval,0]
		copvalues.append(thisEntry)
		orderedAsInFile.append(thisEntry)			
		#print >> sys.stdout, line, "\t", str(pval);		
		
		
	fin.close();

	totalEntry=len(orderedAsInFile)
	nAlready=0
	
	
	
	#Now cal FDR and output

	sortedpvalues=pvaluesMap.keys()
	sortedpvalues.sort()

	for pval in sortedpvalues:
		copvalues=pvaluesMap[pval]	
		lcopvalues=len(copvalues)
		nAlready+=lcopvalues		
		FDR=totalEntry*float(pval)/nAlready
		for copvalue in copvalues:
			copvalue[2]=FDR
			
	if sortByFDR:
		for pval in sortedpvalues:
			copvalues=pvaluesMap[pval]
			for copvalue in copvalues:
				line,pval,FDR=copvalue
				print >> sys.stdout, line+"\t"+str(pval)+"\t"+str(FDR)

	else:
		for line,pval,FDR in orderedAsInFile:
			print >> sys.stdout, line+"\t"+str(pval)+"\t"+str(FDR)

예제 #10

파일 보기

파일: plotViolinBox.py 프로젝트: albertwcheng/albert-bioinformatics-scripts

def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta,showViolin,showBox,firstColAnnot,plotTrend,showLegend,makePzfxFile,makeBinMatrix,writeDataSummaryStat,summaryStatRange,minuslog10pvalue,minNDataToKeep,vfacecolor,valpha,outXYZPvalues,dividePlots):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif


	
	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	trendData={}
	annot={}
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)
		
		if firstColAnnot:
			colAnnot=cols[0]
			cols=cols[1:]
			annotThisFile=[]
			annot[startIdx]=annotThisFile
		else:
			colAnnot=-1
			annotThisFile=None
			
		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))
		
		if plotTrend:
			#print >> stderr,"plotTrend"
			trendDataThisFile=[]
			trendData[startIdx]=trendDataThisFile
		else:
			trendDataThisFile=None
			
			
		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
			
			if plotTrend:
				#print >> stderr,"a"
				trendDataThisLine=[]
			else:
				trendDataThisLine=None
			
			allDataOKThisLine=True
			
			if colAnnot>=0:
				annotThisFile.append(fields[colAnnot])
			
			for idx,col in zip(colIndices,cols):
				try:
					value=float(fields[col])
					if logb!=0:
						if value==0.0:
							raise ValueError
						value=log(value)/logb							
					plotData[idx].append(value)
					
					if plotTrend:
						trendDataThisLine.append(value)
						#print >> stderr,"value:",value
					
				except:
					allDataOKThisLine=False	
				
			if plotTrend:
				if allDataOKThisLine:
					trendDataThisFile.append(trendDataThisLine)
				else:
					trendDataThisFile.append(None)
			
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()
		print >> stderr,xtickLabels
		print >> stderr,relabels
		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])<minNDataToKeep:
			print >> stderr,xtickLabels[c],"discarded because has only",len(plotData[c]),"data points <",minNDataToKeep
			del plotData[c]
			del xtickLabels[c]

	if not skipStat:
		print >> stdout,"student t-test (1 sample; mean=0)"
		print >> stdout,"sample","mean","p-val","median"
	
		if writeDataSummaryStat:
			fDSS=open(writeDataSummaryStat,"w")
			print >> fDSS,"sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange["+str(summaryStatRange[0])+","+str(summaryStatRange[1])+"]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove"
			
		for x in range(0,len(plotData)):
			#print >> stderr, len(plotData[x])
			try:
				print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1],median(plotData[x])
			except:
				print >> stdout, xtickLabels[x],mean(plotData[x]),"NA",median(plotData[x])
			
			if writeDataSummaryStat:
				sumData,N,NIN,NBelow,NAbove=filterDataInRangeInclusive(plotData[x],summaryStatRange[0],summaryStatRange[1])
				
				if NIN>1:
					#print >> stderr,"sumData=",sumData
					#print >> stderr,mean
					mea=mean2(sumData)
					DDOF=1
					sd=std(sumData,ddof=DDOF)
					var=sd*sd
					mi=min(sumData)
					ma=max(sumData)
				else:
					mea="NA"
					sd="NA"
					var="NA"
					mi="NA"
					ma="NA"
				
			
					
				print >> fDSS,xtickLabels[x]+"\t"+str(mea)+"\t"+str(var)+"\t"+str(sd)+"\t"+str(mi)+"\t"+str(ma)+"\t"+str(N)+"\t"+str(NIN)+"\t"+str(float(NIN)*100/N)+"\t"+str(NBelow)+"\t"+str(float(NBelow)*100/N)+"\t"+str(NAbove)+"\t"+str(float(NAbove)*100/N)
			
	
		pvalueM=[]
		
		if writeDataSummaryStat:
			fDSS.close()
		
		print >> stdout,""
		
		print >> stdout,"student t-test (2 samples)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
	
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					try:
						pvalue=ttest_ind(plotData[x],plotData[y])[1]
					except:
						pvalue=1.0
					
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";	
	
		
		print >> stdout,""
	
		
	
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)
	
	
			
		pvalueM=[]
	
		print >> stdout,"welch t-test"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
						
				else:
					try:
						pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_Welch.xyz",xtickLabels,pvalueM)
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)
	
		
		print >> stdout,""
		print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_U.xyz",xtickLabels,pvalueM)
		
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
		
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Ansari-Bradley Two-sample Test for difference in scale parameters " 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=ansari(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
						#pvalue=1.0
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Ansari_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Ansari",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Fligner's Two-sample Test for equal variance (non-parametrics)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=fligner(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_fligner_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_fligner",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Levene's Two-sample Test for equal variance" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=levene(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_levene_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_levene",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Bartlett's Two-sample Test for equal variance (for normal distributions)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=bartlett(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_bartlett_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_bartlett",xtickLabels,pvalueM,methodCluster)	
		
		
		#####

	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes,showViolin,showBox,annot,trendData,showLegend,makePzfxFile,makeBinMatrix,dividePlots)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)
		drawDensigram(plotHistogramToFile+".density.png",plotData,xtickLabels)