Пример #1
0
    def run(self):
        """
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        db_250k = self.db_250k

        #construct the bands to be highlighted in manhattan plot
        highlightBandLs = []
        rm = None
        for peak_id in self.peak_id_ls:
            result_peak = Stock_250kDB.ResultPeak.get(peak_id)
            highlightBandLs.append(
                [result_peak.chromosome, result_peak.start, result_peak.stop])
            #take the 1st result_peak's result as the association result to get locus_type_id
            if rm is None:
                rm = result_peak.result
        if not rm:
            sys.stderr.write(
                "Error: no results_method (association result) fetched from db.\n"
            )
            sys.exit(1)
        if self.inputFname and os.path.isfile(self.inputFname):
            locus_type_id = self.getLocusTypeIDFromInput(
                self.inputFname, datasetName=self.datasetName)

            pd = PassingData()
            if rm.cnv_method_id and not db_250k._cnv_id2chr_pos:
                db_250k.cnv_id2chr_pos = rm.cnv_method_id
                pd.db_id2chr_pos = db_250k.cnv_id2chr_pos
            elif rm.call_method_id:
                db_250k.snp_id2chr_pos = (
                    False, locus_type_id
                )  #correspond to priorTAIRVersion, locus_type_id
                pd.db_id2chr_pos = db_250k.snp_id2chr_pos

            #need to setup a different db setting
            db_genome = GenomeDB.GenomeDatabase(drivername=self.genome_drivername, username=self.genome_db_user,
                password=self.genome_db_passwd, hostname=self.genome_hostname, database=self.genome_dbname, \
                schema=self.genome_schema)
            db_genome.setup(create_tables=False)

            gwr_name = ''
            gwr = SNP.GenomeWideResult(name=gwr_name, construct_chr_pos2index=False, \
              construct_data_obj_id2index=False)
            gwr.fillGenomeWideResultFromHDF5CorrelationFile(
                self.inputFname, datasetName=self.datasetName, pdata=pd)
            gwr.drawManhattanPlot(db_genome, outputFnamePrefix=self.outputFnamePrefix,\
                 min_value=None, need_svg=False, ylim_type=2,\
                 drawBonferroni=False, highlightBandLs=highlightBandLs)
        else:  #2012.3.28 input is invalid.
            sys.stderr.write("inputFname %s is not valid (inexistent).\n" %
                             (self.inputFname))
            sys.exit(0)  #fake ok as I want pegasus workflow to keep running.
	def getMonkeyIBDCheckData(self, inputFname=None):
		"""
		2012.8.21
			inputFname is output of plink ibd check.
 FID1     IID1 FID2     IID2 RT    EZ      Z0      Z1      Z2  PI_HAT PHE       DST     PPC   RATIO
   1  1996093   1  1995025 OT     0  1.0000  0.0000  0.0000  0.0000  -1  0.654218  0.3630  1.9764
   1  1996093   1  2001039 OT     0  0.9832  0.0000  0.0168  0.0168  -1  0.653608  0.0318  1.8792
   1  1996093   1  1984011 OT     0  1.0000  0.0000  0.0000  0.0000  -1  0.645011  0.0168  1.8624
   1  1996093   1  1987004 OT     0  0.9260  0.0628  0.0113  0.0427  -1  0.660490  0.9999  2.2805
   		
		"""
		sys.stderr.write("Reading PI_hat from %s ... "%(inputFname))
		ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=inputFname, rowIDHeader="IID1", colIDHeader="IID2", rowIDIndex=None, colIDIndex=None, \
								dataHeader="PI_HAT", dataIndex=None, hasHeader=True)
		return ibdData
		"""
Пример #3
0
    def setup(self, **keywords):
        """
		2012.10.15
			run before anything is run
		"""
        AbstractMatrixFileWalker.setup(self, **keywords)
        #read in the IBD check result
        if self.plinkIBDCheckOutputFname:
            ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.plinkIBDCheckOutputFname, rowIDHeader="IID1", colIDHeader="IID2", \
                 rowIDIndex=None, colIDIndex=None, \
                 dataHeader="PI_HAT", dataIndex=None, hasHeader=True)
        else:
            ibdData = None
        self.ibdData = ibdData
        self.data_matrix = [
        ]  #data structure to store all rows during fileWalker()
	def getMonkeyKinshipData(self, inputFname=None):
		"""
		2012.8.22
			use SNP.readAdjacencyListDataIntoMatrix(), and defaultValue=0
		2012.2.10
		"""
		
		sys.stderr.write("Reading kinship from %s ... "%(inputFname))
		kinshipData = SNP.readAdjacencyListDataIntoMatrix(inputFname=inputFname, rowIDHeader=None, colIDHeader=None, rowIDIndex=0, colIDIndex=1, \
								dataHeader=None, dataIndex=2, hasHeader=False, defaultValue=0)
		#set kinshipData diagonal to 1
		for i in xrange(len(kinshipData.row_id_ls)):
			kinshipData.data_matrix[i][i] = 1
		return kinshipData
		"""
		header = reader.next()
		col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True)
		monkey1_id_index = col_name2index.get("monkeyId1")
		monkey2_id_index = col_name2index.get("monkeyId2")
		kinship_index = col_name2index.get("kinship")
		"""
		"""
Пример #5
0
	def outputSNPDataInNewCoordinate(self, querySNPDataFname=None, querySNPID2NewReferenceCoordinateLs=None,\
									newSNPDataOutputFname=None, newSNPDataOutputFormat=1):
		"""
		2013.07.03 added argument newSNPDataOutputFormat
			
		2012.10.14
			split out of findSNPPositionOnNewRef()
		"""
		sys.stderr.write("Converting querySNPDataFname %s into individual X SNP format, format=%s ... "%\
						(querySNPDataFname, newSNPDataOutputFormat))
		"""
Sample  Geno    SNP
1999010 CC      cs_primer1082_247
1999068 CC      cs_primer1082_247
2000022 CT      cs_primer1082_247
2000064 CT      cs_primer1082_247
2000117 CC      cs_primer1082_247

		"""
		inf = utils.openGzipFile(querySNPDataFname)
		reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
		col_name2index = getColName2IndexFromHeader(reader.next())
		
		sampleIndex = col_name2index.get("Sample")
		genotypeIndex = col_name2index.get("Geno")
		SNPIDIndex = col_name2index.get("SNP")
		
		row_id2index = {}
		row_id_ls = []
		col_id_ls = []
		col_id2index = {}
		row_col_index2genotype = {}
		for row in reader:
			sampleID = row[sampleIndex]
			genotype = row[genotypeIndex]
			querySNPID = row[SNPIDIndex]
			if querySNPID in querySNPID2NewReferenceCoordinateLs:
				newRefCoordinateLs = querySNPID2NewReferenceCoordinateLs.get(querySNPID)
				if len(newRefCoordinateLs)==1:
					newRefCoordinate = newRefCoordinateLs[0]
					if newSNPDataOutputFormat==2:
						col_id = '%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart)
					else:
						col_id = '%s_%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart, newRefCoordinate.newRefStop)
					queryStrand = newRefCoordinate.queryStrand
					if col_id not in col_id2index:
						col_id2index[col_id] = len(col_id2index)
						col_id_ls.append(col_id)
					if sampleID not in row_id2index:
						row_id2index[sampleID] = len(row_id2index)
						row_id_ls.append(sampleID)
					if queryStrand == "-":
						genotype = SNP.reverseComplement(genotype)
					row_index = row_id2index[sampleID]
					col_index = col_id2index[col_id]
					row_col_index2genotype[(row_index, col_index)] = genotype
				else:
					continue
		data_matrix = numpy.zeros([len(row_id_ls), len(col_id2index)], dtype=numpy.int8)
		
		for row_col_index, genotype in row_col_index2genotype.iteritems():
			row_index, col_index = row_col_index[:2]
			data_matrix[row_index, col_index] = SNP.nt2number[genotype]
		sys.stderr.write("\n")
		snpData = SNP.SNPData(row_id_ls=row_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix)
		snpData.tofile(newSNPDataOutputFname)
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		db_vervet = VervetDB.VervetDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \
									hostname=self.hostname, database=self.dbname, schema=self.schema, port=self.port)
		db_vervet.setup(create_tables=False)
		self.db_vervet = db_vervet
		
		
		kinshipData = self.getMonkeyKinshipData(inputFname=self.inputFname)
		#set kinshipData diagonal to 1
		ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.plinkIBDCheckOutputFname, id1Header="IID1", id2Header="IID2", id1Index=None, id2Index=None, \
								dataHeader="PI_HAT", dataIndex=None, hasHeader=True)
		monkey_id2plinkSex = SNP.getKey2ValueFromMatrixLikeFile(inputFname=self.plinkSexCheckOutputFname, \
								keyHeaderLs=['IID'], valueHeaderLs=['SNPSEX'], keyIndexLs=None, valueIndexLs=None, \
								hasHeader=True, valueDataType=int)
		
		kinshipIBDDeltaData = self.createDeltaMatrix(kinshipData=kinshipData, ibdData=ibdData)
		queueData = self.createKinshipIBDDeltaQueue(kinshipIBDDeltaData)
		kinshipIBDDeltaQueue = queueData.kinshipIBDDeltaQueue
		monkey_id2medianAbsDelta = queueData.monkey_id2medianAbsDelta
		
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		header = ['sourceMonkeyID', 'medianAbsDelta', 'noOfNonMissing', 'sourceMonkeySex', 'sourceMonkeyPlinkSex', \
				'sourceMonkeyMedianAbsDeltaDropAfterSwap', \
			'targetMonkeyID', 'sourceMonkeyNoOfNonMissingAfterSwap', \
			'targetMonkeyMedianAbsDelta', 'targetMonkeyNoOfNonMissing', 'targetMonkeySex', 'targetMonkeyPlinkSex', \
			'targetMonkeyMedianAbsDeltaAfterSwap', 'targetMonkeyNoOfNonMissingAfterSwap']
		writer.writerow(header)
		
		i=0
		while i <50 and len(kinshipIBDDeltaQueue)>0:
			negativeMedianAbsDelta, sourceMonkeyID, noOfNonMissing = heapq.heappop(kinshipIBDDeltaQueue)[:3]
			medianAbsDelta = -negativeMedianAbsDelta
			sourceMonkeyDBEntry = self.getMonkeyDBEntry(db_vervet=db_vervet, ucla_id=sourceMonkeyID)
			
			# 2012.8.22 draw some histogram to check what data looks like
			#self.drawKinshipIBDDeltaVectorHistogram(kinshipIBDDeltaData=kinshipIBDDeltaData, row_id=sourceMonkeyID, \
			#							outputFnamePrefix=self.outputFnamePrefix)
			
			medianAbsDeltaIncreaseQueue = []
			for targetMonkeyID in kinshipData.row_id_ls:
				if targetMonkeyID!=sourceMonkeyID:
					targetMonkeyDBEntry = self.getMonkeyDBEntry(db_vervet=db_vervet, ucla_id=targetMonkeyID)
					#get the updated Median Delta for sourceMonkeyID
					pdata = self.calculateMedianAbsDelta(kinshipData=kinshipData, \
										kinshipDataMonkeyID=targetMonkeyID, ibdData=ibdData, ibdDataMonkeyID=sourceMonkeyID)
					sourceMonkeyMedianAbsDeltaAfterSwap = pdata.medianAbsDelta
					sourceMonkeyNoOfNonMissingAfterSwap  = pdata.noOfNonMissing
					
					#get the updated Median Delta for targetMonkeyID
					pdata = self.calculateMedianAbsDelta(kinshipData=kinshipData, \
										kinshipDataMonkeyID=sourceMonkeyID, ibdData=ibdData, ibdDataMonkeyID=targetMonkeyID)
					targetMonkeyMedianAbsDeltaAfterSwap = pdata.medianAbsDelta
					targetMonkeyNoOfNonMissingAfterSwap = pdata.noOfNonMissing
					
					if sourceMonkeyMedianAbsDeltaAfterSwap is not None:	#add to the queue
						#add the candidate monkey and how much median delta drops into the queue
						pdata = monkey_id2medianAbsDelta.get(targetMonkeyID)
						if pdata:
							targetMonkeyMedianAbsDelta = pdata.medianAbsDelta
							targetMonkeyNoOfNonMissing = pdata.noOfNonMissing
						else:
							targetMonkeyMedianAbsDelta = None
							targetMonkeyNoOfNonMissing = None
						item = [sourceMonkeyMedianAbsDeltaAfterSwap-medianAbsDelta, targetMonkeyID, sourceMonkeyNoOfNonMissingAfterSwap, \
							targetMonkeyMedianAbsDelta, targetMonkeyNoOfNonMissing, targetMonkeyMedianAbsDeltaAfterSwap, \
							targetMonkeyNoOfNonMissingAfterSwap]
						heapq.heappush(medianAbsDeltaIncreaseQueue, item)
				
			#the target monkey that increase the least (or drop the most) for the median delta is the prime candidate for label-swap 
			i+=1
			#output the top 5 candidates for each source monkey
			#output db sex for all monkeys and the plink sex check result
			j = 0
			while j<5 and len(medianAbsDeltaIncreaseQueue)>0:
				sourceMonkeyMedianAbsDeltaDropAfterSwap, targetMonkeyID, sourceMonkeyNoOfNonMissingAfterSwap, \
						targetMonkeyMedianAbsDelta, targetMonkeyNoOfNonMissing, targetMonkeyMedianAbsDeltaAfterSwap, targetMonkeyNoOfNonMissingAfterSwap =\
							heapq.heappop(medianAbsDeltaIncreaseQueue)[:7]
				sourceMonkeySex = sourceMonkeyDBEntry.codeSexInNumber()
				sourceMonkeyPlinkSex = monkey_id2plinkSex.get(sourceMonkeyID)
				
				targetMonkeySex = targetMonkeyDBEntry.codeSexInNumber()
				targetMonkeyPlinkSex = monkey_id2plinkSex.get(targetMonkeyID)
				
				data_row = [sourceMonkeyID, medianAbsDelta, noOfNonMissing, sourceMonkeySex, sourceMonkeyPlinkSex,\
						sourceMonkeyMedianAbsDeltaDropAfterSwap, targetMonkeyID, sourceMonkeyNoOfNonMissingAfterSwap,\
						targetMonkeyMedianAbsDelta, targetMonkeyNoOfNonMissing, targetMonkeySex, targetMonkeyPlinkSex, targetMonkeyMedianAbsDeltaAfterSwap,\
						targetMonkeyNoOfNonMissingAfterSwap]
				writer.writerow(data_row)
				j+= 1
		del writer
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		db_vervet = VervetDB.VervetDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \
									hostname=self.hostname, database=self.dbname, schema=self.schema, port=self.port)
		db_vervet.setup(create_tables=False)
		self.db_vervet = db_vervet
		
		
		kinshipData = self.getMonkeyKinshipData(inputFname=self.inputFname)
		#set kinshipData diagonal to 1
		ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.plinkIBDCheckOutputFname, rowIDHeader="IID1", colIDHeader="IID2", \
										rowIDIndex=None, colIDIndex=None, \
								dataHeader="PI_HAT", dataIndex=None, hasHeader=True)
		
		if self.minAbsDeltaForOutlier>0:
			#2012.8.23 cut data off for Sue
			if self.kinshipMonkeyIDSetFname:
				monkeyID2dataTuple = SNP.getKey2ValueFromMatrixLikeFile(inputFname=self.kinshipMonkeyIDSetFname, keyHeaderLs=['monkeyID'], \
									valueHeaderLs=['noOfMismatches', 'noOfNonMissing'], keyIndexLs=None, valueIndexLs=None, \
									hasHeader=True, valueDataType=float)
				kinshipMonkeyIDSet = set()
				for monkeyID, dataTuple in monkeyID2dataTuple.iteritems():
					if dataTuple[0]==0 and dataTuple[1]>30:
						kinshipMonkeyIDSet.add(monkeyID)
				sys.stderr.write("%s monkeys in kinshipMonkeyIDSet.\n"%(len(kinshipMonkeyIDSet)))
			else:
				kinshipMonkeyIDSet = None
			if self.outputFnamePrefix:
				self.cutOffKinshipIBDDeltaAndOutput(db_vervet=db_vervet, kinshipData=kinshipData, ibdData=ibdData, \
						outputFnamePrefix=self.outputFnamePrefix, minAbsDelta=self.minAbsDeltaForOutlier, kinshipMonkeyIDSet=kinshipMonkeyIDSet)
		
		#2012.8.24 output the delta matrix in PC1 order
		self.PCAOnAbsKinshipIBDDeltaMatrix(kinshipData=kinshipData,  ibdData=ibdData, outputFnamePrefix=self.outputFnamePrefix)
		
		if self.plinkSexCheckOutputFname:
			monkey_id2plinkSex = SNP.getKey2ValueFromMatrixLikeFile(inputFname=self.plinkSexCheckOutputFname, \
								keyHeaderLs=['IID'], valueHeaderLs=['SNPSEX'], keyIndexLs=None, valueIndexLs=None, \
								hasHeader=True, valueDataType=int)
		else:
			monkey_id2plinkSex = {}
		
		kinshipIBDDeltaData = self.createDeltaMatrix(kinshipData=kinshipData, ibdData=ibdData, takeAbs=False)
		
		meanStdData = self.estimateAbsDeltaMeanStd(kinshipIBDDeltaData=kinshipIBDDeltaData, excludeTopFraction=0.2)
		
		queueData = self.createKinshipIBDDeltaChiSqStatQueue(kinshipData=kinshipData, ibdData=ibdData, \
													mean=meanStdData.mean, std=meanStdData.std)
		
		queue = queueData.queue
		monkey_id2queueData = queueData.monkey_id2queueData
		
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		header = ['rank', 'monkeyID', 'chiSqStat', 'noOfNonMissing', 'chiSqPvalue', 'monkeySex','monkeyPlinkSex']
		#if self.iterativeAlgorithm:
		#	header.extend(['chiSqStatIter', 'noOfNonMissingIter', 'chiSqPvalueIter'])
		writer.writerow(header)
		
		i=0
		while i<5000 and len(queue)>0:
			
			minusChiSqStat, sourceMonkeyID, noOfNonMissing, chiSqPvalue = heapq.heappop(queue)[:4]
			chiSqStat = -minusChiSqStat
			sourceMonkeyDBEntry = self.getMonkeyDBEntry(db_vervet=db_vervet, ucla_id=sourceMonkeyID)
			if sourceMonkeyDBEntry:
				sourceMonkeySex = sourceMonkeyDBEntry.codeSexInNumber()
			else:
				sourceMonkeySex = None
			sourceMonkeyPlinkSex = monkey_id2plinkSex.get(sourceMonkeyID)
			data_row = [i, sourceMonkeyID, chiSqStat, noOfNonMissing, chiSqPvalue, sourceMonkeySex, sourceMonkeyPlinkSex]
			
			
			if self.iterativeAlgorithm:
				"""
				if i>0:	#calculate the new chisq stat and p-value.
					chiSqStatData = self.calculateChiSqStatOfDeltaVector(kinshipData=kinshipData, kinshipDataMonkeyID=sourceMonkeyID, \
						ibdData=ibdData, ibdDataMonkeyID=sourceMonkeyID,\
						mean=meanStdData.mean, std=meanStdData.std)
					noOfNonMissing = chiSqStatData.noOfNonMissing
					chiSqStat = chiSqStatData.chiSqStat
					chiSqPvalue = chiSqStatData.chiSqPvalue
					data_row.extend([chiSqStat,noOfNonMissing,  chiSqPvalue])
				"""
				
				queueData = self.updateKinshipIBDDeltaChiSqStatQueue(queue=queue, kinshipData=kinshipData, ibdData=ibdData, \
								mean=meanStdData.mean, std=meanStdData.std, dropMonkeyID=sourceMonkeyID)
				#2012.8.23 old way not very efficient
				#remove itself.
#				ibdDataIndex = ibdData.row_id2row_index.get(sourceMonkeyID)
#				if ibdDataIndex:
#					ibdData.data_matrix[ibdDataIndex, :] = numpy.nan
#					ibdData.data_matrix[:, ibdDataIndex] = numpy.nan
#					ibdData.data_matrix.mask[ibdDataIndex, :] = True
#					ibdData.data_matrix.mask[:, ibdDataIndex] = True
				#queueData = self.createKinshipIBDDeltaChiSqStatQueue(kinshipData=kinshipData, ibdData=ibdData, \
				#									mean=meanStdData.mean, std=meanStdData.std,\
				#									given_row_id_ls=[row[1] for row in queue])
		
				queue = queueData.queue
				monkey_id2queueData = queueData.monkey_id2queueData
			writer.writerow(data_row)
			i+= 1
		del writer