def run(self): """ """ if self.debug: import pdb pdb.set_trace() db_250k = self.db_250k #construct the bands to be highlighted in manhattan plot highlightBandLs = [] rm = None for peak_id in self.peak_id_ls: result_peak = Stock_250kDB.ResultPeak.get(peak_id) highlightBandLs.append( [result_peak.chromosome, result_peak.start, result_peak.stop]) #take the 1st result_peak's result as the association result to get locus_type_id if rm is None: rm = result_peak.result if not rm: sys.stderr.write( "Error: no results_method (association result) fetched from db.\n" ) sys.exit(1) if self.inputFname and os.path.isfile(self.inputFname): locus_type_id = self.getLocusTypeIDFromInput( self.inputFname, datasetName=self.datasetName) pd = PassingData() if rm.cnv_method_id and not db_250k._cnv_id2chr_pos: db_250k.cnv_id2chr_pos = rm.cnv_method_id pd.db_id2chr_pos = db_250k.cnv_id2chr_pos elif rm.call_method_id: db_250k.snp_id2chr_pos = ( False, locus_type_id ) #correspond to priorTAIRVersion, locus_type_id pd.db_id2chr_pos = db_250k.snp_id2chr_pos #need to setup a different db setting db_genome = GenomeDB.GenomeDatabase(drivername=self.genome_drivername, username=self.genome_db_user, password=self.genome_db_passwd, hostname=self.genome_hostname, database=self.genome_dbname, \ schema=self.genome_schema) db_genome.setup(create_tables=False) gwr_name = '' gwr = SNP.GenomeWideResult(name=gwr_name, construct_chr_pos2index=False, \ construct_data_obj_id2index=False) gwr.fillGenomeWideResultFromHDF5CorrelationFile( self.inputFname, datasetName=self.datasetName, pdata=pd) gwr.drawManhattanPlot(db_genome, outputFnamePrefix=self.outputFnamePrefix,\ min_value=None, need_svg=False, ylim_type=2,\ drawBonferroni=False, highlightBandLs=highlightBandLs) else: #2012.3.28 input is invalid. sys.stderr.write("inputFname %s is not valid (inexistent).\n" % (self.inputFname)) sys.exit(0) #fake ok as I want pegasus workflow to keep running.
def getMonkeyIBDCheckData(self, inputFname=None): """ 2012.8.21 inputFname is output of plink ibd check. FID1 IID1 FID2 IID2 RT EZ Z0 Z1 Z2 PI_HAT PHE DST PPC RATIO 1 1996093 1 1995025 OT 0 1.0000 0.0000 0.0000 0.0000 -1 0.654218 0.3630 1.9764 1 1996093 1 2001039 OT 0 0.9832 0.0000 0.0168 0.0168 -1 0.653608 0.0318 1.8792 1 1996093 1 1984011 OT 0 1.0000 0.0000 0.0000 0.0000 -1 0.645011 0.0168 1.8624 1 1996093 1 1987004 OT 0 0.9260 0.0628 0.0113 0.0427 -1 0.660490 0.9999 2.2805 """ sys.stderr.write("Reading PI_hat from %s ... "%(inputFname)) ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=inputFname, rowIDHeader="IID1", colIDHeader="IID2", rowIDIndex=None, colIDIndex=None, \ dataHeader="PI_HAT", dataIndex=None, hasHeader=True) return ibdData """
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) #read in the IBD check result if self.plinkIBDCheckOutputFname: ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.plinkIBDCheckOutputFname, rowIDHeader="IID1", colIDHeader="IID2", \ rowIDIndex=None, colIDIndex=None, \ dataHeader="PI_HAT", dataIndex=None, hasHeader=True) else: ibdData = None self.ibdData = ibdData self.data_matrix = [ ] #data structure to store all rows during fileWalker()
def getMonkeyKinshipData(self, inputFname=None): """ 2012.8.22 use SNP.readAdjacencyListDataIntoMatrix(), and defaultValue=0 2012.2.10 """ sys.stderr.write("Reading kinship from %s ... "%(inputFname)) kinshipData = SNP.readAdjacencyListDataIntoMatrix(inputFname=inputFname, rowIDHeader=None, colIDHeader=None, rowIDIndex=0, colIDIndex=1, \ dataHeader=None, dataIndex=2, hasHeader=False, defaultValue=0) #set kinshipData diagonal to 1 for i in xrange(len(kinshipData.row_id_ls)): kinshipData.data_matrix[i][i] = 1 return kinshipData """ header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) monkey1_id_index = col_name2index.get("monkeyId1") monkey2_id_index = col_name2index.get("monkeyId2") kinship_index = col_name2index.get("kinship") """ """
def outputSNPDataInNewCoordinate(self, querySNPDataFname=None, querySNPID2NewReferenceCoordinateLs=None,\ newSNPDataOutputFname=None, newSNPDataOutputFormat=1): """ 2013.07.03 added argument newSNPDataOutputFormat 2012.10.14 split out of findSNPPositionOnNewRef() """ sys.stderr.write("Converting querySNPDataFname %s into individual X SNP format, format=%s ... "%\ (querySNPDataFname, newSNPDataOutputFormat)) """ Sample Geno SNP 1999010 CC cs_primer1082_247 1999068 CC cs_primer1082_247 2000022 CT cs_primer1082_247 2000064 CT cs_primer1082_247 2000117 CC cs_primer1082_247 """ inf = utils.openGzipFile(querySNPDataFname) reader = csv.reader(inf, delimiter=figureOutDelimiter(inf)) col_name2index = getColName2IndexFromHeader(reader.next()) sampleIndex = col_name2index.get("Sample") genotypeIndex = col_name2index.get("Geno") SNPIDIndex = col_name2index.get("SNP") row_id2index = {} row_id_ls = [] col_id_ls = [] col_id2index = {} row_col_index2genotype = {} for row in reader: sampleID = row[sampleIndex] genotype = row[genotypeIndex] querySNPID = row[SNPIDIndex] if querySNPID in querySNPID2NewReferenceCoordinateLs: newRefCoordinateLs = querySNPID2NewReferenceCoordinateLs.get(querySNPID) if len(newRefCoordinateLs)==1: newRefCoordinate = newRefCoordinateLs[0] if newSNPDataOutputFormat==2: col_id = '%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart) else: col_id = '%s_%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart, newRefCoordinate.newRefStop) queryStrand = newRefCoordinate.queryStrand if col_id not in col_id2index: col_id2index[col_id] = len(col_id2index) col_id_ls.append(col_id) if sampleID not in row_id2index: row_id2index[sampleID] = len(row_id2index) row_id_ls.append(sampleID) if queryStrand == "-": genotype = SNP.reverseComplement(genotype) row_index = row_id2index[sampleID] col_index = col_id2index[col_id] row_col_index2genotype[(row_index, col_index)] = genotype else: continue data_matrix = numpy.zeros([len(row_id_ls), len(col_id2index)], dtype=numpy.int8) for row_col_index, genotype in row_col_index2genotype.iteritems(): row_index, col_index = row_col_index[:2] data_matrix[row_index, col_index] = SNP.nt2number[genotype] sys.stderr.write("\n") snpData = SNP.SNPData(row_id_ls=row_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix) snpData.tofile(newSNPDataOutputFname)
def run(self): """ """ if self.debug: import pdb pdb.set_trace() db_vervet = VervetDB.VervetDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \ hostname=self.hostname, database=self.dbname, schema=self.schema, port=self.port) db_vervet.setup(create_tables=False) self.db_vervet = db_vervet kinshipData = self.getMonkeyKinshipData(inputFname=self.inputFname) #set kinshipData diagonal to 1 ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.plinkIBDCheckOutputFname, id1Header="IID1", id2Header="IID2", id1Index=None, id2Index=None, \ dataHeader="PI_HAT", dataIndex=None, hasHeader=True) monkey_id2plinkSex = SNP.getKey2ValueFromMatrixLikeFile(inputFname=self.plinkSexCheckOutputFname, \ keyHeaderLs=['IID'], valueHeaderLs=['SNPSEX'], keyIndexLs=None, valueIndexLs=None, \ hasHeader=True, valueDataType=int) kinshipIBDDeltaData = self.createDeltaMatrix(kinshipData=kinshipData, ibdData=ibdData) queueData = self.createKinshipIBDDeltaQueue(kinshipIBDDeltaData) kinshipIBDDeltaQueue = queueData.kinshipIBDDeltaQueue monkey_id2medianAbsDelta = queueData.monkey_id2medianAbsDelta writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['sourceMonkeyID', 'medianAbsDelta', 'noOfNonMissing', 'sourceMonkeySex', 'sourceMonkeyPlinkSex', \ 'sourceMonkeyMedianAbsDeltaDropAfterSwap', \ 'targetMonkeyID', 'sourceMonkeyNoOfNonMissingAfterSwap', \ 'targetMonkeyMedianAbsDelta', 'targetMonkeyNoOfNonMissing', 'targetMonkeySex', 'targetMonkeyPlinkSex', \ 'targetMonkeyMedianAbsDeltaAfterSwap', 'targetMonkeyNoOfNonMissingAfterSwap'] writer.writerow(header) i=0 while i <50 and len(kinshipIBDDeltaQueue)>0: negativeMedianAbsDelta, sourceMonkeyID, noOfNonMissing = heapq.heappop(kinshipIBDDeltaQueue)[:3] medianAbsDelta = -negativeMedianAbsDelta sourceMonkeyDBEntry = self.getMonkeyDBEntry(db_vervet=db_vervet, ucla_id=sourceMonkeyID) # 2012.8.22 draw some histogram to check what data looks like #self.drawKinshipIBDDeltaVectorHistogram(kinshipIBDDeltaData=kinshipIBDDeltaData, row_id=sourceMonkeyID, \ # outputFnamePrefix=self.outputFnamePrefix) medianAbsDeltaIncreaseQueue = [] for targetMonkeyID in kinshipData.row_id_ls: if targetMonkeyID!=sourceMonkeyID: targetMonkeyDBEntry = self.getMonkeyDBEntry(db_vervet=db_vervet, ucla_id=targetMonkeyID) #get the updated Median Delta for sourceMonkeyID pdata = self.calculateMedianAbsDelta(kinshipData=kinshipData, \ kinshipDataMonkeyID=targetMonkeyID, ibdData=ibdData, ibdDataMonkeyID=sourceMonkeyID) sourceMonkeyMedianAbsDeltaAfterSwap = pdata.medianAbsDelta sourceMonkeyNoOfNonMissingAfterSwap = pdata.noOfNonMissing #get the updated Median Delta for targetMonkeyID pdata = self.calculateMedianAbsDelta(kinshipData=kinshipData, \ kinshipDataMonkeyID=sourceMonkeyID, ibdData=ibdData, ibdDataMonkeyID=targetMonkeyID) targetMonkeyMedianAbsDeltaAfterSwap = pdata.medianAbsDelta targetMonkeyNoOfNonMissingAfterSwap = pdata.noOfNonMissing if sourceMonkeyMedianAbsDeltaAfterSwap is not None: #add to the queue #add the candidate monkey and how much median delta drops into the queue pdata = monkey_id2medianAbsDelta.get(targetMonkeyID) if pdata: targetMonkeyMedianAbsDelta = pdata.medianAbsDelta targetMonkeyNoOfNonMissing = pdata.noOfNonMissing else: targetMonkeyMedianAbsDelta = None targetMonkeyNoOfNonMissing = None item = [sourceMonkeyMedianAbsDeltaAfterSwap-medianAbsDelta, targetMonkeyID, sourceMonkeyNoOfNonMissingAfterSwap, \ targetMonkeyMedianAbsDelta, targetMonkeyNoOfNonMissing, targetMonkeyMedianAbsDeltaAfterSwap, \ targetMonkeyNoOfNonMissingAfterSwap] heapq.heappush(medianAbsDeltaIncreaseQueue, item) #the target monkey that increase the least (or drop the most) for the median delta is the prime candidate for label-swap i+=1 #output the top 5 candidates for each source monkey #output db sex for all monkeys and the plink sex check result j = 0 while j<5 and len(medianAbsDeltaIncreaseQueue)>0: sourceMonkeyMedianAbsDeltaDropAfterSwap, targetMonkeyID, sourceMonkeyNoOfNonMissingAfterSwap, \ targetMonkeyMedianAbsDelta, targetMonkeyNoOfNonMissing, targetMonkeyMedianAbsDeltaAfterSwap, targetMonkeyNoOfNonMissingAfterSwap =\ heapq.heappop(medianAbsDeltaIncreaseQueue)[:7] sourceMonkeySex = sourceMonkeyDBEntry.codeSexInNumber() sourceMonkeyPlinkSex = monkey_id2plinkSex.get(sourceMonkeyID) targetMonkeySex = targetMonkeyDBEntry.codeSexInNumber() targetMonkeyPlinkSex = monkey_id2plinkSex.get(targetMonkeyID) data_row = [sourceMonkeyID, medianAbsDelta, noOfNonMissing, sourceMonkeySex, sourceMonkeyPlinkSex,\ sourceMonkeyMedianAbsDeltaDropAfterSwap, targetMonkeyID, sourceMonkeyNoOfNonMissingAfterSwap,\ targetMonkeyMedianAbsDelta, targetMonkeyNoOfNonMissing, targetMonkeySex, targetMonkeyPlinkSex, targetMonkeyMedianAbsDeltaAfterSwap,\ targetMonkeyNoOfNonMissingAfterSwap] writer.writerow(data_row) j+= 1 del writer
def run(self): """ """ if self.debug: import pdb pdb.set_trace() db_vervet = VervetDB.VervetDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \ hostname=self.hostname, database=self.dbname, schema=self.schema, port=self.port) db_vervet.setup(create_tables=False) self.db_vervet = db_vervet kinshipData = self.getMonkeyKinshipData(inputFname=self.inputFname) #set kinshipData diagonal to 1 ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.plinkIBDCheckOutputFname, rowIDHeader="IID1", colIDHeader="IID2", \ rowIDIndex=None, colIDIndex=None, \ dataHeader="PI_HAT", dataIndex=None, hasHeader=True) if self.minAbsDeltaForOutlier>0: #2012.8.23 cut data off for Sue if self.kinshipMonkeyIDSetFname: monkeyID2dataTuple = SNP.getKey2ValueFromMatrixLikeFile(inputFname=self.kinshipMonkeyIDSetFname, keyHeaderLs=['monkeyID'], \ valueHeaderLs=['noOfMismatches', 'noOfNonMissing'], keyIndexLs=None, valueIndexLs=None, \ hasHeader=True, valueDataType=float) kinshipMonkeyIDSet = set() for monkeyID, dataTuple in monkeyID2dataTuple.iteritems(): if dataTuple[0]==0 and dataTuple[1]>30: kinshipMonkeyIDSet.add(monkeyID) sys.stderr.write("%s monkeys in kinshipMonkeyIDSet.\n"%(len(kinshipMonkeyIDSet))) else: kinshipMonkeyIDSet = None if self.outputFnamePrefix: self.cutOffKinshipIBDDeltaAndOutput(db_vervet=db_vervet, kinshipData=kinshipData, ibdData=ibdData, \ outputFnamePrefix=self.outputFnamePrefix, minAbsDelta=self.minAbsDeltaForOutlier, kinshipMonkeyIDSet=kinshipMonkeyIDSet) #2012.8.24 output the delta matrix in PC1 order self.PCAOnAbsKinshipIBDDeltaMatrix(kinshipData=kinshipData, ibdData=ibdData, outputFnamePrefix=self.outputFnamePrefix) if self.plinkSexCheckOutputFname: monkey_id2plinkSex = SNP.getKey2ValueFromMatrixLikeFile(inputFname=self.plinkSexCheckOutputFname, \ keyHeaderLs=['IID'], valueHeaderLs=['SNPSEX'], keyIndexLs=None, valueIndexLs=None, \ hasHeader=True, valueDataType=int) else: monkey_id2plinkSex = {} kinshipIBDDeltaData = self.createDeltaMatrix(kinshipData=kinshipData, ibdData=ibdData, takeAbs=False) meanStdData = self.estimateAbsDeltaMeanStd(kinshipIBDDeltaData=kinshipIBDDeltaData, excludeTopFraction=0.2) queueData = self.createKinshipIBDDeltaChiSqStatQueue(kinshipData=kinshipData, ibdData=ibdData, \ mean=meanStdData.mean, std=meanStdData.std) queue = queueData.queue monkey_id2queueData = queueData.monkey_id2queueData writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['rank', 'monkeyID', 'chiSqStat', 'noOfNonMissing', 'chiSqPvalue', 'monkeySex','monkeyPlinkSex'] #if self.iterativeAlgorithm: # header.extend(['chiSqStatIter', 'noOfNonMissingIter', 'chiSqPvalueIter']) writer.writerow(header) i=0 while i<5000 and len(queue)>0: minusChiSqStat, sourceMonkeyID, noOfNonMissing, chiSqPvalue = heapq.heappop(queue)[:4] chiSqStat = -minusChiSqStat sourceMonkeyDBEntry = self.getMonkeyDBEntry(db_vervet=db_vervet, ucla_id=sourceMonkeyID) if sourceMonkeyDBEntry: sourceMonkeySex = sourceMonkeyDBEntry.codeSexInNumber() else: sourceMonkeySex = None sourceMonkeyPlinkSex = monkey_id2plinkSex.get(sourceMonkeyID) data_row = [i, sourceMonkeyID, chiSqStat, noOfNonMissing, chiSqPvalue, sourceMonkeySex, sourceMonkeyPlinkSex] if self.iterativeAlgorithm: """ if i>0: #calculate the new chisq stat and p-value. chiSqStatData = self.calculateChiSqStatOfDeltaVector(kinshipData=kinshipData, kinshipDataMonkeyID=sourceMonkeyID, \ ibdData=ibdData, ibdDataMonkeyID=sourceMonkeyID,\ mean=meanStdData.mean, std=meanStdData.std) noOfNonMissing = chiSqStatData.noOfNonMissing chiSqStat = chiSqStatData.chiSqStat chiSqPvalue = chiSqStatData.chiSqPvalue data_row.extend([chiSqStat,noOfNonMissing, chiSqPvalue]) """ queueData = self.updateKinshipIBDDeltaChiSqStatQueue(queue=queue, kinshipData=kinshipData, ibdData=ibdData, \ mean=meanStdData.mean, std=meanStdData.std, dropMonkeyID=sourceMonkeyID) #2012.8.23 old way not very efficient #remove itself. # ibdDataIndex = ibdData.row_id2row_index.get(sourceMonkeyID) # if ibdDataIndex: # ibdData.data_matrix[ibdDataIndex, :] = numpy.nan # ibdData.data_matrix[:, ibdDataIndex] = numpy.nan # ibdData.data_matrix.mask[ibdDataIndex, :] = True # ibdData.data_matrix.mask[:, ibdDataIndex] = True #queueData = self.createKinshipIBDDeltaChiSqStatQueue(kinshipData=kinshipData, ibdData=ibdData, \ # mean=meanStdData.mean, std=meanStdData.std,\ # given_row_id_ls=[row[1] for row in queue]) queue = queueData.queue monkey_id2queueData = queueData.monkey_id2queueData writer.writerow(data_row) i+= 1 del writer