def constructPedigreeGraphFromPOEdgeFile(self, inputFname=None): """ 2012.8.23 inputFname is output of vervet/src/pedigree/DiscoverParentOffspringFromPlinkIBD.py """ sys.stderr.write("Constructing pedigree-graph out of %s ..."%(inputFname)) DG=nx.DiGraph() reader = None childNodeSet = set() reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() parentIDIndex = reader.getColIndexGivenColHeader("parentID") childIDIndex = reader.getColIndexGivenColHeader("childID") distToPOVectorIndex = reader.getColIndexGivenColHeader("distToPOVector") counter = 0 for row in reader: childID = row[childIDIndex] childNodeSet.add(childID) parentID = row[parentIDIndex] distToPOVector = float(row[distToPOVectorIndex]) DG.add_edge(parentID, childID, weight=distToPOVector) counter += 1 del reader sys.stderr.write("%s children, %s nodes. %s edges. %s connected components.\n"%(\ len(childNodeSet), DG.number_of_nodes(), DG.number_of_edges(), \ nx.number_connected_components(DG.to_undirected()))) return PassingData(DG=DG, childNodeSet=childNodeSet)
def run(self): """ """ if self.debug: import pdb pdb.set_trace() reader = MatrixFile(self.inputFname) reader.constructColName2IndexFromHeader() noOfMendelErrorColumnIndex = reader.getColIndexGivenColHeader(colHeader='N') SNPIDColumnIndex = reader.getColIndexGivenColHeader(colHeader='SNP') writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['chromosome', 'position', 'noOfMendelErrors'] writer.writerow(header) counter = 0 real_counter = 0 for row in reader: SNPID = row[SNPIDColumnIndex] noOfMendelErrors = int(row[noOfMendelErrorColumnIndex]) if noOfMendelErrors <=self.maxNoOfMendelError: chr, pos = SNPID.split('_') data_row = [chr, pos, noOfMendelErrors] writer.writerow(data_row) real_counter += 1 counter += 1 del reader del writer sys.stderr.write("%s/%s lines outputted.\n"%(real_counter, counter))
def getLocusID2MissingFraction(self, inputFname=None): """ 2014.01.08 """ sys.stderr.write("Reading in the missing statistics from %s ... " % (inputFname)) locusID2Stat = {} reader = MatrixFile(inputFname=inputFname) reader.constructColName2IndexFromHeader() locusIDIndex = reader.getColIndexGivenColHeader("locusID") statIndex = reader.getColIndexGivenColHeader("occurrence_byFixedValue") counter = 0 for row in reader: locusID = row[locusIDIndex] chromosome, start = locusID.split('_')[:2] start = int(start) stat = float(row[statIndex]) key = (chromosome, start, start) if key in locusID2Stat: if stat < locusID2Stat[key]: #take lowest value locusID2Stat[key] = stat else: locusID2Stat[key] = stat counter += 1 del reader sys.stderr.write( " %s unique loci with missing fraction out of %s total loci.\n" % (len(locusID2Stat), counter)) return locusID2Stat
def getMonkeyID2Coverage(self, inputFname): """ 2012.9.4 copied from vervet/src/misc.py 2012.2.10 inputFname is output of SequencingStrategy.assignVRCSequencePriorityBasedOnPedigree() + manual change of top ones """ sys.stderr.write("Reading the list of ranked monkeys from %s ..." % (inputFname)) reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() monkey_id_index = reader.getColIndexGivenColHeader("UCLAID") pre_set_coverage_index = reader.getColIndexGivenColHeader("pre-set-coverage") future_coverage_index = reader.getColIndexGivenColHeader("future coverage") to_sequence_monkey_id2coverage = {} for row in reader: monkey_id = row[monkey_id_index] pre_set_coverage = row[pre_set_coverage_index] if pre_set_coverage: pre_set_coverage = float(pre_set_coverage) else: pre_set_coverage = 0 future_coverage = 0 if len(row) >= future_coverage_index + 1: future_coverage = float(row[future_coverage_index]) to_sequence_monkey_id2coverage[monkey_id] = max(future_coverage, pre_set_coverage) del reader sys.stderr.write(" %s monkeys are to-be-sequenced.\n" % (len(to_sequence_monkey_id2coverage))) return to_sequence_monkey_id2coverage """
def getSampleID2IndividualData_UNGC(self, inputFname=None): """ 2013.04.04 Format is like this from UNGC = UCLA Neuroscience Genomics Core: FCID Lane sample ID sample code sample name Index Description SampleProject D1HYNACXX 1 Ilmn Human control pool ( 4plex) IP1 INDEX IS UNKNOWN prepared by Illumina (4 plex pool) 2013-029A D1HYNACXX 2 UNGC Human Sample 1 S1 AS001A ATTACTCG TruSeq DNA PCR Free beta kit 2013-029A """ sys.stderr.write("Getting sampleID2IndividualData from %s ..."%(inputFname)) sampleID2IndividualData = {} reader = MatrixFile(inputFname, openMode='r', delimiter=',') reader.constructColName2IndexFromHeader() sampleIDIndex = reader.getColIndexGivenColHeader("sample ID") sampleNameIndex = reader.getColIndexGivenColHeader("sample name") libraryIndexIndex = reader.getColIndexGivenColHeader("Index") for row in reader: sampleID = row[sampleIDIndex].replace(' ', '_') #2013.04.04 stupid quirks sampleName = row[sampleNameIndex] libraryIndex = row[libraryIndexIndex] if sampleID not in sampleID2IndividualData: sampleID2IndividualData[sampleID] = PassingData(sampleName=sampleName, libraryIndexList=[]) if sampleName!=sampleID2IndividualData[sampleID].sampleName: sys.stderr.write("Error: sampleID %s is associated with two different sample names (%s, %s).\n"%\ (sampleID, sampleName, sampleID2IndividualData[sampleID].sampleName)) raise sampleID2IndividualData[sampleID].libraryIndexList.append(libraryIndex) sys.stderr.write("%s entries.\n"%(len(sampleID2IndividualData))) return sampleID2IndividualData
def constructPedigreeGraphFromPlinkIBD(self, inputFname=None, maxDistanceToPOVector=0.04, drawDistribution=False, outputFnamePrefix=None): """ 2012.8.14 """ sys.stderr.write("Constructing pedigree-graph out of plink-ibd %s ..."%(inputFname)) DG=nx.DiGraph() childNodeSet = set() reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() monkey1IDIndex = reader.getColIndexGivenColHeader("IID1") monkey2IDIndex = reader.getColIndexGivenColHeader("IID2") Z0Index = reader.getColIndexGivenColHeader("Z0") Z1Index = reader.getColIndexGivenColHeader("Z1") Z2Index = reader.getColIndexGivenColHeader("Z2") poVector = numpy.array([0,1,0.0]) counter = 0 real_counter = 0 data_ls = [] for row in reader: monkey1ID = int(row[monkey1IDIndex]) #turn it into integer so could compare age monkey2ID = int(row[monkey2IDIndex]) Z0 = float(row[Z0Index]) Z1 = float(row[Z1Index]) Z2 = float(row[Z2Index]) ZVector = numpy.array([Z0, Z1, Z2]) dist = numpy.linalg.norm(poVector-ZVector) if drawDistribution and outputFnamePrefix: data_ls.append(dist) if dist<=maxDistanceToPOVector: if monkey1ID>monkey2ID: childID = monkey1ID parentID = monkey2ID else: childID = monkey2ID parentID = monkey1ID DG.add_edge(parentID, childID, weight=dist) childNodeSet.add(childID) real_counter += 1 counter += 1 del reader sys.stderr.write("%s out of %s lines become PO pairs. %s children, %s nodes. %s edges. %s connected components.\n"%(\ real_counter, counter, len(childNodeSet), DG.number_of_nodes(), DG.number_of_edges(), \ nx.number_connected_components(DG.to_undirected()))) if drawDistribution and outputFnamePrefix: outputFname = '%s_IBDVector2POVectorDist_hist.png'%(outputFnamePrefix) yh_matplotlib.drawHist(data_ls, title='', \ xlabel_1D="dist(ZVector,POVector)", xticks=None, \ outputFname=outputFname, min_no_of_data_points=10, \ needLog=True, \ dpi=200, min_no_of_bins=25) return PassingData(DG=DG, childNodeSet=childNodeSet)
def getMendelErrorIndividualLocusData(self, mendelErrorFname=None, individualID2Index=None): """ 2013.1.29 """ sys.stderr.write("Getting data on loci involved in mendel-errors from %s ..."%(mendelErrorFname)) locus_id2individual_index_ls = {} #inf = utils.openGzipFile(mendelErrorFname, 'r') reader = MatrixFile(inputFname=mendelErrorFname) #header = reader.next() reader.constructColName2IndexFromHeader() counter = 0 for row in reader: individual_id = row[reader.getColIndexGivenColHeader('KID')] if individual_id in individualID2Index: index =individualID2Index.get(individual_id) else: sys.stderr.write("Individual %s not in individualID2Index.\n"%(individual_id)) sys.exit(3) snp_id = row[3] if snp_id not in locus_id2individual_index_ls: locus_id2individual_index_ls[snp_id] = [] locus_id2individual_index_ls[snp_id].append(index) counter += 1 del reader sys.stderr.write(" %s calls of %s loci, involved in mendel errors.\n"%\ (counter, len(locus_id2individual_index_ls))) return locus_id2individual_index_ls
def getMonkeyPair2IBDVector(self, inputFname=None): """ 2012.9.10 return monkeyIDSet as well 2012.9.6 """ sys.stderr.write("Getting monkey pair 2 IBD vector from %s ..." % (inputFname)) reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() monkey1IDIndex = reader.getColIndexGivenColHeader("IID1") monkey2IDIndex = reader.getColIndexGivenColHeader("IID2") IBDIndex = reader.getColIndexGivenColHeader("PI_HAT") Z0Index = reader.getColIndexGivenColHeader("Z0") Z1Index = reader.getColIndexGivenColHeader("Z1") Z2Index = reader.getColIndexGivenColHeader("Z2") formatFunc = lambda x: "%.2f" % (x) monkeyPair2IBDVector = {} counter = 0 monkeyIDSet = set() for row in reader: monkey1ID = row[monkey1IDIndex] monkey2ID = row[monkey2IDIndex] monkey_id_pair = [monkey1ID, monkey2ID] monkey_id_pair.sort() key = tuple(monkey_id_pair) Z0 = float(row[Z0Index]) Z1 = float(row[Z1Index]) Z2 = float(row[Z2Index]) IBD = float(row[IBDIndex]) IBDVector = [Z0, Z1, Z2] IBDVector = map(formatFunc, IBDVector) IBDVectorStr = ",".join(IBDVector) data = PassingData(IBD=IBD, IBDVector=IBDVector, IBDVectorStr=IBDVectorStr) if key in monkeyPair2IBDVector: sys.stderr.write( "WARNING: key %s has value %s in monkeyPair2IBDVector already. value overwritten with %s.\n" % (repr(key), monkeyPair2IBDVector.get(key), data) ) monkeyPair2IBDVector[key] = data monkeyIDSet.add(monkey1ID) monkeyIDSet.add(monkey2ID) counter += 1 sys.stderr.write( " %s pairs of IBD vectors for %s unique monkeys.\n" % (len(monkeyPair2IBDVector), len(monkeyIDSet)) ) return PassingData(monkeyPair2IBDVector=monkeyPair2IBDVector, monkeyIDSet=monkeyIDSet)
def readInCoordinateMap(self, coordinateMapFname=None): """ 2013.07.11 querySNPID queryStrand queryChromosome queryStart queryStop queryRefBase queryAltBase queryAlignmentSpan queryAlignmentStart queryAlignmentStop newChr newRefStart newRefStop newRefBase targetAlignmentSpan targetAlignmentStart targetAlignmentStop """ sys.stderr.write("Reading in the coordinate map from %s ..." % (coordinateMapFname)) oldCoordinate2newCoordinateDataLs = {} reader = MatrixFile(inputFname=coordinateMapFname) reader.constructColName2IndexFromHeader() oldChromosomeIndex = reader.getColIndexGivenColHeader( "queryChromosome") oldStartIndex = reader.getColIndexGivenColHeader("queryStart") strandIndex = reader.getColIndexGivenColHeader("queryStrand") oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase") oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase") newChromosomeIndex = reader.getColIndexGivenColHeader("newChr") newStartIndex = reader.getColIndexGivenColHeader("newRefStart") newStopIndex = reader.getColIndexGivenColHeader("newRefStop") newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase") counter = 0 for row in reader: oldChromosome = row[oldChromosomeIndex] oldStart = int(row[oldStartIndex]) strand = row[strandIndex] oldRefBase = row[oldRefBaseIndex] oldAltBase = row[oldAltBaseIndex] newChromosome = row[newChromosomeIndex] newStart = int(row[newStartIndex]) newStop = int(row[newStopIndex]) newRefBase = row[newRefBaseIndex] key = (oldChromosome, oldStart) if key not in oldCoordinate2newCoordinateDataLs: oldCoordinate2newCoordinateDataLs[key] = [] oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \ oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\ newStop=newStop, newRefBase=newRefBase)) counter += 1 del reader sys.stderr.write("%s old coordinates with %s new coordinates.\n" % (len(oldCoordinate2newCoordinateDataLs), counter)) return oldCoordinate2newCoordinateDataLs
def getLocusNewID2mapPvalue(self, liftOverLocusMapPvalueFname=None): """ 2014.01.04 oldChromosome, oldStart, oldStop, oldStrand, newChromosome, newStart, newStop, mapPvalue """ sys.stderr.write("Reading in the coordinate map from %s ..." % (liftOverLocusMapPvalueFname)) locusNewID2mapPvalue = {} reader = MatrixFile(inputFname=liftOverLocusMapPvalueFname) reader.constructColName2IndexFromHeader() strandIndex = reader.getColIndexGivenColHeader("oldStrand") newChromosomeIndex = reader.getColIndexGivenColHeader("newChromosome") newStartIndex = reader.getColIndexGivenColHeader("newStart") newStopIndex = reader.getColIndexGivenColHeader("newStop") mapPvalueIndex = reader.getColIndexGivenColHeader("mapPvalue") counter = 0 for row in reader: strand = row[strandIndex] newChromosome = row[newChromosomeIndex] newStart = int(row[newStartIndex]) newStop = int(row[newStopIndex]) mapPvalue = float(row[mapPvalueIndex]) key = (newChromosome, newStart, newStop) if key in locusNewID2mapPvalue: if mapPvalue < locusNewID2mapPvalue[key]: #take lowest value locusNewID2mapPvalue[key] = mapPvalue else: locusNewID2mapPvalue[key] = mapPvalue counter += 1 del reader sys.stderr.write( "%s unique loci with map p-value out of %s total loci.\n" % (len(locusNewID2mapPvalue), counter)) return locusNewID2mapPvalue
def readInStats(self, inputFname=None): """ 2013.07.15 """ sys.stderr.write("Reading stats from %s ..." % (inputFname)) data_matrix = [] reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() switchFrequencyIndex = reader.getColIndexGivenColHeader( "noOfSwitchPoints_by_noOfLociWithUniqueHit") regionSpanIndex = reader.getColIndexGivenColHeader("regionSpan") noOfLociIndex = reader.getColIndexGivenColHeader("#sitesInInput2") totalSpan = 0 totalNoOfLoci = 0 counter = 0 for row in reader: counter += 1 switchFrequency = row[switchFrequencyIndex] regionSpan = row[regionSpanIndex] noOfLoci = row[noOfLociIndex] if switchFrequency and regionSpan and noOfLoci: #non-empty switchFrequency = float(switchFrequency) regionSpan = int(float(regionSpan)) noOfLoci = int(float(noOfLoci)) data_matrix.append([switchFrequency, regionSpan, noOfLoci]) totalSpan += regionSpan totalNoOfLoci += noOfLoci reader.close() sys.stderr.write(" %s valid entries (from %s rows) with totalSpan=%s, totalNoOfLoci=%s.\n"%\ (len(data_matrix), counter, totalSpan, totalNoOfLoci)) return PassingData(data_matrix=data_matrix, totalSpan=totalSpan, totalNoOfLoci=totalNoOfLoci)
def setup(self, **keywords): """ """ AbstractMatrixFileWalker.setup(self, **keywords) #construct a individualCode2readGroup from readGroupFname self.invariantPData.individualCode2readGroup = {} reader = MatrixFile(inputFname=self.readGroupFname) reader.constructColName2IndexFromHeader() if self.readGroupHeader: readGroupIndex = reader.getColIndexGivenColHeader(self.readGroupHeader) else: readGroupIndex = 0 for row in reader: readGroup = row[readGroupIndex] individualAlignment = self.db_vervet.parseAlignmentReadGroup(readGroup).individualAlignment if individualAlignment: individual_code = individualAlignment.individual_sequence.individual.code self.invariantPData.individualCode2readGroup[individual_code] = readGroup del reader return 1