def constructPedigreeGraphFromPOEdgeFile(self, inputFname=None):
		"""
		2012.8.23
			inputFname is output of vervet/src/pedigree/DiscoverParentOffspringFromPlinkIBD.py
		"""
		sys.stderr.write("Constructing pedigree-graph out of %s ..."%(inputFname))
		DG=nx.DiGraph()
		reader = None
		childNodeSet = set()
		reader = MatrixFile(inputFname)
		reader.constructColName2IndexFromHeader()
		
		parentIDIndex = reader.getColIndexGivenColHeader("parentID")
		childIDIndex = reader.getColIndexGivenColHeader("childID")
		distToPOVectorIndex = reader.getColIndexGivenColHeader("distToPOVector")
		counter = 0
		for row in reader:
			childID = row[childIDIndex]
			childNodeSet.add(childID)
			parentID = row[parentIDIndex]
			distToPOVector = float(row[distToPOVectorIndex])
			DG.add_edge(parentID, childID, weight=distToPOVector)
			counter += 1
		del reader
		sys.stderr.write("%s children, %s nodes. %s edges. %s connected components.\n"%(\
										len(childNodeSet), DG.number_of_nodes(), DG.number_of_edges(), \
										nx.number_connected_components(DG.to_undirected())))
		return PassingData(DG=DG, childNodeSet=childNodeSet)
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		reader = MatrixFile(self.inputFname)
		reader.constructColName2IndexFromHeader()
		noOfMendelErrorColumnIndex = reader.getColIndexGivenColHeader(colHeader='N')
		SNPIDColumnIndex = reader.getColIndexGivenColHeader(colHeader='SNP')
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		header = ['chromosome', 'position', 'noOfMendelErrors']
		writer.writerow(header)
		
		counter = 0
		real_counter = 0
		for row in reader:
			SNPID = row[SNPIDColumnIndex]
			noOfMendelErrors = int(row[noOfMendelErrorColumnIndex])
			if noOfMendelErrors <=self.maxNoOfMendelError:
				chr, pos = SNPID.split('_')
				data_row = [chr, pos, noOfMendelErrors]
				writer.writerow(data_row)
				real_counter += 1
			counter += 1
			
		del reader
		del writer
		sys.stderr.write("%s/%s lines outputted.\n"%(real_counter, counter))
示例#3
0
    def getLocusID2MissingFraction(self, inputFname=None):
        """
		2014.01.08
			
		"""
        sys.stderr.write("Reading in the missing statistics from %s ... " %
                         (inputFname))
        locusID2Stat = {}

        reader = MatrixFile(inputFname=inputFname)
        reader.constructColName2IndexFromHeader()
        locusIDIndex = reader.getColIndexGivenColHeader("locusID")
        statIndex = reader.getColIndexGivenColHeader("occurrence_byFixedValue")
        counter = 0
        for row in reader:
            locusID = row[locusIDIndex]
            chromosome, start = locusID.split('_')[:2]
            start = int(start)
            stat = float(row[statIndex])

            key = (chromosome, start, start)
            if key in locusID2Stat:
                if stat < locusID2Stat[key]:
                    #take lowest value
                    locusID2Stat[key] = stat
            else:
                locusID2Stat[key] = stat
            counter += 1
        del reader
        sys.stderr.write(
            " %s unique loci with missing fraction out of %s total loci.\n" %
            (len(locusID2Stat), counter))
        return locusID2Stat
示例#4
0
    def getMonkeyID2Coverage(self, inputFname):
        """
		2012.9.4
			copied from vervet/src/misc.py
		2012.2.10
			inputFname is output of SequencingStrategy.assignVRCSequencePriorityBasedOnPedigree() + manual change of top ones
		"""
        sys.stderr.write("Reading the list of ranked monkeys from %s ..." % (inputFname))
        reader = MatrixFile(inputFname)
        reader.constructColName2IndexFromHeader()

        monkey_id_index = reader.getColIndexGivenColHeader("UCLAID")
        pre_set_coverage_index = reader.getColIndexGivenColHeader("pre-set-coverage")
        future_coverage_index = reader.getColIndexGivenColHeader("future coverage")
        to_sequence_monkey_id2coverage = {}
        for row in reader:
            monkey_id = row[monkey_id_index]
            pre_set_coverage = row[pre_set_coverage_index]
            if pre_set_coverage:
                pre_set_coverage = float(pre_set_coverage)
            else:
                pre_set_coverage = 0
            future_coverage = 0
            if len(row) >= future_coverage_index + 1:
                future_coverage = float(row[future_coverage_index])
            to_sequence_monkey_id2coverage[monkey_id] = max(future_coverage, pre_set_coverage)
        del reader
        sys.stderr.write(" %s monkeys are to-be-sequenced.\n" % (len(to_sequence_monkey_id2coverage)))
        return to_sequence_monkey_id2coverage

        """
	def getSampleID2IndividualData_UNGC(self, inputFname=None):
		"""
		2013.04.04
			Format is like this from UNGC  = UCLA Neuroscience Genomics Core:
			FCID	Lane	sample ID	sample code	sample name	Index	Description	SampleProject
			D1HYNACXX	1	Ilmn Human control pool ( 4plex)	IP1			INDEX IS UNKNOWN prepared by Illumina (4 plex pool)	2013-029A
			D1HYNACXX	2	UNGC Human Sample 1	S1	AS001A	ATTACTCG	TruSeq DNA PCR Free beta kit	2013-029A
		"""
		sys.stderr.write("Getting  sampleID2IndividualData from %s ..."%(inputFname))
		sampleID2IndividualData = {}
		
		reader = MatrixFile(inputFname, openMode='r', delimiter=',')
		reader.constructColName2IndexFromHeader()
		sampleIDIndex = reader.getColIndexGivenColHeader("sample ID")
		sampleNameIndex = reader.getColIndexGivenColHeader("sample name")
		libraryIndexIndex = reader.getColIndexGivenColHeader("Index")
		
		for row in reader:
			sampleID = row[sampleIDIndex].replace(' ', '_')	#2013.04.04 stupid quirks
			sampleName = row[sampleNameIndex]
			libraryIndex = row[libraryIndexIndex]
			if sampleID not in sampleID2IndividualData:
				sampleID2IndividualData[sampleID] = PassingData(sampleName=sampleName, libraryIndexList=[])
			if sampleName!=sampleID2IndividualData[sampleID].sampleName:
				sys.stderr.write("Error: sampleID %s is associated with two different sample names (%s, %s).\n"%\
								(sampleID, sampleName, sampleID2IndividualData[sampleID].sampleName))
				raise
			sampleID2IndividualData[sampleID].libraryIndexList.append(libraryIndex)
		
		sys.stderr.write("%s entries.\n"%(len(sampleID2IndividualData)))
		return sampleID2IndividualData
	def constructPedigreeGraphFromPlinkIBD(self, inputFname=None, maxDistanceToPOVector=0.04, drawDistribution=False, outputFnamePrefix=None):
		"""
		2012.8.14
		"""
		sys.stderr.write("Constructing pedigree-graph out of plink-ibd %s ..."%(inputFname))
		DG=nx.DiGraph()
		childNodeSet = set()
		reader = MatrixFile(inputFname)
		reader.constructColName2IndexFromHeader()
		
		monkey1IDIndex = reader.getColIndexGivenColHeader("IID1")
		monkey2IDIndex = reader.getColIndexGivenColHeader("IID2")
		Z0Index = reader.getColIndexGivenColHeader("Z0")
		Z1Index = reader.getColIndexGivenColHeader("Z1")
		Z2Index = reader.getColIndexGivenColHeader("Z2")
		
		poVector = numpy.array([0,1,0.0])
		counter = 0
		real_counter = 0
		
		data_ls = []
		for row in reader:
			monkey1ID = int(row[monkey1IDIndex])	#turn it into integer so could compare age
			monkey2ID = int(row[monkey2IDIndex])
			Z0 = float(row[Z0Index])
			Z1 = float(row[Z1Index])
			Z2 = float(row[Z2Index])
			ZVector = numpy.array([Z0, Z1, Z2])
			dist = numpy.linalg.norm(poVector-ZVector)
			if drawDistribution and outputFnamePrefix:
				data_ls.append(dist)
			if dist<=maxDistanceToPOVector:
				if monkey1ID>monkey2ID:
					childID = monkey1ID
					parentID = monkey2ID
				else:
					childID = monkey2ID
					parentID = monkey1ID
				DG.add_edge(parentID, childID, weight=dist)
				childNodeSet.add(childID)
				real_counter += 1
			counter += 1
		del reader
		sys.stderr.write("%s out of %s lines become PO pairs. %s children, %s nodes. %s edges. %s connected components.\n"%(\
							real_counter, counter, len(childNodeSet), DG.number_of_nodes(), DG.number_of_edges(), \
							nx.number_connected_components(DG.to_undirected())))
		if drawDistribution and outputFnamePrefix:
			outputFname = '%s_IBDVector2POVectorDist_hist.png'%(outputFnamePrefix)
			yh_matplotlib.drawHist(data_ls, title='', \
								xlabel_1D="dist(ZVector,POVector)", xticks=None, \
								outputFname=outputFname, min_no_of_data_points=10, \
								needLog=True, \
								dpi=200, min_no_of_bins=25)
		return PassingData(DG=DG, childNodeSet=childNodeSet)
示例#7
0
	def getMendelErrorIndividualLocusData(self, mendelErrorFname=None, individualID2Index=None):
		"""
		2013.1.29
		
		"""
		sys.stderr.write("Getting data on loci involved in mendel-errors from %s ..."%(mendelErrorFname))
		locus_id2individual_index_ls = {}
		#inf = utils.openGzipFile(mendelErrorFname, 'r')
		reader = MatrixFile(inputFname=mendelErrorFname)
		#header = reader.next()
		reader.constructColName2IndexFromHeader()
		counter = 0
		for row in reader:
			individual_id = row[reader.getColIndexGivenColHeader('KID')]
			if individual_id in individualID2Index:
				index =individualID2Index.get(individual_id)
			else:
				sys.stderr.write("Individual %s not in individualID2Index.\n"%(individual_id))
				sys.exit(3)
			snp_id = row[3]
			if snp_id not in locus_id2individual_index_ls:
				locus_id2individual_index_ls[snp_id] = []
			locus_id2individual_index_ls[snp_id].append(index)
			counter += 1
		del reader
		sys.stderr.write(" %s calls of %s loci, involved in mendel errors.\n"%\
						(counter, len(locus_id2individual_index_ls)))
		return locus_id2individual_index_ls
示例#8
0
    def getMonkeyPair2IBDVector(self, inputFname=None):
        """
		2012.9.10
			return monkeyIDSet as well
		2012.9.6
		"""
        sys.stderr.write("Getting monkey pair 2 IBD vector from %s  ..." % (inputFname))
        reader = MatrixFile(inputFname)
        reader.constructColName2IndexFromHeader()
        monkey1IDIndex = reader.getColIndexGivenColHeader("IID1")
        monkey2IDIndex = reader.getColIndexGivenColHeader("IID2")
        IBDIndex = reader.getColIndexGivenColHeader("PI_HAT")
        Z0Index = reader.getColIndexGivenColHeader("Z0")
        Z1Index = reader.getColIndexGivenColHeader("Z1")
        Z2Index = reader.getColIndexGivenColHeader("Z2")
        formatFunc = lambda x: "%.2f" % (x)
        monkeyPair2IBDVector = {}
        counter = 0
        monkeyIDSet = set()
        for row in reader:
            monkey1ID = row[monkey1IDIndex]
            monkey2ID = row[monkey2IDIndex]
            monkey_id_pair = [monkey1ID, monkey2ID]
            monkey_id_pair.sort()
            key = tuple(monkey_id_pair)
            Z0 = float(row[Z0Index])
            Z1 = float(row[Z1Index])
            Z2 = float(row[Z2Index])
            IBD = float(row[IBDIndex])
            IBDVector = [Z0, Z1, Z2]
            IBDVector = map(formatFunc, IBDVector)
            IBDVectorStr = ",".join(IBDVector)
            data = PassingData(IBD=IBD, IBDVector=IBDVector, IBDVectorStr=IBDVectorStr)
            if key in monkeyPair2IBDVector:
                sys.stderr.write(
                    "WARNING: key %s has value %s in monkeyPair2IBDVector already. value overwritten with %s.\n"
                    % (repr(key), monkeyPair2IBDVector.get(key), data)
                )
            monkeyPair2IBDVector[key] = data
            monkeyIDSet.add(monkey1ID)
            monkeyIDSet.add(monkey2ID)
            counter += 1
        sys.stderr.write(
            " %s pairs of IBD vectors for %s unique monkeys.\n" % (len(monkeyPair2IBDVector), len(monkeyIDSet))
        )
        return PassingData(monkeyPair2IBDVector=monkeyPair2IBDVector, monkeyIDSet=monkeyIDSet)
示例#9
0
    def readInCoordinateMap(self, coordinateMapFname=None):
        """
		2013.07.11
			querySNPID      queryStrand     queryChromosome queryStart      queryStop       queryRefBase    queryAltBase    queryAlignmentSpan
			queryAlignmentStart     queryAlignmentStop      newChr  newRefStart     newRefStop      newRefBase      targetAlignmentSpan
			targetAlignmentStart    targetAlignmentStop
		"""
        sys.stderr.write("Reading in the coordinate map from %s ..." %
                         (coordinateMapFname))
        oldCoordinate2newCoordinateDataLs = {}
        reader = MatrixFile(inputFname=coordinateMapFname)
        reader.constructColName2IndexFromHeader()
        oldChromosomeIndex = reader.getColIndexGivenColHeader(
            "queryChromosome")
        oldStartIndex = reader.getColIndexGivenColHeader("queryStart")
        strandIndex = reader.getColIndexGivenColHeader("queryStrand")
        oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase")
        oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase")

        newChromosomeIndex = reader.getColIndexGivenColHeader("newChr")
        newStartIndex = reader.getColIndexGivenColHeader("newRefStart")
        newStopIndex = reader.getColIndexGivenColHeader("newRefStop")
        newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase")
        counter = 0
        for row in reader:
            oldChromosome = row[oldChromosomeIndex]
            oldStart = int(row[oldStartIndex])
            strand = row[strandIndex]
            oldRefBase = row[oldRefBaseIndex]
            oldAltBase = row[oldAltBaseIndex]

            newChromosome = row[newChromosomeIndex]
            newStart = int(row[newStartIndex])
            newStop = int(row[newStopIndex])
            newRefBase = row[newRefBaseIndex]

            key = (oldChromosome, oldStart)
            if key not in oldCoordinate2newCoordinateDataLs:
                oldCoordinate2newCoordinateDataLs[key] = []
            oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \
                     oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\
                     newStop=newStop, newRefBase=newRefBase))
            counter += 1
        del reader
        sys.stderr.write("%s old coordinates with %s new coordinates.\n" %
                         (len(oldCoordinate2newCoordinateDataLs), counter))
        return oldCoordinate2newCoordinateDataLs
示例#10
0
    def getLocusNewID2mapPvalue(self, liftOverLocusMapPvalueFname=None):
        """
		2014.01.04
			oldChromosome, oldStart, oldStop, oldStrand, newChromosome, newStart, newStop, mapPvalue
		"""
        sys.stderr.write("Reading in the coordinate map from %s ..." %
                         (liftOverLocusMapPvalueFname))
        locusNewID2mapPvalue = {}
        reader = MatrixFile(inputFname=liftOverLocusMapPvalueFname)
        reader.constructColName2IndexFromHeader()
        strandIndex = reader.getColIndexGivenColHeader("oldStrand")
        newChromosomeIndex = reader.getColIndexGivenColHeader("newChromosome")
        newStartIndex = reader.getColIndexGivenColHeader("newStart")
        newStopIndex = reader.getColIndexGivenColHeader("newStop")
        mapPvalueIndex = reader.getColIndexGivenColHeader("mapPvalue")
        counter = 0
        for row in reader:
            strand = row[strandIndex]
            newChromosome = row[newChromosomeIndex]
            newStart = int(row[newStartIndex])
            newStop = int(row[newStopIndex])
            mapPvalue = float(row[mapPvalueIndex])

            key = (newChromosome, newStart, newStop)
            if key in locusNewID2mapPvalue:
                if mapPvalue < locusNewID2mapPvalue[key]:
                    #take lowest value
                    locusNewID2mapPvalue[key] = mapPvalue
            else:
                locusNewID2mapPvalue[key] = mapPvalue
            counter += 1
        del reader
        sys.stderr.write(
            "%s unique loci with map p-value out of %s total loci.\n" %
            (len(locusNewID2mapPvalue), counter))
        return locusNewID2mapPvalue
    def readInStats(self, inputFname=None):
        """
		2013.07.15
		"""
        sys.stderr.write("Reading stats from %s ..." % (inputFname))

        data_matrix = []

        reader = MatrixFile(inputFname)
        reader.constructColName2IndexFromHeader()
        switchFrequencyIndex = reader.getColIndexGivenColHeader(
            "noOfSwitchPoints_by_noOfLociWithUniqueHit")
        regionSpanIndex = reader.getColIndexGivenColHeader("regionSpan")
        noOfLociIndex = reader.getColIndexGivenColHeader("#sitesInInput2")

        totalSpan = 0
        totalNoOfLoci = 0
        counter = 0
        for row in reader:
            counter += 1
            switchFrequency = row[switchFrequencyIndex]
            regionSpan = row[regionSpanIndex]
            noOfLoci = row[noOfLociIndex]
            if switchFrequency and regionSpan and noOfLoci:  #non-empty
                switchFrequency = float(switchFrequency)
                regionSpan = int(float(regionSpan))
                noOfLoci = int(float(noOfLoci))
                data_matrix.append([switchFrequency, regionSpan, noOfLoci])
                totalSpan += regionSpan
                totalNoOfLoci += noOfLoci
        reader.close()
        sys.stderr.write(" %s valid entries (from %s rows) with totalSpan=%s, totalNoOfLoci=%s.\n"%\
            (len(data_matrix), counter, totalSpan, totalNoOfLoci))
        return PassingData(data_matrix=data_matrix,
                           totalSpan=totalSpan,
                           totalNoOfLoci=totalNoOfLoci)
	def setup(self, **keywords):
		"""
		"""
		AbstractMatrixFileWalker.setup(self, **keywords)
		
		#construct a individualCode2readGroup from readGroupFname
		self.invariantPData.individualCode2readGroup = {}
		reader = MatrixFile(inputFname=self.readGroupFname)
		reader.constructColName2IndexFromHeader()
		if self.readGroupHeader:
			readGroupIndex = reader.getColIndexGivenColHeader(self.readGroupHeader)
		else:
			readGroupIndex = 0
		for row in reader:
			readGroup = row[readGroupIndex]
			individualAlignment = self.db_vervet.parseAlignmentReadGroup(readGroup).individualAlignment
			if individualAlignment:
				individual_code = individualAlignment.individual_sequence.individual.code
				self.invariantPData.individualCode2readGroup[individual_code] = readGroup
		del reader
		return 1