Пример #1
0
    def run(self):
        """
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        if not os.path.isfile(self.inputFname):
            sys.stderr.write("Error: file, %s,  is not a file.\n" %
                             (self.inputFname))
            sys.exit(3)

        inputFile = utils.openGzipFile(self.inputFname, 'r')
        outputPolymorphismFile = PolymorphismTableFile(self.outputFname, openMode='w', isPhased=1, \
                    ploidy=self.ploidy)
        outputChromosomeSequenceFile = open(self.outputChromosomeSequenceFname,
                                            "w")

        commandline = inputFile.next().strip()
        outputPolymorphismFile.addAttribute('commandline',
                                            value=commandline,
                                            overwrite=True,
                                            tableName='polymorphism')

        for line in inputFile:
            if self.iterationPattern.search(
                    line):  #one iteration is regarded as one species
                self.outputOneIteration(inputFile=inputFile, iterationLine=line, outputPolymorphismFile=outputPolymorphismFile,\
                     outputChromosomeSequenceFile=outputChromosomeSequenceFile, ploidy=self.ploidy)

        inputFile.close()
        outputPolymorphismFile.close()
        outputChromosomeSequenceFile.close()
    def run(self):
        """
		input looks like (inputFileFormat=1)
				msHOT-lite 2 1 -t 4781.50413187402 -r 790.4466018 ...
				//
				segsites: 40567
				
				positions: 0.0002 0.0003
				001001101011011001...
				101001010100101111...
				...
			
			./msHOT-lite 2 1 -t 84989.8346003745 -r 34490.1412746802 30000000 -l -en 0.0013 1 0.0670 -en 0.0022 1 0.3866 -en 0.0032 1 0.3446 -en 0.0044 1 0.21
				79 -en 0.0059 1 0.1513 -en 0.0076 1 0.1144 -en 0.0096 1 0.0910 -en 0.0121 1 0.0757 -en 0.0150 1 0.0662 -en 0.0184 1 0.0609 -en 0.0226 1 0.0583 -en
				 0.0275 1 0.0572 -en 0.0333 1 0.0571 -en 0.0402 1 0.0577 -en 0.0485 1 0.0589 -en 0.0583 1 0.0603 -en 0.0700 1 0.0615 -en 0.0839 1 0.0624 -en 0.100
				5 1 0.0632 -en 0.1202 1 0.0641 -en 0.1437 1 0.0651 -en 0.1716 1 0.0663 -en 0.2048 1 0.0678 -en 0.2444 1 0.0696 -en 0.2914 1 0.0719 -en 0.3475 1 0.
				0752 -en 0.4935 1 0.0794 
				//
				@begin 6422
				
				30000000
				1100    01
				6074    10
				
				29966899        10
				29971027        01
				29973740        01
				29982767        01
				29985696        10
				@end
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        if not os.path.isfile(self.inputFname):
            sys.stderr.write("Error: file, %s,  is not a file.\n" %
                             (self.inputFname))
            sys.exit(3)

        inputFile = utils.openGzipFile(self.inputFname, 'r')
        outputPolymorphismFile = PolymorphismTableFile(self.outputFname, openMode='w', isPhased=1, \
                    ploidy=self.ploidy)

        commandline = inputFile.next().strip()
        outputPolymorphismFile.addAttribute('commandline',
                                            value=commandline,
                                            overwrite=True,
                                            tableName='polymorphism')

        self._convert(inputFile=inputFile,
                      outputPolymorphismFile=outputPolymorphismFile,
                      ploidy=self.ploidy)

        inputFile.close()
        outputPolymorphismFile.close()
class SimulatePedigreeHaplotype(ComparePedigreeFromMultipleInput):
	__doc__ = __doc__
	option_default_dict = ComparePedigreeFromMultipleInput.option_default_dict.copy()
	#option_default_dict.pop(('inputFname', 0, ))
	option_default_dict.update({
						('recombinationRate', 1, float):[1.5e-8, '', 1, 'recombination rate per meiosis'],\
						('enableIntraLocusRecombination', 0, int):[0, '', 1, 'toggle to enable intra-locus recombination, \n\
	only applicable to non-single-nucleotide loci'],\
						('inputPedigreeFname', 1, ):['', '', 1, "pedigree file, LINKAGE format, Column 1: pedigree identifier.\n\
Column 2: individual's ID\n\
Column 3: the ID of the individual's father\n\
Column 4: the ID of the individual's mother\n\
Column 5: sex (1=male, 2=female)\n\
Column 6-N: affection status (optional)"],\
						('mutationRate', 0, float):[0, '', 1, 'mutation rate per nucleotide per generation'],\
						
						('ploidy', 0, int):[2, '', 1, 'how many sets of chromosomes one individual carries (for output). ploidy=2 means diploid.'],\
						('speciesName', 0, ):['doggie', '', 1, 'a phantom species name (for output).'],\
						
						})
	
	def __init__(self, inputFnameLs=None, **keywords):
		AbstractMapper.__init__(self, inputFnameLs=inputFnameLs, **keywords)
		
	def sampleRecombinantHaplotype(self, parentalHaplotypeList=None, locusPositionList=None, recombinationRate=1.5e-8, enableIntraLocusRecombination=False,\
								chromosomeLength=None):
		"""
		2013.3.6
			argument enableIntraLocusRecombination is not used at this moment.
			
			#. figure out how many recombinations (=n) for this chromosome, from binomial distribution
			#. uniformly choose n positions from the chromosome
			#. there are N-1 possible locations for recombination. N=chromosome length
		"""
		
		
		noOfRecombinationEvents = numpy.random.binomial(chromosomeLength, recombinationRate)
		recombinationLocationList = None
		ploidy = len(parentalHaplotypeList)
		if noOfRecombinationEvents>0:
			recombinationLocationList = numpy.random.randint(1, chromosomeLength+1, size=noOfRecombinationEvents)
				#chromosomeLength+1 is excluded
			recombinationLocationList.sort()
			
			#reverse it so that the first recombination event is now the last, which first pops out upon pop(). 
			recombinationLocationList.reverse()
			nextRecombinationLocation = recombinationLocationList.pop()
			locusIndexSpanAndHaplotypeIndexList = []
			startLocusIndex =0
			currentHaplotypeIndex = 0	#start from the first haplotype
			noOfPolymorphicLoci = len(locusPositionList)
			for i in xrange(noOfPolymorphicLoci):
				locusPosition = locusPositionList[i]
				if locusPosition>=nextRecombinationLocation:
					locusIndexSpanAndHaplotypeIndexList.append((startLocusIndex, i+1, currentHaplotypeIndex))
						#i is included in current haplotype block, next one starts from i+1
					startLocusIndex = i+1
					#a recombination, switches to a different haplotype for the next block
					currentHaplotypeIndex = (currentHaplotypeIndex+1)%(ploidy)	#alternate to the next parental haplotype index
					if len(recombinationLocationList)>0:
						nextRecombinationLocation = recombinationLocationList.pop()
					else:	#last recombination event has just been visited
						#add the last block and exit
						if startLocusIndex<noOfPolymorphicLoci:	#make sure not beyond
							locusIndexSpanAndHaplotypeIndexList.append((startLocusIndex, noOfPolymorphicLoci, currentHaplotypeIndex))
						break
				elif i==noOfPolymorphicLoci-1:	#reach the last locus but there are still recombination to its right
					#add the last block
					locusIndexSpanAndHaplotypeIndexList.append((startLocusIndex, noOfPolymorphicLoci, currentHaplotypeIndex))
			
			recombinantHaplotype = parentalHaplotypeList[0]
			for locusIndexSpanAndHaplotypeIndex in locusIndexSpanAndHaplotypeIndexList:
				startLocusIndex, stopLocusIndex, currentHaplotypeIndex = locusIndexSpanAndHaplotypeIndex
				currentHaplotype = parentalHaplotypeList[currentHaplotypeIndex]
				recombinantHaplotype[startLocusIndex:stopLocusIndex] = currentHaplotype[startLocusIndex:stopLocusIndex]
		else:	#no recombination, just pick one haplotype
			i = numpy.random.randint(0, len(parentalHaplotypeList))
			recombinantHaplotype = parentalHaplotypeList[i]
		return PassingData(recombinantHaplotype=recombinantHaplotype, recombinationLocationList=recombinationLocationList)
	
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		#read in the pedigree graph
		graphData = self.constructPedigreeGraphFromOneFile(inputFname=self.inputPedigreeFname)
		
		# read in the haplotype pool for founders
		self.inputPolymorphismTableFile = PolymorphismTableFile(self.inputFname, openMode='r', constructSNPData=False)
		if not self.inputPolymorphismTableFile.isPhased:
			sys.stderr.write("Error input file %s has unphased polymorphism data, can't sample haplotypes.\n"%(self.inputFname))
			sys.exit(4)
		
		# set up output
		self.outputPolymorphismFile = PolymorphismTableFile(self.outputFname, openMode='w', isPhased=1, ploidy=self.ploidy)
		self.outputPolymorphismFile.writePedigreeDiGraph2IndividualTable(diGraph=graphData.DG, \
											populationName="", speciesName=self.speciesName, \
											ploidy=self.ploidy)
		self.outputPolymorphismFile.writeChrStartStopTupleList2LocusTable(chr_start_stop_list=\
										self.inputPolymorphismTableFile.locusChrStartStopList,\
										speciesName=self.speciesName, ploidy=self.ploidy)
		
		# order the pedigree members based on their distance to the founders
		founderDistance2NodeList = graphData.DG.orderMembersByDistanceToFounders()
		#2013.10.16 YH: bug needs to be fixed here. orderMembersByDistanceToFounders() does not return a data structure like founderDistance2NodeList.
		
		individualName2polymorphismData = {}
		chromosomeLength = self.inputPolymorphismTableFile.snpData.col_id_ls[-1][2]	#stop of the last locus is chromosomeLength
		#. sample haplotypes for founders / their descendents
		for founderDistance, individualNameList in founderDistance2NodeList.iteritems():
			for individualName in individualNameList:
				polymorphismData = OneIndividualPolymorphismData(isPhased=True, ploidy=self.ploidy)
				if founderDistance==0:	#sample haplotypes for founders
					if self.inputPolymorphismTableFile.ploidy==2 and self.inputPolymorphismTableFile.isPhased:
						#input is diploid, phased haplotype data
						polymorphismData = self.inputPolymorphismTableFile.sampleOneIndividualPolymorphismWithReplacement()
					else:
						# self.inputPolymorphismTableFile.ploidy==1 or self.inputPolymorphismTableFile.ploidy is None:
						#input is haplotype or unknown ploidy
						for i in xrange(self.ploidy):
							haplotype = self.inputPolymorphismTableFile.sampleOneRandomHaplotypeWithReplacement()
							polymorphismData.addHaplotype(haplotype)
				else:	#sample recombinant haplotype based on two parents' four haplotypes
					polymorphismData = OneIndividualPolymorphismData()
					parents = graphData.DG.predecessors(individualName)
					for parentName in parents:
						parentalHaplotypeList = individualName2polymorphismData.get(parentName).haplotypeList
						returnData = self.sampleRecombinantHaplotype(parentalHaplotypeList=parentalHaplotypeList, \
											locusPositionList=self.inputPolymorphismTableFile.locusStartPositionList,\
											recombinationRate=self.recombinationRate, \
											enableIntraLocusRecombination=self.enableIntraLocusRecombination, \
											chromosomeLength=chromosomeLength)
						#output recombination events
						if returnData.recombinationLocationList:
							self.outputPolymorphismFile.writeRecombinationEvents(parentName=parentName, childName=individualName, \
																recombinationLocationList=returnData.recombinationLocationList)
						polymorphismData.addHaplotype(returnData.haplotype)
				
				individualName2polymorphismData[individualName] = polymorphismData
		#. output
		self.outputPolymorphismFile.writeIndividualName2PolymorphismData(\
								individualName2polymorphismData=individualName2polymorphismData,\
								speciesName=self.speciesName, ploidy=self.ploidy)
		self.outputPolymorphismFile.close()
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		#read in the pedigree graph
		graphData = self.constructPedigreeGraphFromOneFile(inputFname=self.inputPedigreeFname)
		
		# read in the haplotype pool for founders
		self.inputPolymorphismTableFile = PolymorphismTableFile(self.inputFname, openMode='r', constructSNPData=False)
		if not self.inputPolymorphismTableFile.isPhased:
			sys.stderr.write("Error input file %s has unphased polymorphism data, can't sample haplotypes.\n"%(self.inputFname))
			sys.exit(4)
		
		# set up output
		self.outputPolymorphismFile = PolymorphismTableFile(self.outputFname, openMode='w', isPhased=1, ploidy=self.ploidy)
		self.outputPolymorphismFile.writePedigreeDiGraph2IndividualTable(diGraph=graphData.DG, \
											populationName="", speciesName=self.speciesName, \
											ploidy=self.ploidy)
		self.outputPolymorphismFile.writeChrStartStopTupleList2LocusTable(chr_start_stop_list=\
										self.inputPolymorphismTableFile.locusChrStartStopList,\
										speciesName=self.speciesName, ploidy=self.ploidy)
		
		# order the pedigree members based on their distance to the founders
		founderDistance2NodeList = graphData.DG.orderMembersByDistanceToFounders()
		#2013.10.16 YH: bug needs to be fixed here. orderMembersByDistanceToFounders() does not return a data structure like founderDistance2NodeList.
		
		individualName2polymorphismData = {}
		chromosomeLength = self.inputPolymorphismTableFile.snpData.col_id_ls[-1][2]	#stop of the last locus is chromosomeLength
		#. sample haplotypes for founders / their descendents
		for founderDistance, individualNameList in founderDistance2NodeList.iteritems():
			for individualName in individualNameList:
				polymorphismData = OneIndividualPolymorphismData(isPhased=True, ploidy=self.ploidy)
				if founderDistance==0:	#sample haplotypes for founders
					if self.inputPolymorphismTableFile.ploidy==2 and self.inputPolymorphismTableFile.isPhased:
						#input is diploid, phased haplotype data
						polymorphismData = self.inputPolymorphismTableFile.sampleOneIndividualPolymorphismWithReplacement()
					else:
						# self.inputPolymorphismTableFile.ploidy==1 or self.inputPolymorphismTableFile.ploidy is None:
						#input is haplotype or unknown ploidy
						for i in xrange(self.ploidy):
							haplotype = self.inputPolymorphismTableFile.sampleOneRandomHaplotypeWithReplacement()
							polymorphismData.addHaplotype(haplotype)
				else:	#sample recombinant haplotype based on two parents' four haplotypes
					polymorphismData = OneIndividualPolymorphismData()
					parents = graphData.DG.predecessors(individualName)
					for parentName in parents:
						parentalHaplotypeList = individualName2polymorphismData.get(parentName).haplotypeList
						returnData = self.sampleRecombinantHaplotype(parentalHaplotypeList=parentalHaplotypeList, \
											locusPositionList=self.inputPolymorphismTableFile.locusStartPositionList,\
											recombinationRate=self.recombinationRate, \
											enableIntraLocusRecombination=self.enableIntraLocusRecombination, \
											chromosomeLength=chromosomeLength)
						#output recombination events
						if returnData.recombinationLocationList:
							self.outputPolymorphismFile.writeRecombinationEvents(parentName=parentName, childName=individualName, \
																recombinationLocationList=returnData.recombinationLocationList)
						polymorphismData.addHaplotype(returnData.haplotype)
				
				individualName2polymorphismData[individualName] = polymorphismData
		#. output
		self.outputPolymorphismFile.writeIndividualName2PolymorphismData(\
								individualName2polymorphismData=individualName2polymorphismData,\
								speciesName=self.speciesName, ploidy=self.ploidy)
		self.outputPolymorphismFile.close()