Exemplos de VCFFile.getSampleIDList em Python, exemplos de pymodule.VCFFile.getSampleIDList em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: OutputVRCPedigreeInTFAMGivenOrderFromFile.py Projeto: mjmontague/vervet-web

	def outputPedigreeForPlink(self, DG=None, db_vervet=None, inputFname=None, outputFname=None, \
							treatEveryOneIndependent=None, sampleIDFormat=1,\
							addUngenotypedDuoParents=False):
		"""
		http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml
			either space or tab could be the delimiter.
		sampleIDFormat
			1: individual.ucla_id
			2: input sampleID
		argument addUngenotypedDuoParents
				for mendel error detection, if an ungenotyped parent in a duo (the other is genotyped) is not present in the genotype file (PED/TPED/BED),
					then plink won't look for its mendel inconsistency 
		
		2013.07.18
			added argument addUngenotypedDuoParents
				for mendel error detection, if an ungenotyped parent in a duo is not present in the genotype file (PED/TPED/BED),
					then plink won't look for its mendel inconsistency 
		
		2013.06.24 added argument sampleIDFormat
			1: individual.ucla_id
			2: alignment.read_group
		2013.1.2
			copied from run()
			
		"""
		sys.stderr.write("Outputting pedigree constrained by %s to %s, treatEveryOneIndependent=%s, sampleIDFormat=%s, addUngenotypedDuoParents=%s ... "%\
						(inputFname, outputFname, treatEveryOneIndependent, sampleIDFormat, addUngenotypedDuoParents))
		vcfFile = VCFFile(inputFname=inputFname)
		
		alignmentLs = []
		alignmentID2sampleData = {}
		individual_id2alignment = {}
		for sampleID in vcfFile.getSampleIDList():
			alignment = db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
			alignmentLs.append(alignment)
			if alignment.id in alignmentID2sampleData:
				sys.stderr.write("Error: alignment %s (%s) for sample %s already in alignmentID2sampleData, with sampleID=%s.\n"%\
								(alignment.id, alignment.read_group, sampleID, \
								alignmentID2sampleData.get(alignment.id).sampleID))
				raise
			alignmentID2sampleData[alignment.id] = PassingData(sampleID=sampleID, alignment=alignment)
			
			individual_id = alignment.individual_sequence.individual_id
			if individual_id in individual_id2alignment:
				sys.stderr.write("Error: alignment %s (%s) for sample %s already in alignmentID2sampleData, with sampleID=%s.\n"%\
								(alignment.id, alignment.read_group, sampleID, \
								alignmentID2sampleData.get(alignment.id).sampleID))
				raise
			individual_id2alignment[individual_id] = alignment
		#alignmentLs = db_vervet.getAlignmentsFromVCFFile(inputFname =inputFname)
		
		"""
		pedigreeGraphData = db_vervet.constructPedgreeGraphOutOfAlignments(alignmentLs)
		DG = pedigreeGraphData.DG
		individual_id2alignmentLs = pedigreeGraphData.individual_id2alignmentLs
		"""
		individual_id2individual = {}
		
		ungenotypedNodeID2Data = {}
		writer = csv.writer(open(outputFname, 'w'), delimiter=' ')
		counter = 0
		family_id= 1	#all in one family
		currentNoOfFakes = 0
		for alignment in alignmentLs:
			nodeID = alignment.individual_sequence.individual_id
			individual = self.getIndividual(db_vervet=db_vervet, individual_id=nodeID, \
										individual_id2individual=individual_id2individual)
			
			if nodeID in DG:
				parents = DG.predecessors(nodeID)
				if len(parents)==2:
					
					parent1 = self.getIndividual(db_vervet=db_vervet, individual_id=parents[0], \
												individual_id2individual=individual_id2individual)
					parent2 = self.getIndividual(db_vervet=db_vervet, individual_id=parents[1], \
												individual_id2individual=individual_id2individual)
					parent1Sex = parent1.codeSexInNumber()
					parent2Sex = parent2.codeSexInNumber()
					#2013.07.18	one and only genotyped, then add the ungenotyped as a ungenotyped duo
					if parents[0] not in individual_id2alignment and parents[1] in individual_id2alignment:
						if parents[0] not in ungenotypedNodeID2Data:
							ungenotypedNodeID2Data[parents[0]] = PassingData(individualDBEntry=parent1, sex=parent1Sex)
					elif parents[0] in individual_id2alignment and parents[1] not in individual_id2alignment:
						if parents[1] not in ungenotypedNodeID2Data:
							ungenotypedNodeID2Data[parents[1]] = PassingData(individualDBEntry=parent2, sex=parent2Sex)
						
					if parent1Sex==2:
						#swap the father and mother row
						tmp = parent1
						parent1 = parent2
						parent2 = tmp
					
					father_id = self.getProperSampleIDForPlinkOutput(individual=parent1, \
									alignmentID2sampleData=alignmentID2sampleData, \
									individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat)
					mother_id = self.getProperSampleIDForPlinkOutput(individual=parent2, \
									alignmentID2sampleData=alignmentID2sampleData, \
									individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat)
				elif len(parents)==1:
					parent1 = self.getIndividual(db_vervet=db_vervet, individual_id=parents[0], \
										individual_id2individual=individual_id2individual)
					parent1Sex = parent1.codeSexInNumber()
					
					
					if parent1Sex==2:
						parent2Sex = 1
						father_id = 0
						mother_id = self.getProperSampleIDForPlinkOutput(individual=parent1, \
									alignmentID2sampleData=alignmentID2sampleData, \
									individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat)
					else:
						parent2Sex = 2
						father_id = self.getProperSampleIDForPlinkOutput(individual=parent1, \
									alignmentID2sampleData=alignmentID2sampleData, \
									individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat)
						mother_id = 0
					#2013.07.18 parent1 (parents[0]) has to be in individual_id2alignment (genotyped) in order for the other
						#to qualify as an ungenotype parent in a duo
					if parents[0] in individual_id2alignment:
						#if parents[0] not in ungenotypedNodeID2Data:
						#	ungenotypedNodeID2Data[parents[0]] = PassingData(individualDBEntry=parent1, sex=parent1Sex)
						fakeParentData = self.generateFakeIndividualID(pedigreeGraph=DG, currentNoOfFakes=currentNoOfFakes)
						currentNoOfFakes = fakeParentData.currentNoOfFakes
						fakeParent2ID = fakeParentData.individualID
						if fakeParent2ID not in individual_id2alignment:
							if fakeParent2ID not in ungenotypedNodeID2Data:
								ungenotypedNodeID2Data[fakeParent2ID] = PassingData(individualDBEntry=None, sex=parent2Sex)
				elif len(parents)==0:
					father_id = 0
					mother_id = 0
				else:
					sys.stderr.write("Error: number of parents (%s) for %s is %s.\n"%(repr(parents), nodeID, len(parents)))
					sys.exit(3)
			else:	# founders
				father_id = 0
				mother_id = 0
			
			if treatEveryOneIndependent:	#force the parents to be 0, everyone becomes founders
				father_id = 0
				mother_id = 0
			individual_id = self.getProperSampleIDForPlinkOutput(individual=individual, \
									alignmentID2sampleData=alignmentID2sampleData, \
									individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat)
			data_row = [family_id, individual_id, father_id, mother_id, \
					individual.codeSexInNumber(), 1]
			writer.writerow(data_row)
			counter += 1
		
		noOfUngenotypedParentsOutputted = 0
		if addUngenotypedDuoParents:
			for ungenotypedNodeID, pdata in ungenotypedNodeID2Data.iteritems():
				individual_id = self.getProperSampleIDForPlinkOutput(individual=pdata.individualDBEntry, \
									alignmentID2sampleData=alignmentID2sampleData, \
									individual_id2alignment=individual_id2alignment, \
									sampleIDFormat=sampleIDFormat, defaultSampleID=ungenotypedNodeID)
				data_row = [family_id, individual_id, 0, 0, pdata.sex, 1]
				writer.writerow(data_row)
				noOfUngenotypedParentsOutputted += 1
		sys.stderr.write("%s individuals and %s ungenotyped duo-parents outputted, number of fake parents %s, addUngenotypedDuoParents=%s.\n"%\
						(counter, noOfUngenotypedParentsOutputted, currentNoOfFakes, addUngenotypedDuoParents))
		del writer

Exemplo n.º 2

0

Exibir arquivo

Arquivo: ExampleToFetchVCFFromDB.py Projeto: mjmontague/vervet-web

	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.data_dir:
			self.data_dir = self.db_vervet.data_dir
		data_dir = self.data_dir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		#query = VervetDB.GenotypeFile.query.filter_by(genotype_method_id=self.genotypeMethodID).filter_by(format=self.format)
		#for genotypeFile in query:
		if not genotypeFile:
			
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(data_dir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule import VCFFile
			
			vcfFile = VCFFile(inputFname=filename, minDepth=0)
			sampleIDList = vcfFile.getSampleIDList()
			writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			header = ['Chromosome', 'position', 'ref']
			columnIndexList = []
			for i in xrange(len(sampleIDList)):
				sampleID = sampleIDList[i]
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
				site = individualAlignment.individual_sequence.individual.site
				#2012.8.29 get scientific name from the taxonomy db
				scientifcName = self.db_taxonomy.returnScientificNameGivenTaxID(individualAlignment.individual_sequence.individual.tax_id)
				#if individualAlignment.individual_sequence.individual.tax_id==60711 and (site.country_id!=144 and site.country_id!=135 \
				#																and site.country_id!=136 and site.country_id!=148): 
				header.append('%s %s'%(sampleID, scientifcName))
				columnIndexList.append(i)
			writer.writerow(header)
			for vcfRecord in vcfFile:
				data_row = [vcfRecord.chr, vcfRecord.pos]
				refCall = vcfRecord.data_row[0]
				data_row.append(refCall['GT'])
				#get alternative allele frequency
				AF_list = vcfRecord.info_tag2value.get('AF')	#info_tag2value['AF']
				AF_list = AF_list.split(',')
				AF_list = map(float, AF_list)
				for columnIndex in columnIndexList:
					#for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
					#it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
					vcfCall = vcfRecord.data_row[columnIndex+1]
					if vcfCall:
						data_row.append(vcfCall['GT'])
					else:
						data_row.append('NA')
						
				writer.writerow(data_row)
				counter += 1
			sys.stderr.write("%s loci outputted.\n"%(counter))
			del writer

Exemplo n.º 3

0

Exibir arquivo

Arquivo: AlignmentToTrioCallPipeline.py Projeto: mjmontague/vervet-web

	def run(self):
		"""
		2011-7-11
		"""
		if self.run_type!=1:
			self.needSplitChrIntervalData = False	#2013.06.21 turn this off before setup_run() to not construct chr2IntervalDataLs
		else:
			self.needSplitChrIntervalData = True
		pdata = self.setup_run()
		workflow = pdata.workflow
		db_vervet = self.db
		
		if self.run_type in [2,3]:
			inputData = self.registerAllInputFiles(workflow, self.inputDir, input_site_handler=self.input_site_handler, \
									checkEmptyVCFByReading=self.checkEmptyVCFByReading,\
									pegasusFolderName=self.pegasusFolderName,\
									maxContigID=self.maxContigID, \
									minContigID=self.minContigID,  db_vervet=db_vervet, \
									needToKnowNoOfLoci=abs(1-self.notToKnowNoOfLoci),\
									minNoOfLociInVCF=self.minNoOfLociInVCF)	#ignore files with too few loci
			inputF = inputData.jobDataLs[0].vcfFile
			vcfFile = VCFFile(inputFname=inputF.abspath)
			alignmentLs = db_vervet.getAlignmentsFromVCFSampleIDList(vcfFile.getSampleIDList())
			del vcfFile
		
		cumulativeMedianDepth = db_vervet.getCumulativeAlignmentMedianDepth(alignmentLs=pdata.alignmentLs, \
											defaultSampleAlignmentDepth=self.defaultSampleAlignmentDepth)
		
		registerReferenceData = pdata.registerReferenceData
		
		
		if self.run_type==1:
			#chr2size = set(['Contig149'])	#temporary when testing Contig149
			#chr2size = set(['1MbBAC'])	#temporary when testing the 1Mb-BAC (formerly vervet_path2)
			#2012.6.12
			#self.outputAlignmentDepthAndOthersForFilter(db_vervet=db_vervet, outputFname=self.alnStatForFilterFname, \
			#									ref_ind_seq_id=self.ref_ind_seq_id, \
			#									foldChange=self.depthFoldChange, minGQ=30)	#minGQ doesn't matter anymore.
			self.addGenotypeCallJobs(workflow=workflow, alignmentDataLs=pdata.alignmentDataLs, chr2IntervalDataLs=self.chr2IntervalDataLs, \
						registerReferenceData=registerReferenceData, \
						site_handler=self.site_handler, input_site_handler=self.input_site_handler,\
						needFastaIndexJob=self.needFastaIndexJob, needFastaDictJob=self.needFastaDictJob, \
						intervalSize=self.intervalSize, intervalOverlapSize=self.intervalOverlapSize, \
						site_type=self.site_type, data_dir=self.data_dir,\
						outputDirPrefix="",\
						genotypeCallerType=self.genotypeCallerType,\
						cumulativeMedianDepth=cumulativeMedianDepth,\
						transferOutput=True)
		elif self.run_type in [2, 3]:
			self.addTrioCallerJobsONVCFFiles(workflow=workflow, alignmentLs=alignmentLs, inputData=inputData, \
						samtools=workflow.samtools, \
						genotyperJava=workflow.genotyperJava,  SelectVariantsJava=workflow.SelectVariantsJava, \
						GenomeAnalysisTKJar=workflow.GenomeAnalysisTKJar, \
						addOrReplaceReadGroupsJava=workflow.addOrReplaceReadGroupsJava, AddOrReplaceReadGroupsJar=workflow.AddOrReplaceReadGroupsJar, \
						CreateSequenceDictionaryJava=workflow.CreateSequenceDictionaryJava, CreateSequenceDictionaryJar=workflow.CreateSequenceDictionaryJar, \
						MergeSamFilesJar=workflow.MergeSamFilesJar, \
						BuildBamIndexFilesJava=workflow.BuildBamIndexFilesJava, BuildBamIndexJar=workflow.BuildBamIndexJar, \
						mv=workflow.mv, CallVariantBySamtools=workflow.CallVariantBySamtools, \
						trioCallerPath=self.trioCallerPath, trioCallerWrapper=workflow.trioCallerWrapper, \
						replicateIndividualTag=self.replicateIndividualTag, treatEveryOneIndependent=self.treatEveryOneIndependent,\
						bgzip_tabix=workflow.bgzip_tabix, vcf_convert=workflow.vcf_convert, \
						vcf_isec=workflow.vcf_isec, vcf_concat=workflow.vcf_concat, \
						concatGATK=workflow.concatGATK, concatSamtools=workflow.concatSamtools,\
						ligateVcf=self.ligateVcf, ligateVcfExecutableFile=self.ligateVcfExecutableFile,\
						registerReferenceData=registerReferenceData, \
						namespace=workflow.namespace, version=workflow.version, site_handler=self.site_handler, input_site_handler=self.input_site_handler,\
						needFastaIndexJob=self.needFastaIndexJob, needFastaDictJob=self.needFastaDictJob, \
						outputDirPrefix="", \
						intervalSize=self.intervalSize, intervalOverlapSize=self.intervalOverlapSize, \
						site_type=self.site_type, data_dir=self.data_dir,\
						onlyKeepBiAllelicSNP=self.onlyKeepBiAllelicSNP, maxSNPMissingRate=self.maxSNPMissingRate,\
						alnStatForFilterF=None, cumulativeMedianDepth=cumulativeMedianDepth,\
						run_type=self.run_type, transferOutput=True)
		
		self.end_run()