Python NextGenSeq примеры использования

Язык программирования: Python

Пространство имен/Пакет: pymodule

Класс/Тип: NextGenSeq

Примеров на hotexamples.com: 5

Python NextGenSeq - 5 примеров найдено. Это лучшие примеры Python кода для pymodule.NextGenSeq, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

isVCFFileEmpty(4)

isFileNameVCF(3)

countNoOfChromosomesBasesInFastQFile(1)

Пример #1

Показать файл

Файл: DeleteEmptyVCF.py Проект: mjmontague/vervet-web

	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		counter = 0
		no_of_vcf = 0
		real_counter = 0
		for inputFname in self.inputFnameLs:
			counter += 1
			if os.path.isfile(inputFname):
				try:
					if NextGenSeq.isFileNameVCF(inputFname, includeIndelVCF=False):
						no_of_vcf += 1
						if NextGenSeq.isVCFFileEmpty(inputFname, checkContent=self.checkEmptyVCFByReading):
							if self.commit:
								if self.report:
									sys.stderr.write("file %s deleted.\n"%(inputFname))
								commandline = 'rm %s'%(inputFname)
								return_data = runLocalCommand(commandline, report_stderr=True, report_stdout=True)
							real_counter += 1
				except:
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
			if self.report and counter%500==0:
				sys.stderr.write("%s%s\t%s\t%s"%('\x08'*80, counter, no_of_vcf, real_counter))
		sys.stderr.write("%s%s\t%s\t%s\n"%('\x08'*80, counter, no_of_vcf, real_counter))
		sys.stderr.write("%s files in total.\n"%(counter))
		sys.stderr.write("Out of %s VCF files, %s are empty and were deleted.\n"%(no_of_vcf, real_counter))

Пример #2

Показать файл

Файл: CalculateTrioInconsistency.py Проект: mjmontague/vervet-web

	def run(self):
		"""
		2011-7-11
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		if NextGenSeq.isVCFFileEmpty(self.inputFname, checkContent=True):
			sys.stderr.write("Input %s doesn't exist or no variants in it.\n"%(self.inputFname))
			#make sure some output files will exist for downstream jobs.
			self.openOutputFiles(self.outputFnamePrefix, self.windowSize)
			sys.exit(0)
		
		vcfFile = VCFFile(inputFname=self.inputFname, minDepth=self.minDepth)
		trio_col_index_data = self.findTrioIndex(vcfFile.sample_id2index, self.trio_id)
		father_index = trio_col_index_data.father_index
		mother_index = trio_col_index_data.mother_index
		child_index = trio_col_index_data.child_index
		outputDStruc = self.openOutputFiles(self.outputFnamePrefix, self.windowSize)
		if (father_index==-1 and mother_index!=-1) or (father_index!=-1 and mother_index==-1):
			#one parent is missing. it's duo.
			self._calculateForDuo(vcfFile, outputDStruc=outputDStruc, trio_col_index_data=trio_col_index_data)
		else:
			self._calculateForTrio(vcfFile, outputDStruc=outputDStruc, trio_col_index_data=trio_col_index_data)
		
		"""

Пример #3

Показать файл

Файл: AddIndividualAlignmentConsensusSequence2DB.py Проект: mjmontague/vervet-web

	def add2DB(self, db=None, individual_alignment_id=None, inputFname=None, format=None, minDP=None, maxDP=None, minBaseQ=None, minMapQ=None,\
			minRMSMapQ=None, minDistanceToIndel=None, comment=None, data_dir=None, commit=0):
		"""
		2012.11.13
		"""
		session = db.session
		session.begin()
		
		#2012.11.13 check if it's in db already
		db_entry = db.checkIndividualAlignmentConsensusSequence(individual_alignment_id=individual_alignment_id, minDP=minDP, \
									maxDP=maxDP, minBaseQ=minBaseQ, minMapQ=minMapQ,\
									minRMSMapQ=minRMSMapQ, minDistanceToIndel=minDistanceToIndel)
		if db_entry:
			sys.stderr.write("Warning: IndividualAlignmentConsensusSequence of (individual_alignment_id=%s, minDP %s, maxDP %s, etc.) already in db with id=%s.\n"%\
							(individual_alignment_id, minDP, maxDP, db_entry.id))
			sys.exit(3)
		else:
			countData = NextGenSeq.countNoOfChromosomesBasesInFastQFile(inputFname)
			no_of_chromosomes = countData.no_of_chromosomes
			no_of_bases = countData.no_of_bases
			
			db_entry = db.getIndividualAlignmentConsensusSequence(individual_alignment_id=individual_alignment_id, format=format, \
									minDP=minDP, maxDP=maxDP, minBaseQ=minBaseQ, \
									minMapQ=minMapQ, minRMSMapQ=minRMSMapQ, minDistanceToIndel=minDistanceToIndel, \
									no_of_chromosomes=no_of_chromosomes,no_of_bases=no_of_bases, \
									original_path=os.path.abspath(inputFname), data_dir=data_dir)
		
		if commit:
			inputFileBasename = os.path.basename(inputFname)
			#moveFileIntoDBAffiliatedStorage() will also set db_entry.path
			exitCode = db.moveFileIntoDBAffiliatedStorage(db_entry=db_entry, filename=inputFileBasename, \
									inputDir=os.path.split(inputFname)[0], \
									outputDir=data_dir,\
									relativeOutputDir=None, shellCommand='cp -rL', \
									srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\
									constructRelativePathFunction=db_entry.constructRelativePath, data_dir=data_dir)
			
			if exitCode!=0:
				sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with %s code.\n"%(exitCode))
				session.rollback()
				self.cleanUpAndExitOnFailure(exitCode=exitCode)
			
			session.flush()
			session.commit()
		else:	#default is also rollback(). to demonstrate good programming
			session.rollback()

Пример #4

Показать файл

Файл: OutputVCFSiteStat.py Проект: mjmontague/vervet-web

	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		db_vervet = self.db_vervet
		if not self.data_dir:
			self.data_dir = db_vervet.data_dir
		
		if not self.local_data_dir:
			self.local_data_dir = db_vervet.data_dir
		
		# Create a abstract dag
		workflowName = os.path.splitext(os.path.basename(self.outputFname))[0]
		workflow = self.initiateWorkflow(workflowName)
		
		self.registerJars(workflow)
		self.registerCommonExecutables(workflow)
		self.registerCustomExecutables(workflow)
		
		refSequence = VervetDB.IndividualSequence.get(self.ref_ind_seq_id)
		
		refFastaFname = os.path.join(self.data_dir, refSequence.path)
		registerReferenceData = yh_pegasus.registerRefFastaFile(workflow, refFastaFname, registerAffiliateFiles=True, \
						input_site_handler=self.input_site_handler,\
						checkAffiliateFileExistence=True)
		refFastaFList = registerReferenceData.refFastaFList
		
		self.outputAlignmentDepthAndOthersForFilter(self.alnStatForFilterFname, ref_ind_seq_id=self.ref_ind_seq_id, \
												foldChange=self.depthFoldChange, minGQ=self.minGQ)
		alnStatForFilterF = self.registerOneInputFile(workflow, self.alnStatForFilterFname)
		
		#name to distinguish between vcf1Dir, and vcf2Dir
		vcf1Name = self.findProperVCFDirIdentifier(self.vcf1Dir, defaultName='vcf1')
		vcf2Name = self.findProperVCFDirIdentifier(self.vcf2Dir, defaultName='vcf2')
		if vcf2Name==vcf1Name or not vcf2Name:
			vcf2Name = "vcf2"
		
		no_of_jobs = 0
		vcf1DepthFilterDir = "%s_DepthFilter"%(vcf1Name)
		vcf1DepthFilterDirJob = self.addMkDirJob(outputDir=vcf1DepthFilterDir)
		#vcf2DepthFilterDir = "%s_DepthFilter"%(vcf2Name)
		#vcf2DepthFilterDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=vcf2DepthFilterDir)
		
		trioInconsistencyDir = "trioInconsistency"
		trioInconsistencyDirJob = self.addMkDirJob(outputDir=trioInconsistencyDir)
		
		
		SNPMismatchStatDir = "SNPMismatchStat"
		SNPMismatchStatDirJob = self.addMkDirJob(outputDir=SNPMismatchStatDir)
		
		input_site_handler = self.input_site_handler
		
		
		#whole genome reduction job.
		wholeGenomeSiteStatFile = File('siteStatAndTrioInconsistency.tsv')
		wholeGenomeSiteStatMergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \
							outputF=wholeGenomeSiteStatFile,transferOutput=False)
		
		wholeGenomeSiteStatBGzipFile = File("%s.gz"%wholeGenomeSiteStatFile.name)
		wholeGenomeSiteStatBGZipTabixJob = self.addBGZIP_tabix_Job(workflow, bgzip_tabix=workflow.bgzip_tabix, \
							parentJob=wholeGenomeSiteStatMergeJob, inputF=wholeGenomeSiteStatFile, \
							outputF=wholeGenomeSiteStatBGzipFile, \
							transferOutput=True, tabixArguments="-s 1 -b 2 -e 2")
		no_of_jobs += 5
		
		#read the trioInconsistencyByPosistionFname and figure out how many contigs in it and add an extraction job for each contig
		chrLs = self.getChrListInTrioInconsistencyFile(self.tabixPath, self.trioInconsistencyByPosistionFname)
		chr2tabixRetrieveJob = {}
		trioInconsistencyByPosistionF = self.registerOneInputFile(workflow, self.trioInconsistencyByPosistionFname)
		trioInconsistencyByPosistion_tbi_Fname = '%s.tbi'%(self.trioInconsistencyByPosistionFname)
		trioInconsistencyByPosistion_tbi_F = self.registerOneInputFile(workflow, trioInconsistencyByPosistion_tbi_Fname)
		
		for chr in chrLs:
			outputF = File(os.path.join(trioInconsistencyDir, '%s.trioInconsistency.tsv'%chr))
			tabixRetrieveJob = self.addTabixRetrieveJob(workflow, executable=workflow.tabixRetrieve, tabixPath=self.tabixPath, \
							inputF=trioInconsistencyByPosistionF, outputF=outputF, regionOfInterest=chr, includeHeader=True,\
							parentJobLs=[trioInconsistencyDirJob], job_max_memory=100, extraDependentInputLs=[trioInconsistencyByPosistion_tbi_F], \
							transferOutput=False)
			chr2tabixRetrieveJob[chr] = tabixRetrieveJob
			no_of_jobs += 1
		
		counter = 0
		no_of_vcf = 0
		no_of_good_vcf = 0
		for inputFname in os.listdir(self.vcf1Dir):
			counter += 1
			if counter%500==0:
				sys.stderr.write("%s %s jobs %s good vcf, %s total vcf, %s total files"%('\x08'*180, no_of_jobs, \
														no_of_good_vcf, no_of_vcf, counter))
			
			vcf1AbsPath = os.path.join(os.path.abspath(self.vcf1Dir), inputFname)
			vcf2AbsPath = os.path.join(os.path.abspath(self.vcf2Dir), inputFname)
			if NextGenSeq.isFileNameVCF(inputFname, includeIndelVCF=False) and not NextGenSeq.isVCFFileEmpty(vcf1AbsPath):
				if not NextGenSeq.isVCFFileEmpty(vcf2AbsPath, checkContent=self.checkEmptyVCFByReading):	#make sure the samtools vcf exists
					no_of_vcf += 1
					chr = self.getChrFromFname(inputFname)
					if not chr or chr not in chr2tabixRetrieveJob:
						continue
					no_of_good_vcf += 1
					#find the contig id and  the matching tabix job
					commonPrefix = inputFname.split('.')[0]
					vcf1 = File(os.path.join(vcf1Name, inputFname))	#relative path
					vcf1.absPath = vcf1AbsPath
					self.registerVCFAndItsTabixIndex(workflow, vcf1, input_site_handler)
					vcf2 = File(os.path.join(vcf2Name, inputFname))	#relative path
					vcf2.absPath = vcf2AbsPath
					self.registerVCFAndItsTabixIndex(workflow, vcf2, input_site_handler)
					
					outputSiteStatF = File(os.path.join(vcf1DepthFilterDir, '%s.siteStat.tsv'%(commonPrefix)))
					vcf1FilterByDepthJob = self.addFilterVCFByDepthJob(workflow, FilterVCFByDepthJava=workflow.FilterVCFByDepthJava, \
							GenomeAnalysisTKJar=workflow.GenomeAnalysisTKJar, \
							refFastaFList=refFastaFList, inputVCFF=vcf1, outputVCFF=None, outputSiteStatF=outputSiteStatF,\
							parentJobLs=[vcf1DepthFilterDirJob], \
							alnStatForFilterF=alnStatForFilterF, \
							extraDependentInputLs=[vcf1.tbi_F], onlyKeepBiAllelicSNP=self.onlyKeepBiAllelicSNP)
					
					snpMisMatchStatFile = File(os.path.join(SNPMismatchStatDir, '%s_snpMismatchStat.tsv'%(os.path.splitext(commonPrefix)[0])))
					calculateSNPMismatchRateOfTwoVCFJob = self.addCalculateTwoVCFSNPMismatchRateJob(workflow, \
							executable=workflow.CalculateSNPMismatchRateOfTwoVCF, \
							vcf1=vcf1, vcf2=vcf2, snpMisMatchStatFile=snpMisMatchStatFile, \
							maxSNPMismatchRate=1.0, parentJobLs=[SNPMismatchStatDirJob], \
							job_max_memory=1000, extraDependentInputLs=[], \
							transferOutput=False)
					
					#add a ReduceMatrixByMergeColumnsWithSameKey job
					chrMergingStatF = File('%s_variantSiteStatAndTrioInconsistencyRate.tsv'%(chr))
					chrMergingStatJob = self.addStatMergeJob(workflow, \
									statMergeProgram=workflow.ReduceMatrixByMergeColumnsWithSameKey, \
									outputF=chrMergingStatF, extraArguments='-k 0,1', transferOutput=False)
					tabixRetrieveJob = chr2tabixRetrieveJob[chr]
					self.addInputToStatMergeJob(workflow, statMergeJob=chrMergingStatJob, \
								inputF=tabixRetrieveJob.output, \
								parentJobLs=[tabixRetrieveJob])
					
					self.addInputToStatMergeJob(workflow, statMergeJob=chrMergingStatJob, \
								inputF=outputSiteStatF, \
								parentJobLs=[vcf1FilterByDepthJob])
					self.addInputToStatMergeJob(workflow, statMergeJob=chrMergingStatJob, \
								inputF=snpMisMatchStatFile, \
								parentJobLs=[calculateSNPMismatchRateOfTwoVCFJob])
					
					#add to the whole genome reduction job
					self.addInputToStatMergeJob(workflow, statMergeJob=wholeGenomeSiteStatMergeJob, \
								inputF=chrMergingStatJob.output, \
								parentJobLs=[chrMergingStatJob])
					no_of_jobs += 3
		
		sys.stderr.write("%s %s jobs %s good vcf, %s total vcf, %s total files.\n"%('\x08'*180, no_of_jobs, \
														no_of_good_vcf, no_of_vcf, counter))
		
		# Write the DAX to stdout
		outf = open(self.outputFname, 'w')
		workflow.writeXML(outf)

Пример #5

Показать файл

Файл: AddVCFFile2DB.py Проект: mjmontague/vervet-web

	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.data_dir:
			self.data_dir = self.db_vervet.data_dir
		data_dir = self.data_dir
		
		realPath = os.path.realpath(self.inputFname)
		logMessage = "file %s.\n"%(self.inputFname)
		if NextGenSeq.isFileNameVCF(realPath, includeIndelVCF=True) and \
				not NextGenSeq.isVCFFileEmpty(realPath, checkContent=self.checkEmptyVCFByReading):
			vcfFile = VCFFile(inputFname=self.inputFname)
			
			individualAlignmentLs = self.getAlignmentLsFromVCF(db_vervet=self.db_vervet, vcfFile=vcfFile)
			
			genotypeMethod = self.db_vervet.getGenotypeMethod(short_name=self.genotypeMethodShortName, \
															individualAlignmentLs=individualAlignmentLs,\
															no_of_individuals=len(individualAlignmentLs), no_of_loci=None,\
															data_dir=self.data_dir)
			self.checkIfAlignmentListMatchMethodDBEntry(individualAlignmentLs, genotypeMethod, session)
			
			pdata = self.getNoOfLociFromVCFFile(vcfFile)
			chromosome2noOfLoci = pdata.chromosome2noOfLoci
			no_of_loci = pdata.no_of_loci
			if no_of_loci>0:	#file with zero loci could have identical md5sum
				try:
					md5sum = utils.get_md5sum(realPath)
				except:
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
					self.cleanUpAndExitOnFailure(exitCode=4)
			else:
				md5sum = None
			"""
			db_entry = VervetDB.GenotypeFile.query.filter_by(md5sum=md5sum).first()
			if db_entry:
				sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s is already in db.\n"%\
									(db_entry.path, md5sum, realPath))
				session.rollback()
				#2012.8.3 when the jobs are clustered into one merged job and it failed halfway
				# and retried elsewhere, the redundancy check should not exit with non-zero. otherwise the merged job would fail again. 
				self.cleanUpAndExitOnFailure(exitCode=0)
			"""
			no_of_individuals = len(individualAlignmentLs)
			no_of_chromosomes = len(chromosome2noOfLoci)
			if no_of_chromosomes == 1:	#2012.8.30 use 1st chromosome
				chromosome = chromosome2noOfLoci.keys()[0]
			else:
				chromosome = None
			genotypeFile = self.db_vervet.getGenotypeFile(genotype_method=genotypeMethod,\
										chromosome=chromosome, format=self.format, path=None, file_size=None, md5sum=md5sum,\
										original_path=realPath, no_of_individuals=no_of_individuals, no_of_loci=no_of_loci,\
										data_dir=self.data_dir, no_of_chromosomes=no_of_chromosomes)
			if genotypeFile.id and genotypeFile.path:
				isPathInDB = self.db_vervet.isPathInDBAffiliatedStorage(relativePath=genotypeFile.path, data_dir=self.data_dir)
				if isPathInDB==-1:
					sys.stderr.write("Error while updating genotypeFile.path with the new path, %s.\n"%(genotypeFile.path))
					self.cleanUpAndExitOnFailure(exitCode=isPathInDB)
				elif isPathInDB==1:	#successful exit, entry already in db
					sys.stderr.write("Warning: file %s is already in db.\n"%\
										(genotypeFile.path))
					session.rollback()
					self.cleanUpAndExitOnFailure(exitCode=0)
				else:	#not in db affiliated storage, keep going.
					pass
			#move the file and update the db_entry's path as well
			inputFileBasename = os.path.basename(self.inputFname)
			relativePath = genotypeFile.constructRelativePath(sourceFilename=inputFileBasename)
			exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=genotypeFile, filename=inputFileBasename, \
									inputDir=os.path.split(self.inputFname)[0], dstFilename=os.path.join(self.data_dir, relativePath), \
									relativeOutputDir=None, shellCommand='cp -rL', \
									srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\
									constructRelativePathFunction=genotypeFile.constructRelativePath)
			
			if exitCode!=0:
				sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with %s code.\n"%(exitCode))
				session.rollback()
				self.cleanUpAndExitOnFailure(exitCode=exitCode)
			
			#copy the tbi (tabix) index file if it exists
			tbiFilename = '%s.tbi'%(realPath)
			if os.path.isfile(tbiFilename):
				srcFilename = tbiFilename
				dstFilename = os.path.join(self.data_dir, '%s.tbi'%(genotypeFile.path))
				utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename)
				logMessage += "tbi file %s has been copied to %s.\n"%(srcFilename, dstFilename)
			## 2012.7.17 commented out because md5sum is calcualted above
			#db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir)
			# #2012.7.17 record the size of db_entry.path (folder or file)
			self.db_vervet.updateDBEntryPathFileSize(db_entry=genotypeFile, data_dir=self.data_dir)
			
			vcfFile.close()
			logMessage += "%s individuals, %s loci, md5sum=%s.\n"%(no_of_individuals, no_of_loci, md5sum)
		else:
			logMessage += " is empty (no loci) or not VCF file.\n"
		self.outputLogMessage(logMessage)
		
		if self.commit:
			try:
				session.flush()
				session.commit()
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.cleanUpAndExitOnFailure(exitCode=3)
		else:
			session.rollback()
			#delete all target files but exit gracefully (exit 0)
			self.cleanUpAndExitOnFailure(exitCode=0)