Пример #1
0
	def copyFileWithAnotherFilePrefix(self, inputFname=None, filenameWithPrefix=None, \
									outputDir=None, outputFileRelativePath=None, \
									logMessage=None, srcFilenameLs=None, dstFilenameLs=None):
		"""
		2013.08.08 added argument outputFileRelativePath
		2013.3.18 bugfix in filename. there was extra . between prefix and suffix.
			moved from vervet/src/VervetDB.py
		2012.9.20
		"""
		srcFilename = inputFname
		if outputFileRelativePath is None and filenameWithPrefix:
			prefix, suffix = os.path.splitext(os.path.basename(inputFname))
			newPrefix = os.path.splitext(filenameWithPrefix)[0]
			outputFileRelativePath = '%s%s'%(newPrefix, suffix)
		
		dstFilename = os.path.join(outputDir, outputFileRelativePath)
		returnCode = utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename)
		if returnCode!=0:
			sys.stderr.write("ERROR during utils.copyFile. check stderr message just ahead of this.\n")
			raise
		if logMessage:
			logMessage += "file %s has been copied to %s.\n"%(srcFilename, dstFilename)
		if srcFilenameLs:
			srcFilenameLs.append(srcFilename)
		if dstFilenameLs:
			dstFilenameLs.append(dstFilename)
		return logMessage
Пример #2
0
	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.data_dir:
			self.data_dir = self.db_vervet.data_dir
		data_dir = self.data_dir
		
		realPath = os.path.realpath(self.inputFname)
		logMessage = "file %s.\n"%(self.inputFname)
		if NextGenSeq.isFileNameVCF(realPath, includeIndelVCF=True) and \
				not NextGenSeq.isVCFFileEmpty(realPath, checkContent=self.checkEmptyVCFByReading):
			vcfFile = VCFFile(inputFname=self.inputFname)
			
			individualAlignmentLs = self.getAlignmentLsFromVCF(db_vervet=self.db_vervet, vcfFile=vcfFile)
			
			genotypeMethod = self.db_vervet.getGenotypeMethod(short_name=self.genotypeMethodShortName, \
															individualAlignmentLs=individualAlignmentLs,\
															no_of_individuals=len(individualAlignmentLs), no_of_loci=None,\
															data_dir=self.data_dir)
			self.checkIfAlignmentListMatchMethodDBEntry(individualAlignmentLs, genotypeMethod, session)
			
			pdata = self.getNoOfLociFromVCFFile(vcfFile)
			chromosome2noOfLoci = pdata.chromosome2noOfLoci
			no_of_loci = pdata.no_of_loci
			if no_of_loci>0:	#file with zero loci could have identical md5sum
				try:
					md5sum = utils.get_md5sum(realPath)
				except:
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
					self.cleanUpAndExitOnFailure(exitCode=4)
			else:
				md5sum = None
			"""
			db_entry = VervetDB.GenotypeFile.query.filter_by(md5sum=md5sum).first()
			if db_entry:
				sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s is already in db.\n"%\
									(db_entry.path, md5sum, realPath))
				session.rollback()
				#2012.8.3 when the jobs are clustered into one merged job and it failed halfway
				# and retried elsewhere, the redundancy check should not exit with non-zero. otherwise the merged job would fail again. 
				self.cleanUpAndExitOnFailure(exitCode=0)
			"""
			no_of_individuals = len(individualAlignmentLs)
			no_of_chromosomes = len(chromosome2noOfLoci)
			if no_of_chromosomes == 1:	#2012.8.30 use 1st chromosome
				chromosome = chromosome2noOfLoci.keys()[0]
			else:
				chromosome = None
			genotypeFile = self.db_vervet.getGenotypeFile(genotype_method=genotypeMethod,\
										chromosome=chromosome, format=self.format, path=None, file_size=None, md5sum=md5sum,\
										original_path=realPath, no_of_individuals=no_of_individuals, no_of_loci=no_of_loci,\
										data_dir=self.data_dir, no_of_chromosomes=no_of_chromosomes)
			if genotypeFile.id and genotypeFile.path:
				isPathInDB = self.db_vervet.isPathInDBAffiliatedStorage(relativePath=genotypeFile.path, data_dir=self.data_dir)
				if isPathInDB==-1:
					sys.stderr.write("Error while updating genotypeFile.path with the new path, %s.\n"%(genotypeFile.path))
					self.cleanUpAndExitOnFailure(exitCode=isPathInDB)
				elif isPathInDB==1:	#successful exit, entry already in db
					sys.stderr.write("Warning: file %s is already in db.\n"%\
										(genotypeFile.path))
					session.rollback()
					self.cleanUpAndExitOnFailure(exitCode=0)
				else:	#not in db affiliated storage, keep going.
					pass
			#move the file and update the db_entry's path as well
			inputFileBasename = os.path.basename(self.inputFname)
			relativePath = genotypeFile.constructRelativePath(sourceFilename=inputFileBasename)
			exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=genotypeFile, filename=inputFileBasename, \
									inputDir=os.path.split(self.inputFname)[0], dstFilename=os.path.join(self.data_dir, relativePath), \
									relativeOutputDir=None, shellCommand='cp -rL', \
									srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\
									constructRelativePathFunction=genotypeFile.constructRelativePath)
			
			if exitCode!=0:
				sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with %s code.\n"%(exitCode))
				session.rollback()
				self.cleanUpAndExitOnFailure(exitCode=exitCode)
			
			#copy the tbi (tabix) index file if it exists
			tbiFilename = '%s.tbi'%(realPath)
			if os.path.isfile(tbiFilename):
				srcFilename = tbiFilename
				dstFilename = os.path.join(self.data_dir, '%s.tbi'%(genotypeFile.path))
				utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename)
				logMessage += "tbi file %s has been copied to %s.\n"%(srcFilename, dstFilename)
			## 2012.7.17 commented out because md5sum is calcualted above
			#db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir)
			# #2012.7.17 record the size of db_entry.path (folder or file)
			self.db_vervet.updateDBEntryPathFileSize(db_entry=genotypeFile, data_dir=self.data_dir)
			
			vcfFile.close()
			logMessage += "%s individuals, %s loci, md5sum=%s.\n"%(no_of_individuals, no_of_loci, md5sum)
		else:
			logMessage += " is empty (no loci) or not VCF file.\n"
		self.outputLogMessage(logMessage)
		
		if self.commit:
			try:
				session.flush()
				session.commit()
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.cleanUpAndExitOnFailure(exitCode=3)
		else:
			session.rollback()
			#delete all target files but exit gracefully (exit 0)
			self.cleanUpAndExitOnFailure(exitCode=0)
Пример #3
0
	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.data_dir:
			self.data_dir = self.db_vervet.data_dir
		data_dir = self.data_dir
		
		inputFileRealPath = os.path.realpath(self.inputFname)
		logMessage = "Adding file %s to db .\n"%(self.inputFname)
		
		if os.path.isfile(inputFileRealPath):
			if self.individual_alignment_id:
				individual_alignment = VervetDB.IndividualAlignment.get(self.individual_alignment_id)
			elif self.parent_individual_alignment_id:
				individual_alignment = self.db_vervet.copyParentIndividualAlignment(parent_individual_alignment_id=self.parent_individual_alignment_id,\
																	mask_genotype_method_id=self.mask_genotype_method_id,\
																	data_dir=self.data_dir, local_realigned=self.local_realigned)
			else:
				#alignment for this library of the individual_sequence
				individual_sequence = VervetDB.IndividualSequence.get(self.individual_sequence_id)
				
				individual_alignment = self.db_vervet.getAlignment(individual_sequence_id=self.individual_sequence_id,\
										path_to_original_alignment=None, sequencer=individual_sequence.sequencer,\
										sequence_type=individual_sequence.sequence_type, sequence_format=individual_sequence.format, \
										ref_individual_sequence_id=self.ref_sequence_id, \
										alignment_method_id=self.alignment_method_id, alignment_format=self.format,\
										individual_sequence_filtered=individual_sequence.filtered, read_group_added=1,
										data_dir=data_dir, \
										mask_genotype_method_id=self.mask_genotype_method_id, \
										parent_individual_alignment_id=self.parent_individual_alignment_id,\
										individual_sequence_file_raw_id=self.individual_sequence_file_raw_id,\
										local_realigned=self.local_realigned, read_group=self.read_group)
			needSessionFlush = False
			if not individual_alignment.path:
				individual_alignment.path = individual_alignment.constructRelativePath()
				needSessionFlush = True
			
			if self.mask_genotype_method_id and \
					individual_alignment.mask_genotype_method_id!=self.mask_genotype_method_id:
				individual_alignment.mask_genotype_method_id = self.mask_genotype_method_id
				needSessionFlush = True
			if self.individual_sequence_file_raw_id and \
					individual_alignment.individual_sequence_file_raw_id != self.individual_sequence_file_raw_id:
				individual_alignment.individual_sequence_file_raw_id = self.individual_sequence_file_raw_id
				needSessionFlush = True
			
			if needSessionFlush:
				session.add(individual_alignment)
				session.flush()
			
			try:
				md5sum = utils.get_md5sum(inputFileRealPath)
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.cleanUpAndExitOnFailure(exitCode=4)
			
			db_entry = VervetDB.IndividualAlignment.query.filter_by(md5sum=md5sum).first()
			if db_entry and db_entry.id!=individual_alignment.id and db_entry.path and os.path.isfile(os.path.join(data_dir, db_entry.path)):
				sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s, is already in db.\n"%\
								(db_entry.path, md5sum, inputFileRealPath))
				self.sessionRollback(session)
				self.cleanUpAndExitOnFailure(exitCode=3)
			
			
			if individual_alignment.md5sum is None or individual_alignment.md5sum!=md5sum:
				individual_alignment.md5sum = md5sum
				session.add(individual_alignment)
				session.flush()
			try:
				#move the file and update the db_entry's path as well
				exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=individual_alignment, filename=os.path.basename(inputFileRealPath), \
							inputDir=os.path.split(inputFileRealPath)[0], dstFilename=os.path.join(self.data_dir, individual_alignment.path), \
							relativeOutputDir=None, shellCommand='cp -rL', \
							srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\
							constructRelativePathFunction=individual_alignment.constructRelativePath)
			except:
				sys.stderr.write('Except in copying %s to db-storage with except info: %s\n'%(inputFileRealPath, repr(sys.exc_info())))
				import traceback
				traceback.print_exc()
				self.sessionRollback(session)
				self.cleanUpAndExitOnFailure(exitCode=5)
			
			if exitCode!=0:
				sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with code=%s.\n"%(exitCode))
				self.sessionRollback(session)
				self.cleanUpAndExitOnFailure(exitCode=exitCode)
			try:
				#make sure these files are stored in self.dstFilenameLs and self.srcFilenameLs
				#copy further files if there are
				if self.inputFnameLs:
					for inputFname in self.inputFnameLs:
						if inputFname!=self.inputFname:	#2013.3.18 make sure it has not been copied.
							logMessage = self.db_vervet.copyFileWithAnotherFilePrefix(inputFname=inputFname, \
												filenameWithPrefix=individual_alignment.path, \
												outputDir=self.data_dir,\
												logMessage=logMessage, srcFilenameLs=self.srcFilenameLs, \
												dstFilenameLs=self.dstFilenameLs)
				
				self.db_vervet.updateDBEntryPathFileSize(db_entry=individual_alignment, data_dir=data_dir)
				
				## 2012.7.17 commented out because md5sum is calculated above
				#db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir)
				#copy the bai index file if it exists
				baiFilename = '%s.bai'%(self.inputFname)
				if not os.path.isfile(baiFilename):
					sys.stderr.write("")
					self.sessionRollback(session)
					self.cleanUpAndExitOnFailure(exitCode=5)
				if os.path.isfile(baiFilename):
					srcFilename = baiFilename
					dstFilename = os.path.join(self.data_dir, '%s.bai'%(individual_alignment.path))
					utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename)
					logMessage += "bai file %s has been copied to %s.\n"%(srcFilename, dstFilename)
					self.srcFilenameLs.append(srcFilename)
					self.dstFilenameLs.append(dstFilename)
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.sessionRollback(session)
				self.cleanUpAndExitOnFailure(exitCode=5)
		else:
			logMessage += "%s doesn't exist.\n"%(inputFileRealPath)
		self.outputLogMessage(logMessage)
		
		if self.commit:
			try:
				session.flush()
				session.commit()
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.cleanUpAndExitOnFailure(exitCode=3)
		else:
			#delete all target files but exit gracefully (exit 0)
			self.sessionRollback(session)
			self.cleanUpAndExitOnFailure(exitCode=0)