def copyFileWithAnotherFilePrefix(self, inputFname=None, filenameWithPrefix=None, \ outputDir=None, outputFileRelativePath=None, \ logMessage=None, srcFilenameLs=None, dstFilenameLs=None): """ 2013.08.08 added argument outputFileRelativePath 2013.3.18 bugfix in filename. there was extra . between prefix and suffix. moved from vervet/src/VervetDB.py 2012.9.20 """ srcFilename = inputFname if outputFileRelativePath is None and filenameWithPrefix: prefix, suffix = os.path.splitext(os.path.basename(inputFname)) newPrefix = os.path.splitext(filenameWithPrefix)[0] outputFileRelativePath = '%s%s'%(newPrefix, suffix) dstFilename = os.path.join(outputDir, outputFileRelativePath) returnCode = utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename) if returnCode!=0: sys.stderr.write("ERROR during utils.copyFile. check stderr message just ahead of this.\n") raise if logMessage: logMessage += "file %s has been copied to %s.\n"%(srcFilename, dstFilename) if srcFilenameLs: srcFilenameLs.append(srcFilename) if dstFilenameLs: dstFilenameLs.append(dstFilename) return logMessage
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.data_dir: self.data_dir = self.db_vervet.data_dir data_dir = self.data_dir realPath = os.path.realpath(self.inputFname) logMessage = "file %s.\n"%(self.inputFname) if NextGenSeq.isFileNameVCF(realPath, includeIndelVCF=True) and \ not NextGenSeq.isVCFFileEmpty(realPath, checkContent=self.checkEmptyVCFByReading): vcfFile = VCFFile(inputFname=self.inputFname) individualAlignmentLs = self.getAlignmentLsFromVCF(db_vervet=self.db_vervet, vcfFile=vcfFile) genotypeMethod = self.db_vervet.getGenotypeMethod(short_name=self.genotypeMethodShortName, \ individualAlignmentLs=individualAlignmentLs,\ no_of_individuals=len(individualAlignmentLs), no_of_loci=None,\ data_dir=self.data_dir) self.checkIfAlignmentListMatchMethodDBEntry(individualAlignmentLs, genotypeMethod, session) pdata = self.getNoOfLociFromVCFFile(vcfFile) chromosome2noOfLoci = pdata.chromosome2noOfLoci no_of_loci = pdata.no_of_loci if no_of_loci>0: #file with zero loci could have identical md5sum try: md5sum = utils.get_md5sum(realPath) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=4) else: md5sum = None """ db_entry = VervetDB.GenotypeFile.query.filter_by(md5sum=md5sum).first() if db_entry: sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s is already in db.\n"%\ (db_entry.path, md5sum, realPath)) session.rollback() #2012.8.3 when the jobs are clustered into one merged job and it failed halfway # and retried elsewhere, the redundancy check should not exit with non-zero. otherwise the merged job would fail again. self.cleanUpAndExitOnFailure(exitCode=0) """ no_of_individuals = len(individualAlignmentLs) no_of_chromosomes = len(chromosome2noOfLoci) if no_of_chromosomes == 1: #2012.8.30 use 1st chromosome chromosome = chromosome2noOfLoci.keys()[0] else: chromosome = None genotypeFile = self.db_vervet.getGenotypeFile(genotype_method=genotypeMethod,\ chromosome=chromosome, format=self.format, path=None, file_size=None, md5sum=md5sum,\ original_path=realPath, no_of_individuals=no_of_individuals, no_of_loci=no_of_loci,\ data_dir=self.data_dir, no_of_chromosomes=no_of_chromosomes) if genotypeFile.id and genotypeFile.path: isPathInDB = self.db_vervet.isPathInDBAffiliatedStorage(relativePath=genotypeFile.path, data_dir=self.data_dir) if isPathInDB==-1: sys.stderr.write("Error while updating genotypeFile.path with the new path, %s.\n"%(genotypeFile.path)) self.cleanUpAndExitOnFailure(exitCode=isPathInDB) elif isPathInDB==1: #successful exit, entry already in db sys.stderr.write("Warning: file %s is already in db.\n"%\ (genotypeFile.path)) session.rollback() self.cleanUpAndExitOnFailure(exitCode=0) else: #not in db affiliated storage, keep going. pass #move the file and update the db_entry's path as well inputFileBasename = os.path.basename(self.inputFname) relativePath = genotypeFile.constructRelativePath(sourceFilename=inputFileBasename) exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=genotypeFile, filename=inputFileBasename, \ inputDir=os.path.split(self.inputFname)[0], dstFilename=os.path.join(self.data_dir, relativePath), \ relativeOutputDir=None, shellCommand='cp -rL', \ srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\ constructRelativePathFunction=genotypeFile.constructRelativePath) if exitCode!=0: sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with %s code.\n"%(exitCode)) session.rollback() self.cleanUpAndExitOnFailure(exitCode=exitCode) #copy the tbi (tabix) index file if it exists tbiFilename = '%s.tbi'%(realPath) if os.path.isfile(tbiFilename): srcFilename = tbiFilename dstFilename = os.path.join(self.data_dir, '%s.tbi'%(genotypeFile.path)) utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename) logMessage += "tbi file %s has been copied to %s.\n"%(srcFilename, dstFilename) ## 2012.7.17 commented out because md5sum is calcualted above #db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir) # #2012.7.17 record the size of db_entry.path (folder or file) self.db_vervet.updateDBEntryPathFileSize(db_entry=genotypeFile, data_dir=self.data_dir) vcfFile.close() logMessage += "%s individuals, %s loci, md5sum=%s.\n"%(no_of_individuals, no_of_loci, md5sum) else: logMessage += " is empty (no loci) or not VCF file.\n" self.outputLogMessage(logMessage) if self.commit: try: session.flush() session.commit() except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=3) else: session.rollback() #delete all target files but exit gracefully (exit 0) self.cleanUpAndExitOnFailure(exitCode=0)
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.data_dir: self.data_dir = self.db_vervet.data_dir data_dir = self.data_dir inputFileRealPath = os.path.realpath(self.inputFname) logMessage = "Adding file %s to db .\n"%(self.inputFname) if os.path.isfile(inputFileRealPath): if self.individual_alignment_id: individual_alignment = VervetDB.IndividualAlignment.get(self.individual_alignment_id) elif self.parent_individual_alignment_id: individual_alignment = self.db_vervet.copyParentIndividualAlignment(parent_individual_alignment_id=self.parent_individual_alignment_id,\ mask_genotype_method_id=self.mask_genotype_method_id,\ data_dir=self.data_dir, local_realigned=self.local_realigned) else: #alignment for this library of the individual_sequence individual_sequence = VervetDB.IndividualSequence.get(self.individual_sequence_id) individual_alignment = self.db_vervet.getAlignment(individual_sequence_id=self.individual_sequence_id,\ path_to_original_alignment=None, sequencer=individual_sequence.sequencer,\ sequence_type=individual_sequence.sequence_type, sequence_format=individual_sequence.format, \ ref_individual_sequence_id=self.ref_sequence_id, \ alignment_method_id=self.alignment_method_id, alignment_format=self.format,\ individual_sequence_filtered=individual_sequence.filtered, read_group_added=1, data_dir=data_dir, \ mask_genotype_method_id=self.mask_genotype_method_id, \ parent_individual_alignment_id=self.parent_individual_alignment_id,\ individual_sequence_file_raw_id=self.individual_sequence_file_raw_id,\ local_realigned=self.local_realigned, read_group=self.read_group) needSessionFlush = False if not individual_alignment.path: individual_alignment.path = individual_alignment.constructRelativePath() needSessionFlush = True if self.mask_genotype_method_id and \ individual_alignment.mask_genotype_method_id!=self.mask_genotype_method_id: individual_alignment.mask_genotype_method_id = self.mask_genotype_method_id needSessionFlush = True if self.individual_sequence_file_raw_id and \ individual_alignment.individual_sequence_file_raw_id != self.individual_sequence_file_raw_id: individual_alignment.individual_sequence_file_raw_id = self.individual_sequence_file_raw_id needSessionFlush = True if needSessionFlush: session.add(individual_alignment) session.flush() try: md5sum = utils.get_md5sum(inputFileRealPath) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=4) db_entry = VervetDB.IndividualAlignment.query.filter_by(md5sum=md5sum).first() if db_entry and db_entry.id!=individual_alignment.id and db_entry.path and os.path.isfile(os.path.join(data_dir, db_entry.path)): sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s, is already in db.\n"%\ (db_entry.path, md5sum, inputFileRealPath)) self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=3) if individual_alignment.md5sum is None or individual_alignment.md5sum!=md5sum: individual_alignment.md5sum = md5sum session.add(individual_alignment) session.flush() try: #move the file and update the db_entry's path as well exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=individual_alignment, filename=os.path.basename(inputFileRealPath), \ inputDir=os.path.split(inputFileRealPath)[0], dstFilename=os.path.join(self.data_dir, individual_alignment.path), \ relativeOutputDir=None, shellCommand='cp -rL', \ srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\ constructRelativePathFunction=individual_alignment.constructRelativePath) except: sys.stderr.write('Except in copying %s to db-storage with except info: %s\n'%(inputFileRealPath, repr(sys.exc_info()))) import traceback traceback.print_exc() self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=5) if exitCode!=0: sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with code=%s.\n"%(exitCode)) self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=exitCode) try: #make sure these files are stored in self.dstFilenameLs and self.srcFilenameLs #copy further files if there are if self.inputFnameLs: for inputFname in self.inputFnameLs: if inputFname!=self.inputFname: #2013.3.18 make sure it has not been copied. logMessage = self.db_vervet.copyFileWithAnotherFilePrefix(inputFname=inputFname, \ filenameWithPrefix=individual_alignment.path, \ outputDir=self.data_dir,\ logMessage=logMessage, srcFilenameLs=self.srcFilenameLs, \ dstFilenameLs=self.dstFilenameLs) self.db_vervet.updateDBEntryPathFileSize(db_entry=individual_alignment, data_dir=data_dir) ## 2012.7.17 commented out because md5sum is calculated above #db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir) #copy the bai index file if it exists baiFilename = '%s.bai'%(self.inputFname) if not os.path.isfile(baiFilename): sys.stderr.write("") self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=5) if os.path.isfile(baiFilename): srcFilename = baiFilename dstFilename = os.path.join(self.data_dir, '%s.bai'%(individual_alignment.path)) utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename) logMessage += "bai file %s has been copied to %s.\n"%(srcFilename, dstFilename) self.srcFilenameLs.append(srcFilename) self.dstFilenameLs.append(dstFilename) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=5) else: logMessage += "%s doesn't exist.\n"%(inputFileRealPath) self.outputLogMessage(logMessage) if self.commit: try: session.flush() session.commit() except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=3) else: #delete all target files but exit gracefully (exit 0) self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=0)