def createSbatchFile(self): # Create batch file in working directory # From analysis get : # queue # expected memory # number of cores # time to allocate cwd = self.analysis.output_dir name = self.analysis.name self.batchfile = FileUtils.getDatestampedFilename( cwd, "PogPipe." + name, ".sbatch") jobname = "PogPipe." + name jobout = os.path.join(cwd, jobname + ".out") joberr = os.path.join(cwd, jobname + ".err") queue = self.analysis.queue cores = self.analysis.cores mempercore = self.analysis.mempercore time = "120" # Contents of slurm script is something like # # #!/usr/bin/env bash # #SBATCH -J myRjob <- the analysis number # #SBATCH -o myRjob_slurm.out <- full path here # #SBATCH -e myRjob_slurm.err <- full path here # #SBATCH -p informatics-dev <- queue from analysis # #SBATCH -n 1 <- number of cores? # #SBATCH -t 5 <- time in minutes # #SBATCH --mem=100 <- memory in Mb str = "#!/usr/bin/env bash\n" str += "#SBATCH =J " + jobname + "\n" str += "#SBATCH -o " + jobout + "\n" str += "#SBATCH -e " + joberr + "\n" str += "#SBATCH -p " + queue + "\n" str += "#SBATCH -n %d" % (cores) + "\n" str += "#SBATCH -t %s" % (time) + "\n" str += "#SBATCH --mem=%d" % (mempercore) + "\n" # If we have an analysis id we're reading from the db. If not then we're on the command line if self.analysis.id is None: str += "python " + settings.POGPIPEROOT + "bin/run_analysis.py -a " + name + " -i " str += self.analysis.input_files[0].input_file else: str += "python " + settings.POGPIPEROOT + "bin/run_analysis.py -a " + name + " -i " str += self.analysis.input_files[0].input_file self.batchfiletext = str FileUtils.writeTextToFile(self.batchfile, str)
def checkDiskSpace(self): bytes = FileUtils.getFreeDiskSpace(self.working_dir) logging.info(" ========> Analysis %20s checked disk space for %s Free space (bytes) %s Needed %s"%(self.name,self.working_dir,bytes,self.minimum_space_needed)) if self.minimum_space_needed and bytes < self.minimum_space_needed: str = "Not enough disk space needed in %s. Needs %s, available %s"%(self.working_dir,self.minimum_space_needed,bytes) logging.info(" ========> Analysis %20s %s"%(self.name,str)) raise Exception(str) elif bytes < 10000000: str = "Not enough disk space needed in %s. Needs %s, available %s"%(self.working_dir,self.minimum_space_needed,bytes) logging.info(" ========> Analysis %20s %s"%(self.name,str)) raise Exception(str) bytes = FileUtils.getFreeDiskSpace(self.output_dir) logging.info(" ========> Analysis %20s checked disk space for %s Free space (bytes) %s Needed %s"%(self.name,self.output_dir,bytes,self.minimum_space_needed)) if self.minimum_space_needed and bytes < self.minimum_space_needed: str = "Not enough disk space needed in %s. Needs %s, available %s"%(self.output_dir,self.minimum_space_needed,bytes) logging.info(" ========> Analysis %20s %s"%(self.name,str)) raise Exception(str) elif bytes < 10000000: str = "Not enough disk space needed in %s. Needs %s, available %s"%(self.output_dir,self.minimum_space_needed,bytes) logging.info(" ========> Analysis %20s %s"%(self.name,str)) raise Exception(str)
def checkDiskSpace(anaobj): print anaobj.currentstatus print anaobj.working_dir bytes = FileUtils.getFreeDiskSpace(anaobj.working_dir) logging.info(" ========> Analysis %20s checked disk space for %s Free space (bytes) %s Needed %s"%(anaobj.name,anaobj.working_dir,bytes,anaobj.minimum_space_needed)) if anaobj.minimum_space_needed and bytes < anaobj.minimum_space_needed: str = "Not enough disk space needed in %s. Needs %s, available %s"%(anaobj.working_dir,anaobj.minimum_space_needed,bytes) logging.info(" ========> Analysis %20s %s"%(anaobj.name,str)) raise Exception(str) elif bytes < 10000000: str = "Not enough disk space needed in %s. Needs %s, available %s"%(anaobj.working_dir,anaobj.minimum_space_needed,bytes) logging.info(" ========> Analysis %20s %s"%(anaobj.name,str)) raise Exception(str) bytes = FileUtils.getFreeDiskSpace(anaobj.output_dir) logging.info(" ========> Analysis %20s checked disk space for %s Free space (bytes) %s Needed %s"%(anaobj.name,anaobj.output_dir,bytes,anaobj.minimum_space_needed)) if anaobj.minimum_space_needed and bytes < anaobj.minimum_space_needed: str = "Not enough disk space needed in %s. Needs %s, available %s"%(anaobj.output_dir,anaobj.minimum_space_needed,bytes) logging.info(" ========> Analysis %20s %s"%(anaobj.name,str)) raise Exception(str) elif bytes < 10000000: str = "Not enough disk space needed in %s. Needs %s, available %s"%(anaobj.output_dir,anaobj.minimum_space_needed,bytes) logging.info(" ========> Analysis %20s %s"%(anaobj.name,str)) raise Exception(str)
def testGetFreeGbDiskSpace(self): """Check we can find the free disk space for our working directory""" out = FileUtils.getFreeGbDiskSpace("/tmp/") print " GBytes free are %d" % out self.assertTrue(out > 0)
def testGetDiskUsage(self): """Check we can find the free disk space for our working directory""" out = FileUtils.getDiskUsage("/tmp/") print " Disk usage is %s " % out self.assertTrue(out > 0)
def testGetPercentFreeDiskSpace(self): """Check we can find the free disk space for our working directory""" out = FileUtils.getPercentFreeDiskSpace("/tmp/") print " Percent free is %d" % out self.assertTrue(out >= 0 and out <= 100)
def postProcessOutput(anaobj): """ Checks expected output files exist in the working directory and copies them over to the output directory """ logging.info(" ========> Analysis %20s called postProcessOutput:"%(anaobj.name)) if AnalysisUtils.checkExpectedOutputFiles(anaobj) == False: raise Exception("Missing expected output files. Number missing are [%d]"%(len(anaobj.missing_output_files))) FileUtils.checkDirExists(anaobj.output_dir) tmpfiles = [] logging.info(" ========> Analysis %20s called postProcessOutput: Moving files from %s to %s "%(anaobj.name,anaobj.working_dir,anaobj.output_dir)) try: for srcfileobj in anaobj.expected_output_files: srcfile = srcfileobj.expected_output_file fullsrcfile = os.path.join(anaobj.working_dir,srcfile) destfile = os.path.join(anaobj.output_dir,srcfile) FileUtils.checkDirExistsForFile(destfile) res = shutil.move(fullsrcfile,destfile) if res == None: res = "OK" else: res = "FAILED" print "Checking %s"%destfile tmpfiles.append(destfile) logging.info(" ========> Analysis %20s called postProcessOutput: Result of file move for %s = %s" % (anaobj.name,srcfile,res)) except Exception as e: logging.info(" ========> Analysis %20s file move failed %s"%(anaobj.name,e)) raise anaobj.output_files = [] for i in tmpfiles: AnalysisUtils.addOutputFile(anaobj,i) for f in anaobj.temp_output_files: logging.info(" ========> Analysis %20s removing temp file %s "%(anaobj.name,f)) res = os.remove(f)
def setUp(self): if FileUtils.fileExists(settings.TESTDBNAME): os.remove(settings.TESTDBNAME) settings.DBNAME = settings.TESTDBNAME init_database() Session = sessionmaker(bind=settings.ENGINE) self.session = Session()
def __init__(self, deltafile, refseqs, qryseqs): self.deltafile = deltafile self.refseqs = refseqs self.qryseqs = qryseqs if not FileUtils.fileExists(self.deltafile): raise Exception( "Can't parse Mummer delta file. File [%s] doesn't exist" % self.deltafile)
def getCommands(self): self.commands = [] self.output_files = [] self.checkDiskSpace() seqs = FastaFile.getSequenceDict(self.refgenome, False) if self.checkInputFiles() == False: raise Exception("Input files [%s] don't exist = can't continue" % (self.input_files)) fileparts = FileUtils.getFileParts(self.input_files[0]) self.basename = fileparts['basename'] # Need to set dbtype somewhere outfile1 = self.working_dir + "/" + self.basename + ".raw.vcf" outfile2 = self.working_dir + "/" + self.basename + ".flt.vcf" regstr = "" if self.regionstr != "": regstr = " -r " + self.regionstr outfile1 = self.working_dir + "/" + self.basename + "." + self.regionstr + ".raw.vcf" outfile2 = self.working_dir + "/" + self.basename + "." + self.regionstr + ".flt.vcf" self.expected_output_files.append(outfile1) self.expected_output_files.append(outfile2) command1 = self.samtools + " mpileup -uf " + self.refgenome + " " + self.input_files[ 0] + " " + regstr + " | " + self.bcftools + " view " + " -bvcg - > " + outfile1 command2 = self.bcftools + " view " + outfile1 + " | " + self.vcfutils + " varFilter -D100 > " + outfile2 print "Command %s" % command1 print "Command %s" % command2 self.commands.append( AnalysisCommand(command=command1, command_rank=len(self.commands) + 1)) self.commands.append( AnalysisCommand(command=command2, command_rank=len(self.commands) + 1)) return self.commands
def getCommands(self): self.commands = [] self.output_files = [] self.checkDiskSpace() print "Reading genome file" seqs = FastaFile.getSequenceDict(self.refgenome,False) if self.checkInputFiles() == False: raise Exception("Input files [%s] don't exist = can't continue"%(self.input_files)) fileparts = FileUtils.getFileParts(self.input_files[0]) self.basename = fileparts['basename'] for seq in seqs: len = seqs[seq]['len'] i = 1 while i < len: end = i + self.chunk -1 if end > len: end = len regionstr = "%s:%d-%d"%(seq,i,end) tmpana = AnalysisFactory.createAnalysisFromModuleName("SamtoolsMpileup") tmpana.setInputFiles(self.input_files,self.input_types) tmpana.refgenome = self.refgenome tmpana.regionstr = regionstr tmpana.init() tmpcmds = tmpana.getCommands() for cmd in tmpcmds: self.commands.append(cmd) i = i + self.chunk return self.commands
def setUp(self): if FileUtils.fileExists(settings.TESTDBNAME): os.remove(settings.TESTDBNAME) settings.DBNAME = settings.TESTDBNAME init_database() #input_files = ['pog1.fa','pog2.fa','pog3.fa'] #input_types = ['fasta','fasta','fasta'] #self.ana = Analysis("FastQC") self.input_files = ["testdata/FoxP2_SL167.fastq"] self.input_types = ['fastq'] Session = sessionmaker(bind=settings.ENGINE) self.session = Session()
def testRunMummer(self): mummer = Mummer() self.assertTrue(mummer) self.assertTrue(AnalysisUtils.setInputFiles(mummer,self.input_files,['fasta','fasta'])) runner = AnalysisRunner(mummer) self.assertTrue(runner.run()) self.assertTrue(len(mummer.output_strings) == 1) self.assertTrue(mummer.output_strings[0].output_string.index('4: FINISHING DATA') > 0) self.assertTrue(FileUtils.fileExists('../testout/mummer.delta'))
def getCommands(self): self.checkDiskSpace() if self.checkInputFiles() == False: raise Exception("Input files [%s] don't exist = can't continue" % (self.input_files)) fileparts = FileUtils.getFileParts(self.input_files[0]) self.basename = fileparts['basename'] # Need to set dbtype somewhere command = self.makeblastdb + " -in " + self.input_files[ 0] + " -input_type fasta -dbtype prot -title " + self.basename + " -parse_seqids -out " + fileparts[ 'dirname'] + "/" + self.basename print "Command %s" % command self.commands.append(command) return self.commands
def init(self): super(FastQCAnalysis, self).init() if len(self.input_files) == 0: raise Exception( "No input files for FastQCAnalysis module. Can't init") fileparts = FileUtils.getFileParts(self.input_files[0].input_file) if fileparts['fileext'] == ".fastq": dir = fileparts['filestub'] + "_fastqc/" elif fileparts['fileext'] == ".gz": dir = fileparts['filestub'].replace(".fastq", "") + "_fastqc/" else: dir = fileparts['basename'] + "_fastqc/" self.fastqc_dir = dir tmp = [] for i, f in enumerate(self.expected_output_filelist): #tmp.append(dir + f) AnalysisUtils.addExpectedOutputFile(self, dir + f)
def tearDown(self): deltafile = '../testout/mummer.delta' if FileUtils.fileExists(deltafile): os.remove(deltafile)
def init(self): if len(self.input_files) == 0: raise Exception("No input files for BlastOutput6Parsermodule. Can't init") fileparts = FileUtils.getFileParts(self.input_files[0])
def testGetFilesInDirectory(self): """Check we can return an array of files in a directory""" files = FileUtils.getAllFilesInDirectory("/tmp/") self.assertTrue(len(files) > 0)
def checkBinary(self,binfile): return FileUtils.fileExists(binfile)
"""Unit test for Analysis.py""" import os import sys import unittest import logging sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../") from config import settings from datamodel.FileUtils import FileUtils from datamodel.database.DB import Analysis, init_database from datamodel.database.AnalysisUtils import AnalysisUtils from sqlalchemy.orm import sessionmaker if FileUtils.fileExists(settings.TESTLOGFILE): os.remove(settings.TESTLOGFILE) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh = logging.FileHandler(settings.TESTLOGFILE, 'a') fh.setFormatter(formatter) log = logging.getLogger() # root logger for hdlr in log.handlers: # remove all old handlers log.removeHandler(hdlr) log.addHandler(fh) # set the new handler class DBAnalysisTest(unittest.TestCase
def getCommands(self): if self.commands and len(self.commands) > 0: return self.commands logging.info(" ========> Analysis %20s Getting commands" % (self.name)) self.commands = [] self.expected_output_files = [] self.temp_output_files = [] outdir = self.output_dir tmpdir = self.working_dir btbin = self.bowtiebindir + self.bowtiebinname stbin = self.samtoolsbindir + self.samtoolsbinname self.calculateSpaceNeeded() if FileUtils.fileExists(btbin) == False: raise Exception("Binary file [%s] doesn't exist = can't continue" % btbin) if FileUtils.fileExists(stbin) == False: raise Exception("Binary file [%s] doesn't exist = can't continue" % stbin) if AnalysisUtils.checkInputFiles(self) == False: raise Exception("Input files [%s] don't exist = can't continue" % (self.input_files)) AnalysisUtils.checkDiskSpace(self) for fobj in self.input_files: f = fobj.input_file try: if f.endswith(".gz"): # f = "<( zcat -c " + f + " )" tmpf = f.replace(".gz", "") fparts = FileUtils.getFileParts(tmpf) command = "gunzip -c " + f + " > " + tmpdir + "/" + fparts[ 'basename'] self.commands.append(command) self.temp_output_files.append(tmpf) f = tmpdir + "/" + fparts['basename'] fparts = FileUtils.getFileParts(f) fstub = fparts['filestub'] bowtieoutfile = tmpdir + "/" + fstub + ".sam" samtoolsoutfile = tmpdir + "/" + fstub + ".bam" if self.param == None: raise Exception( "No parameters entered for bowtie = needs -x <genomeindex>" ) command1 = btbin + " " + self.param + " " + f + " | " + stbin + " view -bS - | " + stbin + " sort - " + tmpdir + "/" + fstub logging.info(" ========> Analysis %20s command 1 : %s" % (self.name, command1)) #command2 = stbin + " view -bS " + bowtieoutfile + "| " + stbin + " sort - " + tmpdir + "/" + fstub # logging.info(" ========> Analysis %20s command 2 : %s" % (self.name,command2)) command2 = stbin + " index " + samtoolsoutfile logging.info(" ========> Analysis %20s command 3 : %s" % (self.name, command2)) # self.expected_output_files.append(fstub + ".sam") self.expected_output_files.append( AnalysisExpectedOutputFile(expected_output_file=fstub + ".bam")) self.expected_output_files.append( AnalysisExpectedOutputFile(expected_output_file=fstub + ".bam.bai")) self.commands.append(AnalysisCommand(command=command1)) self.commands.append(AnalysisCommand(command=command2)) #self.commands.append(command3) except Exception as e: logging.info( " ========> Analysis %20s Failed building command list [%s]" % (self.name, e)) raise return self.commands