def run(self): cactusAlignmentName = "cactusAlignment" cactusAlignment = os.path.join(self.outputDir, cactusAlignmentName) if not os.path.exists(cactusAlignment): #Prepare the assembly #First copy it. if self.assemblyFile[-3:] == '.gz': tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix=".gz") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) system("gunzip %s" % tempAssemblyFile) tempAssemblyFile = tempAssemblyFile[:-3] assert os.path.exists(tempAssemblyFile) else: tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix="") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) #Make the supporting temporary files tempExperimentFile = getTempFile(rootDir=self.getLocalTempDir()) tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree") #Make the experiment file cactusWorkflowExperiment = ExperimentWrapper.createExperimentWrapper( sequences=self.haplotypeSequences + [tempAssemblyFile], newickTreeString=self.newickTree, outputDir=self.getLocalTempDir(), configFile=self.configFile) cactusWorkflowExperiment.setDbName(cactusAlignmentName) cactusWorkflowExperiment.setDbDir( os.path.join(self.getLocalTempDir(), cactusWorkflowExperiment.getDbName()) ) #This needs to be set to ensure the thing gets put in the right directory cactusWorkflowExperiment.writeXML(tempExperimentFile) #Now run cactus workflow runCactusWorkflow(experimentFile=tempExperimentFile, jobTreeDir=tempJobTreeDir, buildAvgs=False, buildReference=True, batchSystem="single_machine", maxThreads=1, jobTreeStats=True) logger.info("Ran the workflow") #Check if the jobtree completed sucessively. runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir) logger.info("Checked the job tree dir") #Compute the stats cactusAlignmentDir = os.path.join(self.getLocalTempDir(), cactusAlignmentName) tempJobTreeStatsFile = os.path.join(self.getLocalTempDir(), "jobTreeStats.xml") system("jobTreeStats --jobTree %s --outputFile %s" % (tempJobTreeDir, tempJobTreeStatsFile)) #Now copy the true assembly back to the output system("mv %s/* %s" % (self.getLocalTempDir(), self.outputDir)) #system("mv %s %s/config.xml" % (tempExperimentFile, self.outputDir)) #system("mv %s %s/" % (tempJobTreeStatsFile, self.outputDir)) #system("mv %s %s/" % (cactusAlignmentDir, self.outputDir)) assert os.path.exists(cactusAlignment) #We're done! self.addChildTarget( MakeStats1(self.outputDir, cactusAlignment, self.options))
def run(self): cactusAlignmentName = "cactusAlignment" cactusAlignment = os.path.join(self.outputDir, cactusAlignmentName) if not os.path.exists(cactusAlignment): #Prepare the assembly #First copy it. if self.assemblyFile[-3:] == '.gz': tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix=".gz") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) system("gunzip %s" % tempAssemblyFile) tempAssemblyFile = tempAssemblyFile[:-3] assert os.path.exists(tempAssemblyFile) else: tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix="") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) #Make the supporting temporary files tempExperimentFile = getTempFile(rootDir=self.getLocalTempDir()) tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree") #Make the experiment file cactusWorkflowExperiment = ExperimentWrapper.createExperimentWrapper( sequences=self.haplotypeSequences + [ tempAssemblyFile ], newickTreeString=self.newickTree, outputDir=self.getLocalTempDir(), configFile=self.configFile) cactusWorkflowExperiment.setDbName(cactusAlignmentName) cactusWorkflowExperiment.setDbDir(os.path.join(self.getLocalTempDir(), cactusWorkflowExperiment.getDbName())) #This needs to be set to ensure the thing gets put in the right directory cactusWorkflowExperiment.writeXML(tempExperimentFile) #Now run cactus workflow runCactusWorkflow(experimentFile=tempExperimentFile, jobTreeDir=tempJobTreeDir, buildAvgs=False, buildReference=True, batchSystem="single_machine", maxThreads=1, jobTreeStats=True) logger.info("Ran the workflow") #Check if the jobtree completed sucessively. runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir) logger.info("Checked the job tree dir") #Compute the stats cactusAlignmentDir = os.path.join(self.getLocalTempDir(), cactusAlignmentName) tempJobTreeStatsFile = os.path.join(self.getLocalTempDir(),"jobTreeStats.xml") system("jobTreeStats --jobTree %s --outputFile %s" % (tempJobTreeDir, tempJobTreeStatsFile)) #Now copy the true assembly back to the output system("mv %s/* %s" % (self.getLocalTempDir(), self.outputDir)) #system("mv %s %s/config.xml" % (tempExperimentFile, self.outputDir)) #system("mv %s %s/" % (tempJobTreeStatsFile, self.outputDir)) #system("mv %s %s/" % (cactusAlignmentDir, self.outputDir)) assert os.path.exists(cactusAlignment) #We're done! self.addChildTarget(MakeStats1(self.outputDir, cactusAlignment, self.options))
def runVanilla(self): logger.debug("Going to put the alignment in %s" % self.outputDir) if not os.path.isdir(self.outputDir): os.mkdir(self.outputDir) if not os.path.exists(os.path.join(self.outputDir, "cactusAlignmentVanilla")): xmlTree = ET.parse(os.path.join(getRootPathString(), "lib", "cactus_workflow_config.xml")) #Set the parameters tempLocalDir = os.path.join(self.outputDir, "tempVanillaCactusAlignment") system("rm -rf %s" % tempLocalDir) os.mkdir(tempLocalDir) #Set the config parameters self.params.applyToXml(xmlTree) config = xmlTree.getroot() assert config is not None #Write the config file tempConfigFile = os.path.join(tempLocalDir, "config.xml") fileHandle = open(tempConfigFile, 'w') assert fileHandle is not None tree = ET.ElementTree(config) tree.write(fileHandle) fileHandle.close() #Make the experiment file tempExperimentFile = os.path.join(tempLocalDir, "experiment.xml") #Now do standard cactus.. #Make the experiment file tempExperimentFile2 = os.path.join(tempLocalDir, "experiment.xml") cactusWorkflowExperiment = CactusWorkflowExperiment( sequences=self.sequences, newickTreeString=self.newickTree, #requiredSpecies=self.requiredSpecies, #singleCopySpecies=self.singleCopySpecies, databaseName="cactusAlignmentVanilla", outputDir=tempLocalDir, configFile=tempConfigFile) tempExperimentDir2 = os.path.join(tempLocalDir, "cactusAlignmentVanilla") cactusWorkflowExperiment.writeExperimentFile(tempExperimentFile2) # apply naming to the event tree to be consistent with progressive exp = ExperimentWrapper(ET.parse(tempExperimentFile2).getroot()) cleanEventTree(exp) exp.writeXML(tempExperimentFile2) #We're done with the progressive, now run the vanilla cactus for comparison tempJobTreeDir2 = os.path.join(tempLocalDir, "jobTreeVanilla") runCactusWorkflow(tempExperimentFile2, tempJobTreeDir2, jobTreeStats=True, setupAndBuildAlignments=True, buildReference=True, maxThreads=4) runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir2) logger.info("Checked the job tree dir for the vanilla run") runCactusMAFGenerator(os.path.join(self.outputDir, "cactusVanilla.maf"), getCactusDiskString(tempExperimentDir2)) #Run the cactus tree stats treeStatsFile = os.path.join(self.outputDir, "treeStats.xml") system("cactus_treeStats --cactusDisk \'%s\' --flowerName 0 --outputFile %s" %(exp.getDiskDatabaseString(), treeStatsFile)) system("jobTreeStats --jobTree %s --outputFile %s/jobTreeStats.xml" % (tempJobTreeDir2, self.outputDir)) system("mv %s %s" % (tempExperimentDir2, self.outputDir)) system("mv %s %s/experiment.xml" % (tempExperimentFile2, self.outputDir))
def run(self): if not os.path.isdir(self.outputDir): os.mkdir(self.outputDir) cactusAlignmentName = "cactusAlignment" outputFile = os.path.join(self.outputDir, cactusAlignmentName) if not os.path.exists(outputFile): config = ET.parse( os.path.join(getRootPathString(), "lib", "cactus_workflow_config.xml")).getroot() #Set the reference algorithm config.find("reference").attrib[ "matching_algorithm"] = self.referenceAlgorithm #Do the minimum block degree configuration iterations = config.find("alignment").find("iterations") blastIteration = iterations.findall("iteration")[0] baseIteration = iterations.findall("iteration")[1] minimumBlastBlockDegree = self.minimumBlockDegree if minimumBlastBlockDegree <= 1: minimumBlastBlockDegree = 2 blastIteration.find("core").attrib["minimumBlockDegree"] = str( minimumBlastBlockDegree) baseIteration.attrib["minimumBlockDegree"] = str( self.minimumBlockDegree) baseIteration.attrib["prune_out_stub_alignments"] = str( int(self.pruneOutStubAlignments)) baseIteration.attrib["gap_gamma"] = str(float(self.gapGamma)) #Set the blast string blastIteration.find( "blast").attrib["blastString"] = blastIteration.find( "blast").attrib["blastString"].replace( "PARAMETERS", self.blastAlignmentString) blastIteration.find( "blast").attrib["selfBlastString"] = blastIteration.find( "blast").attrib["selfBlastString"].replace( "PARAMETERS", self.blastAlignmentString) #Get rid of the base level, if needed if not self.baseLevel: iterations.remove(baseIteration) #Set the number of chains to allow in a level, during promotion config.find("normal").attrib["max_number_of_chains"] = str( self.maxNumberOfChains) #Set the number of chains to order per round of the matching algorithm config.find("reference").attrib["permutations"] = str( self.permutations) #Set the chain weight function if bool(self.useSimulatedAnnealing): config.find("reference").attrib["useSimulatedAnnealing"] = "1" config.find("reference").attrib["theta"] = str(self.theta) #Write the config file tempConfigFile = os.path.join(self.getLocalTempDir(), "config.xml") fileHandle = open(tempConfigFile, 'w') tree = ET.ElementTree(config) tree.write(fileHandle) fileHandle.close() #Make the supporting temporary files tempExperimentFile = os.path.join(self.getLocalTempDir(), "experiment.xml") tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree") #Make the experiment file cactusWorkflowExperiment = CactusWorkflowExperiment( sequences=self.sequences.split(), newickTreeString=self.options.newickTree, requiredSpecies=[(1, self.requiredSpecies.split())], singleCopySpecies=self.singleCopySpecies, outgroupEvent=self.options.outgroupEvent, databaseName=cactusAlignmentName, outputDir=self.getLocalTempDir(), configFile=tempConfigFile) cactusWorkflowExperiment.writeExperimentFile(tempExperimentFile) #Now run cactus workflow runCactusWorkflow(experimentFile=tempExperimentFile, jobTreeDir=tempJobTreeDir, setupAndBuildAlignments=True, buildTrees=False, buildFaces=False, buildReference=True, batchSystem="single_machine", maxThreads=1, jobTreeStats=True) logger.info("Ran the workflow") #Check if the jobtree completed sucessively. runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir) logger.info("Checked the job tree dir") #Now copy the true assembly back to the output system("mv %s %s/experiment.xml" % (tempExperimentFile, self.outputDir)) system("mv %s %s/config.xml" % (tempConfigFile, self.outputDir)) #Copy across the final alignment localCactusDisk = os.path.join(self.getLocalTempDir(), cactusAlignmentName) #Move the final db system("mv %s %s" % (localCactusDisk, outputFile)) #Compute the stats system( "jobTreeStats --jobTree %s --outputFile %s/jobTreeStats.xml" % (tempJobTreeDir, self.outputDir)) #We're done! self.addChildTarget(MakeStats(outputFile, self.outputDir, self.options))
def run(self): if not os.path.isdir(self.outputDir): os.mkdir(self.outputDir) cactusAlignmentName = "cactusAlignment" outputFile = os.path.join(self.outputDir, cactusAlignmentName) if not os.path.exists(outputFile): config = ET.parse(os.path.join(getRootPathString(), "lib", "cactus_workflow_config.xml")).getroot() #Set the reference algorithm config.find("reference").attrib["matching_algorithm"] = self.referenceAlgorithm #Do the minimum block degree configuration iterations = config.find("alignment").find("iterations") blastIteration = iterations.findall("iteration")[0] baseIteration = iterations.findall("iteration")[1] minimumBlastBlockDegree = self.minimumBlockDegree if minimumBlastBlockDegree <= 1: minimumBlastBlockDegree = 2 blastIteration.find("core").attrib["minimumBlockDegree"] = str(minimumBlastBlockDegree) baseIteration.attrib["minimumBlockDegree"] = str(self.minimumBlockDegree) baseIteration.attrib["prune_out_stub_alignments"] = str(int(self.pruneOutStubAlignments)) baseIteration.attrib["gap_gamma"] = str(float(self.gapGamma)) #Set the blast string blastIteration.find("blast").attrib["blastString"] = blastIteration.find("blast").attrib["blastString"].replace("PARAMETERS", self.blastAlignmentString) blastIteration.find("blast").attrib["selfBlastString"] = blastIteration.find("blast").attrib["selfBlastString"].replace("PARAMETERS", self.blastAlignmentString) #Get rid of the base level, if needed if not self.baseLevel: iterations.remove(baseIteration) #Set the number of chains to allow in a level, during promotion config.find("normal").attrib["max_number_of_chains"] = str(self.maxNumberOfChains) #Set the number of chains to order per round of the matching algorithm config.find("reference").attrib["permutations"] = str(self.permutations) #Set the chain weight function if bool(self.useSimulatedAnnealing): config.find("reference").attrib["useSimulatedAnnealing"]="1" config.find("reference").attrib["theta"] = str(self.theta) #Write the config file tempConfigFile = os.path.join(self.getLocalTempDir(), "config.xml") fileHandle = open(tempConfigFile, 'w') tree = ET.ElementTree(config) tree.write(fileHandle) fileHandle.close() #Make the supporting temporary files tempExperimentFile = os.path.join(self.getLocalTempDir(), "experiment.xml") tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree") #Make the experiment file cactusWorkflowExperiment = CactusWorkflowExperiment( sequences=self.sequences.split(), newickTreeString=self.options.newickTree, requiredSpecies=[ (1, self.requiredSpecies.split() ) ], singleCopySpecies=self.singleCopySpecies, outgroupEvent = self.options.outgroupEvent, databaseName=cactusAlignmentName, outputDir=self.getLocalTempDir(), configFile=tempConfigFile) cactusWorkflowExperiment.writeExperimentFile(tempExperimentFile) #Now run cactus workflow runCactusWorkflow(experimentFile=tempExperimentFile, jobTreeDir=tempJobTreeDir, setupAndBuildAlignments=True, buildTrees=False, buildFaces=False, buildReference=True, batchSystem="single_machine", maxThreads=1, jobTreeStats=True) logger.info("Ran the workflow") #Check if the jobtree completed sucessively. runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir) logger.info("Checked the job tree dir") #Now copy the true assembly back to the output system("mv %s %s/experiment.xml" % (tempExperimentFile, self.outputDir)) system("mv %s %s/config.xml" % (tempConfigFile, self.outputDir)) #Copy across the final alignment localCactusDisk = os.path.join(self.getLocalTempDir(), cactusAlignmentName) #Move the final db system("mv %s %s" % (localCactusDisk, outputFile)) #Compute the stats system("jobTreeStats --jobTree %s --outputFile %s/jobTreeStats.xml" % (tempJobTreeDir, self.outputDir)) #We're done! self.addChildTarget(MakeStats(outputFile, self.outputDir, self.options))
def testCactusWorkflow_Blanchette(self): """Runs the workflow on blanchette's simulated (colinear) regions. """ if "SON_TRACE_DATASETS" not in os.environ: return for test in xrange(self.testNo): tempFiles = [] tempDir = getTempDirectory(os.getcwd()) trueAlignment = os.path.join(TestStatus.getPathToDataSets(), "blanchettesSimulation", "00.job", "true.mfa") #Load the true alignment. columnAlignment = [ i for i in fastaAlignmentRead(trueAlignment) ] fastaHeaders = [ i for i in fastaReadHeaders(trueAlignment) ] sequenceNumber = 9 #The tree newickTreeString = "((((HUMAN:0.006969, CHIMP:0.009727):0.025291, BABOON:0.044568):0.11,(RAT:0.072818, MOUSE:0.081244):0.260342):0.023260,((DOG:0.07, CAT:0.07):0.087381,(PIG:0.06, COW:0.06):0.104728):0.04);" #Get random dir testDir = getTempDirectory(tempDir) #random alignment alignmentLength = 5000 randomStart = random.choice(xrange(len(columnAlignment)-alignmentLength)) subAlignment = columnAlignment[randomStart:randomStart+alignmentLength] logger.info("Got a sub alignment, it is %i columns long" % len(subAlignment)) #Get sequences sequences = [ (fastaHeaders[seqNo], "".join([ column[seqNo] for column in subAlignment if column[seqNo] != '-' ])) for seqNo in xrange(sequenceNumber) ] logger.info("Got the sequences") #Write sequences into temp files tempFastaFiles = [] for seqNo in xrange(sequenceNumber): header, sequence = sequences[seqNo] logger.info("Making temp file for header: %s, seq: %s" % (header, sequence)) tempFastaFile = os.path.join(testDir, "%i.fa" % seqNo) tempFastaFiles.append(tempFastaFile) fileHandle = open(tempFastaFile, "w") fastaWrite(fileHandle, header, sequence) fileHandle.close() logger.info("Got the temp sequence files") experiment = getCactusWorkflowExperimentForTest(tempFastaFiles, newickTreeString, testDir) experimentFile = os.path.join(testDir, "experiment.xml") experiment.writeXML(experimentFile) cactusDiskDatabaseString = experiment.getDiskDatabaseString() jobTree = os.path.join(testDir, "jobTree") runCactusWorkflow(experimentFile, jobTree) logger.info("Ran the the workflow") #Check the output alignment runJobTreeStatusAndFailIfNotComplete(jobTree) logger.info("Checked the job tree dir") #Output the 'TRUE' alignment file if os.system("mfaToMaf --help > /dev/null 2>&1") == 0 and\ os.system("cactus_MAFGenerator --help > /dev/null 2>&1") == 0 and\ os.system("mafComparator --help > /dev/null 2>&1") == 0 and\ os.system("cactus_treeStats --help > /dev/null 2>&1") == 0: trueMFAFile = os.path.join(testDir, "true.mfa") fastaAlignmentWrite(subAlignment, fastaHeaders, len(fastaHeaders), trueMFAFile) trueMAFFile = os.path.join(testDir, "true.maf") system("mfaToMaf --mfaFile %s --outputFile %s --logLevel %s" % (trueMFAFile, trueMAFFile, getLogLevelString())) system("cat %s" % trueMAFFile) #Now get mafs for the region. mAFFile = os.path.join(testDir, "flower.maf") system("cactus_MAFGenerator --flowerName 0 --cactusDisk '%s' --outputFile %s --logLevel %s" % (cactusDiskDatabaseString, mAFFile, getLogLevelString())) logger.info("Got the MAFs from the flower disk") system("cat %s" % mAFFile) statsFile = os.path.join(testDir, "stats.xml") system("cactus_treeStats --cactusDisk '%s' --flowerName 0 --outputFile %s --logLevel %s" % (cactusDiskDatabaseString, statsFile, getLogLevelString())) system("cat %s" % statsFile) logger.info("Got the cactus tree stats") #Now compare the mafs to the output. resultsFile = os.path.join(testDir, "results.xml") system("mafComparator --mafFile1 %s --mafFile2 %s --outputFile %s --logLevel %s" % (trueMAFFile, mAFFile, resultsFile, getLogLevelString())) logger.info("Ran the maf comparator") system("cat %s" % resultsFile) #Cleanup experiment.cleanupDb() system("rm -rf %s" % testDir) logger.info("Successfully ran test for the problem") for tempFile in tempFiles: os.remove(tempFile) system("rm -rf %s" % tempDir)