def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildHal, buildFasta, toilStats, subtreeRoot=None, logLevel=None): eW = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqFile = getTempFile() with open(seqFile, 'w') as f: tree = eW.getTree() newick = NXNewick().writeString(tree) f.write('%s\n' % newick) for genome in eW.getGenomesWithSequence(): f.write('%s %s\n' % (genome, eW.getSequenceID(genome))) config = eW.getConfigPath() runCactusProgressive(seqFile, config, toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats, logLevel=logLevel)
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats) # Check that the headers and sequences in the output are the # same as the sequences in the input (minus differences in # repeat-masking) exp = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqMap = exp.buildSequenceMap() # Maps genome name -> headers in fasta headers = {} for genomeName, inputSequencePath in seqMap.items(): if os.path.isdir(inputSequencePath): # Some "input sequence paths" are actually provided as # directories containing multiple FASTAs concatenatedPath = getTempFile() system("cat %s/* > %s" % (inputSequencePath, concatenatedPath)) inputSequencePath = concatenatedPath headers[genomeName] = list(map(itemgetter(0), fastaRead(inputSequencePath))) # check headers inside .c2h output for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)): subExp = ExperimentWrapper(ET.parse(expPath).getroot()) outgroups = subExp.getOutgroupEvents() c2hPath = subExp.getHALPath() with open(c2hPath) as f: for line in f: fields = line.split('\t') if fields[0] == 's': # Sequence line genome = fields[1][1:-1] header = fields[2][1:-1] if genome in headers and genome not in outgroups: # This genome is an input genome self.assertTrue(header in headers[genome], 'Header %s from output c2h %s not found in input fa %s' ' for genome %s' % (header, c2hPath, seqMap[genome], genome)) runToilStatusAndFailIfNotComplete(toilDir) system("rm -rf %s" % tempDir)
def progressiveFunction(self, experimentFile, jobTreeDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, jobTreeStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCactusCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), jobTreeDir, batchSystem=batchSystem, buildAvgs=buildAvgs, jobTreeStats=jobTreeStats) runJobTreeStatusAndFailIfNotComplete(jobTreeDir) system("rm -rf %s" % tempDir)
def runProgressive(self): logger.debug("Going to put the alignment in %s" % self.outputDir) if not os.path.isdir(self.outputDir): os.mkdir(self.outputDir) if not os.path.exists(os.path.join(self.outputDir, "progressiveCactusAlignment")): xmlTree = ET.parse(os.path.join(getRootPathString(), "lib", "cactus_workflow_config.xml")) #Set the parameters tempLocalDir = os.path.join(self.outputDir, "tempProgressiveCactusAlignment") system("rm -rf %s" % tempLocalDir) os.mkdir(tempLocalDir) #Set the config parameters self.params.applyToXml(xmlTree) config = xmlTree.getroot() assert config is not None #Write the config file tempConfigFile = os.path.join(tempLocalDir, "config.xml") fileHandle = open(tempConfigFile, 'w') assert fileHandle is not None tree = ET.ElementTree(config) tree.write(fileHandle) fileHandle.close() #Make the experiment file tempExperimentFile = os.path.join(tempLocalDir, "experiment.xml") if self.params.kyotoTycoon == True: dbConfElem = ET.Element("st_kv_database_conf", type="kyoto_tycoon") ktElem = ET.SubElement(dbConfElem, "kyoto_tycoon", host="localhost", port="1978", database_dir="dummy") else: dbConfElem = None cactusWorkflowExperiment = CactusWorkflowExperiment( sequences=self.sequences, newickTreeString=self.newickTree, #requiredSpecies=self.requiredSpecies, #singleCopySpecies=self.singleCopySpecies, databaseName="cactusAlignment", outputDir=tempLocalDir, configFile=tempConfigFile, databaseConf = dbConfElem) cactusWorkflowExperiment.writeExperimentFile(tempExperimentFile) #The jobtree tempJobTreeDir = os.path.join(tempLocalDir, "jobTree") #The place to put the temporary experiment dir tempExperimentDir = os.path.join(tempLocalDir, "progressiveCactusAlignment") #The temporary experiment runCactusCreateMultiCactusProject(tempExperimentFile, tempExperimentDir) logger.info("Setup the cactus progressive experiment") runCactusProgressive(os.path.join(tempExperimentDir, "progressiveCactusAlignment_project.xml"), tempJobTreeDir, #batchSystem=batchSystem, buildMaf=True, joinMaf=True, #buildTrees=buildTrees, buildFaces=buildFaces, buildReference=buildReference, jobTreeStats=True, maxThreads=4, logLevel="DEBUG") logger.info("Ran the progressive workflow") #Check if the jobtree completed sucessively. runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir) logger.info("Checked the job tree dir for the progressive run") #Run the cactus tree stats expPath = os.path.join(tempExperimentDir, "Anc0", "Anc0_experiment.xml") exp = ExperimentWrapper(ET.parse(expPath).getroot()) if exp.getDbType() == "kyoto_tycoon": ktserver = KtserverLauncher() ktserver.spawnServer(exp) treeStatsFile = os.path.join(self.outputDir, "treeStats.xml") system("cactus_treeStats --cactusDisk \'%s\' --flowerName 0 --outputFile %s" %(exp.getDiskDatabaseString(), treeStatsFile)) if exp.getDbType() == "kyoto_tycoon": ktserver.killServer(exp) #Now copy the true assembly back to the output system("mv %s %s/experiment.xml" % (tempExperimentFile, self.outputDir)) system("mv %s %s" % (tempExperimentDir, self.outputDir)) system("jobTreeStats --jobTree %s --outputFile %s/jobTreeStats.xml" % (tempJobTreeDir, self.outputDir)) system("mv %s %s/config.xml" % (tempConfigFile, self.outputDir)) #But keep a link to the multicactus project in its original path so we can navigate # the paths in the xml... actualResultsDir = os.path.join(os.path.abspath(self.outputDir), "progressiveCactusAlignment") tempResultsDir = os.path.join(self.outputDir, "tempProgressiveCactusAlignment") system("ln -s %s %s" % (actualResultsDir, tempResultsDir))
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats) # Check that the headers and sequences in the output are the # same as the sequences in the input (minus differences in # repeat-masking) exp = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqMap = exp.buildSequenceMap() # Maps genome name -> headers in fasta headers = {} for genomeName, inputSequencePath in seqMap.items(): if os.path.isdir(inputSequencePath): # Some "input sequence paths" are actually provided as # directories containing multiple FASTAs concatenatedPath = getTempFile() system("cat %s/* > %s" % (inputSequencePath, concatenatedPath)) inputSequencePath = concatenatedPath headers[genomeName] = list( map(itemgetter(0), fastaRead(inputSequencePath))) # check headers inside .c2h output for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)): subExp = ExperimentWrapper(ET.parse(expPath).getroot()) outgroups = subExp.getOutgroupEvents() c2hPath = subExp.getHALPath() with open(c2hPath) as f: for line in f: fields = line.split('\t') if fields[0] == 's': # Sequence line genome = fields[1][1:-1] header = fields[2][1:-1] if genome in headers and genome not in outgroups: # This genome is an input genome self.assertTrue( header in headers[genome], 'Header %s from output c2h %s not found in input fa %s' ' for genome %s' % (header, c2hPath, seqMap[genome], genome)) runToilStatusAndFailIfNotComplete(toilDir) system("rm -rf %s" % tempDir)
self.progressiveFunction(experimentFile, toilDir, batchSystem, buildAvgs, buildHal, buildFasta, toilStats, subtreeRoot) def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildHal, buildFasta, toilStats, subtreeRoot=None): eW = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqFile = getTempFile() with open(seqFile, 'w') as f: tree = eW.getTree() o newick = NXNewick().writeString(tree) f.write('%s\n' % newick) for genome in eW.getGenomesWithSequence(): f.write('%s %s\n' % (genome, eW.getSequenceID(genome))) config = eW.getConfigPath() runCactusProgressive(seqFile, config, toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats) if __name__ == '__main__': unittest.main()