def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() # Log the stats for the un-preprocessed assemblies for name, sequence in self.project.getInputSequenceIDMap().items(): self.addChildJobFn(logAssemblyStats, "Before preprocessing", name, sequence) # Create jobs to create the output sequences logger.info("Reading config file from: %s" % self.project.getConfigID()) configFile = fileStore.readGlobalFile(self.project.getConfigID()) configNode = ET.parse(configFile).getroot() ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary.. #Add the preprocessor child job. The output is a job promise value that will be #converted into a list of the IDs of the preprocessed sequences in the follow on job. preprocessorJob = self.addChild(CactusPreprocessor(self.project.getInputSequenceIDs(), configNode)) self.project.setOutputSequenceIDs([preprocessorJob.rv(i) for i in range(len(self.project.getInputSequenceIDs()))]) #Now build the progressive-down job schedule = Schedule() schedule.loadProject(self.project, fileStore=fileStore) schedule.compute() self.options.event = self.project.mcTree.getRootName() leafNames = [ self.project.mcTree.getName(i) for i in self.project.mcTree.getLeaves() ] fileStore.logToMaster("Leaf names = %s" % leafNames) self.options.globalLeafEventSet = set(leafNames) return self.addFollowOn(RunCactusPreprocessorThenProgressiveDown2(options=self.options, project=self.project, event=self.options.event, schedule=schedule, memory=self.configWrapper.getDefaultMemory())).rv()
def main(): usage = "usage: %prog [options] <experiment> <output project path>" description = "Setup a multi-cactus project using an experiment xml as template" parser = OptionParser(usage=usage, description=description) parser.add_option("--fixNames", dest="fixNames", default = "True", help="try to make sequence and event names MAF-compliant [default=true]") parser.add_option("--outgroupNames", dest="outgroupNames", default = None, help="comma-separated names of high quality assemblies to use as outgroups [default=everything]") parser.add_option("--root", dest="root", type=str, help="name of alignment root (must be labeled ancestral node in tree in input experiment). Useful " "for allowing the tree to contain nodes that won't be in the alignment but can still be used for " "outgroups.", default=None) parser.add_option("--overwrite", action="store_true", help="Overwrite existing experiment files", default=False) options, args = parser.parse_args() if len(args) != 2: parser.print_help() raise RuntimeError("Wrong number of arguments") options.expFile = args[0] options.path = os.path.abspath(args[1]) options.name = os.path.basename(options.path) options.fixNames = not options.fixNames.lower() == "false" if (os.path.isdir(options.path) and not options.overwrite) or os.path.isfile(options.path): raise RuntimeError("Output project path %s exists\n" % options.path) expTemplate = ExperimentWrapper(ET.parse(options.expFile).getroot()) configPath = expTemplate.getConfigPath() confTemplate = ConfigWrapper(ET.parse(configPath).getroot()) if options.fixNames: cleanEventTree(expTemplate) checkInputSequencePaths(expTemplate) tree = expTemplate.getTree() # Check that the tree is sensible (root has at least 1 child) if len(tree.getChildren(tree.getRootId())) == 0: raise RuntimeError("Input species tree has only one node.") if options.outgroupNames is not None: projNames = set([tree.getName(x) for x in tree.getLeaves()]) options.outgroupNames = set(options.outgroupNames.split(",")) for outgroupName in options.outgroupNames: if outgroupName not in projNames: raise RuntimeError("Specified outgroup %s not found in tree" % outgroupName) mcProj = createMCProject(tree, expTemplate, confTemplate, options) #Replace the sequences with output sequences expTemplate.updateTree(mcProj.mcTree, expTemplate.buildSequenceMap()) expTemplate.setSequences(CactusPreprocessor.getOutputSequenceFiles(mcProj.inputSequences, expTemplate.getOutputSequenceDir())) #Now do the file tree creation createFileStructure(mcProj, expTemplate, confTemplate, options) # mcProj.check() return 0
def testCactusPreprocessor(self): #Demo sequences sequenceNames = [ "%s.ENm001.fa" % species for species in 'human', "hedgehog" ] sequenceFiles = [ os.path.join(self.encodePath, self.encodeRegion, sequenceName) for sequenceName in sequenceNames ] #Make config file configFile = os.path.join(self.tempDir, "config.xml") rootElem = ET.Element("preprocessor") #<preprocessor chunkSize="10000" proportionToSample="0.2" memory="littleMemory" preprocessorString="cactus_lastzRepeatMask.py --proportionSampled=PROPORTION_SAMPLED --minPeriod=1 --lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped' IN_FILE OUT_FILE "/> preprocessor = ET.SubElement(rootElem, "preprocessor") preprocessor.attrib["chunkSize"] = "100000" preprocessor.attrib["proportionToSample"] = "0.2" preprocessor.attrib["preprocessorString"] = "cactus_lastzRepeatMask.py --proportionSampled=PROPORTION_SAMPLED --fragment=200 --minPeriod=1 --lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped' IN_FILE OUT_FILE" fileHandle = open(configFile, "w") fileHandle.write(ET.tostring(rootElem)) fileHandle.close() #Run preprocessor command = "cactus_preprocessor.py %s %s %s --jobTree %s" % (self.tempDir, configFile, " ".join(sequenceFiles), os.path.join(self.tempDir, "jobTree")) system(command) for sequenceFile, processedSequenceFile in zip(sequenceFiles, CactusPreprocessor.getOutputSequenceFiles(sequenceFiles, self.tempDir)): #Parse sequences into dictionary originalSequences = getSequences(sequenceFile) #Load the new sequences processedSequences = getSequences(processedSequenceFile) #Check they are the same module masking self.checkSequenceSetsEqualModuloSoftMasking(originalSequences, processedSequences) #Compare the proportion of bases masked by lastz with original repeat masking maskedBasesOriginal = getMaskedBases(originalSequences) maskedBasesLastzMasked = getMaskedBases(processedSequences) #Total bases totalBases = sum([ len(i) for i in originalSequences.values() ]) #Calculate number of hard masked bases totalNBases = len([ (header, i, base) for (header, i, base) in maskedBasesOriginal if base.upper() == "N" ]) print " For the sequence file ", sequenceFile, \ " the total number of sequences is ", len(originalSequences), \ " the total number of bases ", totalBases, \ " the number of bases originally masked was: ", len(maskedBasesOriginal),\ " the number of bases masked after running lastz repeat masking is: ", len(maskedBasesLastzMasked), \ " the intersection of these masked sets is: ", len(maskedBasesLastzMasked.intersection(maskedBasesOriginal)), \ " the total number of bases that are Ns ", totalNBases #Now compare to running lastz on its own command = "cactus_lastzRepeatMask.py --proportionSampled=0.2 --minPeriod=1 --lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped --queryhsplimit=keep,nowarn:30' --fragment=200 %s %s" % \ (sequenceFile, self.tempOutputFile) popenPush(command, sequenceFile) lastzSequencesFast = getSequences(self.tempOutputFile) maskedBasesLastzMaskedFast = getMaskedBases(lastzSequencesFast) i = float(len(maskedBasesLastzMaskedFast.intersection(maskedBasesLastzMasked))) print " The number of bases masked after running lastz repeat masking without the preprocessor is: ", len(maskedBasesLastzMaskedFast), \ " the recall of the fast vs. the new is: ", i/len(maskedBasesLastzMasked), \ " the precision of the fast vs. the new is: ", i/len(maskedBasesLastzMaskedFast)
def testCactusPreprocessor(self): #Demo sequences sequenceNames = [ "%s.ENm001.fa" % species for species in ['human', 'hedgehog'] ] sequenceFiles = [ os.path.join(self.encodePath, self.encodeRegion, sequenceName) for sequenceName in sequenceNames ] #Make config file configFile = os.path.join(self.tempDir, "config.xml") rootElem = ET.Element("preprocessor") #<preprocessor chunkSize="10000" proportionToSample="0.2" memory="littleMemory" preprocessorString="cactus_lastzRepeatMask.py --proportionSampled=PROPORTION_SAMPLED --minPeriod=1 --lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped' IN_FILE OUT_FILE "/> preprocessor = ET.SubElement(rootElem, "preprocessor") preprocessor.attrib["chunkSize"] = "100000" preprocessor.attrib["proportionToSample"] = "0.2" preprocessor.attrib["preprocessJob"] = "lastzRepeatMask" preprocessor.attrib["minPeriod"] = "1" preprocessor.attrib["lastzOpts"] = "--step=1 --ambiguous=iupac,100 --ungapped" preprocessor.attrib["fragment"] = "200" fileHandle = open(configFile, "w") fileHandle.write(ET.tostring(rootElem)) fileHandle.close() #Run preprocessor tmpToil = os.path.join(self.tempDir, "toil") runCactusPreprocessor(outputSequenceDir=self.tempDir, configFile=configFile, inputSequences=sequenceFiles, toilDir=tmpToil) for sequenceFile, processedSequenceFile in zip(sequenceFiles, CactusPreprocessor.getOutputSequenceFiles(sequenceFiles, self.tempDir)): #Parse sequences into dictionary originalSequences = getSequences(sequenceFile) #Load the new sequences processedSequences = getSequences(processedSequenceFile) #Check they are the same module masking self.checkSequenceSetsEqualModuloSoftMasking(originalSequences, processedSequences) #Compare the proportion of bases masked by lastz with original repeat masking maskedBasesOriginal = getMaskedBases(originalSequences) maskedBasesLastzMasked = getMaskedBases(processedSequences) #Total bases totalBases = sum([ len(i) for i in originalSequences.values() ]) #Calculate number of hard masked bases totalNBases = len([ (header, i, base) for (header, i, base) in maskedBasesOriginal if base.upper() == "N" ]) print " For the sequence file ", sequenceFile, \ " the total number of sequences is ", len(originalSequences), \ " the total number of bases ", totalBases, \ " the number of bases originally masked was: ", len(maskedBasesOriginal),\ " the number of bases masked after running lastz repeat masking is: ", len(maskedBasesLastzMasked), \ " the intersection of these masked sets is: ", len(maskedBasesLastzMasked.intersection(maskedBasesOriginal)), \ " the total number of bases that are Ns ", totalNBases self.assertGreater(maskedBasesLastzMasked, maskedBasesOriginal)
def run(self): #Load the multi-cactus project project = MultiCactusProject() project.readXML(self.args[0]) #Create jobs to create the output sequences configNode = ET.parse(project.getConfigPath()).getroot() ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary.. #Create the preprocessor self.addChildTarget(CactusPreprocessor(project.getInputSequencePaths(), CactusPreprocessor.getOutputSequenceFiles(project.getInputSequencePaths(), project.getOutputSequenceDir()), configNode)) #Now build the progressive-down target schedule = Schedule() schedule.loadProject(project) schedule.compute() if self.options.event == None: self.options.event = project.mcTree.getRootName() assert self.options.event in project.expMap leafNames = [ project.mcTree.getName(i) for i in project.mcTree.getLeaves() ] self.options.globalLeafEventSet = set(leafNames) self.setFollowOnTarget(ProgressiveDown(self.options, project, self.options.event, schedule))
def testCactusPreprocessor(self): #Demo sequences sequenceNames = [ "%s.ENm001.fa" % species for species in ['human', 'hedgehog'] ] sequenceFiles = [ os.path.join(self.encodePath, self.encodeRegion, sequenceName) for sequenceName in sequenceNames ] #Make config file configFile = os.path.join(self.tempDir, "config.xml") rootElem = ET.Element("preprocessor") #<preprocessor chunkSize="10000" proportionToSample="0.2" memory="littleMemory" preprocessorString="cactus_lastzRepeatMask.py --proportionSampled=PROPORTION_SAMPLED --minPeriod=1 --lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped' IN_FILE OUT_FILE "/> preprocessor = ET.SubElement(rootElem, "preprocessor") preprocessor.attrib["chunkSize"] = "100000" preprocessor.attrib["proportionToSample"] = "0.2" preprocessor.attrib["preprocessJob"] = "lastzRepeatMask" preprocessor.attrib["minPeriod"] = "1" preprocessor.attrib["lastzOpts"] = "--step=1 --ambiguous=iupac,100 --ungapped" preprocessor.attrib["fragment"] = "200" fileHandle = open(configFile, "w") fileHandle.write(ET.tostring(rootElem)) fileHandle.close() #Run preprocessor tmpToil = os.path.join(self.tempDir, "toil") runCactusPreprocessor(outputSequenceDir=self.tempDir, configFile=configFile, inputSequences=sequenceFiles, toilDir=tmpToil) for sequenceFile, processedSequenceFile in zip(sequenceFiles, CactusPreprocessor.getOutputSequenceFiles(sequenceFiles, self.tempDir)): print "sequenceFile: %s" % sequenceFile print "output sequence file: %s" % processedSequenceFile #Parse sequences into dictionary originalSequences = getSequences(sequenceFile) #Load the new sequences processedSequences = getSequences(processedSequenceFile) #Check they are the same module masking self.checkSequenceSetsEqualModuloSoftMasking(originalSequences, processedSequences) #Compare the proportion of bases masked by lastz with original repeat masking maskedBasesOriginal = getMaskedBases(originalSequences) maskedBasesLastzMasked = getMaskedBases(processedSequences) #Total bases totalBases = sum([ len(i) for i in originalSequences.values() ]) #Calculate number of hard masked bases totalNBases = len([ (header, i, base) for (header, i, base) in maskedBasesOriginal if base.upper() == "N" ]) print " For the sequence file ", sequenceFile, \ " the total number of sequences is ", len(originalSequences), \ " the total number of bases ", totalBases, \ " the number of bases originally masked was: ", len(maskedBasesOriginal),\ " the number of bases masked after running lastz repeat masking is: ", len(maskedBasesLastzMasked), \ " the intersection of these masked sets is: ", len(maskedBasesLastzMasked.intersection(maskedBasesOriginal)), \ " the total number of bases that are Ns ", totalNBases #Now compare to running lastz on its own toilOptions = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, "lastzRepeatMaskToil")) toilOptions.logLevel = "CRITICAL" with Toil(toilOptions) as toil: queryID = toil.importFile(makeURL(sequenceFile)) targetIDs = [queryID] repeatMaskedID = toil.start(LastzRepeatMaskJob(queryID=queryID, targetIDs=targetIDs, repeatMaskOptions=RepeatMaskOptions(lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped --queryhsplimit=keep,nowarn:30', minPeriod=1, proportionSampled=0.2, fragment=200))) toil.exportFile(repeatMaskedID, makeURL(self.tempOutputFile)) lastzSequencesFast = getSequences(self.tempOutputFile) maskedBasesLastzMaskedFast = getMaskedBases(lastzSequencesFast) i = float(len(maskedBasesLastzMaskedFast.intersection(maskedBasesLastzMasked))) print " The number of bases masked after running lastz repeat masking without the preprocessor is: ", len(maskedBasesLastzMaskedFast), \ " the recall of the fast vs. the new is: ", i/len(maskedBasesLastzMasked), \ " the precision of the fast vs. the new is: ", i/len(maskedBasesLastzMaskedFast)