Пример #1
0
 def run(self):
     ##########################################
     #Setup a file tree.
     ##########################################
         
     tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), getRandomAlphaNumericString()))   
     
     fileTreeRootFile = tempFileTree.getTempFile()
 
     makeFileTree(fileTreeRootFile, \
                  self.depth, tempFileTree)
     
     treePointer = tempFileTree.getTempFile()
     
     makeTreePointer(fileTreeRootFile, treePointer)
     
     logger.info("We've set up the file tree")
     
     if random.random() > 0.5:
         raise RuntimeError()
     
     ##########################################
     #Issue the child and follow on jobs
     ##########################################
     
     self.addChildTarget(ChildTarget(treePointer))
     
     self.setFollowOnTarget(DestructFileTree(tempFileTree))
     
     logger.info("We've added the child target and finished SetupFileTree.run()")
Пример #2
0
 def run(self):
     tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), "allAgainstAllResults"))
     #Make the list of blast jobs.
     for i in xrange(0, len(self.chunks)):
         for j in xrange(i+1, len(self.chunks)):
             resultsFile = tempFileTree.getTempFile()
             self.resultsFiles.append(resultsFile)
             self.addChildTarget(RunBlast(self.blastOptions, self.chunks[i], self.chunks[j], resultsFile))
     logger.info("Made the list of all-against-all blasts")
     #Set up the job to collate all the results
     self.setFollowOnTarget(CollateBlasts(self.finalResultsFile, self.resultsFiles))
Пример #3
0
def single_copy_wrapper(target, args):
    """
    Main pipeline wrapper. Runs halSingleCopyRegionsExtract once for each region in the conserved_bed file.
    """
    bed_recs = [x.split()[:3] for x in open(args.conserved_bed)]
    result_dir = target.getGlobalTempDir()
    result_tree = TempFileTree(result_dir)
    for chunk in grouper(bed_recs, 10):
        result_path = result_tree.getTempFile()
        target.addChildTargetFn(find_single_copy,
                                args=(args, chunk, result_path))
    target.setFollowOnTargetFn(cat_results,
                               args=(args, result_tree.listFiles()))
Пример #4
0
def extract_maf_wrapper(target, args):
    """
    Main pipeline wrapper. Calls out to hal2maf once for each region in args.conserved_bed
    """
    accelerated_genomes = set(args.accelerated_genomes + [args.ref_genome])
    outgroup_genomes = set(args.target_genomes) - accelerated_genomes
    bed_recs = [x.split() for x in open(args.conserved_bed)]
    result_dir = target.getGlobalTempDir()
    result_tree = TempFileTree(result_dir)
    for chunk in grouper(bed_recs, 50):
        result_path = result_tree.getTempFile()
        target.addChildTargetFn(extract_and_calculate, args=(args, chunk, accelerated_genomes, outgroup_genomes, result_path))
    target.setFollowOnTargetFn(cat_results, args=(args, result_tree.listFiles()))
Пример #5
0
def main_hints_fn(target, bam_paths, db_path, genome, genome_fasta, hints_dir):
    """
    Main driver function. Loops over each BAM, inferring paired-ness, then passing each BAM with one chromosome name
    for filtering. Each BAM will remain separated until the final concatenation and sorting of the hint gffs.
    """
    filtered_bam_tree = TempFileTree(get_tmp(target, global_dir=True, name="filter_file_tree"))
    for bam_path in bam_paths:
        paired = "--paired --pairwiseAlignments" if bam_is_paired(bam_path) is True else ""
        sam_handle = pysam.Samfile(bam_path)
        for references in group_references(sam_handle):
            out_filter = filtered_bam_tree.getTempFile(suffix=".bam")
            target.addChildTargetFn(sort_by_name, memory=8 * 1024 ** 3, cpu=2, 
                                    args=[bam_path, references, out_filter, paired])
    target.setFollowOnTargetFn(build_hints, args=[filtered_bam_tree, genome, db_path, genome_fasta, hints_dir])
Пример #6
0
def dless_wrapper(target, args, split_ss_dict):
    """
    Wrapper for dless function.
    """
    split_ss_dict = read_subalignment_dir(split_ss_path)
    output_gff_tree = TempFileTree(
        os.path.join(target.getGlobalTempDir(), 'output_gff'))
    for chromosome, split_ss_dir in split_ss_dict.iteritems():
        for split_ss in os.listdir(split_ss_dir):
            gff_path = output_gff_tree.getTempFile(suffix=split_ss + '.gff')
            split_ss_path = os.path.join(split_ss_dir, split_ss)
            target.addChildTargetFn(dless,
                                    args=(split_ss_path, gff_path, args.model))
    target.setFollowOnTargetFn(cat_dless, args=(args, output_gff_tree))
Пример #7
0
 def run(self):
     chunks1 = self.getChunks(self.sequenceFiles1, makeSubDir(os.path.join(self.getGlobalTempDir(), "chunks1")))
     chunks2 = self.getChunks(self.sequenceFiles2, makeSubDir(os.path.join(self.getGlobalTempDir(), "chunks2")))
     tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), "allAgainstAllResults"))
     resultsFiles = []
     #Make the list of blast jobs.
     for chunk1 in chunks1:
         for chunk2 in chunks2:
             resultsFile = tempFileTree.getTempFile()
             resultsFiles.append(resultsFile)
             #TODO: Make the compression work
             self.blastOptions.compressFiles = False
             self.addChildTarget(RunBlast(self.blastOptions, chunk1, chunk2, resultsFile))
     logger.info("Made the list of blasts")
     #Set up the job to collate all the results
     self.setFollowOnTarget(CollateBlasts(self.finalResultsFile, resultsFiles))
Пример #8
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.testNo = TestStatus.getTestSetup(1, 1, 2, 2)
     self.depth = TestStatus.getTestSetup(1, 2, 3, 5)
     self.jobTreeDir = os.path.join(os.getcwd(), "jobTree") #A directory for the job tree to be created in
     self.tempFileTreeDir = os.path.join(os.getcwd(), "tempFileTree") #Ensures that file tree is visible
     self.tempFileTree = TempFileTree(self.tempFileTreeDir) #A place to get temp files from
Пример #9
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.testNo = TestStatus.getTestSetup(1, 1, 5, 5)
     self.depth = TestStatus.getTestSetup(1, 2, 2, 3)
     self.jobTreeDir = os.getcwd() + "/jobTree" #A directory for the job tree to be created in
     self.tempFileTreeDir = os.path.join(os.getcwd(), "tempFileTree")
     self.tempFileTree = TempFileTree(self.tempFileTreeDir) #A place to get temp files from
     parasolRestart()
def align_augustus(target, genome, ref_fasta, target_fasta, target_fasta_index,
                   out_db):
    file_tree = TempFileTree(target.getGlobalTempDir())
    tgt_ids = [x.split()[0] for x in open(target_fasta_index)]
    for chunk in grouper(tgt_ids, 250):
        target.addChildTargetFn(
            align, args=[target_fasta, chunk, ref_fasta, file_tree])
    target.setFollowOnTargetFn(cat, args=(genome, file_tree, out_db))
Пример #11
0
class TestCase(unittest.TestCase):
    
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.testNo = TestStatus.getTestSetup(1, 1, 5, 5)
        self.depth = TestStatus.getTestSetup(1, 2, 2, 3)
        self.jobTreeDir = os.getcwd() + "/jobTree" #A directory for the job tree to be created in
        self.tempFileTreeDir = os.path.join(os.getcwd(), "tempFileTree")
        self.tempFileTree = TempFileTree(self.tempFileTreeDir) #A place to get temp files from
        parasolRestart()
    
    def tearDown(self):
        unittest.TestCase.tearDown(self)
        self.tempFileTree.destroyTempFiles()
        parasolStop()
        parasolRestart()
        system("rm -rf %s %s" % (self.jobTreeDir, self.tempFileTreeDir)) #Cleanup the job tree in case it hasn't already been cleaned up.

    def testJobTree_Parasol(self):
        """Runs a test program using the job tree, whilst constantly restarting parasol
        by killing the nodes.
        """
        for test in xrange(self.testNo): #Does not run this test when doing short testing
            jobTreeCommand, fileTreeRootFile = setupJobTree(self.tempFileTree, self.jobTreeDir, 
                                                            "parasol", depth=self.depth)
            jobTreeCommand += " --rescueJobsFrequency 20"
            #Run the job
            parasolAndMasterKiller = ParasolAndMasterKiller()
            parasolAndMasterKiller.start()
            while True:
                while True:
                    process = subprocess.Popen(jobTreeCommand, shell=True)
                    sts = os.waitpid(process.pid, 0)
                    if sts[1] == 0:
                        logger.info("The job tree master ended, with an okay exit value (using parasol)")
                        break
                    else:
                        logger.info("The job tree master ended with an error exit value, restarting: %i" % sts[1])
                if checkEndStateOfJobTree(self.jobTreeDir): #Check the state of the job files
                    break
                
                jobTreeCommand = "jobTreeRun --jobTree %s --logDebug" % self.jobTreeDir
            checkFileTreeCounts(fileTreeRootFile)
            os.system("rm -rf %s" % self.jobTreeDir)
            parasolAndMasterKiller.stopKilling()
            logger.info("Test done okay")
Пример #12
0
class TestCase(unittest.TestCase):
    
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.jobTreeDir = os.path.join(os.getcwd(), "testJobTree") #A directory for the job tree to be created in
        self.tempFileTreeDir = os.path.join(os.getcwd(), "tempFileTree") #Ensures that file tree is visible
        self.tempFileTree = TempFileTree(self.tempFileTreeDir) #A place to get temp files from
    
    def tearDown(self):
        unittest.TestCase.tearDown(self)
        self.tempFileTree.destroyTempFiles()
        system("rm -rf %s %s" % (self.jobTreeDir, self.tempFileTreeDir)) #Cleanup the job tree in case it hasn't already been cleaned up.
   
    # only done in singleMachine for now.  Experts can run manually on other systems if they choose
    def dependenciesTest(self, batchSystem="singleMachine", furtherOptionsString=""):
        def fn(tree, maxCpus, maxThreads, size, cpusPerJob, sleepTime):
            system("rm -rf %s" % self.jobTreeDir)
            logName = self.tempFileTree.getTempFile(suffix="_comblog.txt", makeDir=False)
            commandLine = "jobTreeTest_Dependencies.py --jobTree %s --logFile %s --batchSystem '%s' --tree %s --maxCpus %s --maxThreads %s --size %s --cpusPerJob=%s --sleepTime %s %s" % \
            (self.jobTreeDir, logName, batchSystem, tree, maxCpus, maxThreads, size, cpusPerJob, sleepTime, furtherOptionsString)
            system(commandLine)
        
        fn("comb", 10, 100, 100, 1, 10)
        fn("comb", 200, 100, 100, 20, 10)
       
        fn("fly", 10, 8, 100, 1, 10)
        fn("fly", 10, 8, 100, 2, 10)
        
        fn("balanced", 5, 10, 100, 1, 10)
        fn("balanced", 5, 10, 100, 3, 10)
        
    def testJobTree_dependencies_singleMachine(self):
        self.dependenciesTest(batchSystem="singleMachine")
        
    def testJobTree_dependencies_combined(self):
        self.dependenciesTest(batchSystem="singleMachine", furtherOptionsString="--bigBatchSystem singleMachine --bigMemoryThreshold 1000000")
        
    def testJobTree_dependencies_parasol(self):
        return
        if parasolIsInstalled():
            self.dependenciesTest(batchSystem="parasol")
            
    def testJobTree_dependencies_gridengine(self):
        return
        if gridEngineIsInstalled():
            self.dependenciesTest(batchSystem="gridengine")
Пример #13
0
class TestCase(unittest.TestCase):
    
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.testNo = TestStatus.getTestSetup(1, 1, 2, 2)
        self.depth = TestStatus.getTestSetup(1, 2, 3, 5)
        self.jobTreeDir = os.path.join(os.getcwd(), "jobTree") #A directory for the job tree to be created in
        self.tempFileTreeDir = os.path.join(os.getcwd(), "tempFileTree") #Ensures that file tree is visible
        self.tempFileTree = TempFileTree(self.tempFileTreeDir) #A place to get temp files from
    
    def tearDown(self):
        unittest.TestCase.tearDown(self)
        self.tempFileTree.destroyTempFiles()
        system("rm -rf %s %s" % (self.jobTreeDir, self.tempFileTreeDir)) #Cleanup the job tree in case it hasn't already been cleaned up.
     
    def testJobTree_SingleMachine(self):
        testJobTree(self.testNo, self.depth, self.tempFileTree, self.jobTreeDir, "singleMachine")    
    
    def testJobTree_Parasol(self):
        if parasolIsInstalled():
            testJobTree(self.testNo, self.depth, self.tempFileTree, self.jobTreeDir, "parasol") 
    
    def testJobTree_gridengine(self):
        if gridEngineIsInstalled():
            testJobTree(self.testNo, self.depth, self.tempFileTree, self.jobTreeDir, "gridengine") 
    
    def testJobTree_dependencies(self):
        commandLine = "jobTreeTest_Dependencies.py --jobTree %s --tree comb --maxThreads 100" % self.jobTreeDir
        os.system("rm -rf %s" % self.jobTreeDir)
        system(commandLine)
        commandLine = "jobTreeTest_Dependencies.py --jobTree %s --tree fly --maxThreads 100" % self.jobTreeDir
        os.system("rm -rf %s" % self.jobTreeDir)
        system(commandLine)
        os.system("rm -rf %s" % self.jobTreeDir)
        commandLine = "jobTreeTest_Dependencies.py --jobTree %s --tree balanced --maxThreads 100" % self.jobTreeDir
        system(commandLine)
def align_gp(target, genome, ref_genome, ref_tx_fasta, target_genome_fasta, gp,
             mode, out_db, comp_ann_path, chunk_size):
    """
    Initial wrapper job. Constructs a file tree and starts alignment job batches in groups of chunk_size.
    Follow on: concatenates file tree.
    """
    file_tree = TempFileTree(target.getGlobalTempDir())
    for recs in grouper(open(gp), chunk_size):
        target.addChildTargetFn(align_wrapper,
                                args=[
                                    recs, file_tree, ref_tx_fasta,
                                    target_genome_fasta, comp_ann_path,
                                    ref_genome, mode
                                ])
    target.setFollowOnTargetFn(cat, args=[genome, file_tree, out_db, mode])
def build_analyses(target, ref_genome, genome, annotation_gp, psl, gp, aug_gp,
                   fasta, ref_fasta, sizes, gencode_attributes, out_dir):
    # find all user-defined classes in the categories of analyses
    out_file_tree = TempFileTree(target.getGlobalTempDir())
    classifiers = classes_in_module(src.augustus_classifiers)
    for classifier in classifiers:
        target.addChildTarget(
            classifier(genome, psl, fasta, ref_fasta, annotation_gp,
                       gencode_attributes, gp, ref_genome, out_file_tree,
                       aug_gp))
        # merge the resulting pickled files into sqlite databases and construct BED tracks
    target.setFollowOnTargetFn(database,
                               memory=8 * (1024**3),
                               args=(out_dir, genome, psl, sizes, gp,
                                     annotation_gp, out_file_tree))
Пример #16
0
def wrapper(target, input_gp, output_gtf, genome, sizes_path, fasta_path):
    """
    Produces one jobTree target per genePred entry. In the future, we could try chunking this per target but during
    initial testing I found that it takes ~15 seconds to extract the RNAseq hints and ~1 minute to run each Augustus
    instance. This seems to be a good time per job to me.
    """
    # create a file tree in the global output directory. This tree will store the gtf created by each Augustus instance
    out_file_tree = TempFileTree(target.getGlobalTempDir())
    unsorted_tmp_file = os.path.join(target.getGlobalTempDir(),
                                     getRandomAlphaNumericString(10))
    for line in open(input_gp):
        target.addChildTargetFn(
            transmap_2_aug,
            args=[line, genome, sizes_path, fasta_path, out_file_tree])
    target.setFollowOnTargetFn(
        cat, args=[genome, output_gtf, unsorted_tmp_file, out_file_tree])
def wrapper(target, input_gp, output_gtf, genome, sizes_path, fasta_path,
            hints_db):
    """
    Produces one jobTree target per genePred entry.
    """
    # create a file tree in the global output directory. This tree will store the gtf created by each Augustus instance
    out_file_tree = TempFileTree(target.getGlobalTempDir())
    # this file will be where we reduce the final results to before sorting
    unsorted_tmp_file = os.path.join(target.getGlobalTempDir(),
                                     getRandomAlphaNumericString(10))
    for line in open(input_gp):
        target.addChildTargetFn(transmap_2_aug,
                                memory=8 * (1024**3),
                                args=[
                                    line, genome, sizes_path, fasta_path,
                                    out_file_tree, hints_db
                                ])
    target.setFollowOnTargetFn(
        cat, args=[output_gtf, unsorted_tmp_file, out_file_tree])
Пример #18
0
def build_hints(target, filtered_bam_tree, genome, db_path, genome_fasta, hints_dir):
    """
    Driver function for hint building. Builts intron and exon hints, then calls cat_hints to do final concatenation
    and sorting.
    """
    bam_files = [x for x in filtered_bam_tree.listFiles() if x.endswith("bam")]
    intron_hints_tree = TempFileTree(get_tmp(target, global_dir=True, name="intron_hints_tree"))
    exon_hints_tree = TempFileTree(get_tmp(target, global_dir=True, name="exon_hints_tree"))
    for bam_file in bam_files:
        intron_hints_path = intron_hints_tree.getTempFile(suffix=".intron.gff")
        target.addChildTargetFn(build_intron_hints, memory=8 * 1024 ** 3, cpu=2, args=[bam_file, intron_hints_path])
        exon_hints_path = exon_hints_tree.getTempFile(suffix=".exon.gff")
        target.addChildTargetFn(build_exon_hints, memory=8 * 1024 ** 3, cpu=2, args=[bam_file, exon_hints_path])
    target.setFollowOnTargetFn(cat_hints, args=[intron_hints_tree, exon_hints_tree, genome, db_path, genome_fasta,
                                                hints_dir])
Пример #19
0
    def testTempFileTree(self):
        for test in range(100):  #self.testNo):
            levels = random.choice(range(1, 4))
            fileNo = random.choice(range(1, 6))
            maxTempFiles = int(math.pow(fileNo, levels))

            print("Got %s levels, %s fileNo and %s maxTempFiles" %
                  (levels, fileNo, maxTempFiles))

            tempFileTreeRootDir = os.path.join(self.tempDir,
                                               getRandomAlphaNumericString())
            tempFileTree = TempFileTree(tempFileTreeRootDir, fileNo, levels)

            tempFiles = []
            tempDirs = []
            #Check we can mac number of temp files.
            for i in range(maxTempFiles):
                if random.random() > 0.5:
                    tempFile = tempFileTree.getTempFile()
                    assert os.path.isfile(tempFile)
                    tempFiles.append(tempFile)
                else:
                    tempFile = tempFileTree.getTempDirectory()
                    assert os.path.isdir(tempFile)
                    tempDirs.append(tempFile)

            #Check assertion is created
            try:
                tempFileTree.getTempFile()
                assert False
            except RuntimeError:
                logger.debug("Got expected error message")

            #Now remove a few temp files
            while random.random() > 0.1 and len(tempFiles) > 0:
                tempFile = tempFiles.pop()
                assert os.path.isfile(tempFile)
                tempFileTree.destroyTempFile(tempFile)
                assert not os.path.isfile(tempFile)

            #Now remove a few temp dirs
            while random.random() > 0.1 and len(tempDirs) > 0:
                tempDir = tempDirs.pop()
                assert os.path.isdir(tempDir)
                tempFileTree.destroyTempDir(tempDir)
                assert not os.path.isdir(tempDir)

            #Check temp files is okay
            set(tempFileTree.listFiles()) == set(tempFiles + tempDirs)

            #Either remove all the temp files or just destroy the whole thing
            if random.random() > 0.5:
                #Remove all temp files and check thing is empty.
                for tempFile in tempFiles:
                    tempFileTree.destroyTempFile(tempFile)
                for tempDir in tempDirs:
                    tempFileTree.destroyTempDir(tempDir)
                os.remove(os.path.join(tempFileTreeRootDir, "lock"))
                os.rmdir(tempFileTreeRootDir)
            else:
                tempFileTree.destroyTempFiles()
                assert not os.path.isdir(tempFileTreeRootDir)
Пример #20
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.jobTreeDir = os.path.join(os.getcwd(), "testJobTree") #A directory for the job tree to be created in
     self.tempFileTreeDir = os.path.join(os.getcwd(), "tempFileTree") #Ensures that file tree is visible
     self.tempFileTree = TempFileTree(self.tempFileTreeDir) #A place to get temp files from