Пример #1
0
    def test_get_assets(self):
        model = core.Model(
            tuple())  # getting away with an empty model for this step
        cache = self.cls_to_test(self.cache_file.name,
                                 model,
                                 force_create=True)

        class Activities(core.Enum):
            DATETIME = 'Give date/time'

        PythonTime = self.PythonTime
        python = PythonTime('python')
        stepvariant_db_id = cache.id_step_variant(python,
                                                  (Activities.DATETIME, ))
        # 2-elements sources
        SrcCls = core.assetfactory('Source', [
            core.AssetAttr('reference', rnaseq.FASTAFile, ''),
            core.AssetAttr('otherreference', rnaseq.FASTAFile, ''),
            core.AssetAttr('listoffiles', rnaseq.CSVFileSequence, '')
        ])
        sources = SrcCls(
            rnaseq.FASTAFile('foo.fasta'), rnaseq.FASTAFile('bar.fasta'),
            rnaseq.CSVFileSequence(
                (rnaseq.CSVFile('baz.csv'), rnaseq.CSVFile('baz2.csv'))))
        targets = core.AssetSet()  # targets
        parameters = tuple()
        db_id = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets,
                                      parameters)
        for storedthing in cache.get_srcassets(db_id.id):
            thing = storedthing.resurrect(rnaseq)
Пример #2
0
    def test_RecipeSimpleIncremental(self):
        project = self.project
        env = self.env
        nsamples = self.nsamples
        samplereads = self.samplereads
        sampleinfo_fh = self.sampleinfo_fh
        reference_fn = self.reference_fn
        referenceannotation = self.referenceannotation
        PHAGEFASTA = self._PHAGEFASTA
        PHAGEGFF = self._PHAGEGFF

        # steps used
        bowtie2index = env.activities.INDEX.bowtie2build
        bowtie2align = env.activities.ALIGN.bowtie2
        htseqcount = env.activities.QUANTIFY.htseqcount
        merge = env.activities.UTILITY.columnmerger
        edger = env.activities.DIFFEXP.edger

        from railroadtracks import easy

        # sequence of tasks to run
        torun = list()

        # index for alignment
        Assets = bowtie2index.Assets
        assets = Assets(Assets.Source(rnaseq.FASTAFile(reference_fn)),
                        Assets.Target.createundefined())
        task_index = project.add_task(bowtie2index, 
                                      assets)
        # the step is not done
        self.assertEqual(hortator._TASK_TODO, task_index.info[1])
        torun.append(task_index)
        # run the tasks
        for task in torun:
            # run only if not done
            if task.info[1] != hortator._TASK_DONE:
                task.execute()
                task.status = hortator._TASK_DONE

        self.assertEqual(1, project.persistent_graph.nconcrete_steps)
        # now that the tasks have run let's open the same project
        project_same = easy.Project(project.model, wd=project.wd)

        # index for alignment
        Assets = bowtie2index.Assets
        assets = Assets(Assets.Source(rnaseq.FASTAFile(reference_fn)),
                        Assets.Target.createundefined())
        task_index_same = project_same.add_task(bowtie2index, 
                                                assets)

        self.assertNotEqual(task_index, task_index_same)
        self.assertNotEqual(task_index.call.assets, task_index_same.call.assets)
        self.assertListEqual(list(task_index.call.assets.source.reference), 
                             list(task_index_same.call.assets.source.reference))
        self.assertListEqual(list(task_index.call.assets.target.indexfilepattern), 
                             list(task_index_same.call.assets.target.indexfilepattern))
        self.assertEqual(hortator._TASK_DONE, task_index_same.info[1])
        self.assertEqual(1, project.persistent_graph.nconcrete_steps)
 def setUp(self):
     wd = tempfile.mkdtemp()
     self.wd2 = tempfile.mkdtemp()
     self.project = easy.Project(rnaseq, wd)
     bowtie2index = rnaseq.Bowtie2Build()
     reference_fn = PHAGEFASTA
     Assets = bowtie2index.Assets
     task = self.project.add_task(
         bowtie2index,
         Assets(Assets.Source(rnaseq.FASTAFile(reference_fn))))
     self.task = task
 def testAddTaskDifferentProject(self):
     project2 = easy.Project(rnaseq, self.wd2)
     bowtie2index = rnaseq.Bowtie2Build()
     reference_fn = PHAGEFASTA
     Assets = bowtie2index.Assets
     task2 = project2.add_task(
         bowtie2index,
         Assets(Assets.Source(rnaseq.FASTAFile(reference_fn))))
     #
     tsg = tasksetgraph.TaskSetGraph()
     tsg.add(self.task)
     self.assertRaises(ValueError, tsg.add, task2)
Пример #5
0
    def test_id_stepconcrete(self):
        model = core.Model(
            tuple())  # getting away with an empty model for this step
        cache = self.cls_to_test(self.cache_file.name,
                                 model,
                                 force_create=True)

        class Activities(core.Enum):
            DATETIME = 'Give date/time'

        PythonTime = self.PythonTime

        python = PythonTime('python')
        stepvariant_db_id = cache.id_step_variant(python,
                                                  (Activities.DATETIME, ))
        # empty sources is a special case
        sources = core.AssetSet()  # source
        targets = core.AssetSet()  # targets
        parameters = tuple()
        db_id = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets,
                                      parameters)
        db_id_same = cache.id_stepconcrete(stepvariant_db_id.id, sources,
                                           targets, parameters)
        self.assertEqual(db_id.id, db_id_same.id)
        db_id_notthesame = cache.id_stepconcrete(stepvariant_db_id.id,
                                                 sources,
                                                 targets,
                                                 parameters,
                                                 tag=2)
        self.assertNotEqual(db_id.id, db_id_notthesame.id)

        # 1-element sources
        sources = railroadtracks.model.aligners.AssetsIndexer.Source(
            rnaseq.FASTAFile('foo.fasta'))

        db_id_nothesame = cache.id_stepconcrete(stepvariant_db_id.id, sources,
                                                targets, parameters)
        self.assertNotEqual(db_id.id, db_id_nothesame.id)
        db_id_sameagain = cache.id_stepconcrete(stepvariant_db_id.id, sources,
                                                targets, parameters)

        self.assertEqual(db_id_sameagain.id, db_id_nothesame.id)
        db_id_nothesameagain = cache.id_stepconcrete(stepvariant_db_id.id,
                                                     sources, targets,
                                                     ("%Y", ))
        self.assertNotEqual(db_id.id, db_id_nothesameagain.id)
        self.assertNotEqual(db_id_sameagain.id, db_id_nothesameagain.id)

        # 1-element sources, several parameters
        db_id_2params = cache.id_stepconcrete(stepvariant_db_id.id, sources,
                                              targets, ("%Y", "Z"))
        db_id_same2params = cache.id_stepconcrete(stepvariant_db_id.id,
                                                  sources, targets,
                                                  ("%Y", "Z"))
        self.assertEqual(db_id_2params.id, db_id_same2params.id)

        db_id_2otherparams = cache.id_stepconcrete(stepvariant_db_id.id,
                                                   sources, targets,
                                                   ("%Y", "W"))
        self.assertNotEqual(db_id_2params.id, db_id_2otherparams.id)

        # 2-elements sources
        SrcCls = core.assetfactory('Source', [
            core.AssetAttr('reference', rnaseq.FASTAFile, ''),
            core.AssetAttr('otherreference', rnaseq.FASTAFile, '')
        ])
        sources = SrcCls(rnaseq.FASTAFile('foo.fasta'),
                         rnaseq.FASTAFile('bar.fasta'))
        db_id_notthesame = cache.id_stepconcrete(stepvariant_db_id.id, sources,
                                                 targets, parameters)
        self.assertNotEqual(db_id.id, db_id_notthesame.id)
        db_id_sameagain = cache.id_stepconcrete(stepvariant_db_id.id, sources,
                                                targets, parameters)
        self.assertEqual(db_id_sameagain.id, db_id_notthesame.id)

        # 1-element source / 1-element target
        sources = railroadtracks.model.aligners.AssetsIndexer.Source(
            rnaseq.FASTAFile('foo.fasta'))
        targets = railroadtracks.model.aligners.AssetsIndexer.Target(
            rnaseq.FilePattern('foo_idx'))

        foo_sh = rnaseq.Anyscript()
        stepvariant_db_id = cache.id_step_variant(foo_sh,
                                                  (Activities.DATETIME, ))

        db_id_nothesame = cache.id_stepconcrete(stepvariant_db_id.id, sources,
                                                targets, parameters)
        self.assertNotEqual(db_id.id, db_id_nothesame.id)
        db_id_sameagain = cache.id_stepconcrete(stepvariant_db_id.id, sources,
                                                targets, parameters)
        self.assertEqual(db_id_sameagain.id, db_id_nothesame.id)

        # fail if target assets are suddenly different
        targets_bar = railroadtracks.model.aligners.AssetsIndexer.Target(
            rnaseq.FilePattern('bar_idx'))
        self.assertRaises(ValueError, cache.id_stepconcrete,
                          stepvariant_db_id.id, sources, targets_bar,
                          parameters)
Пример #6
0
    def test_RecipeLoop(self):
        project = self.project
        env = self.env
        nsamples = self.nsamples
        samplereads = self.samplereads
        sampleinfo_fh = self.sampleinfo_fh
        reference_fn = self.reference_fn
        referenceannotation = self.referenceannotation
        PHAGEFASTA = self._PHAGEFASTA
        PHAGEGFF = self._PHAGEGFF

        # -- recipeloop-test-begin
        from railroadtracks import easy

        torun = list()

        # bowtie
        bowtie1index = env.activities.INDEX.bowtiebuild
        bowtie1align = env.activities.ALIGN.bowtie
        Assets = bowtie1index.Assets
        fa_file = rnaseq.FASTAFile(reference_fn)
        task_index_bowtie1 = project.add_task(bowtie1index, 
                                              Assets(Assets.Source(fa_file),
                                                     None))
        torun.append(task_index_bowtie1)

        # bowtie2
        bowtie2index = env.activities.INDEX.bowtie2build
        bowtie2align = env.activities.ALIGN.bowtie2
        Assets = bowtie2index.Assets
        fa_file = rnaseq.FASTAFile(reference_fn)
        task_index_bowtie2 = project.add_task(bowtie2index,
                                              Assets(Assets.Source(fa_file),
                                                     None))
        torun.append(task_index_bowtie2)

        # STAR
        starindex = env.activities.INDEX.starindex
        staralign = env.activities.ALIGN.staralign
        Assets = starindex.Assets
        fa_file = rnaseq.FASTAFile(reference_fn)
        task_index_star = project.add_task(starindex, 
                                           Assets(Assets.Source(fa_file),
                                                  None))
        torun.append(task_index_star)

        # TopHat2
        # (index from bowtie2 used)
        #tophat2 = env.activities.ALIGN.tophat2

        # featureCount
        featurecount = env.activities.QUANTIFY.featurecount

        # Merge columns (obtained from counting)
        merge = env.activities.UTILITY.columnmerger

        # EdgeR, DESeq, DESeq2, and LIMMA voom
        edger = env.activities.DIFFEXP.edger
        deseq = env.activities.DIFFEXP.deseq
        deseq2 = env.activities.DIFFEXP.deseq2
        voom = env.activities.DIFFEXP.limmavoom
        

        # Now explore the different alignment presets in bowtie2, and vanilla star
        from itertools import cycle
        from collections import namedtuple
        Options = namedtuple('Options', 'aligner assets_index parameters')
        # Try various presets for bowtie2
        bowtie2_parameters = (('--very-fast', ), ('--fast', ), 
                              ('--sensitive', ), ('--very-sensitive', ))
        options = [Options(*x) for x in zip(cycle((bowtie2align,)),
                                            cycle((task_index_bowtie2.call.assets.target,)),
                                            bowtie2_parameters)]

        # add bowtie
        options.append(Options(bowtie1align, task_index_bowtie1.call.assets.target, tuple()))
        # add STAR (vanilla, no specific options beside the size of index k-mers)
        options.append(Options(staralign, 
                               task_index_star.call.assets.target, 
                               ('--genomeChrBinNbits', '12')))
        # add TopHat2
        #options.append(Options(tophat2, task_index_bowtie2.call.assets.target, tuple()))

        # loop over the options
        for option in options:
            sample_counts = list()
            # loop over the samples
            for sample_i in range(nsamples):
                read1_fh, read2_fh = samplereads[sample_i]
                # align
                Assets = option.aligner.Assets
                assets = Assets(Assets.Source(option.assets_index.indexfilepattern,
                                              rnaseq.FASTQPossiblyGzipCompressed(read1_fh.name), 
                                              rnaseq.FASTQPossiblyGzipCompressed(read2_fh.name)),
                                Assets.Target.createundefined())
                task_align = project.add_task(option.aligner,
                                              assets,
                                              parameters=option.parameters)
                torun.append(task_align)

                # quantify
                # (non-default parameters to fit our demo GFF)
                Assets = featurecount.Assets
                assets = Assets(Assets.Source(task_align.call.assets.target.alignment,
                                              rnaseq.GFFFile(referenceannotation)),
                                Assets.Target.createundefined())
                task_quantify = project.add_task(featurecount,
                                                 assets,
                                                 parameters = ('--gtf-featuretype', 'CDS',
                                                               '--gtf-attrtype', 'ID'))
                torun.append(task_quantify)

                # keep a pointer to the counts, as we will use it in the merge step
                sample_counts.append(task_quantify.call.assets)

            # merge the sample data into a table (so differential expression can be computed)
            Assets = merge.Assets
            source = Assets.Source(rnaseq.CSVFileSequence(tuple(x.target.counts\
                                                                for x in sample_counts)))
            assets_merge = Assets(source,
                                  Assets.Target.createundefined())
            task_merge = project.add_task(merge,
                                          assets_merge,
                                          parameters=("0","1"))
            torun.append(task_merge)

            # differential expression with edgeR, deseq2, and voom
            # (deseq is too whimsical for tests)
            for diffexp, params in ((edger, ()),
                                    (deseq, ('--dispersion-fittype=local', )), 
                                    (deseq2, ()),
                                    (voom, ())):
                Assets = diffexp.Assets
                assets = Assets(Assets.Source(task_merge.call.assets.target.counts,
                                              core.File(sampleinfo_fh.name)),
                                Assets.Target.createundefined())
                task_de = project.add_task(diffexp,assets)
                torun.append(task_de)

        # run the tasks
        # (this is an integration test rather than a unit test - the 
        # 3rd-party tools are often brittle and we want to keep the noise level down)
        env_log_level = environment.logger.level
        environment.logger.level = logging.ERROR
        try:
            for task in torun:
                if task.info[1] != hortator._TASK_DONE:
                    try:
                        task.execute()
                        status = easy.hortator._TASK_DONE
                    except:
                        status = easy.hortator._TASK_FAILED
                project.persistent_graph.step_concrete_state(hortator.DbID(task.task_id, False),
                                                             easy.hortator._TASK_STATUS_LIST[status])
        finally:
            environment.logger.level = env_log_level
Пример #7
0
    def test_RecipeSimple(self):
        project = self.project
        env = self.env
        nsamples = self.nsamples
        samplereads = self.samplereads
        sampleinfo_fh = self.sampleinfo_fh
        reference_fn = self.reference_fn
        referenceannotation = self.referenceannotation
        PHAGEFASTA = self._PHAGEFASTA
        PHAGEGFF = self._PHAGEGFF
        
        # -- recipesimple-test-begin

        # steps used
        bowtie2index = env.activities.INDEX.bowtie2build
        bowtie2align = env.activities.ALIGN.bowtie2
        htseqcount = env.activities.QUANTIFY.htseqcount
        merge = env.activities.UTILITY.columnmerger
        edger = env.activities.DIFFEXP.edger

        from railroadtracks import easy

        # sequence of tasks to run
        torun = list()
                            
        # index for alignment
        Assets = bowtie2index.Assets
        assets = Assets(Assets.Source(rnaseq.FASTAFile(reference_fn)),
                        Assets.Target.createundefined())
        task_index = project.add_task(bowtie2index, assets)
        torun.append(task_index)

        # process all samples
        sample_counts = list()
        for read1_fh, read2_fh in samplereads:
            # align
            Assets = bowtie2align.Assets
            assets = Assets(Assets.Source(task_index.call.assets.target.indexfilepattern, 
                                          rnaseq.FASTQPossiblyGzipCompressed(read1_fh.name),
                                          rnaseq.FASTQPossiblyGzipCompressed(read2_fh.name)),
                            Assets.Target.createundefined())
            task_align = project.add_task(bowtie2align, assets)
            torun.append(task_align)

            # quantify
            # (non-default parameters to fit our demo GFF)
            params = rnaseq.HTSeqCount._noexons_parameters
            Assets = htseqcount.Assets
            assets = Assets(Assets.Source(task_align.call.assets.target.alignment,
                                          rnaseq.GFFFile(referenceannotation)),
                            Assets.Target.createundefined())
            task_quantify = project.add_task(htseqcount,
                                             assets,
                                             parameters=params)
            torun.append(task_quantify)
            # keep a pointer to the counts,
            # as we will use them in the merge step
            sample_counts.append(task_quantify.call.assets)

        # merge the sample data into a table
        # (so differential expression can be computed)
        Assets = merge.Assets
        counts = tuple(x.target.counts for x in sample_counts)
        assets = Assets(Assets.Source(rnaseq.CSVFileSequence(counts)),
                        merge.Assets.Target.createundefined())
        task_merge = project.add_task(merge,
                                      assets,
                                      parameters=("0","1"))
        torun.append(task_merge)

        # differential expression with edgeR
        Assets = edger.Assets
        assets = Assets(Assets.Source(task_merge.call.assets.target.counts,
                                      rnaseq.CSVFile(sampleinfo_fh.name)),
                        Assets.Target.createundefined())
        task_de = project.add_task(edger,
                                   assets)

        # run the tasks
        for task in torun:
            # run only if not done
            if task.info[1] != hortator._TASK_DONE:
                task.execute()

        # get results
        final_storedentities = project.get_targetsofactivity(rnaseq.ACTIVITY.DIFFEXP)

        # get the step that created the results files
        final_steps = list()
        for stored_entity in final_storedentities:
            final_steps.append(project.persistent_graph.get_parenttask_of_storedentity(stored_entity))
        
        # -- recipesimple-test-end
        
        self.assertEqual(1, len(final_storedentities))
        self.assertEqual(core.File.__name__, final_storedentities[0].clsname)
        self.assertEqual('railroadtracks.model.diffexp.EdgeR', final_steps[0].clsname)
Пример #8
0
    def _recipesimpleincremental(self, runtasks):
        project = self.project
        env = self.env
        nsamples = self.nsamples
        samplereads = self.samplereads
        sampleinfo_fh = self.sampleinfo_fh
        reference_fn = self.reference_fn
        referenceannotation = self.referenceannotation
        PHAGEFASTA = self._PHAGEFASTA
        PHAGEGFF = self._PHAGEGFF
        
        # steps used
        bowtie2index = env.activities.INDEX.bowtie2build
        bowtie2align = env.activities.ALIGN.bowtie2
        htseqcount = env.activities.QUANTIFY.htseqcount
        merge = env.activities.UTILITY.columnmerger
        edger = env.activities.DIFFEXP.edger

        for iteration in range(5):
            nextiteration = False
            # sequence of tasks to run
            torun = list()

            # index for alignment
            Assets = bowtie2index.Assets
            assets = Assets(Assets.Source(rnaseq.FASTAFile(reference_fn)),
                            Assets.Target.createundefined())
            task_index = project.add_task(bowtie2index, assets)
            torun.append(task_index)
            if iteration < 1:
                nextiteration = True
                runtasks(torun)
                self.assertEqual(1, project.persistent_graph.nconcrete_steps)
                continue
            # process all samples
            sample_counts = list()
            for sample_i, (read1_fh, read2_fh) in enumerate(samplereads):
                # align
                Assets = bowtie2align.Assets
                assets = Assets(Assets.Source(task_index.call.assets.target.indexfilepattern, 
                                              rnaseq.FASTQPossiblyGzipCompressed(read1_fh.name),
                                              rnaseq.FASTQPossiblyGzipCompressed(read2_fh.name)),
                                Assets.Target.createundefined())
                task_align = project.add_task(bowtie2align, assets)
                torun.append(task_align)
                if iteration < 2:
                    nextiteration = True
                    runtasks(torun)
                    self.assertEqual(1+(sample_i+1), project.persistent_graph.nconcrete_steps)
                    continue

                # quantify
                # (non-default parameters to fit our demo GFF)
                params = rnaseq.HTSeqCount._noexons_parameters
                Assets = htseqcount.Assets
                assets = Assets(Assets.Source(task_align.call.assets.target.alignment,
                                              rnaseq.GFFFile(referenceannotation)),
                                Assets.Target.createundefined())
                task_quantify = project.add_task(htseqcount,
                                                 assets,
                                                 parameters=params)
                torun.append(task_quantify)
                if iteration < 3:
                    nextiteration = True
                    runtasks(torun)
                    self.assertEqual(1+len(samplereads)+(sample_i+1), 
                                     project.persistent_graph.nconcrete_steps)
                    continue

                # keep a pointer to the counts, as we will use it in the merge step
                sample_counts.append(task_quantify.call.assets)

            if nextiteration:
                continue
            # merge the sample data into a table (so differential expression can be computed)
            Assets = merge.Assets
            counts = tuple(x.target.counts for x in sample_counts)
            assets = Assets(Assets.Source(rnaseq.CSVFileSequence(counts)),
                            merge.Assets.Target.createundefined())

            task_merge = project.add_task(merge,
                                          assets,
                                          parameters=("0", "1"))
            torun.append(task_merge)
            if iteration < 4:
                nextiteration = True
                runtasks(torun)
                self.assertEqual(1+2*len(samplereads)+1, 
                                 project.persistent_graph.nconcrete_steps)
                continue

            # differential expression with edgeR
            Assets = edger.Assets
            assets = Assets(Assets.Source(task_merge.call.assets.target.counts,
                                          rnaseq.CSVFile(sampleinfo_fh.name)),
                            Assets.Target.createundefined())
            task_de = project.add_task(edger,
                                       assets)
            if iteration < 5:
                nextiteration = True
                runtasks(torun)
                self.assertEqual(1+2*len(samplereads)+2, # 1 index + 2 FASTQ per sample + 1 merge + 1 differential expression
                                 project.persistent_graph.nconcrete_steps)
                continue