Exemplo n.º 1
0
 def _default_metadata():
     cls = namedtuple("Sample", ['SampleID'])
     for seq_list in filter(bool, maybe_seqs):
         # return the first one that contains filenames
         return [
             cls(basename(util.rmext(f, all=True))) for f in seq_list
         ]
Exemplo n.º 2
0
    def _configure(self):
        if self.raw_seq_files or self.raw_demuxed_fastq_files:
            for task in self._handle_raw_seqs():
                yield task
        if self.raw_seq_files:
            for task in self._demultiplex():
                yield task

        # ensure all files are decompressed
        # possibly stitch paired reads, demultiplex, and quality filter
        # do closed reference otu picking
        for fasta_fname in self.demuxed_fasta_files:
            dirname = util.rmext(os.path.basename(fasta_fname))
            otu_dir = join(os.path.dirname(fasta_fname), dirname + "_otus")
            yield sixteen.pick_otus_closed_ref(input_fname=fasta_fname,
                                               output_dir=otu_dir,
                                               **self.options.get(
                                                   'pick_otus_closed_ref',
                                                   dict()))
            self.otu_tables.append(join(otu_dir, "otu_table.biom"))

        # convert biom file to tsv
        for otu_table in self.otu_tables:
            tsv_filename = otu_table + ".tsv"
            yield biom.biom_to_tsv(otu_table, tsv_filename)

        # infer genes and pathways with picrust
        for otu_table in self.otu_tables:
            yield sixteen.picrust(otu_table,
                                  **self.options.get('picrust', dict()))
Exemplo n.º 3
0
 def _default_metadata():
     cls = namedtuple("Sample", ['SampleID'])
     for seq_attr in self.sequence_attrs:
         maybe_seqs = getattr(self, seq_attr, None)
         if maybe_seqs:
             return [ cls(basename(util.rmext(f, all=True)))
                      for f in maybe_seqs ]
Exemplo n.º 4
0
    def _configure(self):
        if self.options['infer_pairs'].get('infer'):
            paired, notpaired = infer_pairs(self.raw_seq_files)
            self.raw_seq_files = paired + notpaired

        maybe_tasks = list()
        for maybe_pair in self.raw_seq_files:
            is_pair = type(maybe_pair) in (tuple, list)
            if is_pair:
                pair, tasks = maybe_convert_to_fastq(maybe_pair,
                                                     self.products_dir)
                self.paired_fastq_files.append(pair)
                maybe_tasks.extend(tasks)
            elif util.guess_seq_filetype(maybe_pair) == 'bam':
                prefix = util.new_file(util.rmext(basename(maybe_pair)),
                                       basedir=self.products_dir)
                t = samtools.to_paired_fastq(maybe_pair, prefix)
                paired, single = t['targets'][:2], t['targets'][2]
                self.paired_fastq_files.append(paired)
                self.unpaired_fastq_files.append(single)
                maybe_tasks.append(t)
            else:
                single, tasks = maybe_convert_to_fastq([maybe_pair],
                                                       self.products_dir)
                self.unpaired_fastq_files.append(single[0])
                maybe_tasks.extend(tasks)

        for task in maybe_tasks:
            yield task

        for pair in self.paired_fastq_files:
            align_sam = util.new_file(_to_merged(basename(pair[0]),
                                                 tag="align"),
                                      basedir=self.products_dir)
            align_sam += ".sam"
            self.align_sams.append(align_sam)
            yield subread.align(pair, align_sam,
                                self.options.get('subread_align', dict()))

        for single in self.unpaired_fastq_files:
            align_sam = util.new_file(util.addtag(basename(single), "align"),
                                      basedir=self.products_dir)
            align_sam += ".sam"
            self.align_sams.append(align_sam)
            yield subread.align(single, align_sam,
                                self.options.get('subread_align', dict()))

        for align_sam in self.align_sams:
            count_table = util.new_file(util.addtag(basename(align_sam),
                                                    "count"),
                                        basedir=self.products_dir)
            self.count_tables.append(count_table)
            yield subread.featureCounts([align_sam], count_table,
                                        self.options.get(
                                            'featureCounts', dict()))
Exemplo n.º 5
0
 def _maybe_mangle():
     if not do_mangle:
         return
     if not os.path.exists(
             output_fasta) or os.stat(output_fasta).st_size < 1:
         return
     m = rmext(output_fasta, all=True) if mangle_to is False else mangle_to
     cmd = "sequence_convert -m {m} -f fasta -t fasta {o} > {o}.tmp".format(
         m=m, o=output_fasta)
     CmdAction(cmd, verbose=verbose).execute()
     CmdAction("mv {o}.tmp {o}".format(o=output_fasta),
               verbose=verbose).execute()
Exemplo n.º 6
0
    def _configure(self):
        yield self._handle_raw_seqs_and_demultiplex()

        for fasta_fname in self.demuxed_fasta_files:
            otu_table = util.rmext(fasta_fname) + "_tax.biom"
            otu_table = join(self.products_dir, os.path.basename(otu_table))
            yield pick_otus_closed_ref(
                fasta_fname, otu_table,
                **self.options.get('pick_otus_closed_ref', dict()))
            self.otu_tables.append(otu_table)

        # infer genes and pathways with picrust
        for otu_table in self.otu_tables:
            yield sixteen.picrust(otu_table,
                                  **self.options.get('picrust', dict()))
Exemplo n.º 7
0
def assign_taxonomy(in_fasta, out_dir, qiime_opts={}):

    name = rmext(os.path.basename(in_fasta))+"_tax_assignments.txt"
    taxonomy_out = os.path.join(out_dir, name)
    
    default_opts = dict([
        ("r", settings.workflows.sixteen.otu_refseq),
        ("t", settings.workflows.sixteen.otu_taxonomy),
    ]+list(qiime_opts.items()))

    cmd = ("assign_taxonomy.py -i "+in_fasta+" -o "+out_dir+
           " "+dict_to_cmd_opts(default_opts))

    return { "name"     : "assign_taxonomy: "+taxonomy_out,
             "targets"  : [taxonomy_out],
             "actions"  : [cmd],
             "file_dep" : [default_opts['r'], default_opts['t'], in_fasta] }
Exemplo n.º 8
0
    def split_illumina_style(self, seqfiles_to_split, barcode_seqfiles):
        demuxed, tasks = list(), list()
        bcode_pairs = zip(seqfiles_to_split, barcode_seqfiles)

        options = self.options.get("demultiplex_illumina", dict())
        if 'barcode_type' not in options:
            options['barcode_type'] = _determine_barcode_type(
                self.sample_metadata)

        do_groupby = options.pop("group_by_sampleid", False)
        for seqfile, bcode_file in bcode_pairs:
            sample_dir = join(self.products_dir, basename(seqfile) + "_split")

            map_fname = util.new_file("map.txt", basedir=sample_dir)
            sample_group = self._filter_samples_for_file(
                self.sample_metadata,
                seqfile,
                key=lambda val: val.Run_accession)
            tasks.append(
                sixteen.write_map(sample_group, sample_dir,
                                  **self.options.get('write_map', dict())))

            outfile = util.new_file(util.rmext(basename(seqfile)) +
                                    "_demuxed.fna",
                                    basedir=sample_dir)
            tasks.append(
                sixteen.demultiplex_illumina([seqfile], [bcode_file],
                                             map_fname,
                                             outfile,
                                             qiime_opts=options))
            demuxed.append(outfile)

        if do_groupby:
            output_dir = join(self.products_dir, "demuxed_by-sampleid")
            sample_ids = [s[0] for s in sample_group]
            task_dict = general.group_by_sampleid(demuxed, output_dir,
                                                  sample_ids)
            demuxed = task_dict['targets']
            tasks.append(task_dict)

        return demuxed, tasks
Exemplo n.º 9
0
def pick_otus_closed_ref(input_fname, output_dir, verbose=None, qiime_opts={}):
    """Workflow to perform OTU picking, generates a biom-formatted OTU
    table from demultiplexed 16S reads. This workflow (in general
    terms) wraps qiime's pick_closed_reference_otus.py, which itself
    wraps either uclust or usearch. Note that uclust and usearch
    require a fairly large memory footprint (1.5-2.0G in some cases).

    :param input_fname: String; File path to the input,
                        fasta-formatted 16S sequences
    :param output_dir: String; Path to the directory where the output OTU 
                       table will be saved as 'otu_table.biom'. Other 
                       qiime-specific logs will go there, too.
    :keyword verbose: Boolean: set to true to print the commands that are 
                      run as they are run
    :keyword qiime_opts: Dictionary; A dictionary of command line options to
                         be passed to the wrapped split_libraries.py script. 
                         No - or -- flags are necessary; the correct - or --t
                         flags are inferred based on the length of the option. 
                         For boolean options, use the key/value pattern 
                         of { "my-option": "" }.

    External dependencies:
      - Qiime 1.8.0: https://github.com/qiime/qiime-deploy
      - USEARCH: (only if using the usearch option) 
        http://www.drive5.com/usearch/

    Resource utilization:
      - RAM: >1.5 G

    """

    output_fname = new_file("otu_table.biom", basedir=output_dir)
    revcomp_fname = new_file(
        os.path.basename(rmext(input_fname))+"_revcomp.fna",
        basedir=os.path.dirname(input_fname))

    verbose = settings.workflows.verbose if verbose is None else verbose

    default_opts = {
        "taxonomy_fp": settings.workflows.sixteen.otu_taxonomy,
        "reference_fp": settings.workflows.sixteen.otu_refseq
    }
    default_opts.update(qiime_opts)
    opts = dict_to_cmd_opts(default_opts)

    cmd = ("pick_closed_reference_otus.py"+
           " --input_fp={}"+
           " --output_dir="+output_dir+
           " -f"+
           " "+opts)

    revcomp_cmd = ("sequence_convert"+
                   " --format=fasta"+
                   " --to=fasta "+
                   " -r"+
                   " "+input_fname+
                   " > "+revcomp_fname)

    def run(targets):
        strategies.backup(
            (CmdAction(cmd.format(input_fname),verbose=verbose),
             strategies.Group(
                 CmdAction(revcomp_cmd),
                 CmdAction(cmd.format(revcomp_fname),verbose=verbose))),
            extra_conditions = [ 
                lambda ret, output_fname: os.stat(output_fname).st_size == 0
            ],
            output_fname=output_fname,
        )
             
    return {
        "name": "pick_otus_closed_ref:"+input_fname,
        "actions": [run],
        "targets": [output_fname],
        "file_dep": [input_fname],
        "title": lambda t: t.name+" Estimated mem=3000"
    }
Exemplo n.º 10
0
    def _configure(self):
        for attr in self.sequence_attrs:
            seq_set = getattr(self, attr)

            if self.options['infer_pairs'].get('infer'):
                paired, notpaired = infer_pairs(seq_set)
                seq_set = paired + notpaired

            seq_set, maybe_tasks = maybe_concatenate(seq_set, self.products_dir)
            setattr(self, attr, seq_set)
            for t in maybe_tasks:
                yield t

        for file_ in self.raw_seq_files:
            if util.guess_seq_filetype(file_) != "fastq":
                fastq_file = util.new_file( basename(file_)+"_filtered.fastq",
                                            basedir=self.products_dir )
                yield general.sequence_convert(
                    [file_], fastq_file, 
                    **self.options.get('sequence_convert', dict())
                )
            else:
                fastq_file = file_
            self.intermediate_fastq_files.append(fastq_file)
                

        for fastq_file in self.intermediate_fastq_files:
            name_base = util.new_file(util.rmext(fastq_file, all=True),
                                      basedir=self.products_dir)
            task_dict = next(wgs.knead_data(
                [fastq_file], name_base,
                **self.options.get('decontaminate', {})
            ))
            decontaminated_fastq = task_dict['targets'][0]
            self.decontaminated_fastq_files.append(decontaminated_fastq)
            yield task_dict

        for d_fastq in self.decontaminated_fastq_files:
            metaphlan_file = util.new_file(
                basename(d_fastq)+".metaphlan2.tsv",
                basedir=self.products_dir )
            otu_table = metaphlan_file.replace('.tsv', '.biom')
            yield wgs.metaphlan2(
                [d_fastq], output_file=metaphlan_file,
                biom=otu_table,
                # first index is for first item in list of samples
                # second index is to get the sample id from the sample
                sample_id=self._filter_samples_for_file(self.sample_metadata,
                                                        d_fastq)[0][0],
                input_type="multifastq",
                **self.options.get('metaphlan2', dict())
            )
            self.metaphlan_results.append(metaphlan_file)
            self.otu_tables.append(otu_table)

            # Finally, HUMAnN all alignment files
            humann_output_dir = util.new_file(
                util.rmext(basename(d_fastq), all=True)+"_humann",
                basedir=self.products_dir
            )
            yield wgs.humann2( d_fastq, humann_output_dir, 
                               **self.options.get('humann', dict()) )