def _default_metadata(): cls = namedtuple("Sample", ['SampleID']) for seq_list in filter(bool, maybe_seqs): # return the first one that contains filenames return [ cls(basename(util.rmext(f, all=True))) for f in seq_list ]
def _configure(self): if self.raw_seq_files or self.raw_demuxed_fastq_files: for task in self._handle_raw_seqs(): yield task if self.raw_seq_files: for task in self._demultiplex(): yield task # ensure all files are decompressed # possibly stitch paired reads, demultiplex, and quality filter # do closed reference otu picking for fasta_fname in self.demuxed_fasta_files: dirname = util.rmext(os.path.basename(fasta_fname)) otu_dir = join(os.path.dirname(fasta_fname), dirname + "_otus") yield sixteen.pick_otus_closed_ref(input_fname=fasta_fname, output_dir=otu_dir, **self.options.get( 'pick_otus_closed_ref', dict())) self.otu_tables.append(join(otu_dir, "otu_table.biom")) # convert biom file to tsv for otu_table in self.otu_tables: tsv_filename = otu_table + ".tsv" yield biom.biom_to_tsv(otu_table, tsv_filename) # infer genes and pathways with picrust for otu_table in self.otu_tables: yield sixteen.picrust(otu_table, **self.options.get('picrust', dict()))
def _default_metadata(): cls = namedtuple("Sample", ['SampleID']) for seq_attr in self.sequence_attrs: maybe_seqs = getattr(self, seq_attr, None) if maybe_seqs: return [ cls(basename(util.rmext(f, all=True))) for f in maybe_seqs ]
def _configure(self): if self.options['infer_pairs'].get('infer'): paired, notpaired = infer_pairs(self.raw_seq_files) self.raw_seq_files = paired + notpaired maybe_tasks = list() for maybe_pair in self.raw_seq_files: is_pair = type(maybe_pair) in (tuple, list) if is_pair: pair, tasks = maybe_convert_to_fastq(maybe_pair, self.products_dir) self.paired_fastq_files.append(pair) maybe_tasks.extend(tasks) elif util.guess_seq_filetype(maybe_pair) == 'bam': prefix = util.new_file(util.rmext(basename(maybe_pair)), basedir=self.products_dir) t = samtools.to_paired_fastq(maybe_pair, prefix) paired, single = t['targets'][:2], t['targets'][2] self.paired_fastq_files.append(paired) self.unpaired_fastq_files.append(single) maybe_tasks.append(t) else: single, tasks = maybe_convert_to_fastq([maybe_pair], self.products_dir) self.unpaired_fastq_files.append(single[0]) maybe_tasks.extend(tasks) for task in maybe_tasks: yield task for pair in self.paired_fastq_files: align_sam = util.new_file(_to_merged(basename(pair[0]), tag="align"), basedir=self.products_dir) align_sam += ".sam" self.align_sams.append(align_sam) yield subread.align(pair, align_sam, self.options.get('subread_align', dict())) for single in self.unpaired_fastq_files: align_sam = util.new_file(util.addtag(basename(single), "align"), basedir=self.products_dir) align_sam += ".sam" self.align_sams.append(align_sam) yield subread.align(single, align_sam, self.options.get('subread_align', dict())) for align_sam in self.align_sams: count_table = util.new_file(util.addtag(basename(align_sam), "count"), basedir=self.products_dir) self.count_tables.append(count_table) yield subread.featureCounts([align_sam], count_table, self.options.get( 'featureCounts', dict()))
def _maybe_mangle(): if not do_mangle: return if not os.path.exists( output_fasta) or os.stat(output_fasta).st_size < 1: return m = rmext(output_fasta, all=True) if mangle_to is False else mangle_to cmd = "sequence_convert -m {m} -f fasta -t fasta {o} > {o}.tmp".format( m=m, o=output_fasta) CmdAction(cmd, verbose=verbose).execute() CmdAction("mv {o}.tmp {o}".format(o=output_fasta), verbose=verbose).execute()
def _configure(self): yield self._handle_raw_seqs_and_demultiplex() for fasta_fname in self.demuxed_fasta_files: otu_table = util.rmext(fasta_fname) + "_tax.biom" otu_table = join(self.products_dir, os.path.basename(otu_table)) yield pick_otus_closed_ref( fasta_fname, otu_table, **self.options.get('pick_otus_closed_ref', dict())) self.otu_tables.append(otu_table) # infer genes and pathways with picrust for otu_table in self.otu_tables: yield sixteen.picrust(otu_table, **self.options.get('picrust', dict()))
def assign_taxonomy(in_fasta, out_dir, qiime_opts={}): name = rmext(os.path.basename(in_fasta))+"_tax_assignments.txt" taxonomy_out = os.path.join(out_dir, name) default_opts = dict([ ("r", settings.workflows.sixteen.otu_refseq), ("t", settings.workflows.sixteen.otu_taxonomy), ]+list(qiime_opts.items())) cmd = (" -i "+in_fasta+" -o "+out_dir+ " "+dict_to_cmd_opts(default_opts)) return { "name" : "assign_taxonomy: "+taxonomy_out, "targets" : [taxonomy_out], "actions" : [cmd], "file_dep" : [default_opts['r'], default_opts['t'], in_fasta] }
def split_illumina_style(self, seqfiles_to_split, barcode_seqfiles): demuxed, tasks = list(), list() bcode_pairs = zip(seqfiles_to_split, barcode_seqfiles) options = self.options.get("demultiplex_illumina", dict()) if 'barcode_type' not in options: options['barcode_type'] = _determine_barcode_type( self.sample_metadata) do_groupby = options.pop("group_by_sampleid", False) for seqfile, bcode_file in bcode_pairs: sample_dir = join(self.products_dir, basename(seqfile) + "_split") map_fname = util.new_file("map.txt", basedir=sample_dir) sample_group = self._filter_samples_for_file( self.sample_metadata, seqfile, key=lambda val: val.Run_accession) tasks.append( sixteen.write_map(sample_group, sample_dir, **self.options.get('write_map', dict()))) outfile = util.new_file(util.rmext(basename(seqfile)) + "_demuxed.fna", basedir=sample_dir) tasks.append( sixteen.demultiplex_illumina([seqfile], [bcode_file], map_fname, outfile, qiime_opts=options)) demuxed.append(outfile) if do_groupby: output_dir = join(self.products_dir, "demuxed_by-sampleid") sample_ids = [s[0] for s in sample_group] task_dict = general.group_by_sampleid(demuxed, output_dir, sample_ids) demuxed = task_dict['targets'] tasks.append(task_dict) return demuxed, tasks
def pick_otus_closed_ref(input_fname, output_dir, verbose=None, qiime_opts={}): """Workflow to perform OTU picking, generates a biom-formatted OTU table from demultiplexed 16S reads. This workflow (in general terms) wraps qiime's, which itself wraps either uclust or usearch. Note that uclust and usearch require a fairly large memory footprint (1.5-2.0G in some cases). :param input_fname: String; File path to the input, fasta-formatted 16S sequences :param output_dir: String; Path to the directory where the output OTU table will be saved as 'otu_table.biom'. Other qiime-specific logs will go there, too. :keyword verbose: Boolean: set to true to print the commands that are run as they are run :keyword qiime_opts: Dictionary; A dictionary of command line options to be passed to the wrapped script. No - or -- flags are necessary; the correct - or --t flags are inferred based on the length of the option. For boolean options, use the key/value pattern of { "my-option": "" }. External dependencies: - Qiime 1.8.0: - USEARCH: (only if using the usearch option) Resource utilization: - RAM: >1.5 G """ output_fname = new_file("otu_table.biom", basedir=output_dir) revcomp_fname = new_file( os.path.basename(rmext(input_fname))+"_revcomp.fna", basedir=os.path.dirname(input_fname)) verbose = settings.workflows.verbose if verbose is None else verbose default_opts = { "taxonomy_fp": settings.workflows.sixteen.otu_taxonomy, "reference_fp": settings.workflows.sixteen.otu_refseq } default_opts.update(qiime_opts) opts = dict_to_cmd_opts(default_opts) cmd = (""+ " --input_fp={}"+ " --output_dir="+output_dir+ " -f"+ " "+opts) revcomp_cmd = ("sequence_convert"+ " --format=fasta"+ " --to=fasta "+ " -r"+ " "+input_fname+ " > "+revcomp_fname) def run(targets): strategies.backup( (CmdAction(cmd.format(input_fname),verbose=verbose), strategies.Group( CmdAction(revcomp_cmd), CmdAction(cmd.format(revcomp_fname),verbose=verbose))), extra_conditions = [ lambda ret, output_fname: os.stat(output_fname).st_size == 0 ], output_fname=output_fname, ) return { "name": "pick_otus_closed_ref:"+input_fname, "actions": [run], "targets": [output_fname], "file_dep": [input_fname], "title": lambda t:" Estimated mem=3000" }
def _configure(self): for attr in self.sequence_attrs: seq_set = getattr(self, attr) if self.options['infer_pairs'].get('infer'): paired, notpaired = infer_pairs(seq_set) seq_set = paired + notpaired seq_set, maybe_tasks = maybe_concatenate(seq_set, self.products_dir) setattr(self, attr, seq_set) for t in maybe_tasks: yield t for file_ in self.raw_seq_files: if util.guess_seq_filetype(file_) != "fastq": fastq_file = util.new_file( basename(file_)+"_filtered.fastq", basedir=self.products_dir ) yield general.sequence_convert( [file_], fastq_file, **self.options.get('sequence_convert', dict()) ) else: fastq_file = file_ self.intermediate_fastq_files.append(fastq_file) for fastq_file in self.intermediate_fastq_files: name_base = util.new_file(util.rmext(fastq_file, all=True), basedir=self.products_dir) task_dict = next(wgs.knead_data( [fastq_file], name_base, **self.options.get('decontaminate', {}) )) decontaminated_fastq = task_dict['targets'][0] self.decontaminated_fastq_files.append(decontaminated_fastq) yield task_dict for d_fastq in self.decontaminated_fastq_files: metaphlan_file = util.new_file( basename(d_fastq)+".metaphlan2.tsv", basedir=self.products_dir ) otu_table = metaphlan_file.replace('.tsv', '.biom') yield wgs.metaphlan2( [d_fastq], output_file=metaphlan_file, biom=otu_table, # first index is for first item in list of samples # second index is to get the sample id from the sample sample_id=self._filter_samples_for_file(self.sample_metadata, d_fastq)[0][0], input_type="multifastq", **self.options.get('metaphlan2', dict()) ) self.metaphlan_results.append(metaphlan_file) self.otu_tables.append(otu_table) # Finally, HUMAnN all alignment files humann_output_dir = util.new_file( util.rmext(basename(d_fastq), all=True)+"_humann", basedir=self.products_dir ) yield wgs.humann2( d_fastq, humann_output_dir, **self.options.get('humann', dict()) )