class Pyrosample(object): """A Pyrosample is a legacy object for the few 454 samples we still have and that we need to compare against the new Illumina technology.""" all_paths = """ /info.json /reads.fasta /renamed.fasta /raw/raw.sff /raw/raw.fastq /raw/raw.fasta /raw/raw.qual /raw/manifest.txt /fastq/reads.fastq """ kind = "pyrosample" def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.id_name) def __init__(self, json_path, out_dir): # Attributes # self.out_dir = out_dir self.json_path = FilePath(json_path) # Parse # self.info = load_json_path(self.json_path) # Basic # self.account = "/dev/null" self.run_num = self.info['run_num'] self.run_label = "pyrosample_run_%i" % self.run_num self.project_short_name = self.info['project'] self.project_long_name = self.info['project_name'] # Own attributes # self.num = self.info['sample_num'] self.short_name = self.info['sample'] self.long_name = self.info['sample_name'] self.name = 'run%i_sample%i' % (self.run_num, self.num) self.group = self.info['group'] self.id_name = "run%03d-sample%02d" % (self.run_num, self.num) # Hard coded attributes # self.machine = "454 GS FLX Titanium" # SFF files # self.sff_files_info = self.info['files'] # Pool dummy # self.pool, self.parent = self, self # Other dummy variables # self.bar_len = 0 self.gzipped = False self.used = True # Loaded # self.loaded = False def load(self): """A second __init__ that is delayed and called only if needed""" # Check files are there # for f in self.sff_files_info: if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path']) # Automatic paths # self.base_dir = self.out_dir + self.id_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Make an alias to the json # self.p.info_json.link_from(self.json_path, safe=True) # Primer # self.primer_regex = re.compile(self.info['primer']) # Raw files # self.raw_fasta = FASTA(self.p.raw_fasta) self.raw_fastq = FASTQ(self.p.raw_fastq) # Standard FASTA # self.reads = FASTA(self.p.reads_fasta) self.fasta = FASTA(self.p.renamed) # Special FASTQ # self.fastq = FASTQ(self.p.reads_fastq) # A shameless hack for cdhit to work # self.renamed = self.fastq # Pre-denoised special case # if self.info['predenoised'] and False: self.sff_files_info = [] self.reads.link_from(self.info['predenoised'], safe=True) # Special submission attributes # self.sra = PyroSampleSRA(self) # Loaded # self.loaded = True # Return self for convenience # return self @property def mate(self): if not 'mate' in self.info: return False run_num = self.info['mate']['run'] pool_num = self.info['mate']['pool'] barcode_num = self.info['mate']['num'] return illumitag.runs[run_num][pool_num-1][barcode_num-1] def extract(self): # Call extraction # shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta)) shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual)) shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest)) # Convert # sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq) def clean_iterator(self, reads, minlength=400, threshold=21, windowsize=20): for read in reads: # Length # if len(read) < minlength: continue # Primer # match = self.primer_regex.search(str(read.seq)) if not match: continue # PHRED score # scores = read.letter_annotations["phred_quality"] averaged = moving_average(scores, windowsize) discard = False for i,value in enumerate(averaged): if value < threshold: read = read[:i+windowsize-1] if len(read) < minlength: discard = True break if discard: continue # Undetermined bases # if 'N' in read: continue # Remove primer # read = read[match.end():] # Flip them because 454 reads the other end # read = read.reverse_complement() # Return # yield read def clean(self, **kwargs): self.reads.write(self.clean_iterator(self.raw_fastq, **kwargs)) def report_loss(self): print "Before cleaning: %i" % len(self.raw_fastq) print "After cleaning: %i" % len(self.reads) print "Loss: %.2f%%" % (100 * (1 - (len(self.raw_fastq)/len(self.reads)))) def process(self): self.reads.rename_with_num(self.name + '_read', new_path=self.fasta) def make_fastq(self, **kwargs): """In some special cases we want the FASTQ""" self.fastq.write(self.clean_iterator(self.raw_fastq, **kwargs)) self.fastq.rename_with_num(self.name + '_read') print "make_fastq for sample %s completed" % self.id_name
class QualityReads(object): """A set of sequences determined to be quality controlled""" all_paths = """ /mothur_reads.fasta /mothur_reads.qual /mothur_groups.tsv /qiime_reads.fasta /only_used_samples.fasta /trimmed.fasta """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __len__(self): return len(self.trimmed) def __init__(self, path, parent): # Save parent # self.parent, self.pool = parent, parent self.samples = parent.samples # Auto paths # self.base_dir = parent.p.quality_dir + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Files # self.untrimmed = BarcodedFASTQ(path, samples=self.samples) self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples) self.trimmed = FASTA(self.p.trimmed) # Qiime output # self.qiime_fasta = FASTA(self.p.qiime_fasta) # Mothur # self.mothur_fasta = FASTA(self.p.mothur_fasta) self.mothur_qual = QualFile(self.p.mothur_qual) self.mothur_groups = FilePath(self.p.mothur_groups) # Primer size # self.trim_fwd = self.pool.samples.trim_fwd self.trim_rev = self.pool.samples.trim_rev def filter_unused(self): def no_unused_iterator(reads): for r in reads.parse_barcodes(): if r.first.sample.used: yield r.read self.only_used.write(no_unused_iterator(self.untrimmed)) def trim_primers(self): def no_primers_iterator(reads): for read in reads: yield read[self.trim_fwd:-self.trim_rev] self.trimmed.write(no_primers_iterator(self.only_used)) def make_mothur_output(self): # Trimmed fasta # self.mothur_fasta.link_from(self.trimmed.path) # The groups file # self.mothur_groups.create() for r in self.only_used.parse_barcodes(): sample_name = r.first.sample.short_name read_name = '%s\t%s\n' % (r.read.id, sample_name) self.mothur_groups.handle.write(read_name) self.mothur_groups.close() def make_qiime_output(self): # Prepare fasta writer # handle = open(self.qiime_fasta.path, 'w') writer = FastaWriter(handle, wrap=0) writer.write_header() # Counter # counter = defaultdict(int) # Do it # for r in self.only_used.parse_barcodes(): sample_name = r.first.sample.short_name counter[sample_name] += 1 r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id) bar_seq = r.read.seq[0:self.pool.bar_len] r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq) writer.write_record(r.read[self.trim_fwd:-self.trim_rev]) # Close # writer.write_footer() handle.close()
class Cluster(object): """Analyzes a group of samples.""" all_paths = """ /reads/all_reads.fasta /otus/ /logs/ /report/report.pdf /metadata.csv """ def __repr__(self): return '<%s object "%s" with %i samples>' % (self.__class__.__name__, self.name, len(self.samples)) def __iter__(self): return iter(self.samples) def __len__(self): return len(self.samples) def __getitem__(self, key): if isinstance(key, basestring): return [c for c in self.children if c.short_name == key.lower()][0] elif isinstance(key, int) and hasattr(self.first, 'num'): return [c for c in self.children if c.num == key][0] else: return self.children[key] @property def first(self): return self.children[0] @property def count_seq(self): return sum([len(sample) for sample in self]) def __init__(self, samples, name, base_dir=None): # Save samples # self.name = name self.samples, self.children = samples, samples # Check names are unique # names = [s.short_name for s in samples if s.used] assert len(names) == len(set(names)) # Figure out pools # self.pools = list(set([s.pool for s in self.samples])) self.pools.sort(key = lambda x: x.id_name) # Directory # if base_dir: self.base_dir = base_dir else: self.base_dir = illumitag.view_dir + "clusters/" + self.name + '/' # Loaded # self.loaded = False def load(self): """A second __init__ that is delayed and called only if needed""" # Load the pools and samples # for p in self.pools: p.load() for s in self.samples: s.load() # Dir # self.p = AutoPaths(self.base_dir, self.all_paths) # Figure out if it's a project # if set(self.samples) == set(self.first.pool.project.samples): self.project = self.first.pool.project else: self.project = None # Runner # self.runner = ClusterRunner(self) # FASTA # self.reads = FASTA(self.p.all_reads_fasta) # OTU picking # self.otu_uparse = UparseOTUs(self) self.otu_uclust = UclustOTUs(self) self.otu_cdhit = CdhitOTUs(self) # Preferred # self.otus = self.otu_uparse # Simple reporting # self.reporter = ClusterReporter(self) # Full report # self.report = ClusterReport(self) # Loaded # self.loaded = True # Return self for convenience # return self def run(self, *args, **kwargs): self.runner.run(*args, **kwargs) def run_slurm(self, *args, **kwargs): self.runner.run_slurm(*args, **kwargs) def process_samples(self): for sample in tqdm(self): sample.process() def combine_reads(self): """This is the first function should call. It will combine all the reads of all the samples of this cluster into one big FASTA file.""" paths = [sample.fasta.path for sample in self] shell_output('cat %s > %s' % (' '.join(paths), self.reads)) return self.reads def set_size(self, length): """Trim all sequences to a specific length starting from the end.""" self.size_trimmed = FASTA(new_temp_path()) def trim_iterator(reads): for read in reads: if len(read) < length: continue yield read[-length:] self.size_trimmed.write(trim_iterator(self.reads)) self.size_trimmed.close() # Replace it # self.reads.remove() shutil.move(self.size_trimmed, self.reads) def run_uparse(self): self.otu_uparse.run() @property def metadata(self): return pandas.DataFrame([s.info for s in self], index=[s.short_name for s in self]) def export_metadata(self): self.metadata.to_csv(self.p.metadata, sep='\t', encoding='utf-8')
class PrimerGroup(object): """A bunch of sequences all having the same type of primer outcome (and assembly outcome)""" all_paths = """ /orig.fastq /n_filtered.fastq /qual_filtered.fastq /len_filtered.fastq /trimmed_barcodes.fasta """ qual_threshold = 5 qual_windowsize = 10 min_length = 400 def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __len__(self): return len(self.orig_reads) def create(self): self.orig_reads.create() def add_seq(self, read): self.orig_reads.add_seq(read) def close(self): self.orig_reads.close() def __init__(self, parent): # Save parent # self.parent, self.assemble_group = parent, parent self.samples = parent.samples self.pool = self.parent.pool self.primers = self.pool.primers # Auto paths # self.base_dir = parent.p.groups_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # More # self.orig_reads = self.parent.cls(self.p.orig_fastq, samples=self.samples) self.n_filtered = self.parent.cls(self.p.n_filtered, samples=self.samples) # Quality filtered # if self.parent == 'assembled': self.qual_filtered = BarcodedFASTQ(self.p.qual_filtered, samples=self.samples, primers=self.primers) self.len_filtered = BarcodedFASTQ(self.p.len_filtered_fastq, samples=self.samples, primers=self.primers) self.trimmed_barcodes = FASTA(self.p.trimmed_barcodes) # Further # self.load() def load(self): pass def n_filter(self): """Called from AssembleGroup.discard_reads_with_n""" def no_n_iterator(reads): fwd_len = self.pool.primers.fwd_len rev_len = self.pool.primers.rev_len for read in reads: if 'N' in read[fwd_len:-rev_len]: continue yield read self.n_filtered.write(no_n_iterator(self.orig_reads)) def qual_filter(self): """Called from Assemble.quality_filter""" def good_qual_iterator(reads): for read in reads: averaged = moving_average(read.letter_annotations["phred_quality"], self.qual_windowsize) if any([value < self.qual_threshold for value in averaged]): continue yield read self.qual_filtered.write(good_qual_iterator(self.n_filtered)) def len_filter(self): """Called from Assemble.length_filter""" def good_len_iterator(reads): for read in reads: if len(read) < self.min_length: continue yield read self.len_filtered.write(good_len_iterator(self.qual_filtered)) def trim_bc(self): """Called from Assemble.trim_barcodes""" def no_barcodes_iterator(reads): for read in reads: yield read[self.pool.bar_len:-self.pool.bar_len] if self.pool.bar_len == 0: self.len_filtered.to_fasta(self.trimmed_barcodes) else: self.trimmed_barcodes.write(no_barcodes_iterator(self.len_filtered))
class UclustOTUs(OTUs): """Will use uclust via the qimme wraper to create OTU clusters from a given FASTA file http://qiime.org/scripts/pick_otus.html""" short_name = 'uclust' title = 'UCLUST-QIIME denovo picking' all_paths = """ /clusters/clusters.uc /clusters/qiime.log /clusters/all_otus.txt /clusters/all_centers.fasta /centers.fasta /otus.txt /taxonomy_silva/ /taxonomy_fw/ /graphs/ """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main FASTA file # self.reads = self.parent.reads # Files # self.all_otus = FilePath(self.p.all_otus) self.all_centers = FASTA(self.p.all_centers) self.otus = FilePath(self.base_dir + "otus.txt") self.centers = FASTA(self.base_dir + "centers.fasta") # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) # Preferred one # self.taxonomy = self.taxonomy_silva def run(self): # Clean # shutil.rmtree(self.p.clusters_dir) # Run command # pick_otus = sh.Command('pick_otus.py') pick_otus('-m', 'uclust', '-i', self.reads, '-o', self.p.clusters_dir) # Move into place # base_name = self.p.clusters_dir + self.reads.prefix shutil.move(base_name + '_otus.txt', self.all_otus) shutil.move(base_name + '_otus.log', self.p.qiime_log) shutil.move(base_name + '_clusters.uc', self.p.clusters_uc) # Remove OTUs that are only one read # def filter_singletons(f): for line in f: line = line.split() if len(line) > 2: yield '\t'.join(line) + '\n' self.otus.writelines(filter_singletons(self.all_otus)) # Create the centers file that is missing # pick_rep = sh.Command('pick_rep_set.py') pick_rep('-i', self.all_otus, '-f', self.reads, '-o', self.all_centers) # Remake the centers file without the filtered OTUs # self.otus_to_keep = [line.split()[0] for line in self.otus] def filter_otus(f): for seq in f: if seq.id in self.otus_to_keep: yield seq self.centers.write(filter_otus(self.all_centers)) @property_cached def cluster_counts_table(self): """Create the unfiltered OTU table""" # Put results in a dict of dicts # result = defaultdict(lambda: defaultdict(int)) # Loop # for line in self.otus: # Parse the line # contents = line.split() otu, reads = contents[0], contents[1:] # Parse the hits # for r in reads: nums = re.findall("run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)", r) if nums: run_num, pool_num, sample_num, read_num = map(int, nums[0]) sample = illumitag.runs[run_num][pool_num-1][sample_num-1] name = sample.short_name else: nums = re.findall("run([0-9]+)_sample([0-9]+)_read([0-9]+)", r) run_num, sample_num, read_num = map(int, nums[0]) sample = [s for s in illumitag.presamples+illumitag.pyrosamples if s.run_num==run_num and s.num==sample_num][0] name = sample.short_name # Count # result[otu][name] += 1 # Return # result = pandas.DataFrame(result) result = result.fillna(0) result = result.astype(int) result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1) return result