def combine_rerun_with_orig(self): """Special case when a sample with low reads was rerun in an other pool. Run this just before the combine_reads() method of the associated cluster. This method is called on the reruned sampled, not the original.""" # Check we have a rerun # if self.info.get('rerun') is None: return False # Check we are processed # assert self.fasta.count > 0 # Get the original sample # run, pool, num = self.info['rerun']['run'], self.info['rerun']['pool'], self.info['rerun']['num'] orig_sample = illumitag.runs[run][pool-1][num-1] merged = FASTA(orig_sample.base_dir + 'rerun_merged.fasta') # Check we don't merge twice # assert orig_sample.count == orig_sample.fasta.count # Do it # merged.create() merged.add(orig_sample.fasta) merged.add(self.fasta) merged.close() merged.rename_with_num(orig_sample.name + '_read', orig_sample.fasta) merged.remove() # Check # orig_sample.fasta = FASTA(orig_sample.fasta.path) assert orig_sample.count < orig_sample.fasta.count return True
class UparseOTUs(OTUs): """Will use uparse to create OTU clusters from a given FASTA file http://www.nature.com/doifinder/10.1038/nmeth.2604""" short_name = 'uparse' title = 'UPARSE denovo picking' article = "http://www.nature.com/doifinder/10.1038/nmeth.2604" version = uparse_version threshold = 3.0 all_paths = """ /derep.fasta /sorted.fasta /centers.fasta /readmap.uc /taxonomy_silva/ /taxonomy_fw/ /taxonomy_unite/ /taxonomy_rdp/ /graphs/ /seqenv/ """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __len__(self): return 0 def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main FASTA file # self.reads = self.parent.reads # Files # self.derep = SizesFASTA(self.p.derep) self.sorted = SizesFASTA(self.p.sorted) self.centers = FASTA(self.p.centers) self.readmap = UClusterFile(self.p.readmap) # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva_dir) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) self.taxonomy_unite = CrestTaxonomy(self.centers, self, 'unite', self.p.unite_dir) self.taxonomy_rdp = RdpTaxonomy(self.centers, self) # Preferred one # self.taxonomy = self.taxonomy_silva # Source tracking # self.seqenv = Seqenv(self) def run(self, threshold=None): # Optional threshold # if threshold is None: threshold = self.threshold identity = (100 - threshold) / 100 # Dereplicate (uparse version 32bit version runs out of memory) # if False: sh.usearch7("--derep_fulllength", self.reads, '-output', self.derep, '-sizeout') sh.fasta_make_unique(self.reads, self.derep) # Order by size and kill singeltons # sh.usearch7("--sortbysize", self.derep, '-output', self.sorted, '-minsize', 2) # Compute the centers # sh.usearch7("--cluster_otus", self.sorted, '-otus', self.centers, '-otu_radius_pct', threshold) # Rename the centers # self.centers.rename_with_num('OTU-') # Map the reads back to the centers # sh.usearch7("-usearch_global", self.reads, '-db', self.centers, '-strand', 'plus', '-id', identity, '-uc', self.readmap) def checks(self): assert len(self.reads) == len(self.derep) assert len(self.reads) == len(self.readmap) @property_cached def cluster_counts_table(self): """Parse that custom output for creating the unfiltered OTU table""" result = pandas.DataFrame(self.readmap.otu_sample_counts) result = result.fillna(0) result = result.astype(int) result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1) return result
class Pyrosample(object): """A Pyrosample is a legacy object for the few 454 samples we still have and that we need to compare against the new Illumina technology.""" all_paths = """ /info.json /reads.fasta /renamed.fasta /raw/raw.sff /raw/raw.fastq /raw/raw.fasta /raw/raw.qual /raw/manifest.txt /fastq/reads.fastq """ kind = "pyrosample" def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.id_name) def __init__(self, json_path, out_dir): # Attributes # self.out_dir = out_dir self.json_path = FilePath(json_path) # Parse # self.info = load_json_path(self.json_path) # Basic # self.account = "/dev/null" self.run_num = self.info['run_num'] self.run_label = "pyrosample_run_%i" % self.run_num self.project_short_name = self.info['project'] self.project_long_name = self.info['project_name'] # Own attributes # self.num = self.info['sample_num'] self.short_name = self.info['sample'] self.long_name = self.info['sample_name'] self.name = 'run%i_sample%i' % (self.run_num, self.num) self.group = self.info['group'] self.id_name = "run%03d-sample%02d" % (self.run_num, self.num) # Hard coded attributes # self.machine = "454 GS FLX Titanium" # SFF files # self.sff_files_info = self.info['files'] # Pool dummy # self.pool, self.parent = self, self # Other dummy variables # self.bar_len = 0 self.gzipped = False self.used = True # Loaded # self.loaded = False def load(self): """A second __init__ that is delayed and called only if needed""" # Check files are there # for f in self.sff_files_info: if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path']) # Automatic paths # self.base_dir = self.out_dir + self.id_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Make an alias to the json # self.p.info_json.link_from(self.json_path, safe=True) # Primer # self.primer_regex = re.compile(self.info['primer']) # Raw files # self.raw_fasta = FASTA(self.p.raw_fasta) self.raw_fastq = FASTQ(self.p.raw_fastq) # Standard FASTA # self.reads = FASTA(self.p.reads_fasta) self.fasta = FASTA(self.p.renamed) # Special FASTQ # self.fastq = FASTQ(self.p.reads_fastq) # A shameless hack for cdhit to work # self.renamed = self.fastq # Pre-denoised special case # if self.info['predenoised'] and False: self.sff_files_info = [] self.reads.link_from(self.info['predenoised'], safe=True) # Special submission attributes # self.sra = PyroSampleSRA(self) # Loaded # self.loaded = True # Return self for convenience # return self @property def mate(self): if not 'mate' in self.info: return False run_num = self.info['mate']['run'] pool_num = self.info['mate']['pool'] barcode_num = self.info['mate']['num'] return illumitag.runs[run_num][pool_num-1][barcode_num-1] def extract(self): # Call extraction # shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta)) shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual)) shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest)) # Convert # sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq) def clean_iterator(self, reads, minlength=400, threshold=21, windowsize=20): for read in reads: # Length # if len(read) < minlength: continue # Primer # match = self.primer_regex.search(str(read.seq)) if not match: continue # PHRED score # scores = read.letter_annotations["phred_quality"] averaged = moving_average(scores, windowsize) discard = False for i,value in enumerate(averaged): if value < threshold: read = read[:i+windowsize-1] if len(read) < minlength: discard = True break if discard: continue # Undetermined bases # if 'N' in read: continue # Remove primer # read = read[match.end():] # Flip them because 454 reads the other end # read = read.reverse_complement() # Return # yield read def clean(self, **kwargs): self.reads.write(self.clean_iterator(self.raw_fastq, **kwargs)) def report_loss(self): print "Before cleaning: %i" % len(self.raw_fastq) print "After cleaning: %i" % len(self.reads) print "Loss: %.2f%%" % (100 * (1 - (len(self.raw_fastq)/len(self.reads)))) def process(self): self.reads.rename_with_num(self.name + '_read', new_path=self.fasta) def make_fastq(self, **kwargs): """In some special cases we want the FASTQ""" self.fastq.write(self.clean_iterator(self.raw_fastq, **kwargs)) self.fastq.rename_with_num(self.name + '_read') print "make_fastq for sample %s completed" % self.id_name
class CdhitOTUs(OTUs): """Will use cd-hit to create OTU clusters from a given FASTQ file http://weizhong-lab.ucsd.edu/cd-hit-otu/""" short_name = "cdhit" title = "CD-HIT Illumina OTU picking" all_paths = """ /all_reads.fastq /clusters/OTU.nr2nd.clstr /centers.fasta /otus.txt /taxonomy_silva/ /taxonomy_fw/ /graphs/ """ def __repr__(self): return "<%s object of %s>" % (self.__class__.__name__, self.parent) def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + "/" self.p = AutoPaths(self.base_dir, self.all_paths) # Main reads file here FASTQ # self.reads = FASTQ(self.p.all_reads) # Files # self.cdhit_clusters = FilePath(self.p.clstr) self.cdhit_centers = FASTA(self.p.clusters_dir + "OTU") self.centers = FASTA(self.p.centers) # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, "silvamod", self.p.silva) self.taxonomy_fw = CrestTaxonomy(self.centers, self, "freshwater", self.p.fw_dir) # Preferred one # self.taxonomy = self.taxonomy_silva def run(self): # Combine reads but in fastq format this time # paths = [sample.renamed for sample in self.cluster] shell_output("cat %s > %s" % (" ".join(paths), self.reads)) # Clean # shutil.rmtree(self.p.clusters_dir) # Run command # cdhit = sh.Command(cdhit_script) cdhit("-i", self.reads, "-o", self.p.clusters_dir, "-p", TmpFile.from_string("[ACTG]")) # Create the centers file with good names # self.cdhit_centers.rename_with_num("OTU-", self.centers) @property_cached def cluster_counts_table(self): """Create the unfiltered OTU table""" # Put results in a dict of dicts # result = defaultdict(lambda: defaultdict(int)) # Loop # for line in self.cdhit_clusters: if line.startswith(">"): otu = "OTU-%s" % line.split()[1] continue nums = re.findall(">run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)\.\.\.", line) if nums: run_num, pool_num, sample_num, read_num = map(int, nums[0]) sample = illumitag.runs[run_num][pool_num - 1][sample_num - 1] name = sample.short_name else: nums = re.findall(">run([0-9]+)_sample([0-9]+)_read([0-9]+)\.\.\.", line) run_num, sample_num, read_num = map(int, nums[0]) sample = [ s for s in illumitag.presamples + illumitag.pyrosamples if s.run_num == run_num and s.num == sample_num ][0] name = sample.short_name # Count # result[otu][name] += 1 # Return # result = pandas.DataFrame(result) result = result.fillna(0) result = result.astype(int) result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1) # Remove OTUs that are only one read # return result