class QualityReads(object): """A set of sequences determined to be quality controlled""" all_paths = """ /mothur_reads.fasta /mothur_reads.qual /mothur_groups.tsv /qiime_reads.fasta /only_used_samples.fasta /trimmed.fasta """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __len__(self): return len(self.trimmed) def __init__(self, path, parent): # Save parent # self.parent, self.pool = parent, parent self.samples = parent.samples # Auto paths # self.base_dir = parent.p.quality_dir + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Files # self.untrimmed = BarcodedFASTQ(path, samples=self.samples) self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples) self.trimmed = FASTA(self.p.trimmed) # Qiime output # self.qiime_fasta = FASTA(self.p.qiime_fasta) # Mothur # self.mothur_fasta = FASTA(self.p.mothur_fasta) self.mothur_qual = QualFile(self.p.mothur_qual) self.mothur_groups = FilePath(self.p.mothur_groups) # Primer size # self.trim_fwd = self.pool.samples.trim_fwd self.trim_rev = self.pool.samples.trim_rev def filter_unused(self): def no_unused_iterator(reads): for r in reads.parse_barcodes(): if r.first.sample.used: yield r.read self.only_used.write(no_unused_iterator(self.untrimmed)) def trim_primers(self): def no_primers_iterator(reads): for read in reads: yield read[self.trim_fwd:-self.trim_rev] self.trimmed.write(no_primers_iterator(self.only_used)) def make_mothur_output(self): # Trimmed fasta # self.mothur_fasta.link_from(self.trimmed.path) # The groups file # self.mothur_groups.create() for r in self.only_used.parse_barcodes(): sample_name = r.first.sample.short_name read_name = '%s\t%s\n' % (r.read.id, sample_name) self.mothur_groups.handle.write(read_name) self.mothur_groups.close() def make_qiime_output(self): # Prepare fasta writer # handle = open(self.qiime_fasta.path, 'w') writer = FastaWriter(handle, wrap=0) writer.write_header() # Counter # counter = defaultdict(int) # Do it # for r in self.only_used.parse_barcodes(): sample_name = r.first.sample.short_name counter[sample_name] += 1 r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id) bar_seq = r.read.seq[0:self.pool.bar_len] r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq) writer.write_record(r.read[self.trim_fwd:-self.trim_rev]) # Close # writer.write_footer() handle.close()
class Foraminifera(Database): """This is a custom database containing exlcusively Foraminifera sequences. https://genev.unige.ch/research/laboratory/Jan-Pawlowski You should place the file "foram_db_cor.fasta" in: ~/databases/foraminifera/ Then you can run this: from seqsearch.databases.foraminifera import foraminifera foraminifera.process() print foraminifera.tax_depth_freq """ short_name = "foraminifera" long_name = 'The custom made Foraminifera database as received by email on 7th April 2017' all_paths = """ /foram_db_cor.fasta /foram_mothur.fasta /foram_mothur.tax """ @property def rank_names(self): """The names of the ranks. Total 9 ranks.""" return ['Domain', # 0 'Kingdom', # 1 'Phylum', # 2 'Class', # 3 'Order', # 4 'Family', # 5 'Tribe', # 6 'Genus', # 7 'Species'] # 8 def __init__(self, base_dir=None): # Base directory # if base_dir is None: base_dir = home self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # The results # self.alignment = FASTA(self.p.mothur_fasta) self.taxonomy = FilePath(self.p.mothur_tax) # The part that mothur will use for naming files # self.nickname = "foram_mothur" def process(self): # The file that was received by email without documentation T_T # raw = FASTA(self.p.cor) # Open files # self.alignment.create() self.taxonomy.create() # Loop # for seq in raw: # Parse # name = seq.id[11:].split('|') num = name.pop(0) # Check # for x in name: assert ';' not in x for x in name: assert '\t' not in x # Make ranks # ranks = ['Eukaryota' , # 0 Domain 'Rhizaria' , # 1 Kingdom 'Foraminifera' , # 2 Phylum name[0] , # 3 Class name[1] , # 4 Order name[2] , # 5 Family name[3] , # 6 Tribe name[4] , # 7 Genus name[5]] # 8 Species # The taxonomy string # tax_line = ';'.join(ranks) # Add sequence to the new fasta file # self.alignment.add_str(str(seq.seq), name="foram" + num) # Add the taxonomy to the tax file # self.taxonomy.add_str("foram" + num + '\t' + tax_line + '\n') # Close files # self.alignment.close() self.taxonomy.close()