def combine_rerun_with_orig(self): """Special case when a sample with low reads was rerun in an other pool. Run this just before the combine_reads() method of the associated cluster. This method is called on the reruned sampled, not the original.""" # Check we have a rerun # if self.info.get('rerun') is None: return False # Check we are processed # assert self.fasta.count > 0 # Get the original sample # run, pool, num = self.info['rerun']['run'], self.info['rerun']['pool'], self.info['rerun']['num'] orig_sample = illumitag.runs[run][pool-1][num-1] merged = FASTA(orig_sample.base_dir + 'rerun_merged.fasta') # Check we don't merge twice # assert orig_sample.count == orig_sample.fasta.count # Do it # merged.create() merged.add(orig_sample.fasta) merged.add(self.fasta) merged.close() merged.rename_with_num(orig_sample.name + '_read', orig_sample.fasta) merged.remove() # Check # orig_sample.fasta = FASTA(orig_sample.fasta.path) assert orig_sample.count < orig_sample.fasta.count return True
def fasta(self): """The fasta file containing the filtered genes of this cluster The names now will correspond to long descriptive names""" fasta = FASTA(self.p.fasta) if not fasta: fasta.create() for gene in self.filtered_genes: fasta.add_str(str(gene), name=gene.name) fasta.close() return fasta
def fasta(self): """Make a fasta file with all uniprot proteins that are related to this family.""" fasta = FASTA(self.p.proteins) if not fasta.exists: fasta.create() for seq in pfam.fasta: if self.fam_name in seq.description: fasta.add_seq(seq) fasta.close() assert fasta # Return # return fasta
def test(self): """Search one sequence, and see if it works.""" # New directory # directory = new_temp_dir() # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) # seq = """ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG TTTAATTACAGACCTGAA""" seq = seq.replace('\n','') seq = seq.replace(' ','') # Make input # input_fasta = FASTA(directory + 'input.fasta') input_fasta.create() input_fasta.add_str(seq, "My test sequence") input_fasta.close() # Make output # out_path = directory + 'output.blast' # Make extras parameters # params = {'-outfmt': 0, '-evalue': 1e-5, '-perc_identity': 99} # Make the search # search = SeqSearch(input_fasta, self.blast_db, 'nucl', 'blast', num_threads = 1, out_path = out_path, params = params) # Run it # search.run() # Print result # print "Success", directory
def test(self): """Search one sequence, and see if it works.""" # New directory # directory = new_temp_dir() # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) # seq = """ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG TTTAATTACAGACCTGAA""" seq = seq.replace('\n','') seq = seq.replace(' ','') # Make input # input_fasta = FASTA(directory + 'input.fasta') input_fasta.create() input_fasta.add_str(seq, "My test sequence") input_fasta.close() # Make output # out_path = directory + 'output.blast' # Make extras parameters # params = {'-outfmt': 0, '-evalue': 1e-5, '-perc_identity': 99} # Make the search # search = SeqSearch(input_fasta, self.blast_db, 'nucl', 'blast', num_threads = 1, out_path = out_path, params = params) # Run it # search.run() # Print result # print("Success", directory)
fnas_genes = [strip(seq) for seq in fna] print faa, len(set(fnas_genes) ^ set(faas_genes)), "discrepancies" #print "- in fna but not in faa:", [x for x in set(fnas_genes) - set(faas_genes)] #print "- in faa but not in fna:", [x for x in set(faas_genes) - set(fnas_genes)] #print "" fnas_genes = [strip(seq) for fna in fnas for seq in fna] print len(fnas_genes), len(set(fnas_genes)) for genome in faas: out_path = genomes_dir + genome.short_prefix + '.fasta' out_fasta = FASTA(out_path) out_fasta.create() for seq in genome: out_fasta.add_str(str(seq.seq), strip(seq)) out_fasta.close() out_fasta.gzip_to() out_fasta.remove() def lines(): for genome in faas: for gene in genome: name = strip(gene) yield name + '\t' + gene.description[len(name):].rstrip( ' |') + '\n' annotations_path = current_dir + '../ld12/data/annotations.tsv' with open(annotations_path, 'w') as handle: handle.writelines(lines())
class PairedFASTA(object): """Read and write FASTA file pairs without using too much RAM""" format = 'fasta' def __len__(self): return self.count def __iter__(self): return self.parse() def __nonzero__(self): return bool(self.fwd) and bool(self.rev) def __repr__(self): return '<%s object on "%s" and "%s">' % \ (self.__class__.__name__, self.fwd.path, self.rev.path) def __enter__(self): return self.create() def __exit__(self, exc_type, exc_value, traceback): self.close() @property def exists(self): return self.fwd.exists and self.rev.exists def __init__(self, fwd, rev, parent=None): # FASTA objects # self.fwd = FASTA(fwd) self.rev = FASTA(rev) # Extra # self.gzipped = self.fwd.gzipped self.parent = parent @property_cached def count(self): assert self.fwd.count == self.rev.count return self.fwd.count def open(self): self.fwd.open() self.rev.open() def parse(self): return izip(self.fwd.parse(), self.rev.parse()) def close(self): self.fwd.close() self.rev.close() def create(self): self.fwd.create() self.rev.create() return self def add(self, f, r): return self.add_pair((f, r)) def add_pair(self, pair): self.fwd.add_seq(pair[0]) self.rev.add_seq(pair[1]) def remove(self): self.fwd.remove() self.rev.remove() @property def progress(self): """Just like self.parse but display a progress bar""" return tqdm(self, total=len(self)) def subsample(self, down_to, dest_pair=None): # Check size # assert down_to < len(self) # Make new pair of files # if dest_pair is None: dest_fwd_path = self.fwd_path.new_name_insert("subsampled") dest_rev_path = self.rev_path.new_name_insert("subsampled") dest_pair = self.__class__(dest_fwd_path, dest_rev_path) # Do it # dest_pair.create() for pair in isubsample(self, down_to): dest_pair.add_pair(pair) self.subsampled.close() # Did it work # assert len(dest_pair) == down_to #------------------------------- Extensions ------------------------------# def parse_primers(self, *args, **kwargs): fwd_gen = self.fwd.parse_primers(*args, **kwargs) rev_gen = self.rev.parse_primers(*args, **kwargs) generator = izip(fwd_gen, rev_gen) return GenWithLength(generator, len(fwd_gen))
class PairedFASTA(object): """Read and write FASTA file pairs without using too much RAM""" format = 'fasta' def __len__(self): return self.count def __iter__(self): return self.parse() def __nonzero__(self): return bool(self.fwd) and bool(self.rev) def __repr__(self): return '<%s object on "%s" and "%s">' % \ (self.__class__.__name__, self.fwd.path, self.rev.path) def __enter__(self): return self.create() def __exit__(self, exc_type, exc_value, traceback): self.close() @property def exists(self): return self.fwd.exists and self.rev.exists def __init__(self, fwd, rev, parent=None): # FASTA objects # self.fwd = FASTA(fwd) self.rev = FASTA(rev) # Extra # self.gzipped = self.fwd.gzipped self.parent = parent @property_cached def count(self): assert self.fwd.count == self.rev.count return self.fwd.count def open(self): self.fwd.open() self.rev.open() def parse(self): return izip(self.fwd.parse(), self.rev.parse()) def close(self): self.fwd.close() self.rev.close() def create(self): self.fwd.create() self.rev.create() return self def add(self, f, r): return self.add_pair((f,r)) def add_pair(self, pair): self.fwd.add_seq(pair[0]) self.rev.add_seq(pair[1]) def remove(self): self.fwd.remove() self.rev.remove() @property def progress(self): """Just like self.parse but display a progress bar""" return tqdm(self, total=len(self)) def subsample(self, down_to, dest_pair=None): # Check size # assert down_to < len(self) # Make new pair of files # if dest_pair is None: dest_fwd_path = self.fwd_path.new_name_insert("subsampled") dest_rev_path = self.rev_path.new_name_insert("subsampled") dest_pair = self.__class__(dest_fwd_path, dest_rev_path) # Do it # dest_pair.create() for pair in isubsample(self, down_to): dest_pair.add_pair(pair) self.subsampled.close() # Did it work # assert len(dest_pair) == down_to #------------------------------- Extensions ------------------------------# def parse_primers(self, *args, **kwargs): fwd_gen = self.fwd.parse_primers(*args, **kwargs) rev_gen = self.rev.parse_primers(*args, **kwargs) generator = izip(fwd_gen, rev_gen) return GenWithLength(generator, len(fwd_gen))
seq = seq.split('[')[0] return seq for faa,fna in zip(faas, fnas): faas_genes = [strip(seq) for seq in faa] fnas_genes = [strip(seq) for seq in fna] print faa, len(set(fnas_genes) ^ set(faas_genes)), "discrepancies" #print "- in fna but not in faa:", [x for x in set(fnas_genes) - set(faas_genes)] #print "- in faa but not in fna:", [x for x in set(faas_genes) - set(fnas_genes)] #print "" fnas_genes = [strip(seq) for fna in fnas for seq in fna] print len(fnas_genes), len(set(fnas_genes)) for genome in faas: out_path = genomes_dir + genome.short_prefix + '.fasta' out_fasta = FASTA(out_path) out_fasta.create() for seq in genome: out_fasta.add_str(str(seq.seq), strip(seq)) out_fasta.close() out_fasta.gzip_to() out_fasta.remove() def lines(): for genome in faas: for gene in genome: name = strip(gene) yield name + '\t' + gene.description[len(name):].rstrip(' |') + '\n' annotations_path = current_dir + '../ld12/data/annotations.tsv' with open(annotations_path, 'w') as handle: handle.writelines(lines())
class Foraminifera(Database): """This is a custom database containing exlcusively Foraminifera sequences. https://genev.unige.ch/research/laboratory/Jan-Pawlowski You should place the file "foram_db_cor.fasta" in: ~/databases/foraminifera/ Then you can run this: from seqsearch.databases.foraminifera import foraminifera foraminifera.process() print foraminifera.tax_depth_freq """ short_name = "foraminifera" long_name = 'The custom made Foraminifera database as received by email on 7th April 2017' all_paths = """ /foram_db_cor.fasta /foram_mothur.fasta /foram_mothur.tax """ @property def rank_names(self): """The names of the ranks. Total 9 ranks.""" return ['Domain', # 0 'Kingdom', # 1 'Phylum', # 2 'Class', # 3 'Order', # 4 'Family', # 5 'Tribe', # 6 'Genus', # 7 'Species'] # 8 def __init__(self, base_dir=None): # Base directory # if base_dir is None: base_dir = home self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # The results # self.alignment = FASTA(self.p.mothur_fasta) self.taxonomy = FilePath(self.p.mothur_tax) # The part that mothur will use for naming files # self.nickname = "foram_mothur" def process(self): # The file that was received by email without documentation T_T # raw = FASTA(self.p.cor) # Open files # self.alignment.create() self.taxonomy.create() # Loop # for seq in raw: # Parse # name = seq.id[11:].split('|') num = name.pop(0) # Check # for x in name: assert ';' not in x for x in name: assert '\t' not in x # Make ranks # ranks = ['Eukaryota' , # 0 Domain 'Rhizaria' , # 1 Kingdom 'Foraminifera' , # 2 Phylum name[0] , # 3 Class name[1] , # 4 Order name[2] , # 5 Family name[3] , # 6 Tribe name[4] , # 7 Genus name[5]] # 8 Species # The taxonomy string # tax_line = ';'.join(ranks) # Add sequence to the new fasta file # self.alignment.add_str(str(seq.seq), name="foram" + num) # Add the taxonomy to the tax file # self.taxonomy.add_str("foram" + num + '\t' + tax_line + '\n') # Close files # self.alignment.close() self.taxonomy.close()
class Cluster(object): """Analyzes a group of samples.""" all_paths = """ /reads/all_reads.fasta /otus/ /logs/ /report/report.pdf /metadata.csv """ def __repr__(self): return '<%s object "%s" with %i samples>' % (self.__class__.__name__, self.name, len(self.samples)) def __iter__(self): return iter(self.samples) def __len__(self): return len(self.samples) def __getitem__(self, key): if isinstance(key, basestring): return [c for c in self.children if c.short_name == key.lower()][0] elif isinstance(key, int) and hasattr(self.first, 'num'): return [c for c in self.children if c.num == key][0] else: return self.children[key] @property def first(self): return self.children[0] @property def count_seq(self): return sum([len(sample) for sample in self]) def __init__(self, samples, name, base_dir=None): # Save samples # self.name = name self.samples, self.children = samples, samples # Check names are unique # names = [s.short_name for s in samples if s.used] assert len(names) == len(set(names)) # Figure out pools # self.pools = list(set([s.pool for s in self.samples])) self.pools.sort(key = lambda x: x.id_name) # Directory # if base_dir: self.base_dir = base_dir else: self.base_dir = illumitag.view_dir + "clusters/" + self.name + '/' # Loaded # self.loaded = False def load(self): """A second __init__ that is delayed and called only if needed""" # Load the pools and samples # for p in self.pools: p.load() for s in self.samples: s.load() # Dir # self.p = AutoPaths(self.base_dir, self.all_paths) # Figure out if it's a project # if set(self.samples) == set(self.first.pool.project.samples): self.project = self.first.pool.project else: self.project = None # Runner # self.runner = ClusterRunner(self) # FASTA # self.reads = FASTA(self.p.all_reads_fasta) # OTU picking # self.otu_uparse = UparseOTUs(self) self.otu_uclust = UclustOTUs(self) self.otu_cdhit = CdhitOTUs(self) # Preferred # self.otus = self.otu_uparse # Simple reporting # self.reporter = ClusterReporter(self) # Full report # self.report = ClusterReport(self) # Loaded # self.loaded = True # Return self for convenience # return self def run(self, *args, **kwargs): self.runner.run(*args, **kwargs) def run_slurm(self, *args, **kwargs): self.runner.run_slurm(*args, **kwargs) def process_samples(self): for sample in tqdm(self): sample.process() def combine_reads(self): """This is the first function should call. It will combine all the reads of all the samples of this cluster into one big FASTA file.""" paths = [sample.fasta.path for sample in self] shell_output('cat %s > %s' % (' '.join(paths), self.reads)) return self.reads def set_size(self, length): """Trim all sequences to a specific length starting from the end.""" self.size_trimmed = FASTA(new_temp_path()) def trim_iterator(reads): for read in reads: if len(read) < length: continue yield read[-length:] self.size_trimmed.write(trim_iterator(self.reads)) self.size_trimmed.close() # Replace it # self.reads.remove() shutil.move(self.size_trimmed, self.reads) def run_uparse(self): self.otu_uparse.run() @property def metadata(self): return pandas.DataFrame([s.info for s in self], index=[s.short_name for s in self]) def export_metadata(self): self.metadata.to_csv(self.p.metadata, sep='\t', encoding='utf-8')
# Get clustering values # r1, r2 = list(set([p.run for p in proj])) r1.parse_report_xml() r2.parse_report_xml() print float(r1.report_stats['fwd']['DensityPF']) / float(r1.report_stats['fwd']['DensityRaw']) print float(r2.report_stats['fwd']['DensityPF']) / float(r2.report_stats['fwd']['DensityRaw']) # Check below 400 bp sequences # folder = DirectoryPath(illumitag.projects['evaluation'].base_dir + "below_400/") over = FASTA(folder + "reads.fasta") def over_iterator(reads, max_length=400): for read in reads: if len(read) <= max_length: yield read over.create() for pool in pools: over.add_iterator(over_iterator(pool.good_barcodes.assembled.good_primers.qual_filtered)) over.close() over.graphs[-1].plot() crest = SimpleCrestTaxonomy(over, folder) crest.assign() crest.composition.graph.plot() rdp = SimpleRdpTaxonomy(over, folder) rdp.assign() rdp.composition.graph.plot() # Check unassembled mate pairs # unassembled = [p.good_barcodes.unassembled for p in pools] paths = [u.flipped_reads.path for u in unassembled] folder = DirectoryPath(illumitag.projects['evaluation'].base_dir + "unassembled_taxonomy/") all_unassembled = FASTA(folder + 'unassembled_reads.fasta') shell_output('cat %s > %s' % (' '.join(paths), all_unassembled)) tax = SimpleRdpTaxonomy(all_unassembled, folder)