def __init__(self, fwd, rev, parent=None): # FASTA objects # self.fwd = FASTA(fwd) self.rev = FASTA(rev) # Extra # self.gzipped = self.fwd.gzipped self.parent = parent
def __init__( self, query_path, # The input sequences db_path=pfam.hmm_db, # The database to search seq_type='prot' or 'nucl', # The seq type of the query_path file e_value=0.001, # The search threshold params=None, # Add extra params for the command line out_path=None, # Where the results will be dropped executable=None, # If you want a specific binary give the path cpus=None): # The number of threads to use # Save attributes # self.query = FASTA(query_path) self.db = FilePath(db_path) self.params = params if params else {} self.e_value = e_value self.seq_type = seq_type self.executable = FilePath(executable) # Cores to use # if cpus is None: self.cpus = min(multiprocessing.cpu_count(), 32) else: self.cpus = cpus # Auto detect database short name # if db_path == 'pfam': self.db = pfam.hmm_db if db_path == 'tigrfam': self.db = tigrfam.hmm_db # Output # if out_path is None: self.out_path = FilePath(self.query.prefix_path + '.hmmout') elif out_path.endswith('/'): self.out_path = FilePath(out_path + self.query.prefix + '.hmmout') else: self.out_path = FilePath(out_path)
def generate_values(path, progress=False): seqs = SeqIO.parse(path, 'fasta') if not progress: for seq in seqs: yield (seq.id, seq.description, str(seq.seq)) if progress: for seq in tqdm(GenWithLength(seqs, len(FASTA(path)))): yield (seq.id, seq.description, str(seq.seq))
def load(self): """A second __init__ that is delayed and called only if needed""" # Load the pools and samples # for p in self.pools: p.load() for s in self.samples: s.load() # Dir # self.p = AutoPaths(self.base_dir, self.all_paths) # Figure out if it's a project # if set(self.samples) == set(self.first.pool.project.samples): self.project = self.first.pool.project else: self.project = None # Runner # self.runner = ClusterRunner(self) # FASTA # self.reads = FASTA(self.p.all_reads_fasta) # OTU picking # self.otu_uparse = UparseOTUs(self) self.otu_uclust = UclustOTUs(self) self.otu_cdhit = CdhitOTUs(self) # Preferred # self.otus = self.otu_uparse # Simple reporting # self.reporter = ClusterReporter(self) # Full report # self.report = ClusterReport(self) # Loaded # self.loaded = True # Return self for convenience # return self
def load(self): """A second __init__ that is delayed and called only if needed""" # Check files are there # for f in self.sff_files_info: if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path']) # Automatic paths # self.base_dir = self.out_dir + self.id_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Make an alias to the json # self.p.info_json.link_from(self.json_path, safe=True) # Primer # self.primer_regex = re.compile(self.info['primer']) # Raw files # self.raw_fasta = FASTA(self.p.raw_fasta) self.raw_fastq = FASTQ(self.p.raw_fastq) # Standard FASTA # self.reads = FASTA(self.p.reads_fasta) self.fasta = FASTA(self.p.renamed) # Special FASTQ # self.fastq = FASTQ(self.p.reads_fastq) # A shameless hack for cdhit to work # self.renamed = self.fastq # Pre-denoised special case # if self.info['predenoised'] and False: self.sff_files_info = [] self.reads.link_from(self.info['predenoised'], safe=True) # Special submission attributes # self.sra = PyroSampleSRA(self) # Loaded # self.loaded = True # Return self for convenience # return self
def read_file(self, fp): ''' Read the first FASTA record from the content of fp, and set the chromosome name and sequence using set_chromosome method. ''' if self.verbose: print >> stderr, "reading a FASTA record to set a chromosome" fasta = FASTA(fp=fp, verbose=self.verbose) chr_name, chr_seq = fasta.get_record() if chr_name and chr_seq: chr_name = chr_name[1:] self.set_chromosome(chr_name, chr_seq) elif not chr_name and not chr_seq: raise NoChromosomeFoundError(fp.name, chr_name, chr_seq) else: raise ChromosomeFASTAFromatError(fp.name, chr_name, chr_seq)
def fasta(self): """The fasta file containing the filtered genes of this cluster The names now will correspond to long descriptive names""" fasta = FASTA(self.p.fasta) if not fasta: fasta.create() for gene in self.filtered_genes: fasta.add_str(str(gene), name=gene.name) fasta.close() return fasta
def main(): args = parse_args() dihedrals = read_dihedrals() fasta = FASTA(args.fasta) fasta.read() (peptides, mhcSeq, mhcAllele) = totalNineMers(fasta) #grooves = readGrooves(args.grooves, mhcSeq, peptides) universalGrooves = universalGroove(args.grooves, mhcSeq, peptides) intersectGrooves = IntersectionGroove(args.grooves, mhcSeq, peptides) #for u in universalGrooves: # print (u, universalGrooves[u]) #for u in intersectGrooves: # print (intersectGrooves[u]) labels = read_rmsd_file(args.rms) pdbids = read_datafile(args.t) outputfilehandler = open(args.pdbids, 'w') for pdbid in pdbids: if pdbid in dihedrals: if args.pep: finalSeqCode = oneHotEncoding(peptides[pdbid]) finalLabelCode = dihedrals[pdbid] if args.label == 'x': print(', '.join(finalSeqCode)) outputfilehandler.write(pdbid + '\n') elif args.label == 'y': print(', '.join(finalLabelCode)) outputfilehandler.write(pdbid + '\n') else: finalSeqCode = oneHotEncoding(universalGrooves[pdbid] + peptides[pdbid]) finalLabelCode = dihedrals[pdbid] if args.label == 'x': print(', '.join(finalSeqCode)) outputfilehandler.write(pdbid + '\n') elif args.label == 'y': print(', '.join(finalLabelCode)) outputfilehandler.write(pdbid + '\n') outputfilehandler.close()
class Silva(Database): """SILVA provides comprehensive, quality checked and regularly updated datasets of aligned small (16S/18S, SSU) and large subunit (23S/28S, LSU) ribosomal RNA (rRNA) sequences for all three domains of life (Bacteria, Archaea and Eukarya). SILVA are the official databases of the software package ARB. https://www.arb-silva.de To install: from seqsearch.databases.silva import silva silva.download() silva.unzip() It will put it in ~/databases/silva_xxx/ """ view_url = "https://www.arb-silva.de/no_cache/download/archive/" base_url = "https://www.arb-silva.de/fileadmin/silva_databases/" short_name = "silva" all_paths = """ /test.txt """ def __init__(self, version, seq_type, base_dir=None): # Attributes # self.version = version self.seq_type = seq_type self.short_name = self.short_name + "_" + self.version # Base directory # if base_dir is None: base_dir = home self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # URL # self.url = "release_%s/Exports/" % self.version # The database # self.nr99_name = "SILVA_%s_SSURef_Nr99_tax_silva.fasta.gz" % self.version self.nr99_dest = FASTA(self.base_dir + self.nr99_name) self.nr99 = FASTA(self.base_dir + self.nr99_name[:-3]) # The alignment # self.aligned_name = "SILVA_%s_SSURef_Nr99_tax_silva_full_align_trunc.fasta.gz" % self.version self.aligned_dest = FASTA(self.base_dir + self.aligned_name) self.aligned = FASTA(self.base_dir + self.aligned_name[:-3]) def download(self): self.nr99_dest.directory.create(safe=True) self.nr99_dest.remove(safe=True) self.aligned_dest.remove(safe=True) print "\nDownloading", self.base_url + self.url + self.nr99_name wget.download(self.base_url + self.url + self.nr99_name, out=self.nr99_dest.path) print "\nDownloading", self.base_url + self.url + self.aligned_name wget.download(self.base_url + self.url + self.aligned_name, out=self.aligned_dest.path) def unzip(self): self.nr99_dest.ungzip_to(self.nr99) self.nr99.permissions.only_readable() self.aligned_dest.ungzip_to(self.aligned) self.aligned.permissions.only_readable()
def to_fasta(self, path, verbose=False): # Select verbosity # import tqdm wrapper = tqdm.tqdm if verbose else lambda x: x # Do it # with open(path, 'w') as handle: for r in wrapper(self): SeqIO.write(r, handle, 'fasta') # Return # return FASTA(path)
def fresh_fasta(self): """A file containing all the fresh water genes""" fasta = FASTA(self.p.fresh_fasta) if not fasta.exists: print "Building fasta file with all fresh genes..." fresh = [g for g in genomes.values() if g.fresh] shell_output('gunzip -c %s > %s' % (' '.join(fresh), fasta)) assert len(fasta) == sum(map(len, fresh)) self.timer.print_elapsed() return fasta
def fasta(self): """Make a fasta file with all uniprot proteins that are related to this family.""" fasta = FASTA(self.p.proteins) if not fasta.exists: fasta.create() for seq in pfam.fasta: if self.fam_name in seq.description: fasta.add_seq(seq) fasta.close() assert fasta # Return # return fasta
def __init__(self, version, seq_type, base_dir=None): # Attributes # self.version = version self.seq_type = seq_type self.short_name = self.short_name + "_" + self.version # Base directory # if base_dir is None: base_dir = home self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # URL # self.url = "release_%s/Exports/" % self.version # The database # self.nr99_name = "SILVA_%s_SSURef_Nr99_tax_silva.fasta.gz" % self.version self.nr99_dest = FASTA(self.base_dir + self.nr99_name) self.nr99 = FASTA(self.base_dir + self.nr99_name[:-3]) # The alignment # self.aligned_name = "SILVA_%s_SSURef_Nr99_tax_silva_full_align_trunc.fasta.gz" % self.version self.aligned_dest = FASTA(self.base_dir + self.aligned_name) self.aligned = FASTA(self.base_dir + self.aligned_name[:-3])
def __init__(self, base_dir=None): # Base directory # if base_dir is None: base_dir = home self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # The results # self.alignment = FASTA(self.p.mothur_fasta) self.taxonomy = FilePath(self.p.mothur_tax) # The part that mothur will use for naming files # self.nickname = "foram_mothur"
def main(): args = parse_args() fasta = FASTA(args.fasta) fasta.read() (peptides, mhcSeq, mhcAllele) = totalNineMers(fasta) #grooves = readGrooves(args.grooves, mhcSeq, peptides) universalGrooves = universalGroove(args.grooves, mhcSeq, peptides) intersectGrooves = IntersectionGroove(args.grooves, mhcSeq, peptides) labels = read_rmsd_file(args.rms) pdbids = read_datafile(args.t) aaindex = Aaindex() #for result in aaindex.search('charge'): # print(result) record = aaindex.get('FASG890101') #print (record.title) index_data = record.index_data #print (index_data) charge = aaindex.get('KLEP840101') charge_data = charge.index_data #print (charge_data) for l in labels: (pdbid1, pdbid2) = l.split('_') #if pdbid1 in pdbids and pdbid2 in pdbids: if pdbid1 in pdbids or pdbid2 in pdbids: if args.pep: finalSeqCode, finalLabelCode = oneHotEncoding(peptides[pdbid1]+'|'+peptides[pdbid2], labels[l], index_data, charge_data) if args.label == 'x': print (', '.join(finalSeqCode)) elif args.label == 'y': print (', '.join(finalLabelCode)) else: finalSeqCode, finalLabelCode = oneHotEncoding(universalGrooves[pdbid1]+peptides[pdbid1]+'|'+universalGrooves[pdbid2]+peptides[pdbid2], labels[l], index_data, charge_data) if args.label == 'x': print (', '.join(finalSeqCode)) elif args.label == 'y': print (', '.join(finalLabelCode))
def __init__(self, path, parent): # Save parent # self.parent, self.pool = parent, parent self.samples = parent.samples # Auto paths # self.base_dir = parent.p.quality_dir + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Files # self.untrimmed = BarcodedFASTQ(path, samples=self.samples) self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples) self.trimmed = FASTA(self.p.trimmed) # Qiime output # self.qiime_fasta = FASTA(self.p.qiime_fasta) # Mothur # self.mothur_fasta = FASTA(self.p.mothur_fasta) self.mothur_qual = QualFile(self.p.mothur_qual) self.mothur_groups = FilePath(self.p.mothur_groups) # Primer size # self.trim_fwd = self.pool.samples.trim_fwd self.trim_rev = self.pool.samples.trim_rev
def main(): args = parse_args() fasta = FASTA(args.fasta) fasta.read() peptides, alleles = totalNineMers(fasta) pdbids = peptides.keys() testsetlen = int(args.percent * len(pdbids)) trainset = [] testset = [] for p in pdbids: r = random() if len(testset) < testsetlen and r < 0.5 and alleles[p] == 'A0201': testset.append(p) else: trainset.append(p) write_to_file('train.txt', trainset) write_to_file('test.txt', testset)
def set_size(self, length): """Trim all sequences to a specific length starting from the end.""" self.size_trimmed = FASTA(new_temp_path()) def trim_iterator(reads): for read in reads: if len(read) < length: continue yield read[-length:] self.size_trimmed.write(trim_iterator(self.reads)) self.size_trimmed.close() # Replace it # self.reads.remove() shutil.move(self.size_trimmed, self.reads)
def main(): args = parse_args() fasta = FASTA(args.fasta) fasta.read() peptides = totalNineMers(fasta) pdbids = peptides.keys() testsetlen = int(args.percent * len(pdbids)) trainset = [] testset = [] for i in range(0, len(pdbids)): r = random() if len(testset) < testsetlen and r < 0.5: testset.append(pdbids[i]) else: trainset.append(pdbids[i]) write_to_file('train/90_10/train.txt', trainset) write_to_file('test/90_10/test.txt', testset)
def main(): args = parse_args() fasta = FASTA(args.fasta) fasta.read() (peptides, mhcSeq, mhcAllele) = totalNineMers(fasta) #grooves = readGrooves(args.grooves, mhcSeq, peptides) universalGrooves = universalGroove(args.grooves, mhcSeq, peptides) intersectGrooves = IntersectionGroove(args.grooves, mhcSeq, peptides) #for u in universalGrooves: # print (u, universalGrooves[u]) #for u in intersectGrooves: # print (intersectGrooves[u]) labels = read_rmsd_file(args.rms) pdbids = read_datafile(args.t) for l in labels: (pdbid1, pdbid2) = l.split('_') #if pdbid1 in pdbids and pdbid2 in pdbids: if pdbid1 in pdbids or pdbid2 in pdbids: if args.pep: finalSeqCode, finalLabelCode = oneHotEncoding( peptides[pdbid1] + '|' + peptides[pdbid2], labels[l]) if args.label == 'x': print(', '.join(finalSeqCode)) elif args.label == 'y': print(', '.join(finalLabelCode)) else: finalSeqCode, finalLabelCode = oneHotEncoding( universalGrooves[pdbid1] + peptides[pdbid1] + '|' + universalGrooves[pdbid2] + peptides[pdbid2], labels[l]) if args.label == 'x': print(', '.join(finalSeqCode)) elif args.label == 'y': print(', '.join(finalLabelCode))
def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + "/" self.p = AutoPaths(self.base_dir, self.all_paths) # Main reads file here FASTQ # self.reads = FASTQ(self.p.all_reads) # Files # self.cdhit_clusters = FilePath(self.p.clstr) self.cdhit_centers = FASTA(self.p.clusters_dir + "OTU") self.centers = FASTA(self.p.centers) # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, "silvamod", self.p.silva) self.taxonomy_fw = CrestTaxonomy(self.centers, self, "freshwater", self.p.fw_dir) # Preferred one # self.taxonomy = self.taxonomy_silva
def __init__(self, query_path, db_path, seq_type = 'prot' or 'nucl', # The seq type of the query_path file params = None, # Add extra params for the command line algorithm = "blastn" or "blastp", # Will be auto-determined with seq_type out_path = None, # Where the results will be dropped executable = None, # If you want a specific binary give the path cpus = None, # The number of threads to use num = None, # When parallelized, the number of this thread _out = None, # Store the stdout at this path _err = None): # Store the stderr at this path # Main input # self.query = FASTA(query_path) # The database to search against # self.db = FilePath(db_path) # Other attributes # self.seq_type = seq_type self.algorithm = algorithm self.num = num self.params = params if params else {} # The standard output and error # self._out = _out self._err = _err # Output defaults # if out_path is None: self.out_path = self.query.prefix_path + self.extension elif out_path.endswith('/'): self.out_path = out_path + self.query.prefix + self.extension else: self.out_path = out_path # Make it a file path # self.out_path = FilePath(self.out_path) # Executable # self.executable = FilePath(executable) # Cores to use # if cpus is None: self.cpus = min(multiprocessing.cpu_count(), 32) else: self.cpus = cpus # Save the output somewhere # if self._out is True: self._out = self.out_path + '.stdout' if self._err is True: self._err = self.out_path + '.stderr'
def __init__(self, parent): # Save parent # self.parent, self.assemble_group = parent, parent self.samples = parent.samples self.pool = self.parent.pool self.primers = self.pool.primers # Auto paths # self.base_dir = parent.p.groups_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # More # self.orig_reads = self.parent.cls(self.p.orig_fastq, samples=self.samples) self.n_filtered = self.parent.cls(self.p.n_filtered, samples=self.samples) # Quality filtered # if self.parent == 'assembled': self.qual_filtered = BarcodedFASTQ(self.p.qual_filtered, samples=self.samples, primers=self.primers) self.len_filtered = BarcodedFASTQ(self.p.len_filtered_fastq, samples=self.samples, primers=self.primers) self.trimmed_barcodes = FASTA(self.p.trimmed_barcodes) # Further # self.load()
def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main FASTA file # self.reads = self.parent.reads # Files # self.all_otus = FilePath(self.p.all_otus) self.all_centers = FASTA(self.p.all_centers) self.otus = FilePath(self.base_dir + "otus.txt") self.centers = FASTA(self.base_dir + "centers.fasta") # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) # Preferred one # self.taxonomy = self.taxonomy_silva
def __init__(self, path, num_parts=None, part_size=None, base_dir=None): # Basic # self.path = path # Directory # if base_dir is None: self.base_dir = path + '.parts/' else: self.base_dir = base_dir # Num parts # if num_parts is not None: self.num_parts = num_parts # Evaluate size # if part_size is not None: self.bytes_target = humanfriendly.parse_size(part_size) self.num_parts = int( math.ceil(self.count_bytes / self.bytes_target)) # Make parts # self.make_name = lambda i: self.base_dir + "%03d/part.fasta" % i self.parts = [ FASTA(self.make_name(i)) for i in range(1, self.num_parts + 1) ] # Give a number to each part # for i, part in enumerate(self.parts): part.num = i
def test(self): """Search one sequence, and see if it works.""" # New directory # directory = new_temp_dir() # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) # seq = """ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG TTTAATTACAGACCTGAA""" seq = seq.replace('\n','') seq = seq.replace(' ','') # Make input # input_fasta = FASTA(directory + 'input.fasta') input_fasta.create() input_fasta.add_str(seq, "My test sequence") input_fasta.close() # Make output # out_path = directory + 'output.blast' # Make extras parameters # params = {'-outfmt': 0, '-evalue': 1e-5, '-perc_identity': 99} # Make the search # search = SeqSearch(input_fasta, self.blast_db, 'nucl', 'blast', num_threads = 1, out_path = out_path, params = params) # Run it # search.run() # Print result # print("Success", directory)
def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main FASTA file # self.reads = self.parent.reads # Files # self.derep = SizesFASTA(self.p.derep) self.sorted = SizesFASTA(self.p.sorted) self.centers = FASTA(self.p.centers) self.readmap = UClusterFile(self.p.readmap) # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva_dir) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) self.taxonomy_unite = CrestTaxonomy(self.centers, self, 'unite', self.p.unite_dir) self.taxonomy_rdp = RdpTaxonomy(self.centers, self) # Preferred one # self.taxonomy = self.taxonomy_silva # Source tracking # self.seqenv = Seqenv(self)
def read_fasta(args): fasta = FASTA(args.fasta) fasta.read() headers = fasta.get_headers() pep_chain = {} pep_seq = {} for header in headers: fields = header.split('|') pdbid = fields[0] chainid = fields[1] seq = fasta.get_sequence(header) if len(seq) == 9: pep_chain[pdbid] = chainid pep_seq[pdbid] = seq return (pep_chain, pep_seq)
def test(self): """Search one sequence, and see if it works.""" # New directory # directory = new_temp_dir() # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) # seq = """ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG TTTAATTACAGACCTGAA""" seq = seq.replace('\n','') seq = seq.replace(' ','') # Make input # input_fasta = FASTA(directory + 'input.fasta') input_fasta.create() input_fasta.add_str(seq, "My test sequence") input_fasta.close() # Make output # out_path = directory + 'output.blast' # Make extras parameters # params = {'-outfmt': 0, '-evalue': 1e-5, '-perc_identity': 99} # Make the search # search = SeqSearch(input_fasta, self.blast_db, 'nucl', 'blast', num_threads = 1, out_path = out_path, params = params) # Run it # search.run() # Print result # print "Success", directory
def to_fasta(self, path): with open(path, 'w') as handle: for r in self: SeqIO.write(r, handle, 'fasta') return FASTA(path)
def all_proteins(self): """The main fasta file.""" return FASTA(self.p.unzipped_proteins)
class PairedFASTA(object): """Read and write FASTA file pairs without using too much RAM""" format = 'fasta' def __len__(self): return self.count def __iter__(self): return self.parse() def __nonzero__(self): return bool(self.fwd) and bool(self.rev) def __repr__(self): return '<%s object on "%s" and "%s">' % \ (self.__class__.__name__, self.fwd.path, self.rev.path) def __enter__(self): return self.create() def __exit__(self, exc_type, exc_value, traceback): self.close() @property def exists(self): return self.fwd.exists and self.rev.exists def __init__(self, fwd, rev, parent=None): # FASTA objects # self.fwd = FASTA(fwd) self.rev = FASTA(rev) # Extra # self.gzipped = self.fwd.gzipped self.parent = parent @property_cached def count(self): assert self.fwd.count == self.rev.count return self.fwd.count def open(self): self.fwd.open() self.rev.open() def parse(self): return izip(self.fwd.parse(), self.rev.parse()) def close(self): self.fwd.close() self.rev.close() def create(self): self.fwd.create() self.rev.create() return self def add(self, f, r): return self.add_pair((f,r)) def add_pair(self, pair): self.fwd.add_seq(pair[0]) self.rev.add_seq(pair[1]) def remove(self): self.fwd.remove() self.rev.remove() @property def progress(self): """Just like self.parse but display a progress bar""" return tqdm(self, total=len(self)) def subsample(self, down_to, dest_pair=None): # Check size # assert down_to < len(self) # Make new pair of files # if dest_pair is None: dest_fwd_path = self.fwd_path.new_name_insert("subsampled") dest_rev_path = self.rev_path.new_name_insert("subsampled") dest_pair = self.__class__(dest_fwd_path, dest_rev_path) # Do it # dest_pair.create() for pair in isubsample(self, down_to): dest_pair.add_pair(pair) self.subsampled.close() # Did it work # assert len(dest_pair) == down_to #------------------------------- Extensions ------------------------------# def parse_primers(self, *args, **kwargs): fwd_gen = self.fwd.parse_primers(*args, **kwargs) rev_gen = self.rev.parse_primers(*args, **kwargs) generator = izip(fwd_gen, rev_gen) return GenWithLength(generator, len(fwd_gen))
proj.graphs[-1].plot() # Get statistics # proj.reporter.fraction_discarded proj.reporter.size_fraction_chimeras # Get clustering values # r1, r2 = list(set([p.run for p in proj])) r1.parse_report_xml() r2.parse_report_xml() print float(r1.report_stats['fwd']['DensityPF']) / float(r1.report_stats['fwd']['DensityRaw']) print float(r2.report_stats['fwd']['DensityPF']) / float(r2.report_stats['fwd']['DensityRaw']) # Check below 400 bp sequences # folder = DirectoryPath(illumitag.projects['evaluation'].base_dir + "below_400/") over = FASTA(folder + "reads.fasta") def over_iterator(reads, max_length=400): for read in reads: if len(read) <= max_length: yield read over.create() for pool in pools: over.add_iterator(over_iterator(pool.good_barcodes.assembled.good_primers.qual_filtered)) over.close() over.graphs[-1].plot() crest = SimpleCrestTaxonomy(over, folder) crest.assign() crest.composition.graph.plot() rdp = SimpleRdpTaxonomy(over, folder) rdp.assign() rdp.composition.graph.plot() # Check unassembled mate pairs #
class Foraminifera(Database): """This is a custom database containing exlcusively Foraminifera sequences. https://genev.unige.ch/research/laboratory/Jan-Pawlowski You should place the file "foram_db_cor.fasta" in: ~/databases/foraminifera/ Then you can run this: from seqsearch.databases.foraminifera import foraminifera foraminifera.process() print foraminifera.tax_depth_freq """ short_name = "foraminifera" long_name = 'The custom made Foraminifera database as received by email on 7th April 2017' all_paths = """ /foram_db_cor.fasta /foram_mothur.fasta /foram_mothur.tax """ @property def rank_names(self): """The names of the ranks. Total 9 ranks.""" return ['Domain', # 0 'Kingdom', # 1 'Phylum', # 2 'Class', # 3 'Order', # 4 'Family', # 5 'Tribe', # 6 'Genus', # 7 'Species'] # 8 def __init__(self, base_dir=None): # Base directory # if base_dir is None: base_dir = home self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # The results # self.alignment = FASTA(self.p.mothur_fasta) self.taxonomy = FilePath(self.p.mothur_tax) # The part that mothur will use for naming files # self.nickname = "foram_mothur" def process(self): # The file that was received by email without documentation T_T # raw = FASTA(self.p.cor) # Open files # self.alignment.create() self.taxonomy.create() # Loop # for seq in raw: # Parse # name = seq.id[11:].split('|') num = name.pop(0) # Check # for x in name: assert ';' not in x for x in name: assert '\t' not in x # Make ranks # ranks = ['Eukaryota' , # 0 Domain 'Rhizaria' , # 1 Kingdom 'Foraminifera' , # 2 Phylum name[0] , # 3 Class name[1] , # 4 Order name[2] , # 5 Family name[3] , # 6 Tribe name[4] , # 7 Genus name[5]] # 8 Species # The taxonomy string # tax_line = ';'.join(ranks) # Add sequence to the new fasta file # self.alignment.add_str(str(seq.seq), name="foram" + num) # Add the taxonomy to the tax file # self.taxonomy.add_str("foram" + num + '\t' + tax_line + '\n') # Close files # self.alignment.close() self.taxonomy.close()
def fasta(self): fasta = FASTA(self.autopaths.fasta) return fasta
def combine_rerun_with_orig(self): """Special case when a sample with low reads was rerun in an other pool. Run this just before the combine_reads() method of the associated cluster. This method is called on the reruned sampled, not the original.""" # Check we have a rerun # if self.info.get('rerun') is None: return False # Check we are processed # assert self.fasta.count > 0 # Get the original sample # run, pool, num = self.info['rerun']['run'], self.info['rerun']['pool'], self.info['rerun']['num'] orig_sample = illumitag.runs[run][pool-1][num-1] merged = FASTA(orig_sample.base_dir + 'rerun_merged.fasta') # Check we don't merge twice # assert orig_sample.count == orig_sample.fasta.count # Do it # merged.create() merged.add(orig_sample.fasta) merged.add(self.fasta) merged.close() merged.rename_with_num(orig_sample.name + '_read', orig_sample.fasta) merged.remove() # Check # orig_sample.fasta = FASTA(orig_sample.fasta.path) assert orig_sample.count < orig_sample.fasta.count return True
class Cluster(object): """Analyzes a group of samples.""" all_paths = """ /reads/all_reads.fasta /otus/ /logs/ /report/report.pdf /metadata.csv """ def __repr__(self): return '<%s object "%s" with %i samples>' % (self.__class__.__name__, self.name, len(self.samples)) def __iter__(self): return iter(self.samples) def __len__(self): return len(self.samples) def __getitem__(self, key): if isinstance(key, basestring): return [c for c in self.children if c.short_name == key.lower()][0] elif isinstance(key, int) and hasattr(self.first, 'num'): return [c for c in self.children if c.num == key][0] else: return self.children[key] @property def first(self): return self.children[0] @property def count_seq(self): return sum([len(sample) for sample in self]) def __init__(self, samples, name, base_dir=None): # Save samples # self.name = name self.samples, self.children = samples, samples # Check names are unique # names = [s.short_name for s in samples if s.used] assert len(names) == len(set(names)) # Figure out pools # self.pools = list(set([s.pool for s in self.samples])) self.pools.sort(key = lambda x: x.id_name) # Directory # if base_dir: self.base_dir = base_dir else: self.base_dir = illumitag.view_dir + "clusters/" + self.name + '/' # Loaded # self.loaded = False def load(self): """A second __init__ that is delayed and called only if needed""" # Load the pools and samples # for p in self.pools: p.load() for s in self.samples: s.load() # Dir # self.p = AutoPaths(self.base_dir, self.all_paths) # Figure out if it's a project # if set(self.samples) == set(self.first.pool.project.samples): self.project = self.first.pool.project else: self.project = None # Runner # self.runner = ClusterRunner(self) # FASTA # self.reads = FASTA(self.p.all_reads_fasta) # OTU picking # self.otu_uparse = UparseOTUs(self) self.otu_uclust = UclustOTUs(self) self.otu_cdhit = CdhitOTUs(self) # Preferred # self.otus = self.otu_uparse # Simple reporting # self.reporter = ClusterReporter(self) # Full report # self.report = ClusterReport(self) # Loaded # self.loaded = True # Return self for convenience # return self def run(self, *args, **kwargs): self.runner.run(*args, **kwargs) def run_slurm(self, *args, **kwargs): self.runner.run_slurm(*args, **kwargs) def process_samples(self): for sample in tqdm(self): sample.process() def combine_reads(self): """This is the first function should call. It will combine all the reads of all the samples of this cluster into one big FASTA file.""" paths = [sample.fasta.path for sample in self] shell_output('cat %s > %s' % (' '.join(paths), self.reads)) return self.reads def set_size(self, length): """Trim all sequences to a specific length starting from the end.""" self.size_trimmed = FASTA(new_temp_path()) def trim_iterator(reads): for read in reads: if len(read) < length: continue yield read[-length:] self.size_trimmed.write(trim_iterator(self.reads)) self.size_trimmed.close() # Replace it # self.reads.remove() shutil.move(self.size_trimmed, self.reads) def run_uparse(self): self.otu_uparse.run() @property def metadata(self): return pandas.DataFrame([s.info for s in self], index=[s.short_name for s in self]) def export_metadata(self): self.metadata.to_csv(self.p.metadata, sep='\t', encoding='utf-8')
from fasta import FASTA, AlignedFASTA community = FASTA('community.fasta') alignment = AlignedFASTA('alignment.fasta')
class PrimerGroup(object): """A bunch of sequences all having the same type of primer outcome (and assembly outcome)""" all_paths = """ /orig.fastq /n_filtered.fastq /qual_filtered.fastq /len_filtered.fastq /trimmed_barcodes.fasta """ qual_threshold = 5 qual_windowsize = 10 min_length = 400 def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __len__(self): return len(self.orig_reads) def create(self): self.orig_reads.create() def add_seq(self, read): self.orig_reads.add_seq(read) def close(self): self.orig_reads.close() def __init__(self, parent): # Save parent # self.parent, self.assemble_group = parent, parent self.samples = parent.samples self.pool = self.parent.pool self.primers = self.pool.primers # Auto paths # self.base_dir = parent.p.groups_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # More # self.orig_reads = self.parent.cls(self.p.orig_fastq, samples=self.samples) self.n_filtered = self.parent.cls(self.p.n_filtered, samples=self.samples) # Quality filtered # if self.parent == 'assembled': self.qual_filtered = BarcodedFASTQ(self.p.qual_filtered, samples=self.samples, primers=self.primers) self.len_filtered = BarcodedFASTQ(self.p.len_filtered_fastq, samples=self.samples, primers=self.primers) self.trimmed_barcodes = FASTA(self.p.trimmed_barcodes) # Further # self.load() def load(self): pass def n_filter(self): """Called from AssembleGroup.discard_reads_with_n""" def no_n_iterator(reads): fwd_len = self.pool.primers.fwd_len rev_len = self.pool.primers.rev_len for read in reads: if 'N' in read[fwd_len:-rev_len]: continue yield read self.n_filtered.write(no_n_iterator(self.orig_reads)) def qual_filter(self): """Called from Assemble.quality_filter""" def good_qual_iterator(reads): for read in reads: averaged = moving_average(read.letter_annotations["phred_quality"], self.qual_windowsize) if any([value < self.qual_threshold for value in averaged]): continue yield read self.qual_filtered.write(good_qual_iterator(self.n_filtered)) def len_filter(self): """Called from Assemble.length_filter""" def good_len_iterator(reads): for read in reads: if len(read) < self.min_length: continue yield read self.len_filtered.write(good_len_iterator(self.qual_filtered)) def trim_bc(self): """Called from Assemble.trim_barcodes""" def no_barcodes_iterator(reads): for read in reads: yield read[self.pool.bar_len:-self.pool.bar_len] if self.pool.bar_len == 0: self.len_filtered.to_fasta(self.trimmed_barcodes) else: self.trimmed_barcodes.write(no_barcodes_iterator(self.len_filtered))
'TTDB/TriTrypDB-46_TcruziCLBrenerEsmeraldo-like_AnnotatedCDSs.fasta', 'organism': 'TcruziCLBrenerEsmeraldo-like' } non_emeraldo = { 'genome_filename': 'TTDB/TriTrypDB-46_TcruziCLBrenerNon-Esmeraldo-like_Genome.fasta', 'regions_filename': 'TTDB/TriTrypDB-46_TcruziCLBrenerNon-Esmeraldo-like_AnnotatedCDSs.fasta', 'organism': 'TcruziCLBrenerNon-Esmeraldo-like' } organism = emeraldo_like if __name__ == "__main__": # Load FASTA files genome = FASTA(organism['genome_filename']) genome.load() regions = FASTA(organism['regions_filename']) regions.load() # Load database file sqlite = sqlite3.connect(SQLite_DB) # Create MFASeq Folder Organism_MFASeq_folder = f"{MFASeq_folder}/MFA-Seq_{organism['organism']}" if not os.path.isdir(Organism_MFASeq_folder): os.mkdir(Organism_MFASeq_folder) # Create MFASeq Files for chromosome_id in genome.data.keys():
class QualityReads(object): """A set of sequences determined to be quality controlled""" all_paths = """ /mothur_reads.fasta /mothur_reads.qual /mothur_groups.tsv /qiime_reads.fasta /only_used_samples.fasta /trimmed.fasta """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __len__(self): return len(self.trimmed) def __init__(self, path, parent): # Save parent # self.parent, self.pool = parent, parent self.samples = parent.samples # Auto paths # self.base_dir = parent.p.quality_dir + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Files # self.untrimmed = BarcodedFASTQ(path, samples=self.samples) self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples) self.trimmed = FASTA(self.p.trimmed) # Qiime output # self.qiime_fasta = FASTA(self.p.qiime_fasta) # Mothur # self.mothur_fasta = FASTA(self.p.mothur_fasta) self.mothur_qual = QualFile(self.p.mothur_qual) self.mothur_groups = FilePath(self.p.mothur_groups) # Primer size # self.trim_fwd = self.pool.samples.trim_fwd self.trim_rev = self.pool.samples.trim_rev def filter_unused(self): def no_unused_iterator(reads): for r in reads.parse_barcodes(): if r.first.sample.used: yield r.read self.only_used.write(no_unused_iterator(self.untrimmed)) def trim_primers(self): def no_primers_iterator(reads): for read in reads: yield read[self.trim_fwd:-self.trim_rev] self.trimmed.write(no_primers_iterator(self.only_used)) def make_mothur_output(self): # Trimmed fasta # self.mothur_fasta.link_from(self.trimmed.path) # The groups file # self.mothur_groups.create() for r in self.only_used.parse_barcodes(): sample_name = r.first.sample.short_name read_name = '%s\t%s\n' % (r.read.id, sample_name) self.mothur_groups.handle.write(read_name) self.mothur_groups.close() def make_qiime_output(self): # Prepare fasta writer # handle = open(self.qiime_fasta.path, 'w') writer = FastaWriter(handle, wrap=0) writer.write_header() # Counter # counter = defaultdict(int) # Do it # for r in self.only_used.parse_barcodes(): sample_name = r.first.sample.short_name counter[sample_name] += 1 r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id) bar_seq = r.read.seq[0:self.pool.bar_len] r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq) writer.write_record(r.read[self.trim_fwd:-self.trim_rev]) # Close # writer.write_footer() handle.close()
def __init__(self, fasta_path, seq_type='nucl' or 'prot'): if hasattr(fasta_path, 'seq_type'): self.seq_type = fasta_path.seq_type else: self.seq_type = seq_type FASTA.__init__(self, fasta_path)
class Silva(Database): """ SILVA provides comprehensive, quality checked and regularly updated datasets of aligned small (16S/18S, SSU) and large subunit (23S/28S, LSU) ribosomal RNA (rRNA) sequences for all three domains of life (Bacteria, Archaea and Eukarya). SILVA are the official databases of the software package ARB. https://www.arb-silva.de To install: from seqsearch.databases.silva import silva silva.download() silva.unzip() It will put it in ~/databases/silva_xxx/ """ view_url = "https://www.arb-silva.de/no_cache/download/archive/" base_url = "https://www.arb-silva.de/fileadmin/silva_databases/" short_name = "silva" all_paths = """ /test.txt """ def __init__(self, version, seq_type, base_dir=None): # Attributes # self.version = version self.seq_type = seq_type self.short_name = self.short_name + "_" + self.version # Base directory # if base_dir is None: base_dir = home self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # URL # self.url = "release_%s/Exports/" % self.version # The database # self.nr99_name = "SILVA_%s_SSURef_Nr99_tax_silva.fasta.gz" % self.version self.nr99_dest = FASTA(self.base_dir + self.nr99_name) self.nr99 = FASTA(self.base_dir + self.nr99_name[:-3]) # The alignment # self.aligned_name = "SILVA_%s_SSURef_Nr99_tax_silva_full_align_trunc.fasta.gz" % self.version self.aligned_dest = FASTA(self.base_dir + self.aligned_name) self.aligned = FASTA(self.base_dir + self.aligned_name[:-3]) def download(self): self.nr99_dest.directory.create(safe=True) self.nr99_dest.remove(safe=True) self.aligned_dest.remove(safe=True) import wget print("\nDownloading", self.base_url + self.url + self.nr99_name) wget.download(self.base_url + self.url + self.nr99_name, out=self.nr99_dest.path) print("\nDownloading", self.base_url + self.url + self.aligned_name) wget.download(self.base_url + self.url + self.aligned_name, out=self.aligned_dest.path) def unzip(self): self.nr99_dest.ungzip_to(self.nr99) self.nr99.permissions.only_readable() self.aligned_dest.ungzip_to(self.aligned) self.aligned.permissions.only_readable()
class PairedFASTA(object): """Read and write FASTA file pairs without using too much RAM""" format = 'fasta' def __len__(self): return self.count def __iter__(self): return self.parse() def __nonzero__(self): return bool(self.fwd) and bool(self.rev) def __repr__(self): return '<%s object on "%s" and "%s">' % \ (self.__class__.__name__, self.fwd.path, self.rev.path) def __enter__(self): return self.create() def __exit__(self, exc_type, exc_value, traceback): self.close() @property def exists(self): return self.fwd.exists and self.rev.exists def __init__(self, fwd, rev, parent=None): # FASTA objects # self.fwd = FASTA(fwd) self.rev = FASTA(rev) # Extra # self.gzipped = self.fwd.gzipped self.parent = parent @property_cached def count(self): assert self.fwd.count == self.rev.count return self.fwd.count def open(self): self.fwd.open() self.rev.open() def parse(self): return izip(self.fwd.parse(), self.rev.parse()) def close(self): self.fwd.close() self.rev.close() def create(self): self.fwd.create() self.rev.create() return self def add(self, f, r): return self.add_pair((f, r)) def add_pair(self, pair): self.fwd.add_seq(pair[0]) self.rev.add_seq(pair[1]) def remove(self): self.fwd.remove() self.rev.remove() @property def progress(self): """Just like self.parse but display a progress bar""" return tqdm(self, total=len(self)) def subsample(self, down_to, dest_pair=None): # Check size # assert down_to < len(self) # Make new pair of files # if dest_pair is None: dest_fwd_path = self.fwd_path.new_name_insert("subsampled") dest_rev_path = self.rev_path.new_name_insert("subsampled") dest_pair = self.__class__(dest_fwd_path, dest_rev_path) # Do it # dest_pair.create() for pair in isubsample(self, down_to): dest_pair.add_pair(pair) self.subsampled.close() # Did it work # assert len(dest_pair) == down_to #------------------------------- Extensions ------------------------------# def parse_primers(self, *args, **kwargs): fwd_gen = self.fwd.parse_primers(*args, **kwargs) rev_gen = self.rev.parse_primers(*args, **kwargs) generator = izip(fwd_gen, rev_gen) return GenWithLength(generator, len(fwd_gen))
class UclustOTUs(OTUs): """Will use uclust via the qimme wraper to create OTU clusters from a given FASTA file http://qiime.org/scripts/pick_otus.html""" short_name = 'uclust' title = 'UCLUST-QIIME denovo picking' all_paths = """ /clusters/clusters.uc /clusters/qiime.log /clusters/all_otus.txt /clusters/all_centers.fasta /centers.fasta /otus.txt /taxonomy_silva/ /taxonomy_fw/ /graphs/ """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main FASTA file # self.reads = self.parent.reads # Files # self.all_otus = FilePath(self.p.all_otus) self.all_centers = FASTA(self.p.all_centers) self.otus = FilePath(self.base_dir + "otus.txt") self.centers = FASTA(self.base_dir + "centers.fasta") # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) # Preferred one # self.taxonomy = self.taxonomy_silva def run(self): # Clean # shutil.rmtree(self.p.clusters_dir) # Run command # pick_otus = sh.Command('pick_otus.py') pick_otus('-m', 'uclust', '-i', self.reads, '-o', self.p.clusters_dir) # Move into place # base_name = self.p.clusters_dir + self.reads.prefix shutil.move(base_name + '_otus.txt', self.all_otus) shutil.move(base_name + '_otus.log', self.p.qiime_log) shutil.move(base_name + '_clusters.uc', self.p.clusters_uc) # Remove OTUs that are only one read # def filter_singletons(f): for line in f: line = line.split() if len(line) > 2: yield '\t'.join(line) + '\n' self.otus.writelines(filter_singletons(self.all_otus)) # Create the centers file that is missing # pick_rep = sh.Command('pick_rep_set.py') pick_rep('-i', self.all_otus, '-f', self.reads, '-o', self.all_centers) # Remake the centers file without the filtered OTUs # self.otus_to_keep = [line.split()[0] for line in self.otus] def filter_otus(f): for seq in f: if seq.id in self.otus_to_keep: yield seq self.centers.write(filter_otus(self.all_centers)) @property_cached def cluster_counts_table(self): """Create the unfiltered OTU table""" # Put results in a dict of dicts # result = defaultdict(lambda: defaultdict(int)) # Loop # for line in self.otus: # Parse the line # contents = line.split() otu, reads = contents[0], contents[1:] # Parse the hits # for r in reads: nums = re.findall("run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)", r) if nums: run_num, pool_num, sample_num, read_num = map(int, nums[0]) sample = illumitag.runs[run_num][pool_num-1][sample_num-1] name = sample.short_name else: nums = re.findall("run([0-9]+)_sample([0-9]+)_read([0-9]+)", r) run_num, sample_num, read_num = map(int, nums[0]) sample = [s for s in illumitag.presamples+illumitag.pyrosamples if s.run_num==run_num and s.num==sample_num][0] name = sample.short_name # Count # result[otu][name] += 1 # Return # result = pandas.DataFrame(result) result = result.fillna(0) result = result.astype(int) result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1) return result
""" # Built-in modules # import inspect, os # Internal modules # from seqsearch.databases.ncbi_16s import ncbi_16s from seqsearch.search.blast import BLASTquery # First party modules # from fasta import FASTA # Get current directory # file_name = inspect.getframeinfo(inspect.currentframe()).filename this_dir = os.path.dirname(os.path.abspath(file_name)) + '/' ############################################################################### if __name__ == "__main__": # Main input # seqs = FASTA(this_dir + 'seqs.fasta') # The database to search against # db = ncbi_16s.blast_db # Create search # query = BLASTquery(seqs, db) # Run # query.run()
class UparseOTUs(OTUs): """Will use uparse to create OTU clusters from a given FASTA file http://www.nature.com/doifinder/10.1038/nmeth.2604""" short_name = 'uparse' title = 'UPARSE denovo picking' article = "http://www.nature.com/doifinder/10.1038/nmeth.2604" version = uparse_version threshold = 3.0 all_paths = """ /derep.fasta /sorted.fasta /centers.fasta /readmap.uc /taxonomy_silva/ /taxonomy_fw/ /taxonomy_unite/ /taxonomy_rdp/ /graphs/ /seqenv/ """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __len__(self): return 0 def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main FASTA file # self.reads = self.parent.reads # Files # self.derep = SizesFASTA(self.p.derep) self.sorted = SizesFASTA(self.p.sorted) self.centers = FASTA(self.p.centers) self.readmap = UClusterFile(self.p.readmap) # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva_dir) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) self.taxonomy_unite = CrestTaxonomy(self.centers, self, 'unite', self.p.unite_dir) self.taxonomy_rdp = RdpTaxonomy(self.centers, self) # Preferred one # self.taxonomy = self.taxonomy_silva # Source tracking # self.seqenv = Seqenv(self) def run(self, threshold=None): # Optional threshold # if threshold is None: threshold = self.threshold identity = (100 - threshold) / 100 # Dereplicate (uparse version 32bit version runs out of memory) # if False: sh.usearch7("--derep_fulllength", self.reads, '-output', self.derep, '-sizeout') sh.fasta_make_unique(self.reads, self.derep) # Order by size and kill singeltons # sh.usearch7("--sortbysize", self.derep, '-output', self.sorted, '-minsize', 2) # Compute the centers # sh.usearch7("--cluster_otus", self.sorted, '-otus', self.centers, '-otu_radius_pct', threshold) # Rename the centers # self.centers.rename_with_num('OTU-') # Map the reads back to the centers # sh.usearch7("-usearch_global", self.reads, '-db', self.centers, '-strand', 'plus', '-id', identity, '-uc', self.readmap) def checks(self): assert len(self.reads) == len(self.derep) assert len(self.reads) == len(self.readmap) @property_cached def cluster_counts_table(self): """Parse that custom output for creating the unfiltered OTU table""" result = pandas.DataFrame(self.readmap.otu_sample_counts) result = result.fillna(0) result = result.astype(int) result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1) return result
"""We explore the client given inputs, check for problems, then format them and store them in the repository as immutable artifacts (compressed text files)""" import inspect, os, glob, pandas from fasta import FASTA current_script = inspect.getframeinfo(inspect.currentframe()).filename current_dir = os.path.dirname(os.path.abspath(current_script)) + '/' genomes_dir = current_dir + '../ld12/data/genomes/' input_dir = "/proj/b2013274/mcl/" faa_paths = sorted(glob.glob(input_dir + '*.faa')) fna_paths = sorted(glob.glob(input_dir + '*.fna')) faas = [FASTA(faa) for faa in faa_paths if '647533246' not in faa] fnas = [FASTA(fna) for fna in fna_paths if '647533246' not in fna] faas_nums = [int(g.short_prefix) for g in faas] fnas_nums = [int(g.short_prefix) for g in fnas] metadata = pandas.io.parsers.read_csv(current_dir + '../ld12/data/metadata.tsv', sep='\t', index_col=0, encoding='utf-8') meta_nums = list(metadata.index) print set(faas_nums) ^ set(fnas_nums) print set(faas_nums) ^ set(meta_nums) def strip(seq):
class CdhitOTUs(OTUs): """Will use cd-hit to create OTU clusters from a given FASTQ file http://weizhong-lab.ucsd.edu/cd-hit-otu/""" short_name = "cdhit" title = "CD-HIT Illumina OTU picking" all_paths = """ /all_reads.fastq /clusters/OTU.nr2nd.clstr /centers.fasta /otus.txt /taxonomy_silva/ /taxonomy_fw/ /graphs/ """ def __repr__(self): return "<%s object of %s>" % (self.__class__.__name__, self.parent) def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + "/" self.p = AutoPaths(self.base_dir, self.all_paths) # Main reads file here FASTQ # self.reads = FASTQ(self.p.all_reads) # Files # self.cdhit_clusters = FilePath(self.p.clstr) self.cdhit_centers = FASTA(self.p.clusters_dir + "OTU") self.centers = FASTA(self.p.centers) # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, "silvamod", self.p.silva) self.taxonomy_fw = CrestTaxonomy(self.centers, self, "freshwater", self.p.fw_dir) # Preferred one # self.taxonomy = self.taxonomy_silva def run(self): # Combine reads but in fastq format this time # paths = [sample.renamed for sample in self.cluster] shell_output("cat %s > %s" % (" ".join(paths), self.reads)) # Clean # shutil.rmtree(self.p.clusters_dir) # Run command # cdhit = sh.Command(cdhit_script) cdhit("-i", self.reads, "-o", self.p.clusters_dir, "-p", TmpFile.from_string("[ACTG]")) # Create the centers file with good names # self.cdhit_centers.rename_with_num("OTU-", self.centers) @property_cached def cluster_counts_table(self): """Create the unfiltered OTU table""" # Put results in a dict of dicts # result = defaultdict(lambda: defaultdict(int)) # Loop # for line in self.cdhit_clusters: if line.startswith(">"): otu = "OTU-%s" % line.split()[1] continue nums = re.findall(">run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)\.\.\.", line) if nums: run_num, pool_num, sample_num, read_num = map(int, nums[0]) sample = illumitag.runs[run_num][pool_num - 1][sample_num - 1] name = sample.short_name else: nums = re.findall(">run([0-9]+)_sample([0-9]+)_read([0-9]+)\.\.\.", line) run_num, sample_num, read_num = map(int, nums[0]) sample = [ s for s in illumitag.presamples + illumitag.pyrosamples if s.run_num == run_num and s.num == sample_num ][0] name = sample.short_name # Count # result[otu][name] += 1 # Return # result = pandas.DataFrame(result) result = result.fillna(0) result = result.astype(int) result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1) # Remove OTUs that are only one read # return result
def __init__(self, fasta_path, seq_type='nucl' or 'prot'): # Check if the FASTA already has the seq_type set # if hasattr(fasta_path, 'seq_type'): self.seq_type = fasta_path.seq_type else: self.seq_type = seq_type # Call parent constructor # FASTA.__init__(self, fasta_path)
seq = seq.split('[')[0] return seq for faa,fna in zip(faas, fnas): faas_genes = [strip(seq) for seq in faa] fnas_genes = [strip(seq) for seq in fna] print faa, len(set(fnas_genes) ^ set(faas_genes)), "discrepancies" #print "- in fna but not in faa:", [x for x in set(fnas_genes) - set(faas_genes)] #print "- in faa but not in fna:", [x for x in set(faas_genes) - set(fnas_genes)] #print "" fnas_genes = [strip(seq) for fna in fnas for seq in fna] print len(fnas_genes), len(set(fnas_genes)) for genome in faas: out_path = genomes_dir + genome.short_prefix + '.fasta' out_fasta = FASTA(out_path) out_fasta.create() for seq in genome: out_fasta.add_str(str(seq.seq), strip(seq)) out_fasta.close() out_fasta.gzip_to() out_fasta.remove() def lines(): for genome in faas: for gene in genome: name = strip(gene) yield name + '\t' + gene.description[len(name):].rstrip(' |') + '\n' annotations_path = current_dir + '../ld12/data/annotations.tsv' with open(annotations_path, 'w') as handle: handle.writelines(lines())
def subsampled(self): subsampled = FASTA(self.p.subsampled) if not subsampled.exists: self.fasta.subsample(down_to=30, new_path=subsampled) self.add_taxonomy(subsampled) return subsampled
class Pyrosample(object): """A Pyrosample is a legacy object for the few 454 samples we still have and that we need to compare against the new Illumina technology.""" all_paths = """ /info.json /reads.fasta /renamed.fasta /raw/raw.sff /raw/raw.fastq /raw/raw.fasta /raw/raw.qual /raw/manifest.txt /fastq/reads.fastq """ kind = "pyrosample" def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.id_name) def __init__(self, json_path, out_dir): # Attributes # self.out_dir = out_dir self.json_path = FilePath(json_path) # Parse # self.info = load_json_path(self.json_path) # Basic # self.account = "/dev/null" self.run_num = self.info['run_num'] self.run_label = "pyrosample_run_%i" % self.run_num self.project_short_name = self.info['project'] self.project_long_name = self.info['project_name'] # Own attributes # self.num = self.info['sample_num'] self.short_name = self.info['sample'] self.long_name = self.info['sample_name'] self.name = 'run%i_sample%i' % (self.run_num, self.num) self.group = self.info['group'] self.id_name = "run%03d-sample%02d" % (self.run_num, self.num) # Hard coded attributes # self.machine = "454 GS FLX Titanium" # SFF files # self.sff_files_info = self.info['files'] # Pool dummy # self.pool, self.parent = self, self # Other dummy variables # self.bar_len = 0 self.gzipped = False self.used = True # Loaded # self.loaded = False def load(self): """A second __init__ that is delayed and called only if needed""" # Check files are there # for f in self.sff_files_info: if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path']) # Automatic paths # self.base_dir = self.out_dir + self.id_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Make an alias to the json # self.p.info_json.link_from(self.json_path, safe=True) # Primer # self.primer_regex = re.compile(self.info['primer']) # Raw files # self.raw_fasta = FASTA(self.p.raw_fasta) self.raw_fastq = FASTQ(self.p.raw_fastq) # Standard FASTA # self.reads = FASTA(self.p.reads_fasta) self.fasta = FASTA(self.p.renamed) # Special FASTQ # self.fastq = FASTQ(self.p.reads_fastq) # A shameless hack for cdhit to work # self.renamed = self.fastq # Pre-denoised special case # if self.info['predenoised'] and False: self.sff_files_info = [] self.reads.link_from(self.info['predenoised'], safe=True) # Special submission attributes # self.sra = PyroSampleSRA(self) # Loaded # self.loaded = True # Return self for convenience # return self @property def mate(self): if not 'mate' in self.info: return False run_num = self.info['mate']['run'] pool_num = self.info['mate']['pool'] barcode_num = self.info['mate']['num'] return illumitag.runs[run_num][pool_num-1][barcode_num-1] def extract(self): # Call extraction # shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta)) shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual)) shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest)) # Convert # sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq) def clean_iterator(self, reads, minlength=400, threshold=21, windowsize=20): for read in reads: # Length # if len(read) < minlength: continue # Primer # match = self.primer_regex.search(str(read.seq)) if not match: continue # PHRED score # scores = read.letter_annotations["phred_quality"] averaged = moving_average(scores, windowsize) discard = False for i,value in enumerate(averaged): if value < threshold: read = read[:i+windowsize-1] if len(read) < minlength: discard = True break if discard: continue # Undetermined bases # if 'N' in read: continue # Remove primer # read = read[match.end():] # Flip them because 454 reads the other end # read = read.reverse_complement() # Return # yield read def clean(self, **kwargs): self.reads.write(self.clean_iterator(self.raw_fastq, **kwargs)) def report_loss(self): print "Before cleaning: %i" % len(self.raw_fastq) print "After cleaning: %i" % len(self.reads) print "Loss: %.2f%%" % (100 * (1 - (len(self.raw_fastq)/len(self.reads)))) def process(self): self.reads.rename_with_num(self.name + '_read', new_path=self.fasta) def make_fastq(self, **kwargs): """In some special cases we want the FASTQ""" self.fastq.write(self.clean_iterator(self.raw_fastq, **kwargs)) self.fastq.rename_with_num(self.name + '_read') print "make_fastq for sample %s completed" % self.id_name
def seeds(self): seeds = FASTA(self.autopaths.seed) return seeds
def FASTA_alignment(): # example for talk f = FASTA.retrieve('1YGV', cache_dir) + FASTA.retrieve('3HQV', cache_dir) sa = SequenceAligner.from_FASTA(f) print(sa)