def search_species(self, seqs): with open(self.species_fp) as f: ref_seqs = list(parse_fasta(f, trim_desc=True)) xt = HitExtender(seqs, ref_seqs) with open(self.output_fp) as of: hits = VsearchAligner._parse(of) for hit in hits: yield xt.extend_hit(hit)
def _get_subject_seq(self, subject_id): subject_outfile = tempfile.NamedTemporaryFile() subject_outfile_fp = subject_outfile.name args = [ "blastdbcmd", "-db", self.db, "-entry", subject_id, "-out", subject_outfile_fp ] subprocess.check_call(args) with open(subject_outfile_fp) as f: return list(parse_fasta(f, trim_desc=True))[0][1]
def load(self, assemblies): with open(self.accession_fp, "r") as f: for line in f: toks = line.strip().split() seqid = toks[0] accession = toks[1] assembly = assemblies[accession] self.assemblies[seqid] = assembly self.seqids_by_assembly[assembly.accession].append(seqid) with open(self.fasta_fp, "r") as f: for seqid, seq in parse_fasta(f): self.seqs[seqid] = seq
def test_parse_fasta(self): res = parse_fasta([ ">Seq1 abc def\n", "GGCTGCTATCAG\n", "CTAGCATCGTCGCATCGAC\n", ">Seq2\n", "ACGCTAGCTGCAAAA\n", ]) self.assertEqual(next(res), ("Seq1 abc def", "GGCTGCTATCAGCTAGCATCGTCGCATCGAC")) self.assertEqual(next(res), ("Seq2", "ACGCTAGCTGCAAAA")) self.assertRaises(StopIteration, next, res)
def __init__(self, species_file, ref_fp, mismatch_file, batch_size=10, num_cpus=None): self.typestrain_seqs = list(parse_fasta(species_file, trim_desc=True)) self.reference_fasta_fp = ref_fp self.mismatch_file = mismatch_file self.min_pct_id = 97.0 self.num_threads = num_cpus self.batch_size = batch_size self.max_hits = "10000"
def ssu_seqs(self): if self._ssu_seqs is not None: return self._ssu_seqs if not os.path.exists(self.rna_fp): try: self.download_rna() except urllib.error.HTTPError as e: print(self.accession) print(e) return [] with open(self.rna_fp, "rt") as f: seqs = list(parse_fasta(f)) res = [(desc, seq) for (desc, seq) in seqs if is_16S(desc)] self._ssu_seqs = res return res
def search_species(self, query_seqs): b = VsearchAligner(self.species_fp) vsearch_args = { "min_id": 0.9, "maxaccepts": 5, } if self.num_cpus: vsearch_args["threads"] = self.num_cpus hits = b.search(query_seqs, self.species_input_fp, self.species_output_fp, **vsearch_args) with open(self.species_fp) as f: ref_seqs = list(parse_fasta(f, trim_desc=True)) xt = HitExtender(query_seqs, ref_seqs) for hit in hits: yield xt.extend_hit(hit)
def process_greengenes_seqs(seqs_fp, accessions_fp, output_fp=REFSEQS_FASTA_FP): duplicates_fp = GG_DUPLICATE_FP if os.path.isdir(output_fp): duplicates_fp = os.path.join(output_fp, duplicates_fp) output_fp = os.path.join(output_fp, REFSEQS_FASTA_FP) # Extract table of accessions if accessions_fp.endswith(".gz"): subprocess.check_call(["gunzip", "-f", accessions_fp]) accessions_fp = gunzip_fp(accessions_fp) # Load accessions gg_accessions = {} with open(accessions_fp) as f: for ggid, src, acc in parse_greengenes_accessions(f): gg_accessions[ggid] = (acc, src) # Extract FASTA file if seqs_fp.endswith(".gz"): subprocess.check_call(["gunzip", "-f", seqs_fp]) seqs_fp = gunzip_fp(seqs_fp) # Remove duplicate reference seqs uniq_seqs = collections.defaultdict(list) with open(seqs_fp) as f: for ggid, seq in parse_fasta(f): uniq_seqs[seq].append(ggid) with open(duplicates_fp, "w") as dups: with open(output_fp, "w") as f: for seq, ggids in uniq_seqs.items(): ggid = ggids[0] if len(ggids) > 1: dups.write(" ".join(ggids)) # Re-label seqs with accession numbers acc, src = gg_accessions[ggid] f.write(">%s %s %s\n%s\n" % (acc, src, ggid, seq)) return output_fp
def process_ltp_seqs(input_fp, output_fp=SPECIES_FASTA_FP): if os.path.isdir(output_fp): output_fp = os.path.join(output_fp, SPECIES_FASTA_FP) accession_cts = collections.defaultdict(int) # Re-format FASTA file with open(input_fp) as f_in: seqs = parse_fasta(f_in) with open(output_fp, "w") as f_out: for desc, seq in seqs: vals = desc.split("|") # Some accessions refer to genomes with more than one 16S gene # So accessions can be legitiamtely repeated with distinct gene sequences accession = vals[2] accession_times_previously_seen = accession_cts[accession] accession_cts[accession] += 1 if accession_times_previously_seen > 0: accession = "{0}_repeat{1}".format( accession, accession_times_previously_seen) species_name = vals[3] f_out.write(">{0}\t{1}\n{2}\n".format(accession, species_name, seq)) return output_fp
def main(argv=None): p = argparse.ArgumentParser() p.add_argument("query_fasta", type=argparse.FileType("r"), help="Query sequences FASTA file") p.add_argument( "--output_dir", help=("Output directory (default: basename of query sequences FASTA " "file, plus '_unassigned')")) p.add_argument( "--type_strain_fasta", default="unassigner_species.fasta", help=("Type strain sequences FASTA file (default: %(default)s). " "If the default file is not found, sequences are downloaded " "and re-formatted automatically.")) p.add_argument( "--threshold", type=float, help=("Sequence identity threshold for ruling out species-level " "compatibility. Default value is 0.975 for the standard " "algorithm and 0.991 for the soft threshold algorithm.")) p.add_argument( "--ref_mismatch_positions", help=("File of mismatch positions in reference database. The file may " "be compressed in gzip format.")) p.add_argument( "--num_cpus", type=int, help=("Number of CPUs to use during sequence aligment (default: " "use all the CPUs)")) p.add_argument("--soft_threshold", action="store_true", help="Use soft threshold algorithm.") p.add_argument("--verbose", action="store_true", help="Activate verbose mode.") args = p.parse_args(argv) if args.threshold is None: if args.soft_threshold: min_id = 0.991 else: min_id = 0.975 else: min_id = args.threshold if args.verbose is True: logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) query_seqs = list(parse_fasta(args.query_fasta, trim_desc=True)) if args.output_dir is None: output_dir = os.path.splitext(args.query_fasta.name)[0] + "_unassigned" else: output_dir = args.output_dir # Download type strain files if needed type_strain_fp_is_default = ( args.type_strain_fasta == p.get_default("type_strain_fasta")) type_strain_fp_is_missing = not os.path.exists(args.type_strain_fasta) if type_strain_fp_is_default and type_strain_fp_is_missing: download_type_strain_data() with open(args.type_strain_fasta) as f: species_names = dict(parse_species_names(f)) writer = OutputWriter(output_dir, species_names) alignment_query_fp = writer.output_fp("unassigner_query.fasta") alignment_output_fp = writer.output_fp("unassigner_query_hits.txt") if os.path.exists(alignment_output_fp): a = FileAligner(args.type_strain_fasta, alignment_output_fp) else: a = UnassignAligner(args.type_strain_fasta) a.species_input_fp = alignment_query_fp a.species_output_fp = alignment_output_fp a.num_cpus = args.num_cpus if args.ref_mismatch_positions: if args.ref_mismatch_positions.endswith(".gz"): mm_db_file = gzip.open(args.ref_mismatch_positions, "rt") else: mm_db_file = open(args.ref_mismatch_positions) VariableMismatchRate.load_database(mm_db_file) app = UnassignerApp(a, VariableMismatchRate, min_id=min_id, soft_threshold=args.soft_threshold) for query_id, query_results in app.unassign(query_seqs): writer.write_results(query_id, query_results)
def test_parse_empty_fasta(self): res = parse_fasta([]) list_res = list(res) self.assertEqual(list_res, [])
def from_fasta(cls, f): recs = parse_fasta(f) return cls(recs)
def main(argv=None): p = argparse.ArgumentParser() p.add_argument("query_fasta", type=argparse.FileType("r"), help="Query sequences FASTA file") p.add_argument( "--output_dir", help=("Output directory (default: basename of query sequences FASTA " "file, plus '_unassigned')")) p.add_argument( "--type_strain_fasta", default="unassigner_species.fasta", help=("Type strain sequences FASTA file (default: %(default)s). " "If the default file is not found, sequences are downloaded " "and re-formatted automatically.")) p.add_argument( "--num_cpus", type=int, help=("Number of CPUs to use during sequence aligment (default: " "use all the CPUs)")) p.add_argument("--verbose", action="store_true", help="Activate verbose mode.") args = p.parse_args(argv) if args.verbose is True: logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) query_seqs = list(parse_fasta(args.query_fasta, trim_desc=True)) if args.output_dir is None: output_dir = os.path.splitext(args.query_fasta.name)[0] + "_unassigned" else: output_dir = args.output_dir # Download type strain files if needed type_strain_fp_is_default = ( args.type_strain_fasta == p.get_default("type_strain_fasta")) type_strain_fp_is_missing = not os.path.exists(args.type_strain_fasta) if type_strain_fp_is_default and type_strain_fp_is_missing: download_type_strain_data() with open(args.type_strain_fasta) as f: species_names = dict(parse_species_names(f)) writer = OutputWriter(output_dir, species_names) alignment_query_fp = writer.output_fp("unassigner_query.fasta") alignment_output_fp = writer.output_fp("unassigner_query_hits.txt") if os.path.exists(alignment_output_fp): a = FileAligner(args.type_strain_fasta, alignment_output_fp) else: a = UnassignAligner(args.type_strain_fasta) a.species_input_fp = alignment_query_fp a.species_output_fp = alignment_output_fp a.num_cpus = args.num_cpus algorithm = ThresholdAlgorithm(a) for query_id, query_results in algorithm.unassign(query_seqs): writer.write_results(query_id, query_results)