def gather_guess_tax_at_each_rank( gather_results, num_hashes, taxlist=lca_utils.taxlist(include_strain=False), minimum_matches=3, lowest_rank="genus"): rank_results = [] prev_lineage = "" top_lineage = "" for rank in taxlist: top_lineage, f_ident, f_major = gather_guess_tax_at_rank( gather_results, num_hashes, rank, minimum_matches=minimum_matches) # summarizing at a lower rank than exists will yield same result as prev. break! if not top_lineage or top_lineage == prev_lineage: break rank_results.append( RankSumGatherResult(lineage=top_lineage, f_ident=f_ident, f_major=f_major)) prev_lineage = top_lineage if rank == lowest_rank: break return rank_results
def add_hashes_at_ranks(lineage_hashD, hashes_to_add, lineage, match_rank): # first add full lineage lineage_hashD[lineage].add_many(hashes_to_add) for rank in lca_utils.taxlist(include_strain=False): # TODO: add check to pop ONLY if needed (no need to pop at genus if lineage only has superk, phyl) lin_at_rank = pop_to_rank(lineage, rank) lineage_hashD[lin_at_rank].add_many(hashes_to_add) if rank == match_rank: break return lineage_hashD
def test_searchfiles_contigs_just_gather(location): prefix = os.path.join(location, "pref") filelist = [ f"{prefix}.contigs.rankgather.csv", f"{prefix}.contigs.unmatched.fq" ] sf = SearchFiles(prefix, search=True, gather=True) # two minhashes, share ranks at phylum level hashval = 12345678 ident1 = 'first' mh1, sig1, lin1 = make_sig_and_lin([hashval], ident1, 'a;b;c') hashval2 = 87654321 ident2 = 'second' mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'a;b;f') # create lca_db w sigs lca_db = LCA_Database(scaled=1, ksize=3) lca_db.insert(sig1, ident=ident1) lca_db.insert(sig2, ident=ident2) # make lin_db lin_db = LineageDB() lin_db.insert(ident1, lin1) lin_db.insert(ident2, lin2) num_hashes = 2 # search with combined hashvals search_mh = make_mh([hashval, hashval2]) gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class")) gather_rank_results=gather_guess_tax_at_each_rank(gather_results, num_hashes, minimum_matches=1, \ lowest_rank="class", \ taxlist=lca_utils.taxlist(include_strain=False)) #write search results name = 'name' seq_len = 6 for gres in gather_rank_results: sf.write_result(gres, name, seq_len, result_type="rankgather") sf.close() # check results are in files for f in filelist: assert os.path.exists(f) with open(f"{prefix}.contigs.rankgather.csv", "r") as gatherres: this_gather_csvset = get_csv_set(gatherres) with open(utils.get_testfile("test-data/test.contigs.rankgather.csv"), "r") as searchres: saved_gather_csvset = get_csv_set(searchres) assert saved_gather_csvset == this_gather_csvset
def sort_by_rank_and_containment(summarized_results, match_rank): sorted_results = [] # iterate superkingdom --> match_rank for rank in lca_utils.taxlist(include_strain=False): rank_res = summarized_results[rank] rank_res.sort(key=itemgetter(1), reverse=True) # sort by containment for (lin, containment, intersect_bp, match_sig) in rank_res: sorted_results.append( RankSumSearchResult(lineage=lin, contained_at_rank=containment, contained_bp=intersect_bp, match=match_sig)) if rank == match_rank: break return sorted_results
def test_gather_guess_tax_at_each_rank_1(): #two minhashes, fully shared ranks # first sig hashval = 12345678 ident1 = 'first' mh1, sig1, lin1 = make_sig_and_lin([hashval], ident1, 'a;b;c') # second sig hashval2 = 87654321 ident2 = 'second' mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'a;b;c') # create lca_db w sigs lca_db = LCA_Database(scaled=1, ksize=3) lca_db.insert(sig1, ident=ident1) lca_db.insert(sig2, ident=ident2) # make lin_db lin_db = LineageDB() lin_db.insert(ident1, lin1) lin_db.insert(ident2, lin2) num_hashes = 2 superk_lin = lca_utils.make_lineage('a') phylum_lin = lca_utils.make_lineage('a;b') # search with combined hashvals search_mh = make_mh([hashval, hashval2]) gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class")) rank_results=gather_guess_tax_at_each_rank(gather_results, num_hashes, minimum_matches=1, \ lowest_rank="class", taxlist=lca_utils.taxlist(include_strain=False)) assert len(rank_results) == 3 assert rank_results[0] == RankSumGatherResult(lineage=superk_lin, f_ident=1.0, f_major=1.0) assert rank_results[1] == RankSumGatherResult(lineage=phylum_lin, f_ident=1.0, f_major=1.0) assert rank_results[2] == RankSumGatherResult(lineage=lin1, f_ident=1.0, f_major=1.0)
def test_gather_guess_tax_at_each_rank_3(): # two minhashes, totally distinct ranks # first sig hashval1 = 12345678 ident1 = 'first' mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident1, 'a;b;c') # second sig hashval2 = 87654321 ident2 = 'second' mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'd;e;f') # create lca_db w sig1 lca_db = LCA_Database(scaled=1, ksize=3) lca_db.insert(sig1, ident=ident1) lca_db.insert(sig2, ident=ident2) # next, make lin_db lin_db = LineageDB() lin_db.insert(ident1, lin1) lin_db.insert(ident2, lin2) num_hashes = 2 #winner seems to be def lineage.. will this remain true always? superk_lin = lca_utils.make_lineage('d') phylum_lin = lca_utils.make_lineage('d;e') # search with combined hashvals search_mh = make_mh([hashval1, hashval2]) gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class")) rank_results=gather_guess_tax_at_each_rank(gather_results, num_hashes, minimum_matches=1, \ lowest_rank="class", taxlist=lca_utils.taxlist(include_strain=False)) assert len(rank_results) == 3 assert rank_results[0] == RankSumGatherResult(lineage=superk_lin, f_ident=1.0, f_major=0.5) assert rank_results[1] == RankSumGatherResult(lineage=phylum_lin, f_ident=1.0, f_major=0.5) assert rank_results[2] == RankSumGatherResult(lineage=lin2, f_ident=1.0, f_major=0.5)
def pop_to_rank(lin, rank): "Remove lineage tuples from given lineage `lin` until `rank` is reached." lin = list(lin) txl = lca_utils.taxlist() before_rank = [] for txl_rank in txl: if txl_rank != rank: before_rank.append(txl_rank) else: break # are we already above rank? if lin and lin[-1].rank in before_rank: return tuple(lin) while lin and lin[-1].rank != rank: lin.pop() return tuple(lin)
def make_lca_counts(dblist, lowest_rank='phylum', min_num=0, min_hashes=5, prefix='oddities'): """ Collect counts of all the LCAs in the list of databases. """ assert len(dblist) == 1 keep_ranks = ['root'] for rank in lca_utils.taxlist(): keep_ranks.append(rank) if rank == lowest_rank: break print('keeping hashvals at following ranks:', keep_ranks) print('min number of lineages:', min_num) print('min number of shared hashes:', min_hashes) print('---') # gather all hashvalue assignments from across all the databases assignments = defaultdict(set) for lca_db in dblist: for hashval, idx_list in lca_db.hashval_to_idx.items(): if min_num and len(idx_list) < min_num: continue for idx in idx_list: lid = lca_db.idx_to_lid.get(idx) if lid is not None: lineage = lca_db.lid_to_lineage[lid] assignments[hashval].add(lineage) # now convert to trees -> do LCA & counts counts = defaultdict(int) mixdict = defaultdict(set) for hashval, lineages in assignments.items(): # for each list of tuple_info [(rank, name), ...] build # a tree that lets us discover lowest-common-ancestor. debug("{}", lineages) tree = lca_utils.build_tree(lineages) # now find either a leaf or the first node with multiple # children; that's our lowest-common-ancestor node. lca, reason = lca_utils.find_lca(tree) # find cross-superkingdom hashes, and record combinations of lineages # that have them. rank = 'root' if lca: rank = lca[-1].rank if rank in keep_ranks: xx = [] for lineage in lineages: xx.append(tuple(lineage)) xx = tuple(xx) mixdict[xx].add(hashval) counts[lca] += 1 # sort on number of confused hash vals by combination of lineages. mixdict_items = list(mixdict.items()) mixdict_items.sort(key=lambda x: -len(x[1])) confused_hashvals = set() fp = open(prefix + '.csv', 'wt') w = csv.writer(fp) w.writerow([ 'cluster', 'num_lineages', 'shared_kmers', 'ksize', 'rank', 'lca', 'ident1', 'lineage1', 'ident2', 'lineage2' ]) # # find candidate lineages, then evaluate pairwise intersections. # for cluster_n, (lineages, hashvals) in enumerate(mixdict_items): # insist on more than N hash vals if len(hashvals) < min_hashes: continue # display summary: print('cluster {} has {} assignments for {} hashvals / {} bp'.format( cluster_n, len(lineages), len(hashvals), dblist[0].scaled * len(hashvals))) confused_hashvals.update(hashvals) tree = lca_utils.build_tree(lineages) lca, reason = lca_utils.find_lca(tree) if lca: rank = lca[-1].rank else: rank = 'root' print(' rank & lca:', rank, lca_utils.display_lineage(lca)) # for lineage_n, lineage in enumerate(lineages): # print('* ', lca_utils.display_lineage(lineage)) # now, identify all members of these lineages by their index: all_idxs = [] for lineage_n, lineage in enumerate(lineages): lids = dblist[0].lineage_to_lids[lineage] for lid in lids: idxs = dblist[0].lid_to_idx[lid] all_idxs.extend(idxs) for idx in idxs: ident = dblist[0].idx_to_ident[idx] # run through and look at all pairs of genomes in these lineages; # filter so that we're comparing across lineages with the right # LCA, and with significant intersection. pair_n = 0 candidates = [] for i in range(len(all_idxs)): idx1 = all_idxs[i] lid1 = dblist[0].idx_to_lid[idx1] lin1 = dblist[0].lid_to_lineage[lid1] for j in range(i): idx2 = all_idxs[j] lid2 = dblist[0].idx_to_lid[idx2] lin2 = dblist[0].lid_to_lineage[lid2] ident1 = dblist[0].idx_to_ident[idx1] ident2 = dblist[0].idx_to_ident[idx2] debug("{} x {}", ident1, ident2) this_tree = lca_utils.build_tree([lin1, lin2]) this_lca, this_reason = lca_utils.find_lca(this_tree) # weed out pairs that don't have the desired intersection if lca != this_lca: continue mh1 = dblist[0]._signatures[idx1] mh2 = dblist[0]._signatures[idx2] mins1 = set(mh1.get_mins()) mins2 = set(mh2.get_mins()) intersect_size = len(mins1.intersection(mins2)) # weed out pairs that don't have enough k-mer intersection if intersect_size < min_hashes: continue candidates.append( (pair_n, ident1, lin1, ident2, lin2, intersect_size)) # write summary to CSV for find-oddities-examine.py to use w.writerow([ 'cluster{}.{}'.format(cluster_n, pair_n), len(lineages), intersect_size * dblist[0].scaled, dblist[0].ksize, rank, lca_utils.display_lineage(lca), ident1, lca_utils.display_lineage(lin1), ident2, lca_utils.display_lineage(lin2) ]) pair_n += 1 print(' Candidate genome pairs for these lineages:') for pair_n, ident1, lin1, ident2, lin2, intersection_size in candidates: print(' cluster.pair {}.{} share {} bases'.format( cluster_n, pair_n, intersection_size * dblist[0].scaled)) print(' - {} ({})'.format(ident1, lca_utils.display_lineage(lin1))) print(' - {} ({})'.format(ident2, lca_utils.display_lineage(lin2))) print('') print('') return counts, confused_hashvals
def test_taxlist_2(): assert list(taxlist(include_strain=False)) == ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
def test_taxlist_1(): assert list(taxlist()) == ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
def main(args): "Main entry point for scripting. Use cmdline for command line entry." genomebase = os.path.basename(args.genome) match_rank = 'genus' # load taxonomy CSV tax_assign, _ = load_taxonomy_assignments(args.lineages_csv, start_column=2) print(f'loaded {len(tax_assign)} tax assignments.') # load the genome signature genome_sig = sourmash.load_one_signature(args.genome_sig, select_moltype=args.alphabet, ksize=args.ksize) # load all of the matches from search --containment in the database with open(args.matches_sig, 'rt') as fp: try: siglist = list( sourmash.load_signatures(fp, do_raise=True, quiet=False)) except sourmash.exceptions.SourmashError: siglist = [] print(f"loaded {len(siglist)} matches from '{args.matches_sig}'") # Hack for examining members of our search database: remove exact matches. new_siglist = [] for ss in siglist: if genome_sig.similarity(ss) == 1.0: print(f'removing an identical match: {ss.name()}') else: new_siglist.append(ss) siglist = new_siglist if not siglist: # write empty files so snakemake workflows don't complain; exit. print('no non-identical matches for this genome, exiting.') if not args.no_search_contigs: sf = SearchFiles(args.output_prefix, not args.no_search, args.gather, contigs=True) sf.close() if args.search_genome: gf = SearchFiles(args.output_prefix, not args.no_search, args.gather, contigs=False) gf.close() return 0 # construct a template minhash object that we can use to create new 'uns empty_mh = siglist[0].minhash.copy_and_clear() ksize = empty_mh.ksize scaled = empty_mh.scaled moltype = empty_mh.moltype # create empty LCA database to populate... lca_db = LCA_Database(ksize=ksize, scaled=scaled, moltype=moltype) lin_db = LineageDB() # ...with specific matches. for ss in siglist: ident = get_ident(ss) lineage = tax_assign[ident] lca_db.insert(ss, ident=ident) lin_db.insert(ident, lineage) print(f'loaded {len(siglist)} signatures & created LCA Database') print('') print(f'reading contigs from {genomebase}') screed_iter = screed.open(args.genome) genome_len = 0 if not args.no_search_contigs: sf = SearchFiles(args.output_prefix, not args.no_search, args.gather, contigs=True) for n, record in enumerate(screed_iter): # look at each contig individually mh = empty_mh.copy_and_clear() mh.add_sequence(record.sequence, force=True) # search, optionally aggregate matched hashes to get containment at rank seq_len = len(record.sequence) genome_len += seq_len num_hashes = len(mh.hashes) if not args.no_search: search_results, search_rank_results = search_containment_at_rank( mh, lca_db, lin_db, match_rank) if not search_results: # write to unclassified sf.unmatched.write(">" + record.name + "\n" + record.sequence + "\n") continue # if no search results, don't bother with gather else: # first, print normal search --containment results for sr in search_results: sf.write_result(sr, record.name, seq_len, result_type="search") # now, print containment at rank results for sr in search_rank_results: sf.write_result(sr, record.name, seq_len, result_type="ranksearch") if args.gather: # first, gather at match rank (default genus) gather_results = list( gather_at_rank(mh, lca_db, lin_db, match_rank)) # write standard gather_results? if not gather_results: # write to unclassified. should only get here if no search OR gather results sf.unmatched.write(">" + record.name + "\n" + record.sequence + "\n") else: # next, summarize at higher ranks gather_taxonomy_per_rank = gather_guess_tax_at_each_rank(gather_results, num_hashes, \ minimum_matches=args.gather_min_matches, \ lowest_rank=match_rank, \ taxlist=lca_utils.taxlist(include_strain=False)) #results = list of RankSumGatherResult = namedtuple('RankSumGatherResult', 'lineage, f_ident, f_major') # write taxonomy out for gr in gather_taxonomy_per_rank: sf.write_result(gr, record.name, seq_len, result_type="rankgather") print(f"Processed {n+1} contigs.") # close contig files sf.close() if args.search_genome: gf = SearchFiles(args.output_prefix, not args.no_search, args.gather, contigs=False) # MAG workflow entire_mh = genome_sig.minhash genome_name = genome_sig.name() num_hashes = len(entire_mh.hashes) if not genome_len: for record in screed_iter: genome_len += len(record.sequence) if not args.no_search: #results are guaranteed, otherwise would have exited before searching search_results, search_rank_results = search_containment_at_rank( entire_mh, lca_db, lin_db, match_rank) for sr in search_results: gf.write_result(sr, genome_name, genome_len, result_type="search") for sr in search_rank_results: gf.write_result(sr, genome_name, genome_len, result_type="ranksearch") if args.gather: gather_results = list( gather_at_rank(entire_mh, lca_db, lin_db, match_rank)) # next, summarize at higher ranks gather_taxonomy_per_rank = gather_guess_tax_at_each_rank(gather_results, num_hashes, \ minimum_matches=args.gather_min_matches, \ lowest_rank=match_rank, \ taxlist=lca_utils.taxlist(include_strain=False)) for gather_res in gather_taxonomy_per_rank: gf.write_result(gather_res, genome_name, genome_len, result_type="rankgather") # close genome files gf.close() return 0