def test_searchfiles_contigs_just_search(location): prefix = os.path.join(location, "pref") filelist = [ f"{prefix}.contigs.ranksearch.csv", f"{prefix}.contigs.ranksearch.matches.sig", f"{prefix}.contigs.search.csv", f"{prefix}.contigs.search.matches.sig", f"{prefix}.contigs.unmatched.fq" ] sf = SearchFiles(prefix, search=True, gather=True) # two minhashes, share ranks at phylum level hashval = 12345678 ident1 = 'first' mh1, sig1, lin1 = make_sig_and_lin([hashval], ident1, 'a;b;c') hashval2 = 87654321 ident2 = 'second' mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'a;b;f') # create lca_db w sigs lca_db = LCA_Database(scaled=1, ksize=3) lca_db.insert(sig1, ident=ident1) lca_db.insert(sig2, ident=ident2) # make lin_db lin_db = LineageDB() lin_db.insert(ident1, lin1) lin_db.insert(ident2, lin2) num_hashes = 2 # search with combined hashvals search_mh = make_mh([hashval, hashval2]) results, rank_results = search_containment_at_rank(search_mh, lca_db, lin_db, "class") gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class")) #write search results name = 'name' seq_len = 6 for res in results: sf.write_result(res, name, seq_len, result_type="search") for res in rank_results: sf.write_result(res, name, seq_len, result_type="ranksearch") sf.close() # check results are in files for f in filelist: assert os.path.exists(f) with open(f"{prefix}.contigs.search.csv", "r") as searchres: this_search_csvset = get_csv_set(searchres) with open(utils.get_testfile("test-data/test.contigs.search.csv"), "r") as searchres: saved_search_csvset = get_csv_set(searchres) assert saved_search_csvset == this_search_csvset with open(f"{prefix}.contigs.ranksearch.csv", "r") as searchres: this_ranksearch_csvset = get_csv_set(searchres) with open(utils.get_testfile("test-data/test.contigs.ranksearch.csv"), "r") as searchres: saved_ranksearch_csvset = get_csv_set(searchres) assert saved_ranksearch_csvset == this_ranksearch_csvset
def test_gather_guess_tax_at_rank_1(): # one minhash, one set of ranks hashval = 12345678 ident = 'uniq' mh1, sig1, lin1 = make_sig_and_lin([hashval], ident, 'a;b;c') lca_db = LCA_Database(scaled=1, ksize=3) lca_db.insert(sig1, ident=ident) lin_db = LineageDB() lin_db.insert(ident, lin1) num_hashes = 1 phylum_match_lin = lca_utils.make_lineage('a;b') gather_results = list(gather_at_rank(mh1, lca_db, lin_db, "class")) phylum_results = gather_guess_tax_at_rank(gather_results, num_hashes, "phylum", minimum_matches=1) assert len(phylum_results) == 3 assert phylum_results[0] == phylum_match_lin assert phylum_results[1] == 1.0
def test_gather_at_rank_3(): # two minhashes, totally distinct ranks # first sig hashval1 = 12345678 ident1 = 'first' mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident1, 'a;b;c') # second sig hashval2 = 87654321 ident2 = 'second' mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'd;e;f') # create lca_db w sig1 lca_db = LCA_Database(scaled=1, ksize=3) lca_db.insert(sig1, ident=ident1) lca_db.insert(sig2, ident=ident2) # next, make lin_db lin_db = LineageDB() lin_db.insert(ident1, lin1) lin_db.insert(ident2, lin2) # search with combined hashvals search_mh = make_mh([hashval1, hashval2]) gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class")) assert len(gather_results) == 2 assert set([gather_results[0][0], gather_results[1][0]]) == set([lin1, lin2]) assert set([gather_results[0][1], gather_results[1][1]]) == set([1])
def test_gather_at_rank_2(): #two minhashes, fully shared ranks # first sig hashval = 12345678 ident1 = 'first' mh1, sig1, lin1 = make_sig_and_lin([hashval], ident1, 'a;b;c') # second sig hashval2 = 87654321 ident2 = 'second' mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'a;b;c') # create lca_db w sigs lca_db = LCA_Database(scaled=1, ksize=3) lca_db.insert(sig1, ident=ident1) lca_db.insert(sig2, ident=ident2) # make lin_db lin_db = LineageDB() lin_db.insert(ident1, lin1) lin_db.insert(ident2, lin2) # search with combined hashvals search_mh = make_mh([hashval, hashval2]) gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class")) assert len(gather_results) == 1 assert gather_results[0][0] == lin1 assert gather_results[0][1] == 2
def test_gather_at_rank_1(): # one minhash, one set of ranks hashval = 12345678 ident = 'uniq' mh1, sig1, lin1 = make_sig_and_lin([hashval], ident, 'a;b;c') lca_db = LCA_Database(scaled=1, ksize=3) lca_db.insert(sig1, ident=ident) lin_db = LineageDB() lin_db.insert(ident, lin1) gather_results = list(gather_at_rank(mh1, lca_db, lin_db, "class")) assert len(gather_results) == 1 assert gather_results[0][0] == lin1 assert gather_results[0][1] == 1
def test_gather_guess_tax_at_each_rank_1(): #two minhashes, fully shared ranks # first sig hashval = 12345678 ident1 = 'first' mh1, sig1, lin1 = make_sig_and_lin([hashval], ident1, 'a;b;c') # second sig hashval2 = 87654321 ident2 = 'second' mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'a;b;c') # create lca_db w sigs lca_db = LCA_Database(scaled=1, ksize=3) lca_db.insert(sig1, ident=ident1) lca_db.insert(sig2, ident=ident2) # make lin_db lin_db = LineageDB() lin_db.insert(ident1, lin1) lin_db.insert(ident2, lin2) num_hashes = 2 superk_lin = lca_utils.make_lineage('a') phylum_lin = lca_utils.make_lineage('a;b') # search with combined hashvals search_mh = make_mh([hashval, hashval2]) gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class")) rank_results=gather_guess_tax_at_each_rank(gather_results, num_hashes, minimum_matches=1, \ lowest_rank="class", taxlist=lca_utils.taxlist(include_strain=False)) assert len(rank_results) == 3 assert rank_results[0] == RankSumGatherResult(lineage=superk_lin, f_ident=1.0, f_major=1.0) assert rank_results[1] == RankSumGatherResult(lineage=phylum_lin, f_ident=1.0, f_major=1.0) assert rank_results[2] == RankSumGatherResult(lineage=lin1, f_ident=1.0, f_major=1.0)
def test_gather_guess_tax_at_each_rank_3(): # two minhashes, totally distinct ranks # first sig hashval1 = 12345678 ident1 = 'first' mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident1, 'a;b;c') # second sig hashval2 = 87654321 ident2 = 'second' mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'd;e;f') # create lca_db w sig1 lca_db = LCA_Database(scaled=1, ksize=3) lca_db.insert(sig1, ident=ident1) lca_db.insert(sig2, ident=ident2) # next, make lin_db lin_db = LineageDB() lin_db.insert(ident1, lin1) lin_db.insert(ident2, lin2) num_hashes = 2 #winner seems to be def lineage.. will this remain true always? superk_lin = lca_utils.make_lineage('d') phylum_lin = lca_utils.make_lineage('d;e') # search with combined hashvals search_mh = make_mh([hashval1, hashval2]) gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class")) rank_results=gather_guess_tax_at_each_rank(gather_results, num_hashes, minimum_matches=1, \ lowest_rank="class", taxlist=lca_utils.taxlist(include_strain=False)) assert len(rank_results) == 3 assert rank_results[0] == RankSumGatherResult(lineage=superk_lin, f_ident=1.0, f_major=0.5) assert rank_results[1] == RankSumGatherResult(lineage=phylum_lin, f_ident=1.0, f_major=0.5) assert rank_results[2] == RankSumGatherResult(lineage=lin2, f_ident=1.0, f_major=0.5)
def main(args): "Main entry point for scripting. Use cmdline for command line entry." genomebase = os.path.basename(args.genome) match_rank = 'genus' # load taxonomy CSV tax_assign, _ = load_taxonomy_assignments(args.lineages_csv, start_column=2) print(f'loaded {len(tax_assign)} tax assignments.') # load the genome signature genome_sig = sourmash.load_one_signature(args.genome_sig, select_moltype=args.alphabet, ksize=args.ksize) # load all of the matches from search --containment in the database with open(args.matches_sig, 'rt') as fp: try: siglist = list( sourmash.load_signatures(fp, do_raise=True, quiet=False)) except sourmash.exceptions.SourmashError: siglist = [] print(f"loaded {len(siglist)} matches from '{args.matches_sig}'") # Hack for examining members of our search database: remove exact matches. new_siglist = [] for ss in siglist: if genome_sig.similarity(ss) == 1.0: print(f'removing an identical match: {ss.name()}') else: new_siglist.append(ss) siglist = new_siglist if not siglist: # write empty files so snakemake workflows don't complain; exit. print('no non-identical matches for this genome, exiting.') if not args.no_search_contigs: sf = SearchFiles(args.output_prefix, not args.no_search, args.gather, contigs=True) sf.close() if args.search_genome: gf = SearchFiles(args.output_prefix, not args.no_search, args.gather, contigs=False) gf.close() return 0 # construct a template minhash object that we can use to create new 'uns empty_mh = siglist[0].minhash.copy_and_clear() ksize = empty_mh.ksize scaled = empty_mh.scaled moltype = empty_mh.moltype # create empty LCA database to populate... lca_db = LCA_Database(ksize=ksize, scaled=scaled, moltype=moltype) lin_db = LineageDB() # ...with specific matches. for ss in siglist: ident = get_ident(ss) lineage = tax_assign[ident] lca_db.insert(ss, ident=ident) lin_db.insert(ident, lineage) print(f'loaded {len(siglist)} signatures & created LCA Database') print('') print(f'reading contigs from {genomebase}') screed_iter = screed.open(args.genome) genome_len = 0 if not args.no_search_contigs: sf = SearchFiles(args.output_prefix, not args.no_search, args.gather, contigs=True) for n, record in enumerate(screed_iter): # look at each contig individually mh = empty_mh.copy_and_clear() mh.add_sequence(record.sequence, force=True) # search, optionally aggregate matched hashes to get containment at rank seq_len = len(record.sequence) genome_len += seq_len num_hashes = len(mh.hashes) if not args.no_search: search_results, search_rank_results = search_containment_at_rank( mh, lca_db, lin_db, match_rank) if not search_results: # write to unclassified sf.unmatched.write(">" + record.name + "\n" + record.sequence + "\n") continue # if no search results, don't bother with gather else: # first, print normal search --containment results for sr in search_results: sf.write_result(sr, record.name, seq_len, result_type="search") # now, print containment at rank results for sr in search_rank_results: sf.write_result(sr, record.name, seq_len, result_type="ranksearch") if args.gather: # first, gather at match rank (default genus) gather_results = list( gather_at_rank(mh, lca_db, lin_db, match_rank)) # write standard gather_results? if not gather_results: # write to unclassified. should only get here if no search OR gather results sf.unmatched.write(">" + record.name + "\n" + record.sequence + "\n") else: # next, summarize at higher ranks gather_taxonomy_per_rank = gather_guess_tax_at_each_rank(gather_results, num_hashes, \ minimum_matches=args.gather_min_matches, \ lowest_rank=match_rank, \ taxlist=lca_utils.taxlist(include_strain=False)) #results = list of RankSumGatherResult = namedtuple('RankSumGatherResult', 'lineage, f_ident, f_major') # write taxonomy out for gr in gather_taxonomy_per_rank: sf.write_result(gr, record.name, seq_len, result_type="rankgather") print(f"Processed {n+1} contigs.") # close contig files sf.close() if args.search_genome: gf = SearchFiles(args.output_prefix, not args.no_search, args.gather, contigs=False) # MAG workflow entire_mh = genome_sig.minhash genome_name = genome_sig.name() num_hashes = len(entire_mh.hashes) if not genome_len: for record in screed_iter: genome_len += len(record.sequence) if not args.no_search: #results are guaranteed, otherwise would have exited before searching search_results, search_rank_results = search_containment_at_rank( entire_mh, lca_db, lin_db, match_rank) for sr in search_results: gf.write_result(sr, genome_name, genome_len, result_type="search") for sr in search_rank_results: gf.write_result(sr, genome_name, genome_len, result_type="ranksearch") if args.gather: gather_results = list( gather_at_rank(entire_mh, lca_db, lin_db, match_rank)) # next, summarize at higher ranks gather_taxonomy_per_rank = gather_guess_tax_at_each_rank(gather_results, num_hashes, \ minimum_matches=args.gather_min_matches, \ lowest_rank=match_rank, \ taxlist=lca_utils.taxlist(include_strain=False)) for gather_res in gather_taxonomy_per_rank: gf.write_result(gather_res, genome_name, genome_len, result_type="rankgather") # close genome files gf.close() return 0