def make_all_matches(sigdict, tree, threshold): """ Find all the matches between a dictionary of signatures and an SBT (search tree), at or above given threshold. Return a dictionary of d[signame] -> (match_name, similarity) """ match_d = {} search_fn = lambda: SearchMinHashesFindBest().search for query in sigdict.values(): matching_sig = None for leaf in tree.find(search_fn(), query, threshold): # deal with bug? in this search_fn; thresholds not always met. similarity = leaf.data.similarity(query) if similarity >= threshold: matching_sig = leaf.data print('match:', query.name(), matching_sig.name(), similarity) match_d[query.name()] = (matching_sig.name(), similarity) if not matching_sig: print('no match found:', query.name()) return match_d
def do_search(): search_fn = SearchMinHashesFindBest().search results = [] for leaf in tree.find(search_fn, streamsig, args.threshold): results.append((streamsig.similarity(leaf.data), leaf.data)) return results
def sbt_search(args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name of SBT to load') parser.add_argument('query', help='signature to query') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.08, type=float) parser.add_argument('--save-matches', type=argparse.FileType('wt')) parser.add_argument('--best-only', action='store_true') sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) moltype = sourmash_args.calculate_moltype(args) search_fn = search_minhashes if args.best_only: search_fn = SearchMinHashesFindBest().search tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) query = sourmash_args.load_query_signature(args.query, select_ksize=args.ksize, select_moltype=moltype) query_moltype = sourmash_args.get_moltype(query) query_ksize = query.estimator.ksize notify('loaded query: {}... (k={}, {})', query.name()[:30], query_ksize, query_moltype) results = [] for leaf in tree.find(search_fn, query, args.threshold): results.append((query.similarity(leaf.data), leaf.data)) #results.append((leaf.data.similarity(ss), leaf.data)) results.sort(key=lambda x: -x[0]) # reverse sort on similarity for (similarity, query) in results: print('{:.2f} {}'.format(similarity, query.name())) if args.save_matches: outname = args.save_matches.name notify('saving all matches to "{}"', outname) sig.save_signatures([m for (sim, m) in results], args.save_matches)
def sbt_gather(self, args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('query') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.05, type=float) parser.add_argument('-o', '--output', type=argparse.FileType('wt')) parser.add_argument('--csv', type=argparse.FileType('wt')) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) if args.protein: if args.dna is True: raise Exception('cannot specify both --dna and --protein!') args.dna = False moltype = None if args.protein: moltype = 'protein' elif args.dna: moltype = 'dna' tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) sl = sig.load_signatures(args.query, select_ksize=args.ksize, select_moltype=moltype) sl = list(sl) if len(sl) != 1: print('When loading query from "{}",'.format(args.query), file=sys.stderr) print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl))) sys.exit(-1) query = sl[0] query_moltype = 'UNKNOWN' if query.estimator.is_molecule_type('dna'): query_moltype = 'DNA' elif query.estimator.is_molecule_type('protein'): query_moltype = 'protein' query_ksize = query.estimator.ksize print('loaded query: {}... (k={}, {})'.format(query.name()[:30], query_ksize, query_moltype)) tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) #s = sig.load_signatures(args.query, select_ksize=args.ksize) orig_query = query sum_found = 0. found = [] while 1: search_fn = SearchMinHashesFindBest().search results = [] # use super low threshold for this part of the search for leaf in tree.find(search_fn, query, 0.00001): results.append((query.similarity(leaf.data), leaf.data)) #results.append((leaf.data.similarity(ss), leaf.data)) if not len(results): # no matches at all! break # take the best result results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_sim, best_ss = results[0] sim = best_ss.similarity(orig_query) # adjust by size of leaf (kmer cardinality of original genome) if best_ss.estimator.hll: leaf_kmers = best_ss.estimator.hll.estimate_cardinality() query_kmers = orig_query.estimator.hll.estimate_cardinality() f_of_total = leaf_kmers / query_kmers * sim else: f_of_total = 0 if not found and sim < args.threshold: print('best match: {}'.format(best_ss.name())) print('similarity is {:.5f} of db signature;'.format(sim)) print('this is below specified threshold => exiting.') break # subtract found hashes from search hashes, construct new search new_mins = set(query.estimator.mh.get_mins()) found_mins = best_ss.estimator.mh.get_mins() # print interim & save print('found: {:.2f} {} {}'.format(f_of_total, len(new_mins), best_ss.name())) found.append((f_of_total, best_ss, sim)) sum_found += f_of_total new_mins -= set(found_mins) e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins)) for m in new_mins: e.mh.add_hash(m) new_ss = sig.SourmashSignature('foo', e) query = new_ss print('found {}, total fraction {:.3f}'.format(len(found), sum_found)) print('') if not found: sys.exit(0) found.sort() found.reverse() print('Composition:') for (frac, leaf_sketch, sim) in found: print('{:.2f} {}'.format(frac, leaf_sketch.name())) if args.output: print('Composition:', file=args.output) for (frac, leaf_sketch, sim) in found: print('{:.2f} {}'.format(frac, leaf_sketch.name()), file=args.output) if args.csv: fieldnames = ['fraction', 'name', 'similarity', 'sketch_kmers'] w = csv.DictWriter(args.csv, fieldnames=fieldnames) w.writeheader() for (frac, leaf_sketch, sim) in found: cardinality = leaf_sketch.estimator.hll.estimate_cardinality() w.writerow(dict(fraction=frac, name=leaf_sketch.name(), similarity=sim, sketch_kmers=cardinality))
def categorize(self, args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('queries', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.08, type=float) parser.add_argument('--traverse-directory', action="store_true") sourmash_args.add_moltype_args(parser) parser.add_argument('--csv', type=argparse.FileType('at')) parser.add_argument('--load-csv', default=None) args = parser.parse_args(args) if args.protein: if args.dna is True: raise Exception('cannot specify both --dna and --protein!') args.dna = False moltype = None if args.protein: moltype = 'protein' elif args.dna: moltype = 'dna' already_names = set() if args.load_csv: with open(args.load_csv, 'rt') as fp: r = csv.reader(fp) for row in r: already_names.add(row[0]) tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) if args.traverse_directory: inp_files = [] for dirname in args.queries: for root, dirs, files in os.walk(dirname): for name in files: if name.endswith('.sig'): fullname = os.path.join(root, name) if fullname not in already_names: inp_files.append(fullname) else: inp_files = args.queries print('found {} files to query'.format(len(inp_files))) loader = sourmash_args.LoadSingleSignatures(inp_files, args.ksize, moltype) for queryfile, query, query_moltype, query_ksize in loader: print('loaded query: {}... (k={}, {})'.format(query.name()[:30], query_ksize, query_moltype)) results = [] search_fn = SearchMinHashesFindBest().search for leaf in tree.find(search_fn, query, args.threshold): # ignore self if leaf.data.md5sum() != query.md5sum(): results.append((query.similarity(leaf.data), leaf.data)) best_hit_sim = 0.0 best_hit_query_name = "" if results: results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_hit_sim, best_hit_query = results[0] print('for {}, found: {:.2f} {}'.format(query.name(), best_hit_sim, best_hit_query.name())) best_hit_query_name = best_hit_query.name() else: print('for {}, no match found'.format(query.name())) if args.csv: w = csv.writer(args.csv) w.writerow([queryfile, best_hit_query_name, best_hit_sim]) if loader.skipped_ignore: print('skipped/ignore: {}'.format(loader.skipped_ignore)) if loader.skipped_nosig: print('skipped/nosig: {}'.format(loader.skipped_nosig))
def sbt_search(self, args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('query') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.08, type=float) parser.add_argument('--save-matches', type=argparse.FileType('wt')) parser.add_argument('--best-only', action='store_true') sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) if args.protein: if args.dna is True: raise Exception('cannot specify both --dna and --protein!') args.dna = False moltype = None if args.protein: moltype = 'protein' elif args.dna: moltype = 'dna' search_fn = search_minhashes if args.best_only: search_fn = SearchMinHashesFindBest().search tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) sl = sig.load_signatures(args.query, select_ksize=args.ksize, select_moltype=moltype) sl = list(sl) if len(sl) != 1: print('When loading query from "{}",'.format(args.query), file=sys.stderr) print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl))) sys.exit(-1) query = sl[0] query_moltype = 'UNKNOWN' if query.estimator.is_molecule_type('dna'): query_moltype = 'DNA' elif query.estimator.is_molecule_type('protein'): query_moltype = 'protein' query_ksize = query.estimator.ksize print('loaded query: {}... (k={}, {})'.format(query.name()[:30], query_ksize, query_moltype)) results = [] for leaf in tree.find(search_fn, query, args.threshold): results.append((query.similarity(leaf.data), leaf.data)) #results.append((leaf.data.similarity(ss), leaf.data)) results.sort(key=lambda x: -x[0]) # reverse sort on similarity for (similarity, query) in results: print('{:.2f} {}'.format(similarity, query.name())) if args.save_matches: outname = args.save_matches.name print('saving all matches to "{}"'.format(outname)) sig.save_signatures([ m for (sim, m) in results ], args.save_matches)
def categorize(args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name of SBT to load') parser.add_argument('queries', nargs='+', help='list of signatures to categorize') parser.add_argument('-k', '--ksize', type=int, default=None) parser.add_argument('--threshold', default=0.08, type=float) parser.add_argument('--traverse-directory', action="store_true") sourmash_args.add_moltype_args(parser) parser.add_argument('--csv', type=argparse.FileType('at')) parser.add_argument('--load-csv', default=None) args = parser.parse_args(args) moltype = sourmash_args.calculate_moltype(args) already_names = set() if args.load_csv: with open(args.load_csv, 'rt') as fp: r = csv.reader(fp) for row in r: already_names.add(row[0]) tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) if args.traverse_directory: inp_files = set(sourmash_args.traverse_find_sigs(args.queries)) else: inp_files = set(args.queries) - already_names inp_files = set(inp_files) - already_names notify('found {} files to query', len(inp_files)) loader = sourmash_args.LoadSingleSignatures(inp_files, args.ksize, moltype) for queryfile, query, query_moltype, query_ksize in loader: notify('loaded query: {}... (k={}, {})', query.name()[:30], query_ksize, query_moltype) results = [] search_fn = SearchMinHashesFindBest().search for leaf in tree.find(search_fn, query, args.threshold): if leaf.data.md5sum() != query.md5sum(): # ignore self. results.append((query.similarity(leaf.data), leaf.data)) best_hit_sim = 0.0 best_hit_query_name = "" if results: results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_hit_sim, best_hit_query = results[0] notify('for {}, found: {:.2f} {}', query.name(), best_hit_sim, best_hit_query.name()) best_hit_query_name = best_hit_query.name() else: notify('for {}, no match found', query.name()) if args.csv: w = csv.writer(args.csv) w.writerow([queryfile, best_hit_query_name, best_hit_sim]) if loader.skipped_ignore: notify('skipped/ignore: {}', loader.skipped_ignore) if loader.skipped_nosig: notify('skipped/nosig: {}', loader.skipped_nosig)
def search(args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('query', help='query signature') parser.add_argument('databases', help='signatures/SBTs to search', nargs='+') parser.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') parser.add_argument('--threshold', default=0.08, type=float, help='minimum threshold for reporting matches') parser.add_argument('--save-matches', type=argparse.FileType('wt'), help='output matching signatures to this file.') parser.add_argument( '--best-only', action='store_true', help='report only the best match (with greater speed).') parser.add_argument('-n', '--num-results', default=3, type=int, help='number of results to report') parser.add_argument('--containment', action='store_true', help='evaluate containment rather than similarity') parser.add_argument( '--scaled', type=float, help='downsample query to this scaled factor (yields greater speed)') parser.add_argument('-o', '--output', type=argparse.FileType('wt'), help='output CSV containing matches to this file') sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) # set up the query. query = sourmash_args.load_query_signature(args.query, select_ksize=args.ksize, select_moltype=moltype) query_moltype = sourmash_args.get_moltype(query) query_ksize = query.minhash.ksize notify('loaded query: {}... (k={}, {})', query.name()[:30], query_ksize, query_moltype) # downsample if requested if args.scaled: if query.minhash.max_hash == 0: error('cannot downsample a signature not created with --scaled') sys.exit(-1) notify('downsampling query from scaled={} to {}', query.minhash.scaled, int(args.scaled)) query.minhash = query.minhash.downsample_scaled(args.scaled) # set up the search function(s) search_fn = search_minhashes # similarity vs containment query_similarity = lambda x: query.similarity(x, downsample=True) if args.containment: query_similarity = lambda x: query.contained_by(x, downsample=True) # set up the search databases databases = sourmash_args.load_sbts_and_sigs(args.databases, query_ksize, query_moltype) if not len(databases): error('Nothing found to search!') sys.exit(-1) # collect results across all the trees SearchResult = namedtuple('SearchResult', 'similarity, match_sig, md5, filename, name') results = [] found_md5 = set() for (sbt_or_siglist, filename, is_sbt) in databases: if args.best_only: search_fn = SearchMinHashesFindBest().search if is_sbt: tree = sbt_or_siglist notify('Searching SBT {}', filename) for leaf in tree.find(search_fn, query, args.threshold): similarity = query_similarity(leaf.data) if similarity >= args.threshold and \ leaf.data.md5sum() not in found_md5: sr = SearchResult(similarity=similarity, match_sig=leaf.data, md5=leaf.data.md5sum(), filename=filename, name=leaf.data.name()) found_md5.add(sr.md5) results.append(sr) else: # list of signatures for ss in sbt_or_siglist: similarity = query_similarity(ss) if similarity >= args.threshold and \ ss.md5sum() not in found_md5: sr = SearchResult(similarity=similarity, match_sig=ss, md5=ss.md5sum(), filename=filename, name=ss.name()) found_md5.add(sr.md5) results.append(sr) # sort results on similarity (reverse) results.sort(key=lambda x: -x.similarity) if args.best_only: notify( "(truncated search because of --best-only; only trust top result") n_matches = len(results) if n_matches <= args.num_results: print_results('{} matches:'.format(len(results))) else: print_results('{} matches; showing first {}:', len(results), args.num_results) n_matches = args.num_results # output! print_results("similarity match") print_results("---------- -----") for sr in results[:n_matches]: pct = '{:.1f}%'.format(sr.similarity * 100) name = sr.match_sig._display_name(60) print_results('{:>6} {}', pct, name) if args.output: fieldnames = ['similarity', 'name', 'filename', 'md5'] w = csv.DictWriter(args.output, fieldnames=fieldnames) w.writeheader() for sr in results: d = dict(sr._asdict()) del d['match_sig'] w.writerow(d) # save matching signatures upon request if args.save_matches: outname = args.save_matches.name notify('saving all matched signatures to "{}"', outname) sig.save_signatures([sr.match_sig for sr in results], args.save_matches)