Exemplo n.º 1
0
def sbt_combine(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import SigLeaf

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name to save SBT into')
    parser.add_argument('sbts', nargs='+', help='SBTs to combine to a new SBT')
    parser.add_argument('-x', '--bf-size', type=float, default=1e5)

    sourmash_args.add_moltype_args(parser)

    args = parser.parse_args(args)
    moltype = sourmash_args.calculate_moltype(args)

    inp_files = list(args.sbts)
    notify('combining {} SBTs', len(inp_files))

    tree = SBT.load(inp_files.pop(0), leaf_loader=SigLeaf.load)

    for f in inp_files:
        new_tree = SBT.load(f, leaf_loader=SigLeaf.load)
        # TODO: check if parameters are the same for both trees!
        tree.combine(new_tree)

    notify('saving SBT under "{}".', args.sbt_name)
    tree.save(args.sbt_name)
Exemplo n.º 2
0
def test_sbt_fsstorage():
    factory = GraphFactory(31, 1e5, 4)
    with utils.TempDirectory() as location:
        tree = SBT(factory)

        for f in utils.SIG_FILES:
            sig = next(signature.load_signatures(utils.get_test_data(f)))
            leaf = SigLeaf(os.path.basename(f), sig)
            tree.add_node(leaf)
            to_search = leaf

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        old_result = {str(s) for s in tree.find(search_minhashes,
                                                to_search.data, 0.1)}
        print(*old_result, sep='\n')

        with FSStorage(os.path.join(location, '.fstree')) as storage:
            tree.save(os.path.join(location, 'tree'), storage=storage)

        tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load)
        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {str(s) for s in tree.find(search_minhashes,
                                                to_search.data, 0.1)}
        print(*new_result, sep='\n')

        assert old_result == new_result

        assert os.path.exists(os.path.join(location, tree.storage.path))
        assert os.path.exists(os.path.join(location, '.fstree'))
Exemplo n.º 3
0
def load_sbts_and_sigs(filenames, query_ksize, query_moltype):
    databases = []
    for sbt_or_sigfile in filenames:
        try:
            tree = SBT.load(sbt_or_sigfile, leaf_loader=SigLeaf.load)
            ksize = get_ksize(tree)
            if ksize != query_ksize:
                error("ksize on tree '{}' is {};", sbt_or_sigfile, ksize)
                error('this is different from query ksize of {}.', query_ksize)
                sys.exit(-1)

            databases.append((tree, sbt_or_sigfile, True))
            notify('loaded SBT {}', sbt_or_sigfile)
        except (ValueError, EnvironmentError):
            # not an SBT - try as a .sig

            try:
                siglist = sig.load_signatures(sbt_or_sigfile,
                                              select_ksize=query_ksize,
                                              select_moltype=query_moltype)
                siglist = list(siglist)
                databases.append((list(siglist), sbt_or_sigfile, False))
                notify('loaded {} signatures from {}', len(siglist),
                       sbt_or_sigfile)
            except EnvironmentError:
                error("file '{}' does not exist", sbt_or_sigfile)
                sys.exit(-1)

    return databases
Exemplo n.º 4
0
def test_tree_save_load(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)

    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        to_search = leaf

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    old_result = {
        str(s)
        for s in tree.find(search_minhashes, to_search.data, 0.1)
    }
    print(*old_result, sep='\n')

    with utils.TempDirectory() as location:
        tree.save(os.path.join(location, 'demo'))
        tree = SBT.load(os.path.join(location, 'demo'),
                        leaf_loader=SigLeaf.load)

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
        print(*new_result, sep='\n')

        assert old_result == new_result
Exemplo n.º 5
0
def test_tree_repair():
    tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'),
                           leaf_loader=SigLeaf.load)

    tree_cur = SBT.load(utils.get_test_data('v3.sbt.json'),
                        leaf_loader=SigLeaf.load)

    testdata1 = utils.get_test_data(utils.SIG_FILES[0])
    to_search = next(signature.load_signatures(testdata1))

    results_repair = {str(s) for s in tree_repair.find(search_minhashes,
                                                       to_search, 0.1)}
    results_cur = {str(s) for s in tree_cur.find(search_minhashes,
                                                 to_search, 0.1)}

    assert results_repair == results_cur
    assert len(results_repair) == 4
Exemplo n.º 6
0
def load_sbts_and_sigs(filenames, query_ksize, query_moltype, traverse=False):
    n_signatures = 0
    n_databases = 0
    databases = []
    for sbt_or_sigfile in filenames:
        if traverse and os.path.isdir(sbt_or_sigfile):
            for sigfile in traverse_find_sigs([sbt_or_sigfile]):
                try:
                    siglist = sig.load_signatures(sigfile,
                                                  ksize=query_ksize,
                                                  select_moltype=query_moltype)
                    siglist = list(siglist)
                    databases.append((list(siglist), sbt_or_sigfile, False))
                    notify('loaded {} signatures from {}',
                           len(siglist),
                           sigfile,
                           end='\r')
                    n_signatures += len(siglist)
                except:  # ignore errors with traverse
                    continue
            continue
        try:
            tree = SBT.load(sbt_or_sigfile, leaf_loader=SigLeaf.load)
            ksize = get_ksize(tree)
            if ksize != query_ksize:
                error("ksize on tree '{}' is {};", sbt_or_sigfile, ksize)
                error('this is different from query ksize of {}.', query_ksize)
                sys.exit(-1)

            databases.append((tree, sbt_or_sigfile, True))
            notify('loaded SBT {}', sbt_or_sigfile, end='\r')
            n_databases += 1
        except (ValueError, EnvironmentError):
            # not an SBT - try as a .sig

            try:
                siglist = sig.load_signatures(sbt_or_sigfile,
                                              ksize=query_ksize,
                                              select_moltype=query_moltype)
                siglist = list(siglist)
                databases.append((list(siglist), sbt_or_sigfile, False))
                notify('loaded {} signatures from {}',
                       len(siglist),
                       sbt_or_sigfile,
                       end='\r')
                n_signatures += len(siglist)
            except EnvironmentError:
                error("\nfile '{}' does not exist", sbt_or_sigfile)
                sys.exit(-1)
    notify(' ' * 79, end='\r')
    notify('loaded {} signatures and {} databases total.'.format(
        n_signatures, n_databases))

    if databases:
        print('')

    return databases
Exemplo n.º 7
0
def test_tree_v1_load():
    tree_v1 = SBT.load(utils.get_test_data('v1.sbt.json'),
                       leaf_loader=SigLeaf.load)

    tree_v2 = SBT.load(utils.get_test_data('v2.sbt.json'),
                       leaf_loader=SigLeaf.load)

    testdata1 = utils.get_test_data(utils.SIG_FILES[0])
    to_search = next(signature.load_signatures(testdata1))

    results_v1 = {
        str(s)
        for s in tree_v1.find(search_minhashes, to_search, 0.1)
    }
    results_v2 = {
        str(s)
        for s in tree_v2.find(search_minhashes, to_search, 0.1)
    }

    assert results_v1 == results_v2
    assert len(results_v1) == 4
Exemplo n.º 8
0
def test_tree_repair_add_node():
    tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'),
                           leaf_loader=SigLeaf.load)

    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree_repair.add_node(leaf)

    for pos, node in list(tree_repair.nodes.items()):
        # Every parent of a node must be an internal node (and not a leaf),
        # except for node 0 (the root), whose parent is None.
        if pos != 0:
            assert isinstance(tree_repair.parent(pos).node, Node)

        # Leaf nodes can't have children
        if isinstance(node, Leaf):
            assert all(c.node is None for c in tree_repair.children(pos))
Exemplo n.º 9
0
def sbt_search(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBest

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to load')
    parser.add_argument('query', help='signature to query')
    parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
    parser.add_argument('--threshold', default=0.08, type=float)
    parser.add_argument('--save-matches', type=argparse.FileType('wt'))
    parser.add_argument('--best-only', action='store_true')

    sourmash_args.add_moltype_args(parser)
    args = parser.parse_args(args)
    moltype = sourmash_args.calculate_moltype(args)

    search_fn = search_minhashes
    if args.best_only:
        search_fn = SearchMinHashesFindBest().search

    tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
    query = sourmash_args.load_query_signature(args.query,
                                               select_ksize=args.ksize,
                                               select_moltype=moltype)
    query_moltype = sourmash_args.get_moltype(query)
    query_ksize = query.estimator.ksize
    notify('loaded query: {}... (k={}, {})',
           query.name()[:30], query_ksize, query_moltype)

    results = []
    for leaf in tree.find(search_fn, query, args.threshold):
        results.append((query.similarity(leaf.data), leaf.data))
        #results.append((leaf.data.similarity(ss), leaf.data))

    results.sort(key=lambda x: -x[0])  # reverse sort on similarity
    for (similarity, query) in results:
        print('{:.2f} {}'.format(similarity, query.name()))

    if args.save_matches:
        outname = args.save_matches.name
        notify('saving all matches to "{}"', outname)
        sig.save_signatures([m for (sim, m) in results], args.save_matches)
Exemplo n.º 10
0
def test_save_sparseness(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)

    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        to_search = leaf

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    old_result = {
        str(s)
        for s in tree.find(search_minhashes, to_search.data, 0.1)
    }
    print(*old_result, sep='\n')

    with utils.TempDirectory() as location:
        tree.save(os.path.join(location, 'demo'), sparseness=1.0)
        tree_loaded = SBT.load(os.path.join(location, 'demo'),
                               leaf_loader=SigLeaf.load)
        assert all(not isinstance(n, Node) for n in tree_loaded.nodes.values())

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {
            str(s)
            for s in tree_loaded.find(search_minhashes, to_search.data, 0.1)
        }
        print(*new_result, sep='\n')

        assert old_result == new_result

        for pos, node in list(tree_loaded.nodes.items()):
            # Every parent of a node must be an internal node (and not a leaf),
            # except for node 0 (the root), whose parent is None.
            if pos != 0:
                assert isinstance(tree_loaded.parent(pos).node, Node)

            # Leaf nodes can't have children
            if isinstance(node, Leaf):
                assert all(c.node is None for c in tree_loaded.children(pos))
Exemplo n.º 11
0
def test_sbt_ipfsstorage():
    ipfsapi = pytest.importorskip('ipfsapi')

    factory = GraphFactory(31, 1e5, 4)
    with utils.TempDirectory() as location:
        tree = SBT(factory)

        for f in utils.SIG_FILES:
            sig = next(signature.load_signatures(utils.get_test_data(f)))
            leaf = SigLeaf(os.path.basename(f), sig)
            tree.add_node(leaf)
            to_search = leaf

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        old_result = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
        print(*old_result, sep='\n')

        try:
            with IPFSStorage() as storage:
                tree.save(os.path.join(location, 'tree'), storage=storage)
        except ipfsapi.exceptions.ConnectionError:
            pytest.xfail("ipfs not installed/functioning probably")

        with IPFSStorage() as storage:
            tree = SBT.load(os.path.join(location, 'tree'),
                            leaf_loader=SigLeaf.load,
                            storage=storage)

            print('*' * 60)
            print("{}:".format(to_search.metadata))
            new_result = {
                str(s)
                for s in tree.find(search_minhashes, to_search.data, 0.1)
            }
            print(*new_result, sep='\n')

            assert old_result == new_result
Exemplo n.º 12
0
    def categorize(self, args):
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('queries', nargs='+')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.08, type=float)
        parser.add_argument('--traverse-directory', action="store_true")

        sourmash_args.add_moltype_args(parser)

        parser.add_argument('--csv', type=argparse.FileType('at'))
        parser.add_argument('--load-csv', default=None)
        
        args = parser.parse_args(args)

        if args.protein:
            if args.dna is True:
                raise Exception('cannot specify both --dna and --protein!')
            args.dna = False

        moltype = None
        if args.protein:
            moltype = 'protein'
        elif args.dna:
            moltype = 'dna'

        already_names = set()
        if args.load_csv:
            with open(args.load_csv, 'rt') as fp:
                r = csv.reader(fp)
                for row in r:
                    already_names.add(row[0])

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)

        if args.traverse_directory:
            inp_files = []
            for dirname in args.queries:
                for root, dirs, files in os.walk(dirname):
                    for name in files:
                        if name.endswith('.sig'):
                            fullname = os.path.join(root, name)
                            if fullname not in already_names:
                                inp_files.append(fullname)
        else:
            inp_files = args.queries

        print('found {} files to query'.format(len(inp_files)))

        loader = sourmash_args.LoadSingleSignatures(inp_files,
                                                    args.ksize, moltype)
        for queryfile, query, query_moltype, query_ksize in loader:
            print('loaded query: {}... (k={}, {})'.format(query.name()[:30],
                                                          query_ksize,
                                                          query_moltype))

            results = []
            search_fn = SearchMinHashesFindBest().search

            for leaf in tree.find(search_fn, query, args.threshold):
                # ignore self
                if leaf.data.md5sum() != query.md5sum():
                    results.append((query.similarity(leaf.data), leaf.data))

            best_hit_sim = 0.0
            best_hit_query_name = ""
            if results:
                results.sort(key=lambda x: -x[0])   # reverse sort on similarity
                best_hit_sim, best_hit_query = results[0]
                print('for {}, found: {:.2f} {}'.format(query.name(),
                                                        best_hit_sim,
                                                        best_hit_query.name()))
                best_hit_query_name = best_hit_query.name()
            else:
                print('for {}, no match found'.format(query.name()))

            if args.csv:
                w = csv.writer(args.csv)
                w.writerow([queryfile, best_hit_query_name, best_hit_sim])

        if loader.skipped_ignore:
            print('skipped/ignore: {}'.format(loader.skipped_ignore))
        if loader.skipped_nosig:
            print('skipped/nosig: {}'.format(loader.skipped_nosig))
Exemplo n.º 13
0
def load_sbts_and_sigs(filenames, query, is_similarity_query, traverse=False):
    query_ksize = query.minhash.ksize
    query_moltype = get_moltype(query)

    n_signatures = 0
    n_databases = 0
    databases = []
    for sbt_or_sigfile in filenames:
        if traverse and os.path.isdir(sbt_or_sigfile):
            for sigfile in traverse_find_sigs([sbt_or_sigfile]):
                try:
                    siglist = sig.load_signatures(sigfile,
                                                  ksize=query_ksize,
                                                  select_moltype=query_moltype)
                    siglist = filter_compatible_signatures(query, siglist, 1)
                    siglist = list(siglist)
                    databases.append((siglist, sbt_or_sigfile, False))
                    notify('loaded {} signatures from {}',
                           len(siglist),
                           sigfile,
                           end='\r')
                    n_signatures += len(siglist)
                except:  # ignore errors with traverse
                    pass

            # done! jump to beginning of main 'for' loop
            continue

        # no traverse? try loading as an SBT.
        try:
            tree = SBT.load(sbt_or_sigfile, leaf_loader=SigLeaf.load)

            if not check_tree_is_compatible(sbt_or_sigfile, tree, query,
                                            is_similarity_query):
                sys.exit(-1)

            databases.append((tree, sbt_or_sigfile, True))
            notify('loaded SBT {}', sbt_or_sigfile, end='\r')
            n_databases += 1

            # done! jump to beginning of main 'for' loop
            continue
        except (ValueError, EnvironmentError):
            # not an SBT - try as a .sig
            pass

        # not a tree? try loading as a signature.
        try:
            siglist = sig.load_signatures(sbt_or_sigfile,
                                          ksize=query_ksize,
                                          select_moltype=query_moltype)
            siglist = list(siglist)
            if len(siglist) == 0:  # file not found, or parse error?
                raise ValueError

            siglist = filter_compatible_signatures(query, siglist, False)
            siglist = list(siglist)

            databases.append((siglist, sbt_or_sigfile, False))
            notify('loaded {} signatures from {}',
                   len(siglist),
                   sbt_or_sigfile,
                   end='\r')
            n_signatures += len(siglist)
        except (EnvironmentError, ValueError):
            error("\nCannot open file '{}'", sbt_or_sigfile)
            sys.exit(-1)

    notify(' ' * 79, end='\r')
    if n_signatures and n_databases:
        notify('loaded {} signatures and {} databases total.', n_signatures,
               n_databases)
    elif n_signatures:
        notify('loaded {} signatures.', n_signatures)
    elif n_databases:
        notify('loaded {} databases.', n_databases)
    else:
        sys.exit(-1)

    if databases:
        print('')

    return databases
Exemplo n.º 14
0
    def sbt_search(self, args):
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('query')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.08, type=float)
        parser.add_argument('--save-matches', type=argparse.FileType('wt'))
        parser.add_argument('--best-only', action='store_true')

        sourmash_args.add_moltype_args(parser)
        args = parser.parse_args(args)

        if args.protein:
            if args.dna is True:
                raise Exception('cannot specify both --dna and --protein!')
            args.dna = False

        moltype = None
        if args.protein:
            moltype = 'protein'
        elif args.dna:
            moltype = 'dna'

        search_fn = search_minhashes
        if args.best_only:
            search_fn = SearchMinHashesFindBest().search

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
        sl = sig.load_signatures(args.query, select_ksize=args.ksize,
                                 select_moltype=moltype)
        sl = list(sl)
        if len(sl) != 1:
            print('When loading query from "{}",'.format(args.query),
                  file=sys.stderr)
            print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl)))
            sys.exit(-1)

        query = sl[0]

        query_moltype = 'UNKNOWN'
        if query.estimator.is_molecule_type('dna'):
            query_moltype = 'DNA'
        elif query.estimator.is_molecule_type('protein'):
            query_moltype = 'protein'
        query_ksize = query.estimator.ksize
        print('loaded query: {}... (k={}, {})'.format(query.name()[:30],
                                                      query_ksize,
                                                      query_moltype))

        results = []
        for leaf in tree.find(search_fn, query, args.threshold):
            results.append((query.similarity(leaf.data), leaf.data))
            #results.append((leaf.data.similarity(ss), leaf.data))

        results.sort(key=lambda x: -x[0])   # reverse sort on similarity
        for (similarity, query) in results:
            print('{:.2f} {}'.format(similarity, query.name()))

        if args.save_matches:
            outname = args.save_matches.name
            print('saving all matches to "{}"'.format(outname))
            sig.save_signatures([ m for (sim, m) in results ],
                                args.save_matches)
Exemplo n.º 15
0
def index(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name to save SBT into')
    parser.add_argument('signatures',
                        nargs='+',
                        help='signatures to load into SBT')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='suppress non-error output')
    parser.add_argument('-k',
                        '--ksize',
                        type=int,
                        default=None,
                        help='k-mer size for which to build the SBT.')
    parser.add_argument('--traverse-directory',
                        action='store_true',
                        help='load all signatures underneath this directory.')
    parser.add_argument('--append',
                        action='store_true',
                        default=False,
                        help='add signatures to an existing SBT.')
    parser.add_argument('-x',
                        '--bf-size',
                        type=float,
                        default=1e5,
                        help='Bloom filter size used for internal nodes.')

    sourmash_args.add_moltype_args(parser)

    args = parser.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    if args.append:
        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
    else:
        factory = GraphFactory(1, args.bf_size, 4)
        tree = SBT(factory)

    if args.traverse_directory:
        inp_files = list(sourmash_args.traverse_find_sigs(args.signatures))
    else:
        inp_files = list(args.signatures)

    notify('loading {} files into SBT', len(inp_files))

    n = 0
    ksizes = set()
    moltypes = set()
    for f in inp_files:
        siglist = sig.load_signatures(f,
                                      select_ksize=args.ksize,
                                      select_moltype=moltype)

        # load all matching signatures in this file
        for ss in siglist:
            ksizes.add(ss.minhash.ksize)
            moltypes.add(sourmash_args.get_moltype(ss))

            leaf = SigLeaf(ss.md5sum(), ss)
            tree.add_node(leaf)
            n += 1

        # check to make sure we aren't loading incompatible signatures
        if len(ksizes) > 1 or len(moltypes) > 1:
            error('multiple k-mer sizes or molecule types present; fail.')
            error('specify --dna/--protein and --ksize as necessary')
            error('ksizes: {}; moltypes: {}', ", ".join(map(str, ksizes)),
                  ", ".join(moltypes))
            sys.exit(-1)

    # did we load any!?
    if n == 0:
        error('no signatures found to load into tree!? failing.')
        sys.exit(-1)

    notify('loaded {} sigs; saving SBT under "{}"', n, args.sbt_name)
    tree.save(args.sbt_name)
Exemplo n.º 16
0
def categorize(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBest

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to load')
    parser.add_argument('queries',
                        nargs='+',
                        help='list of signatures to categorize')
    parser.add_argument('-k', '--ksize', type=int, default=None)
    parser.add_argument('--threshold', default=0.08, type=float)
    parser.add_argument('--traverse-directory', action="store_true")

    sourmash_args.add_moltype_args(parser)

    parser.add_argument('--csv', type=argparse.FileType('at'))
    parser.add_argument('--load-csv', default=None)

    args = parser.parse_args(args)
    moltype = sourmash_args.calculate_moltype(args)

    already_names = set()
    if args.load_csv:
        with open(args.load_csv, 'rt') as fp:
            r = csv.reader(fp)
            for row in r:
                already_names.add(row[0])

    tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)

    if args.traverse_directory:
        inp_files = set(sourmash_args.traverse_find_sigs(args.queries))
    else:
        inp_files = set(args.queries) - already_names

    inp_files = set(inp_files) - already_names

    notify('found {} files to query', len(inp_files))

    loader = sourmash_args.LoadSingleSignatures(inp_files, args.ksize, moltype)

    for queryfile, query, query_moltype, query_ksize in loader:
        notify('loaded query: {}... (k={}, {})',
               query.name()[:30], query_ksize, query_moltype)

        results = []
        search_fn = SearchMinHashesFindBest().search

        for leaf in tree.find(search_fn, query, args.threshold):
            if leaf.data.md5sum() != query.md5sum():  # ignore self.
                results.append((query.similarity(leaf.data), leaf.data))

        best_hit_sim = 0.0
        best_hit_query_name = ""
        if results:
            results.sort(key=lambda x: -x[0])  # reverse sort on similarity
            best_hit_sim, best_hit_query = results[0]
            notify('for {}, found: {:.2f} {}', query.name(), best_hit_sim,
                   best_hit_query.name())
            best_hit_query_name = best_hit_query.name()
        else:
            notify('for {}, no match found', query.name())

        if args.csv:
            w = csv.writer(args.csv)
            w.writerow([queryfile, best_hit_query_name, best_hit_sim])

    if loader.skipped_ignore:
        notify('skipped/ignore: {}', loader.skipped_ignore)
    if loader.skipped_nosig:
        notify('skipped/nosig: {}', loader.skipped_nosig)
Exemplo n.º 17
0
def sbt_gather(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBestIgnoreMaxHash

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to search')
    parser.add_argument('query', help='query signature')
    parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
    parser.add_argument('--threshold', default=0.05, type=float)
    parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
    parser.add_argument('--csv', type=argparse.FileType('wt'))
    parser.add_argument('--save-matches', type=argparse.FileType('wt'))

    sourmash_args.add_moltype_args(parser)

    args = parser.parse_args(args)
    moltype = sourmash_args.calculate_moltype(args)

    tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
    query = sourmash_args.load_query_signature(args.query,
                                               select_ksize=args.ksize,
                                               select_moltype=moltype)
    query_moltype = sourmash_args.get_moltype(query)
    query_ksize = query.estimator.ksize
    notify('loaded query: {}... (k={}, {})',
           query.name()[:30], query_ksize, query_moltype)

    if query.estimator.max_hash == 0:
        error('query signature needs to be created with --scaled')
        error('or using --with-cardinality.')
        sys.exit(-1)

    notify('query signature has max_hash: {}', query.estimator.max_hash)
    orig_query = query

    R_metagenome = 2**64 / float(orig_query.estimator.max_hash)

    new_mins = query.estimator.get_hashes()
    e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins))
    e.update(query.estimator)
    query = sig.SourmashSignature('', e)

    sum_found = 0.
    found = []
    while 1:
        search_fn = SearchMinHashesFindBestIgnoreMaxHash().search

        results = []
        # use super low threshold for this part of the search
        for leaf in tree.find(search_fn, query, 0.00001):
            results.append((query.estimator.similarity_ignore_maxhash(
                leaf.data.estimator), leaf.data))

        if not len(results):  # no matches at all!
            break

        # take the best result
        results.sort(key=lambda x: -x[0])  # reverse sort on similarity
        best_sim, best_ss = results[0]

        # subtract found hashes from search hashes, construct new search
        new_mins = set(query.estimator.get_hashes())
        found_mins = best_ss.estimator.get_hashes()

        if best_ss.estimator.max_hash:
            R_genome = 2**64 / float(best_ss.estimator.max_hash)
        elif best_ss.estimator.hll:
            genome_size = best_ss.estimator.hll.estimate_cardinality()
            genome_max_hash = max(found_mins)
            R_genome = float(genome_size) / float(genome_max_hash)
        else:
            error('Best hash match in sbt_gather has no cardinality')
            error('Please prepare database of sequences with --scaled')
            error('...or with --with-cardinality')
            sys.exit(-1)

        R_comparison = max(R_metagenome, R_genome)
        new_max_hash = 2**64 / float(R_comparison)
        new_mins = set([i for i in new_mins if i < new_max_hash])
        found_mins = set([i for i in found_mins if i < new_max_hash])

        # intersection:
        intersect_mins = new_mins.intersection(found_mins)

        if len(intersect_mins) < 5:  # hard cutoff for now
            notify('found only {} hashes in common.', len(intersect_mins))
            notify('this is below a sane threshold => exiting.')
            break

        # first denominator - genome size
        genome_n_mins = len(found_mins)
        f_genome = len(intersect_mins) / float(genome_n_mins)

        # second denominator - metagenome size
        query_n_mins = len(orig_query.estimator.get_hashes())
        f_query = len(intersect_mins) / float(query_n_mins)

        # print interim & save
        notify('found: {:.2f} {:.2f} {}', f_genome, f_query, best_ss.name())
        found.append((f_genome, best_ss))

        new_mins -= set(found_mins)
        e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins))
        e.add_many(new_mins)
        query = sig.SourmashSignature('', e)

    notify('found {}, total fraction {:.3f}', len(found), sum_found)
    notify('')

    if not found:
        sys.exit(0)

    found.sort(key=lambda x: x[0])
    found.reverse()

    notify('Composition:')
    for (frac, leaf_sketch) in found:
        notify('{:.2f} {}', frac, leaf_sketch.name())

    if args.output:
        print('Composition:', file=args.output)
        for (frac, leaf_sketch) in found:
            print('{:.2f} {}'.format(frac, leaf_sketch.name()),
                  file=args.output)

    if args.csv:
        fieldnames = ['fraction', 'name', 'sketch_kmers']
        w = csv.DictWriter(args.csv, fieldnames=fieldnames)

        w.writeheader()
        for (frac, leaf_sketch) in found:
            cardinality = leaf_sketch.estimator.hll.estimate_cardinality()
            w.writerow(
                dict(fraction=frac,
                     name=leaf_sketch.name(),
                     sketch_kmers=cardinality))
    if args.save_matches:
        outname = args.save_matches.name
        notify('saving all matches to "{}"', outname)
        sig.save_signatures([ss for (f, ss) in found], args.save_matches)
Exemplo n.º 18
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('dir1')
    p.add_argument('sbt1')
    p.add_argument('dir2')
    p.add_argument('sbt2')
    p.add_argument('-k', '--ksize', type=int, default=31)
    args = p.parse_args()

    print('loading all signatures:', args.dir1)
    sigdict1 = load_all_signatures(args.dir1, args.ksize)
    tree1 = SBT.load(args.sbt1, leaf_loader=SigLeaf.load)
    print('...loaded {} signatures at k={}'.format(len(sigdict1), args.ksize))

    print('loading all signatures:', args.dir2)
    sigdict2 = load_all_signatures(args.dir2, args.ksize)
    tree2 = SBT.load(args.sbt2, leaf_loader=SigLeaf.load)
    print('...loaded {} signatures at k={}'.format(len(sigdict2), args.ksize))

    # first, find all matches in 2 for 1, and 1 for 2
    THRESHOLD = 0.05
    matches_1_in_2 = make_all_matches(sigdict1, tree2, THRESHOLD)
    matches_2_in_1 = make_all_matches(sigdict2, tree1, THRESHOLD)

    # now, do containment
    contained_1_in_2 = containment(matches_1_in_2, sigdict1, sigdict2)
    contained_2_in_1 = containment(matches_2_in_1, sigdict2, sigdict1)

    # summary stats
    CONTAIN_THRESHOLD = 0.95
    IDENT_THRESHOLD = 0.80

    print('thresholds:')
    print('min Jaccard similarity for any match:', THRESHOLD)
    print('to score as identical, similarity must be >=', IDENT_THRESHOLD)
    print('to score as contained, containment must be >=', CONTAIN_THRESHOLD)

    # 1 in 2
    c_ident = 0
    c_match = 0
    c_contain = 0
    c_no_match = 0
    c_no_contain = 0
    identical_names = []

    for query_name in sigdict1:
        best_match = None
        similarity = 0.0
        cont = 0.0

        if query_name in matches_1_in_2:
            (best_match, similarity) = matches_1_in_2[query_name]
        if query_name in contained_1_in_2:
            cont = contained_1_in_2[query_name]

        if not best_match:
            c_no_match += 1
        else:
            c_match += 1

        if cont < CONTAIN_THRESHOLD:
            c_no_contain += 1
        else:
            c_contain += 1

        if similarity > IDENT_THRESHOLD:
            identical_names.append((query_name, best_match))
            c_ident += 1

    print('----')
    print('{} vs {}: {} signatures'.format(args.dir1, args.dir2,
                                           len(sigdict1)))
    print('identical count:', c_ident)
    print('containment count:', c_contain)
    print('matches:', c_match)

    print('no match:', c_no_match)
    print('no contain:', c_no_contain)

    print('identical:')
    for (k, v) in identical_names:
        print("{} = {}".format(k, v))

    # 2 in 1
    c_ident = 0
    c_match = 0
    c_contain = 0
    c_no_match = 0
    c_no_contain = 0
    identical_names = []

    for query_name in sigdict2:
        best_match = None
        similarity = 0.0
        cont = 0.0

        if query_name in matches_2_in_1:
            (best_match, similarity) = matches_2_in_1[query_name]
        if query_name in contained_2_in_1:
            cont = contained_2_in_1[query_name]

        if not best_match:
            c_no_match += 1
        else:
            c_match += 1

        if cont < CONTAIN_THRESHOLD:
            c_no_contain += 1
        else:
            c_contain += 1

        if similarity > IDENT_THRESHOLD:
            identical_names.append((query_name, best_match))
            c_ident += 1

    print('----')
    print('{} vs {}: {} signatures'.format(args.dir2, args.dir1,
                                           len(sigdict2)))
    print('identical count:', c_ident)
    print('containment count:', c_contain)

    print('matches:', c_match)
    print('no match:', c_no_match)
    print('no contain:', c_no_contain)

    print('identical:')
    for (k, v) in identical_names:
        print("{} = {}".format(k, v))
Exemplo n.º 19
0
    def sbt_gather(self, args):
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('query')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.05, type=float)
        parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
        parser.add_argument('--csv', type=argparse.FileType('wt'))

        sourmash_args.add_moltype_args(parser)

        args = parser.parse_args(args)

        if args.protein:
            if args.dna is True:
                raise Exception('cannot specify both --dna and --protein!')
            args.dna = False

        moltype = None
        if args.protein:
            moltype = 'protein'
        elif args.dna:
            moltype = 'dna'

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
        sl = sig.load_signatures(args.query, select_ksize=args.ksize,
                                 select_moltype=moltype)
        sl = list(sl)
        if len(sl) != 1:
            print('When loading query from "{}",'.format(args.query),
                  file=sys.stderr)
            print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl)))
            sys.exit(-1)

        query = sl[0]

        query_moltype = 'UNKNOWN'
        if query.estimator.is_molecule_type('dna'):
            query_moltype = 'DNA'
        elif query.estimator.is_molecule_type('protein'):
            query_moltype = 'protein'
        query_ksize = query.estimator.ksize
        print('loaded query: {}... (k={}, {})'.format(query.name()[:30],
                                                      query_ksize,
                                                      query_moltype))

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
        #s = sig.load_signatures(args.query, select_ksize=args.ksize)
        orig_query = query

        sum_found = 0.
        found = []
        while 1:
            search_fn = SearchMinHashesFindBest().search

            results = []
            # use super low threshold for this part of the search
            for leaf in tree.find(search_fn, query, 0.00001):
                results.append((query.similarity(leaf.data), leaf.data))
                #results.append((leaf.data.similarity(ss), leaf.data))

            if not len(results):          # no matches at all!
                break

            # take the best result
            results.sort(key=lambda x: -x[0])   # reverse sort on similarity
            best_sim, best_ss = results[0]
            sim = best_ss.similarity(orig_query)

            # adjust by size of leaf (kmer cardinality of original genome)
            if best_ss.estimator.hll:
                leaf_kmers = best_ss.estimator.hll.estimate_cardinality()
                query_kmers = orig_query.estimator.hll.estimate_cardinality()
                f_of_total = leaf_kmers / query_kmers * sim
            else:
                f_of_total = 0

            if not found and sim < args.threshold:
                print('best match: {}'.format(best_ss.name()))
                print('similarity is {:.5f} of db signature;'.format(sim))
                print('this is below specified threshold => exiting.')
                break

            # subtract found hashes from search hashes, construct new search
            new_mins = set(query.estimator.mh.get_mins())
            found_mins = best_ss.estimator.mh.get_mins()

            # print interim & save
            print('found: {:.2f} {} {}'.format(f_of_total,
                                               len(new_mins),
                                               best_ss.name()))
            found.append((f_of_total, best_ss, sim))
            sum_found += f_of_total

            new_mins -= set(found_mins)
            e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins))
            for m in new_mins:
                e.mh.add_hash(m)
            new_ss = sig.SourmashSignature('foo', e)
            query = new_ss

        print('found {}, total fraction {:.3f}'.format(len(found), sum_found))
        print('')

        if not found:
            sys.exit(0)

        found.sort()
        found.reverse()

        print('Composition:')
        for (frac, leaf_sketch, sim) in found:
            print('{:.2f} {}'.format(frac, leaf_sketch.name()))

        if args.output:
            print('Composition:', file=args.output)
            for (frac, leaf_sketch, sim) in found:
                print('{:.2f} {}'.format(frac, leaf_sketch.name()),
                      file=args.output)

        if args.csv:
            fieldnames = ['fraction', 'name', 'similarity', 'sketch_kmers']
            w = csv.DictWriter(args.csv, fieldnames=fieldnames)

            w.writeheader()
            for (frac, leaf_sketch, sim) in found:
                cardinality = leaf_sketch.estimator.hll.estimate_cardinality()
                w.writerow(dict(fraction=frac, name=leaf_sketch.name(),
                                similarity=sim,
                                sketch_kmers=cardinality))
Exemplo n.º 20
0
    def watch(self, args):
        "Build a signature from raw FASTA/FASTQ coming in on stdin, search."
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.05, type=float)
        parser.add_argument('--input-is-protein', action='store_true')
        sourmash_args.add_moltype_args(parser, default_dna=True)
        parser.add_argument('-n', '--num-hashes', type=int,
                            default=DEFAULT_N,
                            help='number of hashes to use in each sketch (default: %(default)i)')
        parser.add_argument('--name', type=str, default='stdin')
        args = parser.parse_args(args)

        if args.input_is_protein and args.dna:
            print('WARNING: input is protein, turning off DNA hash computing.',
                  file=sys.stderr)
            args.dna = False
            args.protein = True

        if args.dna and args.protein:
            notify('ERROR: cannot use "watch" with both DNA and protein.')

        if args.dna:
            moltype = 'DNA'
            is_protein = False
        else:
            moltype = 'protein'
            is_protein = True

        E = sourmash_lib.Estimators(ksize=args.ksize, n=args.num_hashes,
                                    protein=is_protein)
        streamsig = sig.SourmashSignature('', E, filename='stdin',
                                          name=args.name)

        notify('Computing signature for k={}, {} from stdin',
               args.ksize, moltype)


        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)

        def do_search():
            search_fn = SearchMinHashesFindBest().search

            results = []
            for leaf in tree.find(search_fn, streamsig, args.threshold):
                results.append((streamsig.similarity(leaf.data),
                                leaf.data))

            return results

        notify('reading sequences from stdin')
        screed_iter = screed.open('/dev/stdin')
        watermark = WATERMARK_SIZE

        # iterate over input records
        n = 0
        for n, record in enumerate(screed_iter):
            # at each watermark, print status & check cardinality
            if n >= watermark:
                notify('... read {} sequences', n)
                watermark += WATERMARK_SIZE

                if do_search():
                    break

            if args.input_is_protein:
                E.mh.add_protein(record.sequence)
            else:
                E.add_sequence(record.sequence, False)

        results = do_search()
        if not results:
            notify('... read {} sequences, no matches found.', n)
        else:
            results.sort(key=lambda x: -x[0])   # take best
            similarity, found_sig = results[0]
            notify('FOUND: {}, at {:.3f}', found_sig.name(),
                   similarity)

        if args.output:
            sig.save_signatures([streamsig], args.output)
Exemplo n.º 21
0
def watch(args):
    "Build a signature from raw FASTA/FASTQ coming in on stdin, search."
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBest

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to search')
    parser.add_argument('inp_file', nargs='?', default='/dev/stdin')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='suppress non-error output')
    parser.add_argument('-o',
                        '--output',
                        type=argparse.FileType('wt'),
                        help='save signature generated from data here')
    parser.add_argument('--threshold',
                        default=0.05,
                        type=float,
                        help='minimum threshold for matches')
    parser.add_argument(
        '--input-is-protein',
        action='store_true',
        help='Consume protein sequences - no translation needed')
    sourmash_args.add_construct_moltype_args(parser)
    parser.add_argument(
        '-n',
        '--num-hashes',
        type=int,
        default=DEFAULT_N,
        help='number of hashes to use in each sketch (default: %(default)i)')
    parser.add_argument('--name',
                        type=str,
                        default='stdin',
                        help='name to use for generated signature')
    sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
    args = parser.parse_args(args)
    set_quiet(args.quiet)

    if args.input_is_protein and args.dna:
        notify('WARNING: input is protein, turning off DNA hashing.')
        args.dna = False
        args.protein = True

    if args.dna and args.protein:
        notify('ERROR: cannot use "watch" with both DNA and protein.')

    if args.dna:
        moltype = 'DNA'
        is_protein = False
    else:
        moltype = 'protein'
        is_protein = True

    tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)

    def get_ksize(tree):
        """Walk nodes in `tree` to find out ksize"""
        for node in tree.nodes.values():
            if isinstance(node, sourmash_lib.sbtmh.SigLeaf):
                return node.data.minhash.ksize

    # deduce ksize from the SBT we are loading
    ksize = args.ksize
    if ksize is None:
        ksize = get_ksize(tree)

    E = sourmash_lib.MinHash(ksize=ksize,
                             n=args.num_hashes,
                             is_protein=is_protein)
    streamsig = sig.SourmashSignature('', E, filename='stdin', name=args.name)

    notify('Computing signature for k={}, {} from stdin', ksize, moltype)

    def do_search():
        search_fn = SearchMinHashesFindBest().search

        results = []
        for leaf in tree.find(search_fn, streamsig, args.threshold):
            results.append((streamsig.similarity(leaf.data), leaf.data))

        return results

    notify('reading sequences from stdin')
    screed_iter = screed.open(args.inp_file)
    watermark = WATERMARK_SIZE

    # iterate over input records
    n = 0
    for n, record in enumerate(screed_iter):
        # at each watermark, print status & check cardinality
        if n >= watermark:
            notify('\r... read {} sequences', n, end='')
            watermark += WATERMARK_SIZE

            if do_search():
                break

        if args.input_is_protein:
            E.add_protein(record.sequence)
        else:
            E.add_sequence(record.sequence, False)

    results = do_search()
    if not results:
        notify('... read {} sequences, no matches found.', n)
    else:
        results.sort(key=lambda x: -x[0])  # take best
        similarity, found_sig = results[0]
        print_results('FOUND: {}, at {:.3f}', found_sig.name(), similarity)

    if args.output:
        notify('saving signature to {}', args.output.name)
        sig.save_signatures([streamsig], args.output)