示例#1
0
def sbt_combine(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import SigLeaf

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name to save SBT into')
    parser.add_argument('sbts', nargs='+', help='SBTs to combine to a new SBT')
    parser.add_argument('-x', '--bf-size', type=float, default=1e5)

    sourmash_args.add_moltype_args(parser)

    args = parser.parse_args(args)
    moltype = sourmash_args.calculate_moltype(args)

    inp_files = list(args.sbts)
    notify('combining {} SBTs', len(inp_files))

    tree = SBT.load(inp_files.pop(0), leaf_loader=SigLeaf.load)

    for f in inp_files:
        new_tree = SBT.load(f, leaf_loader=SigLeaf.load)
        # TODO: check if parameters are the same for both trees!
        tree.combine(new_tree)

    notify('saving SBT under "{}".', args.sbt_name)
    tree.save(args.sbt_name)
示例#2
0
def test_binary_nary_tree():
    factory = GraphFactory(31, 1e5, 4)
    trees = {}
    trees[2] = SBT(factory)
    trees[5] = SBT(factory, d=5)
    trees[10] = SBT(factory, d=10)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        for tree in trees.values():
            tree.add_node(leaf)
        to_search = leaf
        n_leaves += 1

    assert all([len(t.leaves()) == n_leaves for t in trees.values()])

    results = {}
    print('*' * 60)
    print("{}:".format(to_search.metadata))
    for d, tree in trees.items():
        results[d] = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
    print(*results[2], sep='\n')

    assert results[2] == results[5]
    assert results[5] == results[10]
示例#3
0
def test_tree_save_load(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)

    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        to_search = leaf

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    old_result = {
        str(s)
        for s in tree.find(search_minhashes, to_search.data, 0.1)
    }
    print(*old_result, sep='\n')

    with utils.TempDirectory() as location:
        tree.save(os.path.join(location, 'demo'))
        tree = SBT.load(os.path.join(location, 'demo'),
                        leaf_loader=SigLeaf.load)

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
        print(*new_result, sep='\n')

        assert old_result == new_result
示例#4
0
def test_sbt_fsstorage():
    factory = GraphFactory(31, 1e5, 4)
    with utils.TempDirectory() as location:
        tree = SBT(factory)

        for f in utils.SIG_FILES:
            sig = next(signature.load_signatures(utils.get_test_data(f)))
            leaf = SigLeaf(os.path.basename(f), sig)
            tree.add_node(leaf)
            to_search = leaf

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        old_result = {str(s) for s in tree.find(search_minhashes,
                                                to_search.data, 0.1)}
        print(*old_result, sep='\n')

        with FSStorage(os.path.join(location, '.fstree')) as storage:
            tree.save(os.path.join(location, 'tree'), storage=storage)

        tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load)
        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {str(s) for s in tree.find(search_minhashes,
                                                to_search.data, 0.1)}
        print(*new_result, sep='\n')

        assert old_result == new_result

        assert os.path.exists(os.path.join(location, tree.storage.path))
        assert os.path.exists(os.path.join(location, '.fstree'))
示例#5
0
def load_sbts_and_sigs(filenames, query_ksize, query_moltype):
    databases = []
    for sbt_or_sigfile in filenames:
        try:
            tree = SBT.load(sbt_or_sigfile, leaf_loader=SigLeaf.load)
            ksize = get_ksize(tree)
            if ksize != query_ksize:
                error("ksize on tree '{}' is {};", sbt_or_sigfile, ksize)
                error('this is different from query ksize of {}.', query_ksize)
                sys.exit(-1)

            databases.append((tree, sbt_or_sigfile, True))
            notify('loaded SBT {}', sbt_or_sigfile)
        except (ValueError, EnvironmentError):
            # not an SBT - try as a .sig

            try:
                siglist = sig.load_signatures(sbt_or_sigfile,
                                              select_ksize=query_ksize,
                                              select_moltype=query_moltype)
                siglist = list(siglist)
                databases.append((list(siglist), sbt_or_sigfile, False))
                notify('loaded {} signatures from {}', len(siglist),
                       sbt_or_sigfile)
            except EnvironmentError:
                error("file '{}' does not exist", sbt_or_sigfile)
                sys.exit(-1)

    return databases
示例#6
0
def test_tree_repair():
    tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'),
                           leaf_loader=SigLeaf.load)

    tree_cur = SBT.load(utils.get_test_data('v3.sbt.json'),
                        leaf_loader=SigLeaf.load)

    testdata1 = utils.get_test_data(utils.SIG_FILES[0])
    to_search = next(signature.load_signatures(testdata1))

    results_repair = {str(s) for s in tree_repair.find(search_minhashes,
                                                       to_search, 0.1)}
    results_cur = {str(s) for s in tree_cur.find(search_minhashes,
                                                 to_search, 0.1)}

    assert results_repair == results_cur
    assert len(results_repair) == 4
示例#7
0
def load_sbts_and_sigs(filenames, query_ksize, query_moltype, traverse=False):
    n_signatures = 0
    n_databases = 0
    databases = []
    for sbt_or_sigfile in filenames:
        if traverse and os.path.isdir(sbt_or_sigfile):
            for sigfile in traverse_find_sigs([sbt_or_sigfile]):
                try:
                    siglist = sig.load_signatures(sigfile,
                                                  ksize=query_ksize,
                                                  select_moltype=query_moltype)
                    siglist = list(siglist)
                    databases.append((list(siglist), sbt_or_sigfile, False))
                    notify('loaded {} signatures from {}',
                           len(siglist),
                           sigfile,
                           end='\r')
                    n_signatures += len(siglist)
                except:  # ignore errors with traverse
                    continue
            continue
        try:
            tree = SBT.load(sbt_or_sigfile, leaf_loader=SigLeaf.load)
            ksize = get_ksize(tree)
            if ksize != query_ksize:
                error("ksize on tree '{}' is {};", sbt_or_sigfile, ksize)
                error('this is different from query ksize of {}.', query_ksize)
                sys.exit(-1)

            databases.append((tree, sbt_or_sigfile, True))
            notify('loaded SBT {}', sbt_or_sigfile, end='\r')
            n_databases += 1
        except (ValueError, EnvironmentError):
            # not an SBT - try as a .sig

            try:
                siglist = sig.load_signatures(sbt_or_sigfile,
                                              ksize=query_ksize,
                                              select_moltype=query_moltype)
                siglist = list(siglist)
                databases.append((list(siglist), sbt_or_sigfile, False))
                notify('loaded {} signatures from {}',
                       len(siglist),
                       sbt_or_sigfile,
                       end='\r')
                n_signatures += len(siglist)
            except EnvironmentError:
                error("\nfile '{}' does not exist", sbt_or_sigfile)
                sys.exit(-1)
    notify(' ' * 79, end='\r')
    notify('loaded {} signatures and {} databases total.'.format(
        n_signatures, n_databases))

    if databases:
        print('')

    return databases
示例#8
0
def test_tree_v1_load():
    tree_v1 = SBT.load(utils.get_test_data('v1.sbt.json'),
                       leaf_loader=SigLeaf.load)

    tree_v2 = SBT.load(utils.get_test_data('v2.sbt.json'),
                       leaf_loader=SigLeaf.load)

    testdata1 = utils.get_test_data(utils.SIG_FILES[0])
    to_search = next(signature.load_signatures(testdata1))

    results_v1 = {
        str(s)
        for s in tree_v1.find(search_minhashes, to_search, 0.1)
    }
    results_v2 = {
        str(s)
        for s in tree_v2.find(search_minhashes, to_search, 0.1)
    }

    assert results_v1 == results_v2
    assert len(results_v1) == 4
示例#9
0
def sbt_index(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name to save SBT into')
    parser.add_argument('signatures',
                        nargs='+',
                        help='signatures to load into SBT')
    parser.add_argument('-k', '--ksize', type=int, default=None)
    parser.add_argument('--traverse-directory', action='store_true')
    parser.add_argument('-x', '--bf-size', type=float, default=1e5)

    sourmash_args.add_moltype_args(parser)

    args = parser.parse_args(args)
    moltype = sourmash_args.calculate_moltype(args)

    factory = GraphFactory(1, args.bf_size, 4)
    tree = SBT(factory)

    if args.traverse_directory:
        inp_files = list(sourmash_args.traverse_find_sigs(args.signatures))
    else:
        inp_files = list(args.signatures)

    notify('loading {} files into SBT', len(inp_files))

    n = 0
    ksizes = set()
    moltypes = set()
    for f in inp_files:
        siglist = sig.load_signatures(f,
                                      select_ksize=args.ksize,
                                      select_moltype=moltype)

        # load all matching signatures in this file
        for ss in siglist:
            ksizes.add(ss.estimator.ksize)
            moltypes.add(sourmash_args.get_moltype(ss))

            leaf = SigLeaf(ss.md5sum(), ss)
            tree.add_node(leaf)
            n += 1

        # check to make sure we aren't loading incompatible signatures
        if len(ksizes) > 1 or len(moltypes) > 1:
            error('multiple k-mer sizes or molecule types present; fail.')
            error('specify --dna/--protein and --ksize as necessary')
            error('ksizes: {}; moltypes: {}', ", ".join(map(str, ksizes)),
                  ", ".join(moltypes))
            sys.exit(-1)

    # did we load any!?
    if n == 0:
        error('no signatures found to load into tree!? failing.')
        sys.exit(-1)

    notify('loaded {} sigs; saving SBT under "{}"', n, args.sbt_name)
    tree.save(args.sbt_name)
示例#10
0
def test_sbt_ipfsstorage():
    ipfsapi = pytest.importorskip('ipfsapi')

    factory = GraphFactory(31, 1e5, 4)
    with utils.TempDirectory() as location:
        tree = SBT(factory)

        for f in utils.SIG_FILES:
            sig = next(signature.load_signatures(utils.get_test_data(f)))
            leaf = SigLeaf(os.path.basename(f), sig)
            tree.add_node(leaf)
            to_search = leaf

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        old_result = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
        print(*old_result, sep='\n')

        try:
            with IPFSStorage() as storage:
                tree.save(os.path.join(location, 'tree'), storage=storage)
        except ipfsapi.exceptions.ConnectionError:
            pytest.xfail("ipfs not installed/functioning probably")

        with IPFSStorage() as storage:
            tree = SBT.load(os.path.join(location, 'tree'),
                            leaf_loader=SigLeaf.load,
                            storage=storage)

            print('*' * 60)
            print("{}:".format(to_search.metadata))
            new_result = {
                str(s)
                for s in tree.find(search_minhashes, to_search.data, 0.1)
            }
            print(*new_result, sep='\n')

            assert old_result == new_result
示例#11
0
def sbt_index(client, db, cell, query, ksize, nsketch, key, file):
    '''Create a sequence Bloom tree from a cell/ database cursor.
    1. select seqs for tree
    2. assign common id (field derivative.minhash.sbt.ids)
    3. minhash seqs, name == UUID, md5? (think about SBT reuse)
    4. query a different collection/ metagenome against this

    --index {raw, minhash}
    input: all of cell or cursor

    \b
    $ zoo sbt_index --db ref --cell ref --ksize 16 --nsketch 1000 \
    reference
    Initialize SBT.
    Compute minhash signatures for selected documents.
    k-mer size: 16, sketch size: 1000
    \ 9158 Elapsed Time: 0:01:45
    Save SBT.
    Done.

    \b
    $ sourmash sbt_search --ksize 16 reference survey.fa.sig
    # running sourmash subcommand: sbt_search
    loaded query: survey.fa... (k=16, DNA)
    0.11 0ef85591-d464-4953-915f-f673907b7e8e (Zika reference genome)

    TODO: add query
    TODO: --key arg not working?
    '''
    c = MongoClient(client)[db][cell]

    print('Initialize SBT.')
    # init SBT
    factory = GraphFactory(ksize=ksize, starting_size=1e5, n_tables=4)
    # 4 .. nt?
    tree = SBT(factory, d=2)  # d .. see "n-ary " in notebook

    print('Compute minhash signatures for selected documents.')
    print('{}{}{}{}'.format(
        'k-mer size: ', ksize, ', sketch size: ', nsketch
        ))
    bar = ProgressBar(max_value=UnknownLength)
    counter = 0
    for d in c.find():
        counter += 1
        e = Estimators(ksize=ksize, n=nsketch)
        e.add_sequence(d['sequence'], force=True)
        s = SourmashSignature(email='', estimator=e, name=deep_get(d, key))
        leaf = SigLeaf(metadata=deep_get(d, key), data=s)
        tree.add_node(node=leaf)
        bar.update(counter)
    print('\nSave SBT.')
    tree.save(file)
    print('Done.')
示例#12
0
def test_save_sparseness(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)

    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        to_search = leaf

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    old_result = {
        str(s)
        for s in tree.find(search_minhashes, to_search.data, 0.1)
    }
    print(*old_result, sep='\n')

    with utils.TempDirectory() as location:
        tree.save(os.path.join(location, 'demo'), sparseness=1.0)
        tree_loaded = SBT.load(os.path.join(location, 'demo'),
                               leaf_loader=SigLeaf.load)
        assert all(not isinstance(n, Node) for n in tree_loaded.nodes.values())

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {
            str(s)
            for s in tree_loaded.find(search_minhashes, to_search.data, 0.1)
        }
        print(*new_result, sep='\n')

        assert old_result == new_result

        for pos, node in list(tree_loaded.nodes.items()):
            # Every parent of a node must be an internal node (and not a leaf),
            # except for node 0 (the root), whose parent is None.
            if pos != 0:
                assert isinstance(tree_loaded.parent(pos).node, Node)

            # Leaf nodes can't have children
            if isinstance(node, Leaf):
                assert all(c.node is None for c in tree_loaded.children(pos))
示例#13
0
    def sbt_index(self, args):
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('signatures', nargs='+')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--traverse-directory', action='store_true')
        parser.add_argument('-x', '--bf-size', type=float, default=1e5)

        sourmash_args.add_moltype_args(parser)

        args = parser.parse_args(args)

        if args.protein:
            if args.dna is True:
                raise Exception('cannot specify both --dna and --protein!')
            args.dna = False
            moltype = 'protein'
        else:
            args.dna = True
            moltype = 'dna'

        factory = GraphFactory(1, args.bf_size, 4)
        tree = SBT(factory)

        inp_files = list(args.signatures)

        if args.traverse_directory:
            inp_files = []
            for dirname in args.signatures:
                for root, dirs, files in os.walk(dirname):
                    for name in files:
                        if name.endswith('.sig'):
                            fullname = os.path.join(root, name)
                            inp_files.append(fullname)

        print('loading {} files into SBT'.format(len(inp_files)))

        n = 0
        for f in inp_files:
            s = sig.load_signatures(f, select_ksize=args.ksize,
                                    select_moltype=moltype)

            for ss in s:
                leaf = SigLeaf(ss.md5sum(), ss)
                tree.add_node(leaf)
                n += 1

        print('loaded {} sigs; saving SBT under "{}".'.format(n,
                                                              args.sbt_name))
        tree.save(args.sbt_name)
示例#14
0
def test_tree_repair_add_node():
    tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'),
                           leaf_loader=SigLeaf.load)

    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree_repair.add_node(leaf)

    for pos, node in list(tree_repair.nodes.items()):
        # Every parent of a node must be an internal node (and not a leaf),
        # except for node 0 (the root), whose parent is None.
        if pos != 0:
            assert isinstance(tree_repair.parent(pos).node, Node)

        # Leaf nodes can't have children
        if isinstance(node, Leaf):
            assert all(c.node is None for c in tree_repair.children(pos))
示例#15
0
def sbt_search(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBest

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to load')
    parser.add_argument('query', help='signature to query')
    parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
    parser.add_argument('--threshold', default=0.08, type=float)
    parser.add_argument('--save-matches', type=argparse.FileType('wt'))
    parser.add_argument('--best-only', action='store_true')

    sourmash_args.add_moltype_args(parser)
    args = parser.parse_args(args)
    moltype = sourmash_args.calculate_moltype(args)

    search_fn = search_minhashes
    if args.best_only:
        search_fn = SearchMinHashesFindBest().search

    tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
    query = sourmash_args.load_query_signature(args.query,
                                               select_ksize=args.ksize,
                                               select_moltype=moltype)
    query_moltype = sourmash_args.get_moltype(query)
    query_ksize = query.estimator.ksize
    notify('loaded query: {}... (k={}, {})',
           query.name()[:30], query_ksize, query_moltype)

    results = []
    for leaf in tree.find(search_fn, query, args.threshold):
        results.append((query.similarity(leaf.data), leaf.data))
        #results.append((leaf.data.similarity(ss), leaf.data))

    results.sort(key=lambda x: -x[0])  # reverse sort on similarity
    for (similarity, query) in results:
        print('{:.2f} {}'.format(similarity, query.name()))

    if args.save_matches:
        outname = args.save_matches.name
        notify('saving all matches to "{}"', outname)
        sig.save_signatures([m for (sim, m) in results], args.save_matches)
示例#16
0
def test_search_minhashes():
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)

    to_search = next(iter(tree.leaves()))

    # this fails if 'search_minhashes' is calc containment and not similarity.
    results = tree.find(search_minhashes, to_search.data, 0.08)
    for leaf in results:
        assert to_search.data.similarity(leaf.data) >= 0.08

    print(results)
示例#17
0
def test_simple_index(n_children):
    factory = GraphFactory(5, 100, 3)
    root = SBT(factory, d=n_children)

    leaf1 = Leaf("a", factory())
    leaf1.data.count("AAAAA")
    leaf1.data.count("AAAAT")
    leaf1.data.count("AAAAC")

    leaf2 = Leaf("b", factory())
    leaf2.data.count("AAAAA")
    leaf2.data.count("AAAAT")
    leaf2.data.count("AAAAG")

    leaf3 = Leaf("c", factory())
    leaf3.data.count("AAAAA")
    leaf3.data.count("AAAAT")
    leaf3.data.count("CAAAA")

    leaf4 = Leaf("d", factory())
    leaf4.data.count("AAAAA")
    leaf4.data.count("CAAAA")
    leaf4.data.count("GAAAA")

    leaf5 = Leaf("e", factory())
    leaf5.data.count("AAAAA")
    leaf5.data.count("AAAAT")
    leaf5.data.count("GAAAA")

    root.add_node(leaf1)
    root.add_node(leaf2)
    root.add_node(leaf3)
    root.add_node(leaf4)
    root.add_node(leaf5)

    def search_kmer(obj, seq):
        return obj.data.get(seq)

    kmers = ["AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA"]

    linear = LinearIndex()
    linear.insert(leaf1)
    linear.insert(leaf2)
    linear.insert(leaf3)
    linear.insert(leaf4)
    linear.insert(leaf5)

    for kmer in kmers:
        assert set(root.find(search_kmer,
                             kmer)) == set(linear.find(search_kmer, kmer))

    print("-----")
    print([x.metadata for x in root.find(search_kmer, "AAAAA")])
    print([x.metadata for x in root.find(search_kmer, "AAAAT")])
    print([x.metadata for x in root.find(search_kmer, "AAAAG")])
    print([x.metadata for x in root.find(search_kmer, "CAAAA")])
    print([x.metadata for x in root.find(search_kmer, "GAAAA")])
示例#18
0
def load_sbts_and_sigs(filenames, query, is_similarity_query, traverse=False):
    query_ksize = query.minhash.ksize
    query_moltype = get_moltype(query)

    n_signatures = 0
    n_databases = 0
    databases = []
    for sbt_or_sigfile in filenames:
        if traverse and os.path.isdir(sbt_or_sigfile):
            for sigfile in traverse_find_sigs([sbt_or_sigfile]):
                try:
                    siglist = sig.load_signatures(sigfile,
                                                  ksize=query_ksize,
                                                  select_moltype=query_moltype)
                    siglist = filter_compatible_signatures(query, siglist, 1)
                    siglist = list(siglist)
                    databases.append((siglist, sbt_or_sigfile, False))
                    notify('loaded {} signatures from {}',
                           len(siglist),
                           sigfile,
                           end='\r')
                    n_signatures += len(siglist)
                except:  # ignore errors with traverse
                    pass

            # done! jump to beginning of main 'for' loop
            continue

        # no traverse? try loading as an SBT.
        try:
            tree = SBT.load(sbt_or_sigfile, leaf_loader=SigLeaf.load)

            if not check_tree_is_compatible(sbt_or_sigfile, tree, query,
                                            is_similarity_query):
                sys.exit(-1)

            databases.append((tree, sbt_or_sigfile, True))
            notify('loaded SBT {}', sbt_or_sigfile, end='\r')
            n_databases += 1

            # done! jump to beginning of main 'for' loop
            continue
        except (ValueError, EnvironmentError):
            # not an SBT - try as a .sig
            pass

        # not a tree? try loading as a signature.
        try:
            siglist = sig.load_signatures(sbt_or_sigfile,
                                          ksize=query_ksize,
                                          select_moltype=query_moltype)
            siglist = list(siglist)
            if len(siglist) == 0:  # file not found, or parse error?
                raise ValueError

            siglist = filter_compatible_signatures(query, siglist, False)
            siglist = list(siglist)

            databases.append((siglist, sbt_or_sigfile, False))
            notify('loaded {} signatures from {}',
                   len(siglist),
                   sbt_or_sigfile,
                   end='\r')
            n_signatures += len(siglist)
        except (EnvironmentError, ValueError):
            error("\nCannot open file '{}'", sbt_or_sigfile)
            sys.exit(-1)

    notify(' ' * 79, end='\r')
    if n_signatures and n_databases:
        notify('loaded {} signatures and {} databases total.', n_signatures,
               n_databases)
    elif n_signatures:
        notify('loaded {} signatures.', n_signatures)
    elif n_databases:
        notify('loaded {} databases.', n_databases)
    else:
        sys.exit(-1)

    if databases:
        print('')

    return databases
示例#19
0
def test_longer_search(n_children):
    ksize = 5
    factory = GraphFactory(ksize, 100, 3)
    root = SBT(factory, d=n_children)

    leaf1 = Leaf("a", factory())
    leaf1.data.count('AAAAA')
    leaf1.data.count('AAAAT')
    leaf1.data.count('AAAAC')

    leaf2 = Leaf("b", factory())
    leaf2.data.count('AAAAA')
    leaf2.data.count('AAAAT')
    leaf2.data.count('AAAAG')

    leaf3 = Leaf("c", factory())
    leaf3.data.count('AAAAA')
    leaf3.data.count('AAAAT')
    leaf3.data.count('CAAAA')

    leaf4 = Leaf("d", factory())
    leaf4.data.count('AAAAA')
    leaf4.data.count('CAAAA')
    leaf4.data.count('GAAAA')

    leaf5 = Leaf("e", factory())
    leaf5.data.count('AAAAA')
    leaf5.data.count('AAAAT')
    leaf5.data.count('GAAAA')

    root.add_node(leaf1)
    root.add_node(leaf2)
    root.add_node(leaf3)
    root.add_node(leaf4)
    root.add_node(leaf5)

    def kmers(k, seq):
        for start in range(len(seq) - k + 1):
            yield seq[start:start + k]

    def search_transcript(node, seq, threshold):
        presence = [node.data.get(kmer) for kmer in kmers(ksize, seq)]
        if sum(presence) >= int(threshold * (len(seq) - ksize + 1)):
            return 1
        return 0

    try1 = [x.metadata for x in root.find(search_transcript, "AAAAT", 1.0)]
    assert set(try1) == set(['a', 'b', 'c', 'e']), try1  # no 'd'

    try2 = [x.metadata for x in root.find(search_transcript, "GAAAAAT", 0.6)]
    assert set(try2) == set(['a', 'b', 'c', 'd', 'e'])

    try3 = [x.metadata for x in root.find(search_transcript, "GAAAA", 1.0)]
    assert set(try3) == set(['d', 'e']), try3
示例#20
0
def test_sbt_combine(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)
    tree_1 = SBT(factory, d=n_children)
    tree_2 = SBT(factory, d=n_children)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        if n_leaves < 4:
            tree_1.add_node(leaf)
        else:
            tree_2.add_node(leaf)
        n_leaves += 1

    tree_1.combine(tree_2)

    t1_leaves = {str(l) for l in tree_1.leaves()}
    t_leaves = {str(l) for l in tree.leaves()}

    assert len(t1_leaves) == n_leaves
    assert len(t_leaves) == len(t1_leaves)
    assert t1_leaves == t_leaves

    to_search = next(
        signature.load_signatures(utils.get_test_data(utils.SIG_FILES[0])))
    t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)}
    tree_result = {str(s) for s in tree.find(search_minhashes, to_search, 0.1)}
    assert t1_result == tree_result

    # TODO: save and load both trees

    # check if adding a new node will use the next empty position
    next_empty = 0
    for n, d in tree_1.nodes.items():
        if d is None:
            next_empty = n
            break
    if not next_empty:
        next_empty = n + 1

    tree_1.add_node(leaf)
    assert tree_1.max_node == next_empty
示例#21
0
with open('ref.json', 'w+') as outjson:
    outjson.write(dumps(db.ref.find(), indent=4))

from sourmash_lib import Estimators
from sourmash_lib.sbt import SBT, GraphFactory
from sourmash_lib.sbtmh import SigLeaf, search_minhashes
from sourmash_lib.signature import SourmashSignature

KSIZE = 16
N = 1000

# init SBT
factory = GraphFactory(ksize=KSIZE, starting_size=1e5, n_tables=4)
# 4 .. nt?
tree = SBT(factory, d=2)  # d .. see "n-ary " in notebook

bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
cursor = db.ref.find()
c = 0
for i in cursor:
    key = deep_get(i, 'metadata.alt_id.gb')
    seq = i['sequence']  # db.ref.find_one()['sequence']  # 'ACTG...'
    e = Estimators(ksize=KSIZE, n=N)
    e.add_sequence(seq, force=True)  # e.get_hashes()
    s = SourmashSignature(email='', estimator=e, name=key)

    leaf = SigLeaf(metadata=key, data=s)
    tree.add_node(node=leaf)
    c += 1
    bar.update(c)
示例#22
0
def test_simple(n_children):
    factory = GraphFactory(5, 100, 3)
    root = SBT(factory, d=n_children)

    leaf1 = Leaf("a", factory())
    leaf1.data.count('AAAAA')
    leaf1.data.count('AAAAT')
    leaf1.data.count('AAAAC')

    leaf2 = Leaf("b", factory())
    leaf2.data.count('AAAAA')
    leaf2.data.count('AAAAT')
    leaf2.data.count('AAAAG')

    leaf3 = Leaf("c", factory())
    leaf3.data.count('AAAAA')
    leaf3.data.count('AAAAT')
    leaf3.data.count('CAAAA')

    leaf4 = Leaf("d", factory())
    leaf4.data.count('AAAAA')
    leaf4.data.count('CAAAA')
    leaf4.data.count('GAAAA')

    leaf5 = Leaf("e", factory())
    leaf5.data.count('AAAAA')
    leaf5.data.count('AAAAT')
    leaf5.data.count('GAAAA')

    root.add_node(leaf1)
    root.add_node(leaf2)
    root.add_node(leaf3)
    root.add_node(leaf4)
    root.add_node(leaf5)

    def search_kmer(obj, seq):
        return obj.data.get(seq)

    leaves = [leaf1, leaf2, leaf3, leaf4, leaf5]
    kmers = ["AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA"]

    def search_kmer_in_list(kmer):
        x = []
        for l in leaves:
            if l.data.get(kmer):
                x.append(l)

        return set(x)

    for kmer in kmers:
        assert set(root.find(search_kmer, kmer)) == search_kmer_in_list(kmer)

    print('-----')
    print([x.metadata for x in root.find(search_kmer, "AAAAA")])
    print([x.metadata for x in root.find(search_kmer, "AAAAT")])
    print([x.metadata for x in root.find(search_kmer, "AAAAG")])
    print([x.metadata for x in root.find(search_kmer, "CAAAA")])
    print([x.metadata for x in root.find(search_kmer, "GAAAA")])
示例#23
0
def sbt_gather(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBestIgnoreMaxHash

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to search')
    parser.add_argument('query', help='query signature')
    parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
    parser.add_argument('--threshold', default=0.05, type=float)
    parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
    parser.add_argument('--csv', type=argparse.FileType('wt'))
    parser.add_argument('--save-matches', type=argparse.FileType('wt'))

    sourmash_args.add_moltype_args(parser)

    args = parser.parse_args(args)
    moltype = sourmash_args.calculate_moltype(args)

    tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
    query = sourmash_args.load_query_signature(args.query,
                                               select_ksize=args.ksize,
                                               select_moltype=moltype)
    query_moltype = sourmash_args.get_moltype(query)
    query_ksize = query.estimator.ksize
    notify('loaded query: {}... (k={}, {})',
           query.name()[:30], query_ksize, query_moltype)

    if query.estimator.max_hash == 0:
        error('query signature needs to be created with --scaled')
        error('or using --with-cardinality.')
        sys.exit(-1)

    notify('query signature has max_hash: {}', query.estimator.max_hash)
    orig_query = query

    R_metagenome = 2**64 / float(orig_query.estimator.max_hash)

    new_mins = query.estimator.get_hashes()
    e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins))
    e.update(query.estimator)
    query = sig.SourmashSignature('', e)

    sum_found = 0.
    found = []
    while 1:
        search_fn = SearchMinHashesFindBestIgnoreMaxHash().search

        results = []
        # use super low threshold for this part of the search
        for leaf in tree.find(search_fn, query, 0.00001):
            results.append((query.estimator.similarity_ignore_maxhash(
                leaf.data.estimator), leaf.data))

        if not len(results):  # no matches at all!
            break

        # take the best result
        results.sort(key=lambda x: -x[0])  # reverse sort on similarity
        best_sim, best_ss = results[0]

        # subtract found hashes from search hashes, construct new search
        new_mins = set(query.estimator.get_hashes())
        found_mins = best_ss.estimator.get_hashes()

        if best_ss.estimator.max_hash:
            R_genome = 2**64 / float(best_ss.estimator.max_hash)
        elif best_ss.estimator.hll:
            genome_size = best_ss.estimator.hll.estimate_cardinality()
            genome_max_hash = max(found_mins)
            R_genome = float(genome_size) / float(genome_max_hash)
        else:
            error('Best hash match in sbt_gather has no cardinality')
            error('Please prepare database of sequences with --scaled')
            error('...or with --with-cardinality')
            sys.exit(-1)

        R_comparison = max(R_metagenome, R_genome)
        new_max_hash = 2**64 / float(R_comparison)
        new_mins = set([i for i in new_mins if i < new_max_hash])
        found_mins = set([i for i in found_mins if i < new_max_hash])

        # intersection:
        intersect_mins = new_mins.intersection(found_mins)

        if len(intersect_mins) < 5:  # hard cutoff for now
            notify('found only {} hashes in common.', len(intersect_mins))
            notify('this is below a sane threshold => exiting.')
            break

        # first denominator - genome size
        genome_n_mins = len(found_mins)
        f_genome = len(intersect_mins) / float(genome_n_mins)

        # second denominator - metagenome size
        query_n_mins = len(orig_query.estimator.get_hashes())
        f_query = len(intersect_mins) / float(query_n_mins)

        # print interim & save
        notify('found: {:.2f} {:.2f} {}', f_genome, f_query, best_ss.name())
        found.append((f_genome, best_ss))

        new_mins -= set(found_mins)
        e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins))
        e.add_many(new_mins)
        query = sig.SourmashSignature('', e)

    notify('found {}, total fraction {:.3f}', len(found), sum_found)
    notify('')

    if not found:
        sys.exit(0)

    found.sort(key=lambda x: x[0])
    found.reverse()

    notify('Composition:')
    for (frac, leaf_sketch) in found:
        notify('{:.2f} {}', frac, leaf_sketch.name())

    if args.output:
        print('Composition:', file=args.output)
        for (frac, leaf_sketch) in found:
            print('{:.2f} {}'.format(frac, leaf_sketch.name()),
                  file=args.output)

    if args.csv:
        fieldnames = ['fraction', 'name', 'sketch_kmers']
        w = csv.DictWriter(args.csv, fieldnames=fieldnames)

        w.writeheader()
        for (frac, leaf_sketch) in found:
            cardinality = leaf_sketch.estimator.hll.estimate_cardinality()
            w.writerow(
                dict(fraction=frac,
                     name=leaf_sketch.name(),
                     sketch_kmers=cardinality))
    if args.save_matches:
        outname = args.save_matches.name
        notify('saving all matches to "{}"', outname)
        sig.save_signatures([ss for (f, ss) in found], args.save_matches)
示例#24
0
    def sbt_search(self, args):
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('query')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.08, type=float)
        parser.add_argument('--save-matches', type=argparse.FileType('wt'))
        parser.add_argument('--best-only', action='store_true')

        sourmash_args.add_moltype_args(parser)
        args = parser.parse_args(args)

        if args.protein:
            if args.dna is True:
                raise Exception('cannot specify both --dna and --protein!')
            args.dna = False

        moltype = None
        if args.protein:
            moltype = 'protein'
        elif args.dna:
            moltype = 'dna'

        search_fn = search_minhashes
        if args.best_only:
            search_fn = SearchMinHashesFindBest().search

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
        sl = sig.load_signatures(args.query, select_ksize=args.ksize,
                                 select_moltype=moltype)
        sl = list(sl)
        if len(sl) != 1:
            print('When loading query from "{}",'.format(args.query),
                  file=sys.stderr)
            print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl)))
            sys.exit(-1)

        query = sl[0]

        query_moltype = 'UNKNOWN'
        if query.estimator.is_molecule_type('dna'):
            query_moltype = 'DNA'
        elif query.estimator.is_molecule_type('protein'):
            query_moltype = 'protein'
        query_ksize = query.estimator.ksize
        print('loaded query: {}... (k={}, {})'.format(query.name()[:30],
                                                      query_ksize,
                                                      query_moltype))

        results = []
        for leaf in tree.find(search_fn, query, args.threshold):
            results.append((query.similarity(leaf.data), leaf.data))
            #results.append((leaf.data.similarity(ss), leaf.data))

        results.sort(key=lambda x: -x[0])   # reverse sort on similarity
        for (similarity, query) in results:
            print('{:.2f} {}'.format(similarity, query.name()))

        if args.save_matches:
            outname = args.save_matches.name
            print('saving all matches to "{}"'.format(outname))
            sig.save_signatures([ m for (sim, m) in results ],
                                args.save_matches)
示例#25
0
def categorize(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBest

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to load')
    parser.add_argument('queries',
                        nargs='+',
                        help='list of signatures to categorize')
    parser.add_argument('-k', '--ksize', type=int, default=None)
    parser.add_argument('--threshold', default=0.08, type=float)
    parser.add_argument('--traverse-directory', action="store_true")

    sourmash_args.add_moltype_args(parser)

    parser.add_argument('--csv', type=argparse.FileType('at'))
    parser.add_argument('--load-csv', default=None)

    args = parser.parse_args(args)
    moltype = sourmash_args.calculate_moltype(args)

    already_names = set()
    if args.load_csv:
        with open(args.load_csv, 'rt') as fp:
            r = csv.reader(fp)
            for row in r:
                already_names.add(row[0])

    tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)

    if args.traverse_directory:
        inp_files = set(sourmash_args.traverse_find_sigs(args.queries))
    else:
        inp_files = set(args.queries) - already_names

    inp_files = set(inp_files) - already_names

    notify('found {} files to query', len(inp_files))

    loader = sourmash_args.LoadSingleSignatures(inp_files, args.ksize, moltype)

    for queryfile, query, query_moltype, query_ksize in loader:
        notify('loaded query: {}... (k={}, {})',
               query.name()[:30], query_ksize, query_moltype)

        results = []
        search_fn = SearchMinHashesFindBest().search

        for leaf in tree.find(search_fn, query, args.threshold):
            if leaf.data.md5sum() != query.md5sum():  # ignore self.
                results.append((query.similarity(leaf.data), leaf.data))

        best_hit_sim = 0.0
        best_hit_query_name = ""
        if results:
            results.sort(key=lambda x: -x[0])  # reverse sort on similarity
            best_hit_sim, best_hit_query = results[0]
            notify('for {}, found: {:.2f} {}', query.name(), best_hit_sim,
                   best_hit_query.name())
            best_hit_query_name = best_hit_query.name()
        else:
            notify('for {}, no match found', query.name())

        if args.csv:
            w = csv.writer(args.csv)
            w.writerow([queryfile, best_hit_query_name, best_hit_sim])

    if loader.skipped_ignore:
        notify('skipped/ignore: {}', loader.skipped_ignore)
    if loader.skipped_nosig:
        notify('skipped/nosig: {}', loader.skipped_nosig)
示例#26
0
    def sbt_gather(self, args):
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('query')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.05, type=float)
        parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
        parser.add_argument('--csv', type=argparse.FileType('wt'))

        sourmash_args.add_moltype_args(parser)

        args = parser.parse_args(args)

        if args.protein:
            if args.dna is True:
                raise Exception('cannot specify both --dna and --protein!')
            args.dna = False

        moltype = None
        if args.protein:
            moltype = 'protein'
        elif args.dna:
            moltype = 'dna'

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
        sl = sig.load_signatures(args.query, select_ksize=args.ksize,
                                 select_moltype=moltype)
        sl = list(sl)
        if len(sl) != 1:
            print('When loading query from "{}",'.format(args.query),
                  file=sys.stderr)
            print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl)))
            sys.exit(-1)

        query = sl[0]

        query_moltype = 'UNKNOWN'
        if query.estimator.is_molecule_type('dna'):
            query_moltype = 'DNA'
        elif query.estimator.is_molecule_type('protein'):
            query_moltype = 'protein'
        query_ksize = query.estimator.ksize
        print('loaded query: {}... (k={}, {})'.format(query.name()[:30],
                                                      query_ksize,
                                                      query_moltype))

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
        #s = sig.load_signatures(args.query, select_ksize=args.ksize)
        orig_query = query

        sum_found = 0.
        found = []
        while 1:
            search_fn = SearchMinHashesFindBest().search

            results = []
            # use super low threshold for this part of the search
            for leaf in tree.find(search_fn, query, 0.00001):
                results.append((query.similarity(leaf.data), leaf.data))
                #results.append((leaf.data.similarity(ss), leaf.data))

            if not len(results):          # no matches at all!
                break

            # take the best result
            results.sort(key=lambda x: -x[0])   # reverse sort on similarity
            best_sim, best_ss = results[0]
            sim = best_ss.similarity(orig_query)

            # adjust by size of leaf (kmer cardinality of original genome)
            if best_ss.estimator.hll:
                leaf_kmers = best_ss.estimator.hll.estimate_cardinality()
                query_kmers = orig_query.estimator.hll.estimate_cardinality()
                f_of_total = leaf_kmers / query_kmers * sim
            else:
                f_of_total = 0

            if not found and sim < args.threshold:
                print('best match: {}'.format(best_ss.name()))
                print('similarity is {:.5f} of db signature;'.format(sim))
                print('this is below specified threshold => exiting.')
                break

            # subtract found hashes from search hashes, construct new search
            new_mins = set(query.estimator.mh.get_mins())
            found_mins = best_ss.estimator.mh.get_mins()

            # print interim & save
            print('found: {:.2f} {} {}'.format(f_of_total,
                                               len(new_mins),
                                               best_ss.name()))
            found.append((f_of_total, best_ss, sim))
            sum_found += f_of_total

            new_mins -= set(found_mins)
            e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins))
            for m in new_mins:
                e.mh.add_hash(m)
            new_ss = sig.SourmashSignature('foo', e)
            query = new_ss

        print('found {}, total fraction {:.3f}'.format(len(found), sum_found))
        print('')

        if not found:
            sys.exit(0)

        found.sort()
        found.reverse()

        print('Composition:')
        for (frac, leaf_sketch, sim) in found:
            print('{:.2f} {}'.format(frac, leaf_sketch.name()))

        if args.output:
            print('Composition:', file=args.output)
            for (frac, leaf_sketch, sim) in found:
                print('{:.2f} {}'.format(frac, leaf_sketch.name()),
                      file=args.output)

        if args.csv:
            fieldnames = ['fraction', 'name', 'similarity', 'sketch_kmers']
            w = csv.DictWriter(args.csv, fieldnames=fieldnames)

            w.writeheader()
            for (frac, leaf_sketch, sim) in found:
                cardinality = leaf_sketch.estimator.hll.estimate_cardinality()
                w.writerow(dict(fraction=frac, name=leaf_sketch.name(),
                                similarity=sim,
                                sketch_kmers=cardinality))
示例#27
0
    def categorize(self, args):
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('queries', nargs='+')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.08, type=float)
        parser.add_argument('--traverse-directory', action="store_true")

        sourmash_args.add_moltype_args(parser)

        parser.add_argument('--csv', type=argparse.FileType('at'))
        parser.add_argument('--load-csv', default=None)
        
        args = parser.parse_args(args)

        if args.protein:
            if args.dna is True:
                raise Exception('cannot specify both --dna and --protein!')
            args.dna = False

        moltype = None
        if args.protein:
            moltype = 'protein'
        elif args.dna:
            moltype = 'dna'

        already_names = set()
        if args.load_csv:
            with open(args.load_csv, 'rt') as fp:
                r = csv.reader(fp)
                for row in r:
                    already_names.add(row[0])

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)

        if args.traverse_directory:
            inp_files = []
            for dirname in args.queries:
                for root, dirs, files in os.walk(dirname):
                    for name in files:
                        if name.endswith('.sig'):
                            fullname = os.path.join(root, name)
                            if fullname not in already_names:
                                inp_files.append(fullname)
        else:
            inp_files = args.queries

        print('found {} files to query'.format(len(inp_files)))

        loader = sourmash_args.LoadSingleSignatures(inp_files,
                                                    args.ksize, moltype)
        for queryfile, query, query_moltype, query_ksize in loader:
            print('loaded query: {}... (k={}, {})'.format(query.name()[:30],
                                                          query_ksize,
                                                          query_moltype))

            results = []
            search_fn = SearchMinHashesFindBest().search

            for leaf in tree.find(search_fn, query, args.threshold):
                # ignore self
                if leaf.data.md5sum() != query.md5sum():
                    results.append((query.similarity(leaf.data), leaf.data))

            best_hit_sim = 0.0
            best_hit_query_name = ""
            if results:
                results.sort(key=lambda x: -x[0])   # reverse sort on similarity
                best_hit_sim, best_hit_query = results[0]
                print('for {}, found: {:.2f} {}'.format(query.name(),
                                                        best_hit_sim,
                                                        best_hit_query.name()))
                best_hit_query_name = best_hit_query.name()
            else:
                print('for {}, no match found'.format(query.name()))

            if args.csv:
                w = csv.writer(args.csv)
                w.writerow([queryfile, best_hit_query_name, best_hit_sim])

        if loader.skipped_ignore:
            print('skipped/ignore: {}'.format(loader.skipped_ignore))
        if loader.skipped_nosig:
            print('skipped/nosig: {}'.format(loader.skipped_nosig))
示例#28
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('dir1')
    p.add_argument('sbt1')
    p.add_argument('dir2')
    p.add_argument('sbt2')
    p.add_argument('-k', '--ksize', type=int, default=31)
    args = p.parse_args()

    print('loading all signatures:', args.dir1)
    sigdict1 = load_all_signatures(args.dir1, args.ksize)
    tree1 = SBT.load(args.sbt1, leaf_loader=SigLeaf.load)
    print('...loaded {} signatures at k={}'.format(len(sigdict1), args.ksize))

    print('loading all signatures:', args.dir2)
    sigdict2 = load_all_signatures(args.dir2, args.ksize)
    tree2 = SBT.load(args.sbt2, leaf_loader=SigLeaf.load)
    print('...loaded {} signatures at k={}'.format(len(sigdict2), args.ksize))

    # first, find all matches in 2 for 1, and 1 for 2
    THRESHOLD = 0.05
    matches_1_in_2 = make_all_matches(sigdict1, tree2, THRESHOLD)
    matches_2_in_1 = make_all_matches(sigdict2, tree1, THRESHOLD)

    # now, do containment
    contained_1_in_2 = containment(matches_1_in_2, sigdict1, sigdict2)
    contained_2_in_1 = containment(matches_2_in_1, sigdict2, sigdict1)

    # summary stats
    CONTAIN_THRESHOLD = 0.95
    IDENT_THRESHOLD = 0.80

    print('thresholds:')
    print('min Jaccard similarity for any match:', THRESHOLD)
    print('to score as identical, similarity must be >=', IDENT_THRESHOLD)
    print('to score as contained, containment must be >=', CONTAIN_THRESHOLD)

    # 1 in 2
    c_ident = 0
    c_match = 0
    c_contain = 0
    c_no_match = 0
    c_no_contain = 0
    identical_names = []

    for query_name in sigdict1:
        best_match = None
        similarity = 0.0
        cont = 0.0

        if query_name in matches_1_in_2:
            (best_match, similarity) = matches_1_in_2[query_name]
        if query_name in contained_1_in_2:
            cont = contained_1_in_2[query_name]

        if not best_match:
            c_no_match += 1
        else:
            c_match += 1

        if cont < CONTAIN_THRESHOLD:
            c_no_contain += 1
        else:
            c_contain += 1

        if similarity > IDENT_THRESHOLD:
            identical_names.append((query_name, best_match))
            c_ident += 1

    print('----')
    print('{} vs {}: {} signatures'.format(args.dir1, args.dir2,
                                           len(sigdict1)))
    print('identical count:', c_ident)
    print('containment count:', c_contain)
    print('matches:', c_match)

    print('no match:', c_no_match)
    print('no contain:', c_no_contain)

    print('identical:')
    for (k, v) in identical_names:
        print("{} = {}".format(k, v))

    # 2 in 1
    c_ident = 0
    c_match = 0
    c_contain = 0
    c_no_match = 0
    c_no_contain = 0
    identical_names = []

    for query_name in sigdict2:
        best_match = None
        similarity = 0.0
        cont = 0.0

        if query_name in matches_2_in_1:
            (best_match, similarity) = matches_2_in_1[query_name]
        if query_name in contained_2_in_1:
            cont = contained_2_in_1[query_name]

        if not best_match:
            c_no_match += 1
        else:
            c_match += 1

        if cont < CONTAIN_THRESHOLD:
            c_no_contain += 1
        else:
            c_contain += 1

        if similarity > IDENT_THRESHOLD:
            identical_names.append((query_name, best_match))
            c_ident += 1

    print('----')
    print('{} vs {}: {} signatures'.format(args.dir2, args.dir1,
                                           len(sigdict2)))
    print('identical count:', c_ident)
    print('containment count:', c_contain)

    print('matches:', c_match)
    print('no match:', c_no_match)
    print('no contain:', c_no_contain)

    print('identical:')
    for (k, v) in identical_names:
        print("{} = {}".format(k, v))
示例#29
0
    def watch(self, args):
        "Build a signature from raw FASTA/FASTQ coming in on stdin, search."
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.05, type=float)
        parser.add_argument('--input-is-protein', action='store_true')
        sourmash_args.add_moltype_args(parser, default_dna=True)
        parser.add_argument('-n', '--num-hashes', type=int,
                            default=DEFAULT_N,
                            help='number of hashes to use in each sketch (default: %(default)i)')
        parser.add_argument('--name', type=str, default='stdin')
        args = parser.parse_args(args)

        if args.input_is_protein and args.dna:
            print('WARNING: input is protein, turning off DNA hash computing.',
                  file=sys.stderr)
            args.dna = False
            args.protein = True

        if args.dna and args.protein:
            notify('ERROR: cannot use "watch" with both DNA and protein.')

        if args.dna:
            moltype = 'DNA'
            is_protein = False
        else:
            moltype = 'protein'
            is_protein = True

        E = sourmash_lib.Estimators(ksize=args.ksize, n=args.num_hashes,
                                    protein=is_protein)
        streamsig = sig.SourmashSignature('', E, filename='stdin',
                                          name=args.name)

        notify('Computing signature for k={}, {} from stdin',
               args.ksize, moltype)


        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)

        def do_search():
            search_fn = SearchMinHashesFindBest().search

            results = []
            for leaf in tree.find(search_fn, streamsig, args.threshold):
                results.append((streamsig.similarity(leaf.data),
                                leaf.data))

            return results

        notify('reading sequences from stdin')
        screed_iter = screed.open('/dev/stdin')
        watermark = WATERMARK_SIZE

        # iterate over input records
        n = 0
        for n, record in enumerate(screed_iter):
            # at each watermark, print status & check cardinality
            if n >= watermark:
                notify('... read {} sequences', n)
                watermark += WATERMARK_SIZE

                if do_search():
                    break

            if args.input_is_protein:
                E.mh.add_protein(record.sequence)
            else:
                E.add_sequence(record.sequence, False)

        results = do_search()
        if not results:
            notify('... read {} sequences, no matches found.', n)
        else:
            results.sort(key=lambda x: -x[0])   # take best
            similarity, found_sig = results[0]
            notify('FOUND: {}, at {:.3f}', found_sig.name(),
                   similarity)

        if args.output:
            sig.save_signatures([streamsig], args.output)
示例#30
0
def watch(args):
    "Build a signature from raw FASTA/FASTQ coming in on stdin, search."
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBest

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to search')
    parser.add_argument('inp_file', nargs='?', default='/dev/stdin')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='suppress non-error output')
    parser.add_argument('-o',
                        '--output',
                        type=argparse.FileType('wt'),
                        help='save signature generated from data here')
    parser.add_argument('--threshold',
                        default=0.05,
                        type=float,
                        help='minimum threshold for matches')
    parser.add_argument(
        '--input-is-protein',
        action='store_true',
        help='Consume protein sequences - no translation needed')
    sourmash_args.add_construct_moltype_args(parser)
    parser.add_argument(
        '-n',
        '--num-hashes',
        type=int,
        default=DEFAULT_N,
        help='number of hashes to use in each sketch (default: %(default)i)')
    parser.add_argument('--name',
                        type=str,
                        default='stdin',
                        help='name to use for generated signature')
    sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
    args = parser.parse_args(args)
    set_quiet(args.quiet)

    if args.input_is_protein and args.dna:
        notify('WARNING: input is protein, turning off DNA hashing.')
        args.dna = False
        args.protein = True

    if args.dna and args.protein:
        notify('ERROR: cannot use "watch" with both DNA and protein.')

    if args.dna:
        moltype = 'DNA'
        is_protein = False
    else:
        moltype = 'protein'
        is_protein = True

    tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)

    def get_ksize(tree):
        """Walk nodes in `tree` to find out ksize"""
        for node in tree.nodes.values():
            if isinstance(node, sourmash_lib.sbtmh.SigLeaf):
                return node.data.minhash.ksize

    # deduce ksize from the SBT we are loading
    ksize = args.ksize
    if ksize is None:
        ksize = get_ksize(tree)

    E = sourmash_lib.MinHash(ksize=ksize,
                             n=args.num_hashes,
                             is_protein=is_protein)
    streamsig = sig.SourmashSignature('', E, filename='stdin', name=args.name)

    notify('Computing signature for k={}, {} from stdin', ksize, moltype)

    def do_search():
        search_fn = SearchMinHashesFindBest().search

        results = []
        for leaf in tree.find(search_fn, streamsig, args.threshold):
            results.append((streamsig.similarity(leaf.data), leaf.data))

        return results

    notify('reading sequences from stdin')
    screed_iter = screed.open(args.inp_file)
    watermark = WATERMARK_SIZE

    # iterate over input records
    n = 0
    for n, record in enumerate(screed_iter):
        # at each watermark, print status & check cardinality
        if n >= watermark:
            notify('\r... read {} sequences', n, end='')
            watermark += WATERMARK_SIZE

            if do_search():
                break

        if args.input_is_protein:
            E.add_protein(record.sequence)
        else:
            E.add_sequence(record.sequence, False)

    results = do_search()
    if not results:
        notify('... read {} sequences, no matches found.', n)
    else:
        results.sort(key=lambda x: -x[0])  # take best
        similarity, found_sig = results[0]
        print_results('FOUND: {}, at {:.3f}', found_sig.name(), similarity)

    if args.output:
        notify('saving signature to {}', args.output.name)
        sig.save_signatures([streamsig], args.output)