Exemplo n.º 1
0
def test_jaccard_on_real_data():
    from sourmash_lib.signature import load_signatures

    afile = 'n10000/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz'
    a = utils.get_test_data(afile)
    sig1 = list(load_signatures(a))[0]
    mh1 = sig1.minhash

    bfile = 'n10000/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz'
    b = utils.get_test_data(bfile)
    sig2 = list(load_signatures(b))[0]
    mh2 = sig2.minhash

    assert mh1.compare(mh2) == 0.0183
    assert mh2.compare(mh1) == 0.0183

    mh1 = mh1.downsample_n(1000)
    mh2 = mh2.downsample_n(1000)
    assert mh1.compare(mh2) == 0.011
    assert mh2.compare(mh1) == 0.011

    mh1 = mh1.downsample_n(100)
    mh2 = mh2.downsample_n(100)
    assert mh1.compare(mh2) == 0.01
    assert mh2.compare(mh1) == 0.01

    mh1 = mh1.downsample_n(10)
    mh2 = mh2.downsample_n(10)
    assert mh1.compare(mh2) == 0.0
    assert mh2.compare(mh1) == 0.0
Exemplo n.º 2
0
def test_scaled_on_real_data_2():
    from sourmash_lib.signature import load_signatures

    afile = 'scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz'
    a = utils.get_test_data(afile)
    sig1 = list(load_signatures(a))[0]
    mh1 = sig1.minhash

    bfile = 'scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz'
    b = utils.get_test_data(bfile)
    sig2 = list(load_signatures(b))[0]
    mh2 = sig2.minhash

    assert round(mh1.compare(mh2), 5) == 0.01644
    assert round(mh2.compare(mh1), 5) == 0.01644

    mh1 = mh1.downsample_scaled(1000)
    mh2 = mh2.downsample_scaled(1000)

    assert round(mh1.compare(mh2), 4) == 0.0187
    assert round(mh2.compare(mh1), 4) == 0.0187

    mh1 = mh1.downsample_scaled(10000)
    mh2 = mh2.downsample_scaled(10000)
    assert round(mh1.compare(mh2), 3) == 0.01
    assert round(mh2.compare(mh1), 3) == 0.01

    mh1 = mh1.downsample_scaled(100000)
    mh2 = mh2.downsample_scaled(100000)
    assert round(mh1.compare(mh2), 2) == 0.01
    assert round(mh2.compare(mh1), 2) == 0.01
Exemplo n.º 3
0
    def search(self, args):
        "Search a query sig against one or more signatures; report top match."
        parser = argparse.ArgumentParser()
        parser.add_argument('query')
        parser.add_argument('against', nargs='+')
        parser.add_argument('--threshold', default=0.08, type=float)
        parser.add_argument('-k', '--ksize', default=DEFAULT_K, type=int)
        parser.add_argument('-f', '--force', action='store_true')
        args = parser.parse_args(args)

        # get the query signature
        sl = sig.load_signatures(open(args.query, 'r'),
                                 select_ksize=args.ksize)
        if len(sl) != 1:
            raise Exception("%d query signatures; need exactly one" % len(sl))
        query = sl[0]

        # get the signatures to query
        print('loading db of signatures from %d files' % len(args.against),
              file=sys.stderr)
        against = []
        for filename in args.against:
            if filename == args.query and not args.force:
                print('excluding query from database (file %s)' % filename,
                      file=sys.stderr)
                continue

            sl = sig.load_signatures(open(filename, 'r'),
                                     select_ksize=args.ksize)
            for x in sl:
                against.append((x, filename))

        # compute query x db
        distances = []
        for (x, filename) in against:
            distance = query.similarity(x)
            if distance >= args.threshold:
                distances.append((distance, x, filename))

        # any matches? sort, show.
        if distances:
            distances.sort(reverse=True, key=lambda x: x[0])
            print('%d matches:' % len(distances))
            for distance, match, filename in distances[:3]:
                print('\t', match.name(), '\t', "%.3f" % distance, '\t',
                      filename)
        else:
            print('** no matches in %d signatures' % len(against),
                  file=sys.stderr)
Exemplo n.º 4
0
    def load(info, dirname):
        from sourmash_lib import signature

        filename = os.path.join(dirname, info['filename'])
        it = signature.load_signatures(filename)
        data, = list(it)  # should only be one signature
        return SigLeaf(info['metadata'], data, name=info['name'])
Exemplo n.º 5
0
def test_binary_nary_tree(SBTImplementation):
    factory = GraphFactory(31, 1e5, 4)
    trees = {}
    trees[2] = SBTImplementation(factory)
    trees[5] = SBTImplementation(factory, d=5)
    trees[10] = SBTImplementation(factory, d=10)

    for f in glob("urchin/lividus*.sig"):
        with open(f, 'r') as data:
            sig = signature.load_signatures(data)
        leaf = SigLeaf(os.path.basename(f), sig[0])
        for tree in trees.values():
            tree.add_node(leaf)
        to_search = leaf

    results = {}
    print('*' * 60)
    print("{}:".format(to_search.metadata))
    for d, tree in trees.items():
        results[d] = [
            str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)
        ]
    print(*results[2], sep='\n')

    assert set(results[2]) == set(results[5])
    assert set(results[5]) == set(results[10])
    assert len(results) > 0
Exemplo n.º 6
0
def test_tree_save_load(SBTImplementation):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBTImplementation(factory)
    for f in glob("urchin/lividus*.sig"):
        with open(f, 'r') as data:
            sig = signature.load_signatures(data)
        leaf = SigLeaf(os.path.basename(f), sig[0])
        tree.add_node(leaf)
        to_search = leaf

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    old_result = [
        str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)
    ]
    print(*old_result, sep='\n')

    tree.save('urchin')

    tree2 = SBTImplementation.load('urchin.sbt.json', leaf_loader=SigLeaf.load)

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    new_result = [
        str(s) for s in tree2.find(search_minhashes, to_search.data, 0.1)
    ]
    print(*new_result, sep='\n')

    assert set(old_result) == set(new_result)
    assert len(old_result) > 0
Exemplo n.º 7
0
def test_tree_save_load(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)

    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        to_search = leaf

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    old_result = {
        str(s)
        for s in tree.find(search_minhashes, to_search.data, 0.1)
    }
    print(*old_result, sep='\n')

    with utils.TempDirectory() as location:
        tree.save(os.path.join(location, 'demo'))
        tree = SBT.load(os.path.join(location, 'demo'),
                        leaf_loader=SigLeaf.load)

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
        print(*new_result, sep='\n')

        assert old_result == new_result
Exemplo n.º 8
0
def test_binary_nary_tree():
    factory = GraphFactory(31, 1e5, 4)
    trees = {}
    trees[2] = SBT(factory)
    trees[5] = SBT(factory, d=5)
    trees[10] = SBT(factory, d=10)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        for tree in trees.values():
            tree.add_node(leaf)
        to_search = leaf
        n_leaves += 1

    assert all([len(t.leaves()) == n_leaves for t in trees.values()])

    results = {}
    print('*' * 60)
    print("{}:".format(to_search.metadata))
    for d, tree in trees.items():
        results[d] = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
    print(*results[2], sep='\n')

    assert results[2] == results[5]
    assert results[5] == results[10]
Exemplo n.º 9
0
def test_sbt_fsstorage():
    factory = GraphFactory(31, 1e5, 4)
    with utils.TempDirectory() as location:
        tree = SBT(factory)

        for f in utils.SIG_FILES:
            sig = next(signature.load_signatures(utils.get_test_data(f)))
            leaf = SigLeaf(os.path.basename(f), sig)
            tree.add_node(leaf)
            to_search = leaf

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        old_result = {str(s) for s in tree.find(search_minhashes,
                                                to_search.data, 0.1)}
        print(*old_result, sep='\n')

        with FSStorage(os.path.join(location, '.fstree')) as storage:
            tree.save(os.path.join(location, 'tree'), storage=storage)

        tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load)
        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {str(s) for s in tree.find(search_minhashes,
                                                to_search.data, 0.1)}
        print(*new_result, sep='\n')

        assert old_result == new_result

        assert os.path.exists(os.path.join(location, tree.storage.path))
        assert os.path.exists(os.path.join(location, '.fstree'))
Exemplo n.º 10
0
def test_sbt_combine(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)
    tree_1 = SBT(factory, d=n_children)
    tree_2 = SBT(factory, d=n_children)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        if n_leaves < 4:
            tree_1.add_node(leaf)
        else:
            tree_2.add_node(leaf)
        n_leaves += 1

    tree_1.combine(tree_2)

    t1_leaves = {str(l) for l in tree_1.leaves()}
    t_leaves = {str(l) for l in tree.leaves()}

    assert len(t1_leaves) == n_leaves
    assert len(t_leaves) == len(t1_leaves)
    assert t1_leaves == t_leaves

    to_search = next(signature.load_signatures(
                        utils.get_test_data(utils.SIG_FILES[0])))
    t1_result = {str(s) for s in tree_1.find(search_minhashes,
                                             to_search, 0.1)}
    tree_result = {str(s) for s in tree.find(search_minhashes,
                                             to_search, 0.1)}
    assert t1_result == tree_result

    # TODO: save and load both trees

    # check if adding a new node will use the next empty position
    next_empty = 0
    for n, d in tree_1.nodes.items():
        if d is None:
            next_empty = n
            break
    if not next_empty:
        next_empty = n + 1

    tree_1.add_node(leaf)
    assert tree_1.max_node == next_empty
Exemplo n.º 11
0
def test_do_sourmash_check_protein_comparisons():
    # this test checks 2 x 2 protein comparisons with E. coli genes.
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('ecoli.faa')
        status, out, err = utils.runscript('sourmash', [
            'compute', '-k', '21', '--input-is-protein', '--singleton',
            testdata1
        ],
                                           in_directory=location)
        sig1 = os.path.join(location, 'ecoli.faa.sig')
        assert os.path.exists(sig1)

        testdata2 = utils.get_test_data('ecoli.genes.fna')
        status, out, err = utils.runscript('sourmash', [
            'compute', '-k', '21', '--protein', '--no-dna', '--singleton',
            testdata2
        ],
                                           in_directory=location)
        sig2 = os.path.join(location, 'ecoli.genes.fna.sig')
        assert os.path.exists(sig2)

        # I'm not sure why load_signatures is randomizing order, but ok.
        x = list(signature.load_signatures(sig1))
        sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name())

        x = list(signature.load_signatures(sig2))
        sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name())

        name1 = sig1_aa.name().split()[0]
        assert name1 == 'NP_414543.1'
        name2 = sig2_aa.name().split()[0]
        assert name2 == 'NP_414544.1'
        name3 = sig1_trans.name().split()[0]
        assert name3 == 'gi|556503834:2801-3733'
        name4 = sig2_trans.name().split()[0]
        assert name4 == 'gi|556503834:337-2799'

        print(name1, name3, round(sig1_aa.similarity(sig1_trans), 3))
        print(name2, name3, round(sig2_aa.similarity(sig1_trans), 3))
        print(name1, name4, round(sig1_aa.similarity(sig2_trans), 3))
        print(name2, name4, round(sig2_aa.similarity(sig2_trans), 3))

        assert round(sig1_aa.similarity(sig1_trans), 3) == 0.0
        assert round(sig2_aa.similarity(sig1_trans), 3) == 0.273
        assert round(sig1_aa.similarity(sig2_trans), 3) == 0.174
        assert round(sig2_aa.similarity(sig2_trans), 3) == 0.0
Exemplo n.º 12
0
def test_load_minified(track_abundance):
    sigfile = utils.get_test_data('genome-s10+s11.sig')
    sigs = load_signatures(sigfile)

    minified = save_signatures(sigs)
    with open(sigfile, 'r') as f:
        orig_file = f.read()
    assert len(minified) < len(orig_file)
    assert '\n' not in minified
Exemplo n.º 13
0
    def compare(self, args):
        "Compare multiple signature files and create a distance matrix."
        import numpy

        parser = argparse.ArgumentParser()
        parser.add_argument('signatures', nargs='+')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('-o', '--output')
        args = parser.parse_args(args)

        # load in the various signatures
        siglist = []
        for filename in args.signatures:
            print('loading', filename, file=sys.stderr)
            data = open(filename).read()
            loaded = sig.load_signatures(data, select_ksize=args.ksize)
            if not loaded:
                print('warning: no signatures loaded at given ksize from %s' %
                      filename,
                      file=sys.stderr)
            siglist.extend(loaded)

        if len(siglist) == 0:
            print('no signatures!', file=sys.stderr)
            sys.exit(-1)

        # build the distance matrix
        D = numpy.zeros([len(siglist), len(siglist)])
        numpy.set_printoptions(precision=3, suppress=True)

        # do all-by-all calculation
        i = 0
        labeltext = []
        for i, E in enumerate(siglist):
            for j, E2 in enumerate(siglist):
                D[i][j] = E.similarity(E2)

            print('%d-%20s\t%s' % (
                i,
                E.name(),
                D[i, :, ],
            ))
            labeltext.append(E.name())
            i += 1

        print('min similarity in matrix:', numpy.min(D), file=sys.stderr)

        # shall we output a matrix?
        if args.output:
            labeloutname = args.output + '.labels.txt'
            print('saving labels to:', labeloutname, file=sys.stderr)
            with open(labeloutname, 'w') as fp:
                fp.write("\n".join(labeltext))

            print('saving distance matrix to:', args.output, file=sys.stderr)
            with open(args.output, 'wb') as fp:
                numpy.save(fp, D)
Exemplo n.º 14
0
def test_do_sourmash_check_knowngood_dna_comparisons():
    # this test checks against a known good signature calculated
    # by utils/compute-dna-mh-another-way.py
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('ecoli.genes.fna')
        status, out, err = utils.runscript(
            'sourmash',
            ['compute', '-k', '21', '--singleton', '--dna', testdata1],
            in_directory=location)
        sig1 = os.path.join(location, 'ecoli.genes.fna.sig')
        assert os.path.exists(sig1)

        x = list(signature.load_signatures(sig1))
        sig1, sig2 = sorted(x, key=lambda x: x.name())

        knowngood = utils.get_test_data('benchmark.dna.sig')
        good = list(signature.load_signatures(knowngood))[0]

        assert sig2.similarity(good) == 1.0
Exemplo n.º 15
0
def test_roundtrip(track_abundance):
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Exemplo n.º 16
0
def test_roundtrip(track_abundance):
    e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig = SourmashSignature('*****@*****.**', e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.estimator

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Exemplo n.º 17
0
def test_do_sourmash_compute():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('short.fa')
        status, out, err = utils.runscript('sourmash', ['compute', testdata1],
                                           in_directory=location)

        sigfile = os.path.join(location, 'short.fa.sig')
        assert os.path.exists(sigfile)

        sig = next(signature.load_signatures(sigfile))
        assert sig.name().endswith('short.fa')
Exemplo n.º 18
0
def test_roundtrip_empty(track_abundance):
    # edge case, but: empty estimator? :)
    e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance)

    sig = SourmashSignature('*****@*****.**', e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.estimator

    assert sig.similarity(sig2) == 0
    assert sig2.similarity(sig) == 0
Exemplo n.º 19
0
    def sbt_index(self, args):
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('signatures', nargs='+')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--traverse-directory', action='store_true')
        parser.add_argument('-x', '--bf-size', type=float, default=1e5)

        sourmash_args.add_moltype_args(parser)

        args = parser.parse_args(args)

        if args.protein:
            if args.dna is True:
                raise Exception('cannot specify both --dna and --protein!')
            args.dna = False
            moltype = 'protein'
        else:
            args.dna = True
            moltype = 'dna'

        factory = GraphFactory(1, args.bf_size, 4)
        tree = SBT(factory)

        inp_files = list(args.signatures)

        if args.traverse_directory:
            inp_files = []
            for dirname in args.signatures:
                for root, dirs, files in os.walk(dirname):
                    for name in files:
                        if name.endswith('.sig'):
                            fullname = os.path.join(root, name)
                            inp_files.append(fullname)

        print('loading {} files into SBT'.format(len(inp_files)))

        n = 0
        for f in inp_files:
            s = sig.load_signatures(f, select_ksize=args.ksize,
                                    select_moltype=moltype)

            for ss in s:
                leaf = SigLeaf(ss.md5sum(), ss)
                tree.add_node(leaf)
                n += 1

        print('loaded {} sigs; saving SBT under "{}".'.format(n,
                                                              args.sbt_name))
        tree.save(args.sbt_name)
Exemplo n.º 20
0
def test_roundtrip_empty(track_abundance):
    # edge case, but: empty minhash? :)
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)

    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert sig.similarity(sig2) == 0
    assert sig2.similarity(sig) == 0
Exemplo n.º 21
0
def test_do_sourmash_compute_multik():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('short.fa')
        status, out, err = utils.runscript(
            'sourmash', ['compute', '-k', '21,31', testdata1],
            in_directory=location)
        outfile = os.path.join(location, 'short.fa.sig')
        assert os.path.exists(outfile)

        siglist = list(signature.load_signatures(outfile))
        assert len(siglist) == 2
        ksizes = set([x.estimator.ksize for x in siglist])
        assert 21 in ksizes
        assert 31 in ksizes
Exemplo n.º 22
0
def test_save_minified(track_abundance):
    e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1, name="foo")

    e2 = sourmash_lib.MinHash(n=1, ksize=25, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2, name="bar baz")

    x = save_signatures([sig1, sig2])
    assert '\n' not in x
    assert len(x.split('\n')) == 1

    y = list(load_signatures(x))
    assert len(y) == 2
    assert any(sig.name() == 'foo' for sig in y)
    assert any(sig.name() == 'bar baz' for sig in y)
Exemplo n.º 23
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('sigfile')
    p.add_argument('--scaled', default=10000, type=int)
    args = p.parse_args()

    sigs = list(signature.load_signatures(args.sigfile))
    print('loaded {} signatures'.format(len(sigs)), file=sys.stderr)

    dsout = []

    for sig in sigs:
        sig.minhash = sig.minhash.downsample_scaled(args.scaled)
        dsout.append(sig)
    signature.save_signatures(dsout, sys.stdout)
Exemplo n.º 24
0
def test_roundtrip_max_hash(track_abundance):
    e = sourmash_lib.MinHash(n=0,
                             ksize=20,
                             track_abundance=track_abundance,
                             max_hash=10)
    e.add_hash(5)
    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert e.max_hash == e2.max_hash

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Exemplo n.º 25
0
def test_roundtrip_seed(track_abundance):
    e = sourmash_lib.Estimators(n=1,
                                ksize=20,
                                track_abundance=track_abundance,
                                seed=10)
    e.mh.add_hash(5)
    sig = SourmashSignature('*****@*****.**', e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.estimator

    assert e.seed == e2.seed

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Exemplo n.º 26
0
def test_save_load_multisig(track_abundance):
    e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    e2 = sourmash_lib.MinHash(n=1, ksize=25, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2)

    x = save_signatures([sig1, sig2])
    y = list(load_signatures(x))

    print(x)

    assert len(y) == 2
    assert sig1 in y  # order not guaranteed, note.
    assert sig2 in y
    assert sig1 != sig2
Exemplo n.º 27
0
def test_do_sourmash_compute_with_seed():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('short.fa')
        outfile = os.path.join(location, 'FOO.xxx')
        status, out, err = utils.runscript('sourmash', [
            'compute', '-k', '21,31', '--seed', '43', testdata1, '-o', outfile
        ],
                                           in_directory=location)
        assert os.path.exists(outfile)

        siglist = list(signature.load_signatures(outfile))
        assert len(siglist) == 2

        seeds = [x.estimator.seed for x in siglist]
        assert len(seeds) == 2
        assert set(seeds) == set([43])
Exemplo n.º 28
0
def test_do_sourmash_compute_multik_with_protein():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('short.fa')
        status, out, err = utils.runscript(
            'sourmash', ['compute', '-k', '21,30', '--protein', testdata1],
            in_directory=location)
        outfile = os.path.join(location, 'short.fa.sig')
        assert os.path.exists(outfile)

        with open(outfile, 'rt') as fp:
            sigdata = fp.read()
            siglist = list(signature.load_signatures(sigdata))
            assert len(siglist) == 4
            ksizes = set([x.estimator.ksize for x in siglist])
            assert 21 in ksizes
            assert 30 in ksizes
Exemplo n.º 29
0
    def dump(self, args):
        parser = argparse.ArgumentParser()
        parser.add_argument('filenames', nargs='+')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        args = parser.parse_args(sys.argv[2:])

        for filename in args.filenames:
            data = open(filename).read()
            print('loading', filename)
            siglist = sig.load_signatures(data, select_ksize=args.ksize)
            assert len(siglist) == 1

            s = siglist[0]

            fp = open(filename + '.dump.txt', 'w')
            fp.write(" ".join((map(str, s.estimator.mh.get_mins()))))
            fp.close()
Exemplo n.º 30
0
    def dump(self, args):
        parser = argparse.ArgumentParser()
        parser.add_argument('filenames', nargs='+')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K, help='k-mer size (default: %(default)i)')
        args = parser.parse_args(args)

        for filename in args.filenames:
            print('loading', filename)
            siglist = sig.load_signatures(filename, select_ksize=args.ksize)
            siglist = list(siglist)
            assert len(siglist) == 1

            s = siglist[0]

            fp = open(filename + '.dump.txt', 'w')
            fp.write(" ".join((map(str, s.estimator.mh.get_mins()))))
            fp.close()