Пример #1
0
def sbt_index(client, db, cell, query, ksize, nsketch, key, file):
    '''Create a sequence Bloom tree from a cell/ database cursor.
    1. select seqs for tree
    2. assign common id (field derivative.minhash.sbt.ids)
    3. minhash seqs, name == UUID, md5? (think about SBT reuse)
    4. query a different collection/ metagenome against this

    --index {raw, minhash}
    input: all of cell or cursor

    \b
    $ zoo sbt_index --db ref --cell ref --ksize 16 --nsketch 1000 \
    reference
    Initialize SBT.
    Compute minhash signatures for selected documents.
    k-mer size: 16, sketch size: 1000
    \ 9158 Elapsed Time: 0:01:45
    Save SBT.
    Done.

    \b
    $ sourmash sbt_search --ksize 16 reference survey.fa.sig
    # running sourmash subcommand: sbt_search
    loaded query: survey.fa... (k=16, DNA)
    0.11 0ef85591-d464-4953-915f-f673907b7e8e (Zika reference genome)

    TODO: add query
    TODO: --key arg not working?
    '''
    c = MongoClient(client)[db][cell]

    print('Initialize SBT.')
    # init SBT
    factory = GraphFactory(ksize=ksize, starting_size=1e5, n_tables=4)
    # 4 .. nt?
    tree = SBT(factory, d=2)  # d .. see "n-ary " in notebook

    print('Compute minhash signatures for selected documents.')
    print('{}{}{}{}'.format(
        'k-mer size: ', ksize, ', sketch size: ', nsketch
        ))
    bar = ProgressBar(max_value=UnknownLength)
    counter = 0
    for d in c.find():
        counter += 1
        e = Estimators(ksize=ksize, n=nsketch)
        e.add_sequence(d['sequence'], force=True)
        s = SourmashSignature(email='', estimator=e, name=deep_get(d, key))
        leaf = SigLeaf(metadata=deep_get(d, key), data=s)
        tree.add_node(node=leaf)
        bar.update(counter)
    print('\nSave SBT.')
    tree.save(file)
    print('Done.')
Пример #2
0
def test_deep_set():
    # Replacing existing values does not work.
    with pytest.raises(TypeError):
        deep_set(d, 'a.b', [1, 2])
        # Key exists, item assignment not allowed w/ replace=False
    deep_set(d, 'a.b', [1, 2], replace=True)

    # Except when they evaluate to False, i.e. [], {}, "", False, None
    deep_set(d, 'a.c', 42)
    deep_set(d, 'a.d', 42)
    deep_set(d, 'a.e', 42)
    deep_set(d, 'a.f', 42)
    deep_set(d, 'a.g', {'foo': 'bar'})

    # We can access and modify the objects in place with deep_get()
    deep_get(d, 'a.b').append(3)
    assert deep_get(d, 'a.b')[2] == 3

    deep_get(d, 'a.g').update({'bar': 'foo'})
    assert deep_get(d, 'a.g')['bar'] == 'foo'

    # We can create new, nested keys.
    with pytest.raises(KeyError):
        deep_set(d, 'a.new.nested.path', 5)
        # 'Key not present. Use "force=True" to create key.'
    deep_set(d, 'a.new.nested.path', 5, force=True)
    assert deep_get(d, 'a.new') == {'nested': {'path': 5}}

    # A new (nested) key can only be created if the "root" is a dict.
    with pytest.raises(AttributeError):
        deep_set(d, 'a.new.nested.path.below', 5, force=True)
        # 'int' object has no attribute 'setdefault'
    deep_set(d, 'a.new.nested.path', {}, replace=True)
    deep_set(d, 'a.new.nested.path.below', 5, force=True)
    assert deep_get(d, 'a.new.nested.path.below') == 5
Пример #3
0
def add(file, client, db, cell, primkey):
    '''Load a data cell.

    An alternative primary key can be specified to insert documents. This
    is useful in the case where the data cell comes from a collaborator
    who uses a different set of UUIDs as we do. In this case, these identifiers
    do not reflect, whether an entry is a duplicate.

    Example:

    \b
    $ zoo add --client localhost:27017 --db zika --cell t5 zoo/data/cell_a.json
    Loading data cell.
    3 documents inserted in collection t5.
    0 duplicates skipped.
    Done.

    \b
    $ zoo add --db zika --cell t5 --primkey genbank.a zoo/data/cell_b.json
    Loading data cell.
    Index created on field "genbank.a".
    1 documents inserted in collection t5.
    3 duplicates skipped.
    Done.
    '''
    click.echo('Loading data cell.')
    c = MongoClient(client)[db][cell]
    inserted = 0
    duplicates = 0
    if primkey == '_id':
        for line in file:
            try:
                c.insert_one(json.loads(line.strip()))
                inserted += 1
            except DuplicateKeyError:
                duplicates += 1
                pass
    else:
        # index primkey if it does not exists yet
        if primkey not in c.index_information():
            c.create_index(primkey, unique=True, name=primkey)
            print('Index created on field', '"' + primkey + '".')
        for line in file:
            d = json.loads(line.strip())
            if c.find_one({primkey: deep_get(d, primkey)}):  # no duplicate
                duplicates += 1
            else:
                c.insert_one(d)
                inserted += 1

    print(
        inserted, 'documents inserted in cell', '"' + cell + '".')
    if duplicates > 0:
        print(duplicates, 'duplicates skipped.\nDone.')
Пример #4
0
def minhash(client, db, cell, query, ksize, nsketch, key, file):
    '''Minhash a cell/ database cursor.
    just plain old sigs for collection
    '''
    c = MongoClient(client)[db][cell]

    bar = ProgressBar(max_value=UnknownLength)
    counter = 0
    l = []
    print('Compute minhash signatures for selected documents.')
    print('{}{}{}{}'.format(
        'k-mer size: ', ksize, ', sketch size: ', nsketch
        ))
    for d in c.find():
        counter += 1
        e = Estimators(ksize=ksize, n=nsketch)
        e.add_sequence(d['sequence'], force=True)
        s = SourmashSignature(email='', estimator=e, name=deep_get(d, key))
        l.append(s)
        bar.update(counter)

    print('\nSave signatures.')
    signature.save_signatures(l, fp=file)
    print('Done.')
Пример #5
0
            host = None

        entries = {
            '_id': str(uuid4()),
            'metadata.location': j.country,
            'metadata.date': date,
            'metadata.host': host
            }

        for k, v in entries.items():
            try:  # NaN in host, date
                deep_set(d, k, v, replace=True)
            except AttributeError:
                pass

        deep_get(d, 'metadata.alt_id').append({'genbank': j.genbank})
        deep_get(d, 'metadata.grp_id').append({'segments': j.id})

        # try:  # host: NaN
        #     deep_set(d, 'metadata.host', j.host.lower(),)
        # except AttributeError:
        #     pass

        deep_set(d, 'relative.taxonomy.subtype', j.subtype, force=True)
        deep_set(d, 'derivative.segment_number', j.segment_number, force=True)
        deep_set(d, 'derivative.length', j.seqlen, force=True)
        deep_set(d, 'metadata.age', j.age, force=True)
        deep_set(d, 'metadata.gender', j.gender, force=True)
        deep_set(
            d, 'relative.taxonomy.nomenclature',
            re.search('\((.*)\)', j.isolate).group(1))
Пример #6
0
def test_deep_get():
    assert deep_get(d, 'a.b') == 5
Пример #7
0
from sourmash_lib.sbtmh import SigLeaf, search_minhashes
from sourmash_lib.signature import SourmashSignature

KSIZE = 16
N = 1000

# init SBT
factory = GraphFactory(ksize=KSIZE, starting_size=1e5, n_tables=4)
# 4 .. nt?
tree = SBT(factory, d=2)  # d .. see "n-ary " in notebook

bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
cursor = db.ref.find()
c = 0
for i in cursor:
    key = deep_get(i, 'metadata.alt_id.gb')
    seq = i['sequence']  # db.ref.find_one()['sequence']  # 'ACTG...'
    e = Estimators(ksize=KSIZE, n=N)
    e.add_sequence(seq, force=True)  # e.get_hashes()
    s = SourmashSignature(email='', estimator=e, name=key)

    leaf = SigLeaf(metadata=key, data=s)
    tree.add_node(node=leaf)
    c += 1
    bar.update(c)
# \ 9158 Elapsed Time: 0:01:49

# search the last fasta entry against the SBT (">0.95")
# filtered = tree.find(search_minhashes, s, 0.1)
# matches = [(str(i.metadata), i.data.similarity(s)) for i in filtered]
# [('0.95', 1.0)]  # fasta header, similarity