示例#1
0
def test_sbt_dayhoff_command_index(c):
    # test command-line creation of SBT database with dayhoff sigs
    sigfile1 = utils.get_test_data(
        'prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
    sigfile2 = utils.get_test_data(
        'prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig')

    db_out = c.output('dayhoff.sbt.zip')

    c.run_sourmash('index', db_out, sigfile1, sigfile2, '--scaled', '100',
                   '-k', '57', '--dayhoff')

    db2 = sourmash.load_sbt_index(db_out)

    sig1 = sourmash.load_one_signature(sigfile1)
    sig2 = sourmash.load_one_signature(sigfile2)

    # check reconstruction --
    mh_list = [x.minhash for x in db2.signatures()]
    assert len(mh_list) == 2
    assert sig1.minhash in mh_list
    assert sig2.minhash in mh_list

    # and search, gather
    results = db2.search(sig1,
                         threshold=0.0,
                         ignore_abundance=True,
                         do_containment=False,
                         best_only=False)
    assert len(results) == 2

    results = db2.gather(sig2)
    assert results[0][0] == 1.0
示例#2
0
def load_sbt_file(tree_file):
    if os.path.exists(tree_file):
        try:
            sbt = sourmash.load_sbt_index(tree_file)
            sys.stderr.write(f"loaded sbt file at {tree_file}\n")
            return sbt
        except:
            sys.stderr.write(f"cannot load sbt file at {tree_file}\n")
            sys.exit()
示例#3
0
def create_sbt_or_load_existing(tree_file, load_existing=False):
    # hmm.. adding and overwriting seems complicated but managed? see sourmash/sbt_storage.py
    if load_existing:
        try:
            sbt = sourmash.load_sbt_index(tree_file)
        except:
            sys.stderr.write(f"\ncannot load sbt file at {tree_file}\n")
    else:
        sbt = sourmash.create_sbt_index()
    return sbt
示例#4
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('sbt')
    args = p.parse_args()

    db = sourmash.load_sbt_index(args.sbt)
    threshold = THRESHOLD

    for leaf in db.leaves():
        query = leaf.data
        matches = db.find(search_minhashes, query, threshold)
        matches = list([ x.data for x in matches ])
        if query not in matches:
            print(query)
            assert 0
示例#5
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('sbt')
    args = p.parse_args()

    db = sourmash.load_sbt_index(args.sbt)
    threshold = THRESHOLD

    for leaf in db.leaves():
        query = leaf.data
        matches = db.find(search_minhashes, query, threshold)
        matches = list([x.data for x in matches])
        if query not in matches:
            print(query)
            assert 0
示例#6
0
def load_index(filename):
    import sourmash

    index = None
    try:
        index = sourmash.load_sbt_index(filename)
    except (ValueError, EnvironmentError):
        pass

    if index is None:
        try:
            index = sourmash.lca.lca_utils.LCA_Database()
            index.load(filename)
        except (ValueError, EnvironmentError, TypeError):
            pass

    if index is None:
        # TODO: raise error
        pass

    return index
示例#7
0
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("-k", "--ksize", type=int, default=51)
    parser.add_argument("-o", "--output", type=str, default=None)
    parser.add_argument("query")
    parser.add_argument("sbt", nargs="+")

    args = parser.parse_args()

    query = sourmash.load_one_signature(args.query, ksize=args.ksize)

    query_mins = set(query.minhash.get_mins())

    for index in args.sbt:
        sbt = sourmash.load_sbt_index(index)
        for i, dataset in enumerate(sbt.leaves()):
            dataset_mins = dataset.data.minhash.get_mins()
            del dataset._data
            query_mins -= set(dataset_mins)
            if not query_mins:
                break

            if i % 100 == 0:
                print(
                    f"Progress: {i} sigs processed, query has {len(query_mins)} hashes left"
                )

    new_mh = query.minhash.copy_and_clear()
    if new_mh.track_abundance:
        new_mh.set_abundances({
示例#8
0
for k in load_routing_keys(args.tags):
    channel.queue_bind(exchange=args.exchange, queue=queue_name, routing_key=k)

# channel.queue_bind(
#         exchange=args.exchange, queue=queue_name, routing_key='phiweger2.#')
# channel.queue_bind(
#         exchange=args.exchange, queue=queue_name, routing_key='*.*.found.#')
#         # exchange=args.exchange, queue=queue_name, routing_key='hello.#')

# Pass user data to callbacks using partial()
# https://github.com/pika/pika/issues/158
# on_message_callback=callback(download=True)

try:
    db = sourmash.load_sbt_index(args.db)  # 'genomes/context.sbt.json'
except (FileNotFoundError, TypeError):
    db = None

channel.basic_consume(queue=queue_name,
                      on_message_callback=partial(callback,
                                                  download=True,
                                                  db=db,
                                                  threshold=args.threshold,
                                                  outfile=args.log,
                                                  outdir=args.outdir))
# outfile can be e.g. ".log" or "-"

# eprint('Waiting for messages. To exit press CTRL+C')
channel.start_consuming()
示例#9
0
def _load_database(filename, traverse, traverse_yield_all):
    """Load file as a database - list of signatures, LCA, SBT, etc.

    Return (db, dbtype), where dbtype is a DatabaseType enum.

    This is an internal function used by other functions in sourmash_args.
    """
    loaded = False
    dbtype = None

    # special case stdin
    if not loaded and filename == '-':
        db = sourmash.load_signatures(sys.stdin, quiet=True, do_raise=True)
        db = list(db)
        loaded = True
        dbtype = DatabaseType.SIGLIST

    # load signatures from directory
    if not loaded and os.path.isdir(filename) and traverse:
        all_sigs = []
        for thisfile in traverse_find_sigs([filename], traverse_yield_all):
            try:
                with open(thisfile, 'rt') as fp:
                    x = sourmash.load_signatures(fp, quiet=True, do_raise=True)
                    siglist = list(x)
                    all_sigs.extend(siglist)
            except (IOError, sourmash.exceptions.SourmashError):
                if traverse_yield_all:
                    continue
                else:
                    raise

        loaded = True
        db = all_sigs
        dbtype = DatabaseType.SIGLIST

    # load signatures from single file
    try:
        # CTB: could make this a generator, with some trickery; but for
        # now, just force into list.
        with open(filename, 'rt') as fp:
            db = sourmash.load_signatures(fp, quiet=True, do_raise=True)
            db = list(db)

        loaded = True
        dbtype = DatabaseType.SIGLIST
    except Exception as exc:
        pass

    if not loaded:  # try load as SBT
        try:
            db = load_sbt_index(filename)
            loaded = True
            dbtype = DatabaseType.SBT
        except:
            pass

    if not loaded:  # try load as LCA
        try:
            db, _, _ = load_single_database(filename)
            loaded = True
            dbtype = DatabaseType.LCA
        except:
            pass

    if not loaded:
        successful_screed_load = False
        it = None
        try:
            # CTB: could be kind of time consuming for big record, but at the
            # moment screed doesn't expose format detection cleanly.
            with screed.open(filename) as it:
                record = next(iter(it))
            successful_screed_load = True
        except:
            pass

        if successful_screed_load:
            raise OSError(
                "Error while reading signatures from '{}' - got sequences instead! Is this a FASTA/FASTQ file?"
                .format(filename))

    if not loaded:
        raise OSError(
            "Error while reading signatures from '{}'.".format(filename))

    return db, dbtype
示例#10
0
                found = ix.search(sig,
                                  do_containment=True,
                                  threshold=threshold)
                if found:
                    for val, sig, _ in found:
                        # uvig_317315   SRR1160888_95 length_20936_VirSor...
                        name = sig.name().split('\t')[0]
                        if not name in phage_names:
                            phage_sigs.append(sig)
                        phage_names.add(name)
    return phage_sigs


# Housekeeping
params = {'ksize': args.k, 'n': 0, 'scaled': args.scaled}
ix = load_sbt_index(args.index)

# Search each (pro)phage candidate in the phage database
phage_sigs = sorted(search_ix(args.candidates, ix, params,
                              args.min_containment),
                    key=lambda x: x.name().split('\t')[0])
'''
Example header from gut phage database .fasta:

uvig_256501\tERR1190858_420 length_42504_VirSorter_cat_2

On VirSorter categories:

> Categories 1 and 4 represent the most confident assignments within each type meaning at least one hallmark viral gene is detected and an enrichment in viral‐like genes, 2 and 5 for ‘likely’ predictions containing either an enrichment in viral‐like genes or a hallmark gene, and 3 and 6 are ‘possible’ predictions. -- https://sfamjournals.onlinelibrary.wiley.com/doi/full/10.1111/1462-2920.15186
'''