Пример #1
0
def classify_signature(query_sig, dblist, threshold):
    """
    Classify 'query_sig' using the given list of databases.

    Insist on at least 'threshold' counts of a given lineage before taking
    it seriously.

    Return (lineage, status) where 'lineage' is a tuple of LineagePairs
    and 'status' is either 'nomatch', 'found', or 'disagree'.

    This function proceeds in two stages:

       * first, build a list of assignments for all the lineages for each
         hashval.  (For e.g. kraken, this is done in the database preparation
         step; here, we do it dynamically each time.
       * then, across all the hashvals, count the number of times each linage
         shows up, and filter out low-abundance ones (under threshold).
         Then, determine the LCA of all of those.

      """
    # gather assignments from across all the databases
    assignments = lca_utils.gather_assignments(query_sig.minhash.get_mins(),
                                               dblist)

    # now convert to trees -> do LCA & counts
    counts = lca_utils.count_lca_for_assignments(assignments)
    debug(counts.most_common())

    # ok, we now have the LCAs for each hashval, and their number of
    # counts. Now build a tree across "significant" LCAs - those above
    # threshold.

    tree = {}

    for lca, count in counts.most_common():
        if count < threshold:
            break

        # update tree with this set of assignments
        lca_utils.build_tree([lca], tree)

    status = 'nomatch'
    if not tree:
        return [], status

    # now find lowest-common-ancestor of the resulting tree.
    lca, reason = lca_utils.find_lca(tree)
    if reason == 0:  # leaf node
        debug('END', lca)
        status = 'found'
    else:  # internal node => disagreement
        debug('MULTI', lca)
        status = 'disagree'

    debug('lineage is:', lca)

    return lca, status
Пример #2
0
def test_build_tree_4():
    tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')],
                      ])

    tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')],
                      ], tree)

    assert tree == { LineagePair('rank1', 'name1'): { LineagePair('rank2', 'name2a') : {},
                                           LineagePair('rank2', 'name2b') : {}} }
Пример #3
0
def make_lca_counts(dblist):
    """
    Collect counts of all the LCAs in the list of databases.

    CTB this could usefully be converted to a generator function.
    """

    # gather all hashvalue assignments from across all the databases
    assignments = defaultdict(set)
    for lca_db in dblist:
        for hashval, lid_list in lca_db.hashval_to_lineage_id.items():
            lineages = [lca_db.lineage_dict[lid] for lid in lid_list]
            assignments[hashval].update(lineages)

    # now convert to trees -> do LCA & counts
    counts = defaultdict(int)
    for hashval, lineages in assignments.items():

        # for each list of tuple_info [(rank, name), ...] build
        # a tree that lets us discover lowest-common-ancestor.
        debug(lineages)
        tree = lca_utils.build_tree(lineages)

        # now find either a leaf or the first node with multiple
        # children; that's our lowest-common-ancestor node.
        lca, reason = lca_utils.find_lca(tree)
        counts[lca] += 1

    return counts
Пример #4
0
def test_find_lca_2():
    tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')],
                       [LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')],
                      ])
    lca = find_lca(tree)

    assert lca == ((LineagePair('rank1', 'name1'),), 2)
Пример #5
0
def test_build_tree():
    tree = build_tree(
        [[LineagePair('rank1', 'name1'),
          LineagePair('rank2', 'name2')]])
    assert tree == {
        LineagePair('rank1', 'name1'): {
            LineagePair('rank2', 'name2'): {}
        }
    }
Пример #6
0
def test_find_lca():
    tree = build_tree(
        [[LineagePair('rank1', 'name1'),
          LineagePair('rank2', 'name2')]])
    lca = find_lca(tree)

    assert lca == ((
        LineagePair('rank1', 'name1'),
        LineagePair('rank2', 'name2'),
    ), 0)
Пример #7
0
def test_build_tree_5():
    with pytest.raises(ValueError):
        tree = build_tree([])
Пример #8
0
def test_build_tree_3():  # empty 'rank2' name
    tree = build_tree(
        [[LineagePair('rank1', 'name1'),
          LineagePair('rank2', '')]])
    assert tree == {LineagePair('rank1', 'name1'): {}}
Пример #9
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int)
    p.add_argument('--sample-threshold',
                   default=DEFAULT_SAMPLE_THRESHOLD,
                   type=int)
    p.add_argument('--abundance-threshold',
                   default=DEFAULT_ABUND_THRESHOLD,
                   type=int)
    p.add_argument('revindex')
    p.add_argument('db', nargs='+')
    args = p.parse_args()

    idx = revindex_utils.HashvalRevindex(args.revindex)

    lca_db_list, ksize, scaled = lca_utils.load_databases(args.db, SCALED)

    cnt = collections.Counter()
    for k, v in idx.hashval_to_abunds.items():
        cnt[k] += len([abund for abund in v \
                       if abund >= args.abundance_threshold])

    total = 0
    found = 0
    unknown = collections.defaultdict(int)
    for hashval, count in cnt.most_common():
        # break when we hit things in < 10 samples.
        if count < args.sample_threshold:
            break
        total += 1
        lca_set = set()

        for lca_db in lca_db_list:
            lineages = lca_db.get_lineage_assignments(hashval)
            lca_set.update(lineages)

        if not lca_set:
            unknown[count] += 1
            continue

        assert lca_set, lca_set

        # for each list of tuple_info [(rank, name), ...] build
        # a tree that lets us discover lowest-common-ancestor.
        tree = lca_utils.build_tree(lca_set)

        # now find either a leaf or the first node with multiple
        # children; that's our lowest-common-ancestor node.
        lca, reason = lca_utils.find_lca(tree)

        print('hash {}, in {} samples; lineage: {}'.format(
            hashval, count, ";".join(lca_utils.zip_lineage(lca))),
              file=sys.stderr)
        found += 1

    print('found {} of {} ({:.2f}%)'.format(found, total, found / total * 100),
          file=sys.stderr)
    print('outputting distribution of unknowns', file=sys.stderr)
    print('commonality,n,sum_n')

    sofar = 0
    for k, cnt in sorted(unknown.items()):
        sofar += cnt
        print('{},{},{}'.format(k, cnt, sofar))
Пример #10
0
def compare_csv(args):
    p = argparse.ArgumentParser()
    p.add_argument('csv1', help='taxonomy spreadsheet output by classify')
    p.add_argument('csv2', help='custom taxonomy spreadsheet')
    p.add_argument('-d', '--debug', action='store_true')
    p.add_argument('-C',
                   '--start-column',
                   default=2,
                   type=int,
                   help='column at which taxonomic assignments start')
    p.add_argument('--tabs',
                   action='store_true',
                   help='input spreadsheet is tab-delimited (default: commas)')
    p.add_argument('--no-headers',
                   action='store_true',
                   help='no headers present in taxonomy spreadsheet')
    p.add_argument('-f', '--force', action='store_true')
    args = p.parse_args(args)

    if args.start_column < 2:
        error('error, --start-column cannot be less than 2')
        sys.exit(-1)

    if args.debug:
        set_debug(args.debug)

    # first, load classify-style spreadsheet
    notify('loading classify output from: {}', args.csv1)
    assignments0, num_rows0 = load_taxonomy_assignments(args.csv1,
                                                        start_column=3)

    notify('loaded {} distinct lineages, {} rows',
           len(set(assignments0.values())), num_rows0)
    notify('----')

    # next, load custom taxonomy spreadsheet
    delimiter = ','
    if args.tabs:
        delimiter = '\t'

    notify('loading custom spreadsheet from: {}', args.csv2)
    assignments, num_rows = load_taxonomy_assignments(
        args.csv2,
        delimiter=delimiter,
        start_column=args.start_column,
        use_headers=not args.no_headers,
        force=args.force)
    notify('loaded {} distinct lineages, {} rows',
           len(set(assignments.values())), num_rows)

    # now, compute basic differences:
    missing_1 = set(assignments0.keys()) - set(assignments.keys())
    missing_2 = set(assignments.keys()) - set(assignments0.keys())
    if missing_2:
        notify('missing {} assignments in classify spreadsheet.',
               len(missing_2))
    if missing_1:
        notify('missing {} assignments in custom spreadsheet.', len(missing_1))
    if missing_1 or missing_2:
        notify('(these will not be evaluated any further)')
    else:
        notify('note: all IDs are in both spreadsheets!')

    # next, look at differences in lineages
    common = set(assignments0.keys())
    common.intersection_update(assignments.keys())

    n_total = 0
    n_different = 0
    n_compat = 0
    n_incompat = 0
    incompat_rank = defaultdict(int)
    for k in common:
        n_total += 1
        v0 = assignments0[k]
        v1 = assignments[k]
        if v0 != v1:
            n_different += 1
            tree = lca_utils.build_tree([v0])
            lca_utils.build_tree([v1], tree)

            lca, reason = lca_utils.find_lca(tree)
            if reason == 0:  # compatible lineages
                n_compat += 1
                print_results("{},compatible,{}", k,
                              ";".join(zip_lineage(lca)))
            else:
                n_incompat += 1
                print_results("{},incompatible,{}", k,
                              ";".join(zip_lineage(lca)))
                rank = next(iter(lca_utils.taxlist()))
                if lca:
                    rank = lca[-1].rank
                incompat_rank[rank] += 1

    notify("{} total assignments, {} differ between spreadsheets.", n_total,
           n_different)
    notify("{} are compatible (one lineage is ancestor of another.", n_compat)
    notify("{} are incompatible (there is a disagreement in the trees).",
           n_incompat)

    if n_incompat:
        for rank in lca_utils.taxlist():
            notify('{} incompatible at rank {}', incompat_rank[rank], rank)