Пример #1
0
def format_lineage(lineage_tup):
    """
    Pretty print lineage.
    """
    # list of ranks present
    present = [ l.rank for l in lineage_tup if l.name ]
    d = dict(lineage_tup) # rank: value

    if 'genus' in present:
        genus = d['genus']
        if 'strain' in present:
            name = d['strain']
        elif 'species' in present:
            species = d['species']
            if species.startswith(genus + ' ') or \
              species.startswith(genus + '_'):
                name = species
            else:
                name = '{} {}'.format(genus, species)
        else:
            name = '{} sp.'.format(genus)
    elif len(present) < 3:
        lineage_str = lca_utils.zip_lineage(lineage_tup, truncate_empty=True)
        lineage_str = "; ".join(lineage_str)
        name = lineage_str + ' - (no further assignment)'
    elif len(present) > 1 and 'superkingdom' in present:
        lowest_rank = present[-1]
        name = '{}; {} {}'.format(d['superkingdom'], lowest_rank,
                                   d[lowest_rank])
    else:
        lineage_str = lca_utils.zip_lineage(lineage_tup, truncate_empty=True)
        lineage_str = "; ".join(lineage_str)
        name = lineage_str

    return name
Пример #2
0
def generate_report(record_duplicates, record_no_lineage, record_remnants,
                    unused_lineages, filename):
    """
    Output a report of anomalies from building the index.
    """
    with open(filename, 'wt') as fp:
        print('Duplicate signatures:', file=fp)
        fp.write("\n".join(record_duplicates))
        fp.write("\n")
        print('----\nNo lineage provided for:', file=fp)
        fp.write("\n".join(record_no_lineage))
        fp.write("\n")
        print('----\nNo signatures found for these lineage assignments:', file=fp)
        fp.write('\n'.join(record_remnants))
        fp.write("\n")
        print('----\nUnused lineages:', file=fp)
        for lineage in unused_lineages:
            fp.write(";".join(lca_utils.zip_lineage(lineage)))
            fp.write("\n")
def gather_main(args):
    """
    """
    p = argparse.ArgumentParser()
    p.add_argument('--debug', action='store_true')
    p.add_argument('spreadsheet')
    p.add_argument('species')
    p.add_argument('--sbt')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'))
    args = p.parse_args(args)

    if args.debug:
        set_debug(args.debug)

    assignments, num_rows = load_taxonomy_assignments(args.spreadsheet)

    found = False
    for ident, lineage in assignments.items():
        for vv in lineage:
            if vv.rank == 'species' and vv.name == args.species:
                found = True
                found_lineage = lineage
                break

    if not found:
        print('nothing found for {}; quitting'.format(args.species))
        sys.exit(-1)

    print('found:', ", ".join(lca_utils.zip_lineage(found_lineage)))

    lineage_search = dict(found_lineage)

    rank_found = defaultdict(list)
    rank_idents = defaultdict(list)
    taxlist = list(reversed(list(lca_utils.taxlist())))

    for ident, lineage in assignments.items():
        dd = dict(lineage)
        for k in taxlist:
            if dd.get(k) and dd.get(k) == lineage_search.get(k):
                rank_found[k].append(lineage)
                rank_idents[k].append(ident)
                break

    retrieve_idents = defaultdict(set)
    gimme_idents = {}
    for k in rank_found:
        print('at', k, 'found', len(rank_found.get(k)))

        num_to_extract = min(len(rank_idents[k]), 10)
        gimme = random.sample(rank_idents[k], num_to_extract)
        for g in gimme:
            gimme_idents[g] = k

    if not args.output or not args.sbt:
        print('no output arg or SBT arg given; quitting without extracting')
        sys.exit(-1)

    print('looking for:', len(gimme_idents))

    tree = sourmash_lib.load_sbt_index(args.sbt)

    w = csv.writer(args.output)
    for n, leaf in enumerate(tree.leaves()):
        if n % 1000 == 0:
            print('...', n)
        name = leaf.data.name()
        # hack for NCBI-style names, etc.
        name = name.split(' ')[0].split('.')[0]

        if name in gimme_idents:
            level = gimme_idents[name]
            level_n = taxlist.index(level)
            filename = leaf.data.d['filename']

            w.writerow([level, level_n, filename, leaf.data.name()])
            print('FOUND!', leaf.data.name(), level)
def summarize_taxonomic_purity(minhash_collections, lca_db, verbose=False,
                               filenames=None):
    if filenames is None:
        filenames = list(range(len(minhash_collections)))
    total = 0
    n_empty = 0
    pure_at_rank = collections.defaultdict(int)

    # for each group,
    for f_name, g in zip(filenames, minhash_collections):
        print("Results for {}".format(f_name))
        total += 1

        # collect all the hash values for this minhash collection
        mins = g.get_mins()

        multis = set()
        # now, collate the lineage assignments across all the hashes
        assignments = collections.Counter()
        for m in mins:
            x = lca_db.get_lineage_assignments(m)
            for lineage in x:
                assignments[lineage] += 1
            if len(x) > 0:
                multis.add(m)

        if not assignments:
            # empty, no hash assignments for this group!
            n_empty += 1
            continue

        # for each taxonomic level, calculate purity
        for level, rank in enumerate(reversed(taxonomy_levels)):
            index = len(taxonomy_levels) - level

            # track number of different items at this tax rank
            level_counter = collections.Counter()

            count = 0
            for lineage, count in assignments.items():
                # trim the lineage back to the given rank
                sublineage = lineage[:index]
                assert sublineage[-1].rank == rank, lineage

                # count the trimmed lineage!
                level_counter[sublineage] += count

            # ## Note for @sgsmob: at this point, 'level_counter' contains
            # ## all lineages trimmed back to 'rank', with associated counts.
            # ## This is a collections.Counter object, so both .items() and
            # ## the method '.most_common()' will return lists of
            # ## (lineage, count), with 'most_common()' returning them in
            # ## highest-to-lowest order.

            # ## Here is some code to compute the fraction of the bin that
            # ## belongs to the top-ranked lineage:

            top_lineage, top_count = next(iter(level_counter.most_common()))
            total_hashes = sum(level_counter.values())

            # go back and count any k-mer that came from the top lineage plus
            # at least one more as belonging to the top lineage only
            for m in multis:
                x = lca_db.get_lineage_assignments(m)
                total_hashes = total_hashes - (len(x) - 1)
                num_matches = sum(1 for l in x if l[:index] == top_lineage)
                if num_matches > 1:
                    top_count = top_count - (num_matches - 1)

            lineage_display = "; ".join(lca_utils.zip_lineage(top_lineage))
            if verbose:
                print(rank,
                      '{:.1f}%'.format(top_count / total_hashes * 100),
                      lineage_display)

            if len(level_counter) == 0:   # should never get here!
                assert 0, assignments

            # if we have a 100% pure bin, quit now.
            if top_count / total_hashes == 1.0:
                pure_at_rank[rank] += 1

                # print out the egregiously bad ones just to double check...
                if rank in ('phylum', 'superkingdom') and verbose:
                    print('---')
                    for assignment in assignments:
                        print('\t',
                              '; '.join(lca_utils.zip_lineage(assignment)))
                print()
                break
        print()

    print()
    # calculate summary numbers!
    print('total bins: {}'.format(total))
    print('unassigned bins: {}'.format(n_empty))

    sum_pure = 0
    for rank in reversed(taxonomy_levels):
        print('pure at rank {}: {}'.format(rank, pure_at_rank[rank]))
        sum_pure += pure_at_rank[rank]

    print('not pure at any rank:', total - sum_pure - n_empty)
Пример #5
0
def classify(args):
    """
    main single-genome classification function.
    """
    p = argparse.ArgumentParser()
    p.add_argument('--db', nargs='+', action='append')
    p.add_argument('--query', nargs='+', action='append')
    p.add_argument('--threshold', type=int, default=DEFAULT_THRESHOLD)
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   help='output CSV to this file instead of stdout')
    p.add_argument('--scaled', type=float)
    p.add_argument('--traverse-directory',
                   action='store_true',
                   help='load all signatures underneath directories.')
    p.add_argument('-d', '--debug', action='store_true')
    args = p.parse_args(args)

    if not args.db:
        error('Error! must specify at least one LCA database with --db')
        sys.exit(-1)

    if not args.query:
        error('Error! must specify at least one query signature with --query')
        sys.exit(-1)

    if args.debug:
        set_debug(args.debug)

    # flatten --db and --query
    args.db = [item for sublist in args.db for item in sublist]
    args.query = [item for sublist in args.query for item in sublist]

    # load all the databases
    dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled)
    notify('ksize={} scaled={}', ksize, scaled)

    # find all the queries
    notify('finding query signatures...')
    if args.traverse_directory:
        inp_files = list(sourmash_args.traverse_find_sigs(args.query))
    else:
        inp_files = list(args.query)

    # set up output
    csvfp = csv.writer(sys.stdout)
    if args.output:
        notify("outputting classifications to '{}'", args.output.name)
        csvfp = csv.writer(args.output)
    else:
        notify("outputting classifications to stdout")
    csvfp.writerow(['ID', 'status'] + list(lca_utils.taxlist()))

    # for each query, gather all the matches across databases
    total_count = 0
    n = 0
    total_n = len(inp_files)
    for query_filename in inp_files:
        n += 1
        for query_sig in sourmash_lib.load_signatures(query_filename,
                                                      ksize=ksize):
            notify(u'\r\033[K', end=u'')
            notify('... classifying {} (file {} of {})',
                   query_sig.name(),
                   n,
                   total_n,
                   end='\r')
            debug('classifying', query_sig.name())
            total_count += 1

            # make sure we're looking at the same scaled value as database
            query_sig.minhash = query_sig.minhash.downsample_scaled(scaled)

            # do the classification
            lineage, status = classify_signature(query_sig, dblist,
                                                 args.threshold)
            debug(lineage)

            # output each classification to the spreadsheet
            row = [query_sig.name(), status]
            row += lca_utils.zip_lineage(lineage)

            # when outputting to stdout, make output intelligible
            if not args.output:
                notify(u'\r\033[K', end=u'')
            csvfp.writerow(row)

    notify(u'\r\033[K', end=u'')
    notify('classified {} signatures total', total_count)
Пример #6
0
def summarize_taxonomic_purity(minhash_collections,
                               lca_db,
                               verbose=False,
                               filenames=None):
    if filenames is None:
        filenames = list(range(len(minhash_collections)))
    total = 0
    n_empty = 0
    pure_at_rank = collections.defaultdict(int)

    # for each group,
    for f_name, g in zip(filenames, minhash_collections):
        print("Results for {}".format(f_name))
        total += 1

        # collect all the hash values for this minhash collection
        mins = g.get_mins()

        multis = set()
        # now, collate the lineage assignments across all the hashes
        assignments = collections.Counter()
        for m in mins:
            x = lca_db.get_lineage_assignments(m)
            for lineage in x:
                assignments[lineage] += 1
            if len(x) > 0:
                multis.add(m)

        if not assignments:
            # empty, no hash assignments for this group!
            n_empty += 1
            continue

        # for each taxonomic level, calculate purity
        for level, rank in enumerate(reversed(taxonomy_levels)):
            index = len(taxonomy_levels) - level

            # track number of different items at this tax rank
            level_counter = collections.Counter()

            count = 0
            for lineage, count in assignments.items():
                # trim the lineage back to the given rank
                sublineage = lineage[:index]
                assert sublineage[-1].rank == rank, lineage

                # count the trimmed lineage!
                level_counter[sublineage] += count

            # ## Note for @sgsmob: at this point, 'level_counter' contains
            # ## all lineages trimmed back to 'rank', with associated counts.
            # ## This is a collections.Counter object, so both .items() and
            # ## the method '.most_common()' will return lists of
            # ## (lineage, count), with 'most_common()' returning them in
            # ## highest-to-lowest order.

            # ## Here is some code to compute the fraction of the bin that
            # ## belongs to the top-ranked lineage:

            top_lineage, top_count = next(iter(level_counter.most_common()))
            total_hashes = sum(level_counter.values())

            # go back and count any k-mer that came from the top lineage plus
            # at least one more as belonging to the top lineage only
            for m in multis:
                x = lca_db.get_lineage_assignments(m)
                total_hashes = total_hashes - (len(x) - 1)
                num_matches = sum(1 for l in x if l[:index] == top_lineage)
                if num_matches > 1:
                    top_count = top_count - (num_matches - 1)

            lineage_display = "; ".join(lca_utils.zip_lineage(top_lineage))
            if verbose:
                print(rank, '{:.1f}%'.format(top_count / total_hashes * 100),
                      lineage_display)

            if len(level_counter) == 0:  # should never get here!
                assert 0, assignments

            # if we have a 100% pure bin, quit now.
            if top_count / total_hashes == 1.0:
                pure_at_rank[rank] += 1

                # print out the egregiously bad ones just to double check...
                if rank in ('phylum', 'superkingdom') and verbose:
                    print('---')
                    for assignment in assignments:
                        print('\t',
                              '; '.join(lca_utils.zip_lineage(assignment)))
                print()
                break
        print()

    print()
    # calculate summary numbers!
    print('total bins: {}'.format(total))
    print('unassigned bins: {}'.format(n_empty))

    sum_pure = 0
    for rank in reversed(taxonomy_levels):
        print('pure at rank {}: {}'.format(rank, pure_at_rank[rank]))
        sum_pure += pure_at_rank[rank]

    print('not pure at any rank:', total - sum_pure - n_empty)
Пример #7
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int)
    p.add_argument('--sample-threshold',
                   default=DEFAULT_SAMPLE_THRESHOLD,
                   type=int)
    p.add_argument('--abundance-threshold',
                   default=DEFAULT_ABUND_THRESHOLD,
                   type=int)
    p.add_argument('revindex')
    p.add_argument('db', nargs='+')
    args = p.parse_args()

    idx = revindex_utils.HashvalRevindex(args.revindex)

    lca_db_list, ksize, scaled = lca_utils.load_databases(args.db, SCALED)

    cnt = collections.Counter()
    for k, v in idx.hashval_to_abunds.items():
        cnt[k] += len([abund for abund in v \
                       if abund >= args.abundance_threshold])

    total = 0
    found = 0
    unknown = collections.defaultdict(int)
    for hashval, count in cnt.most_common():
        # break when we hit things in < 10 samples.
        if count < args.sample_threshold:
            break
        total += 1
        lca_set = set()

        for lca_db in lca_db_list:
            lineages = lca_db.get_lineage_assignments(hashval)
            lca_set.update(lineages)

        if not lca_set:
            unknown[count] += 1
            continue

        assert lca_set, lca_set

        # for each list of tuple_info [(rank, name), ...] build
        # a tree that lets us discover lowest-common-ancestor.
        tree = lca_utils.build_tree(lca_set)

        # now find either a leaf or the first node with multiple
        # children; that's our lowest-common-ancestor node.
        lca, reason = lca_utils.find_lca(tree)

        print('hash {}, in {} samples; lineage: {}'.format(
            hashval, count, ";".join(lca_utils.zip_lineage(lca))),
              file=sys.stderr)
        found += 1

    print('found {} of {} ({:.2f}%)'.format(found, total, found / total * 100),
          file=sys.stderr)
    print('outputting distribution of unknowns', file=sys.stderr)
    print('commonality,n,sum_n')

    sofar = 0
    for k, cnt in sorted(unknown.items()):
        sofar += cnt
        print('{},{},{}'.format(k, cnt, sofar))
Пример #8
0
def summarize_main(args):
    """
    main summarization function.
    """
    p = argparse.ArgumentParser()
    p.add_argument('--db', nargs='+', action='append')
    p.add_argument('--query', nargs='+', action='append')
    p.add_argument('--threshold', type=int, default=DEFAULT_THRESHOLD)
    p.add_argument('--traverse-directory',
                   action='store_true',
                   help='load all signatures underneath directories.')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   help='CSV output')
    p.add_argument('--scaled', type=float)
    p.add_argument('-d', '--debug', action='store_true')
    args = p.parse_args(args)

    if not args.db:
        error('Error! must specify at least one LCA database with --db')
        sys.exit(-1)

    if not args.query:
        error('Error! must specify at least one query signature with --query')
        sys.exit(-1)

    if args.debug:
        set_debug(args.debug)

    if args.scaled:
        args.scaled = int(args.scaled)

    # flatten --db and --query
    args.db = [item for sublist in args.db for item in sublist]
    args.query = [item for sublist in args.query for item in sublist]

    # load all the databases
    dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled)
    notify('ksize={} scaled={}', ksize, scaled)

    # find all the queries
    notify('finding query signatures...')
    if args.traverse_directory:
        inp_files = list(sourmash_args.traverse_find_sigs(args.query))
    else:
        inp_files = list(args.query)

    # for each query, gather all the hashvals across databases
    total_count = 0
    n = 0
    total_n = len(inp_files)
    hashvals = defaultdict(int)
    for query_filename in inp_files:
        n += 1
        for query_sig in sourmash_lib.load_signatures(query_filename,
                                                      ksize=ksize):
            notify(u'\r\033[K', end=u'')
            notify('... loading {} (file {} of {})',
                   query_sig.name(),
                   n,
                   total_n,
                   end='\r')
            total_count += 1

            mh = query_sig.minhash.downsample_scaled(scaled)
            for hashval in mh.get_mins():
                hashvals[hashval] += 1

    notify(u'\r\033[K', end=u'')
    notify('loaded {} signatures from {} files total.', total_count, n)

    # get the full counted list of lineage counts in this signature
    lineage_counts = summarize(hashvals, dblist, args.threshold)

    # output!
    total = float(len(hashvals))
    for (lineage, count) in lineage_counts.items():
        if lineage:
            lineage = lca_utils.zip_lineage(lineage, truncate_empty=True)
            lineage = ';'.join(lineage)
        else:
            lineage = '(root)'

        p = count / total * 100.
        p = '{:.1f}%'.format(p)

        print_results('{:5} {:>5}   {}'.format(p, count, lineage))

    # CSV:
    if args.output:
        w = csv.writer(args.output)
        headers = ['count'] + list(lca_utils.taxlist())
        w.writerow(headers)

        for (lineage, count) in lineage_counts.items():
            debug('lineage:', lineage)
            row = [count] + lca_utils.zip_lineage(lineage)
            w.writerow(row)
Пример #9
0
def compare_csv(args):
    p = argparse.ArgumentParser()
    p.add_argument('csv1', help='taxonomy spreadsheet output by classify')
    p.add_argument('csv2', help='custom taxonomy spreadsheet')
    p.add_argument('-d', '--debug', action='store_true')
    p.add_argument('-C',
                   '--start-column',
                   default=2,
                   type=int,
                   help='column at which taxonomic assignments start')
    p.add_argument('--tabs',
                   action='store_true',
                   help='input spreadsheet is tab-delimited (default: commas)')
    p.add_argument('--no-headers',
                   action='store_true',
                   help='no headers present in taxonomy spreadsheet')
    p.add_argument('-f', '--force', action='store_true')
    args = p.parse_args(args)

    if args.start_column < 2:
        error('error, --start-column cannot be less than 2')
        sys.exit(-1)

    if args.debug:
        set_debug(args.debug)

    # first, load classify-style spreadsheet
    notify('loading classify output from: {}', args.csv1)
    assignments0, num_rows0 = load_taxonomy_assignments(args.csv1,
                                                        start_column=3)

    notify('loaded {} distinct lineages, {} rows',
           len(set(assignments0.values())), num_rows0)
    notify('----')

    # next, load custom taxonomy spreadsheet
    delimiter = ','
    if args.tabs:
        delimiter = '\t'

    notify('loading custom spreadsheet from: {}', args.csv2)
    assignments, num_rows = load_taxonomy_assignments(
        args.csv2,
        delimiter=delimiter,
        start_column=args.start_column,
        use_headers=not args.no_headers,
        force=args.force)
    notify('loaded {} distinct lineages, {} rows',
           len(set(assignments.values())), num_rows)

    # now, compute basic differences:
    missing_1 = set(assignments0.keys()) - set(assignments.keys())
    missing_2 = set(assignments.keys()) - set(assignments0.keys())
    if missing_2:
        notify('missing {} assignments in classify spreadsheet.',
               len(missing_2))
    if missing_1:
        notify('missing {} assignments in custom spreadsheet.', len(missing_1))
    if missing_1 or missing_2:
        notify('(these will not be evaluated any further)')
    else:
        notify('note: all IDs are in both spreadsheets!')

    # next, look at differences in lineages
    common = set(assignments0.keys())
    common.intersection_update(assignments.keys())

    n_total = 0
    n_different = 0
    n_compat = 0
    n_incompat = 0
    incompat_rank = defaultdict(int)
    for k in common:
        n_total += 1
        v0 = assignments0[k]
        v1 = assignments[k]
        if v0 != v1:
            n_different += 1
            tree = lca_utils.build_tree([v0])
            lca_utils.build_tree([v1], tree)

            lca, reason = lca_utils.find_lca(tree)
            if reason == 0:  # compatible lineages
                n_compat += 1
                print_results("{},compatible,{}", k,
                              ";".join(zip_lineage(lca)))
            else:
                n_incompat += 1
                print_results("{},incompatible,{}", k,
                              ";".join(zip_lineage(lca)))
                rank = next(iter(lca_utils.taxlist()))
                if lca:
                    rank = lca[-1].rank
                incompat_rank[rank] += 1

    notify("{} total assignments, {} differ between spreadsheets.", n_total,
           n_different)
    notify("{} are compatible (one lineage is ancestor of another.", n_compat)
    notify("{} are incompatible (there is a disagreement in the trees).",
           n_incompat)

    if n_incompat:
        for rank in lca_utils.taxlist():
            notify('{} incompatible at rank {}', incompat_rank[rank], rank)