示例#1
0
def load_taxonomy_assignments(filename,
                              delimiter=',',
                              start_column=2,
                              use_headers=True,
                              force=False):
    # parse spreadsheet!
    fp = open(filename, 'rtU')
    r = csv.reader(fp, delimiter=delimiter)
    row_headers = ['identifiers']
    row_headers += ['_skip_'] * (start_column - 2)
    row_headers += list(lca_utils.taxlist())

    # first check that headers are interpretable.
    if use_headers:
        notify('examining spreadsheet headers...')
        first_row = next(iter(r))

        n_disagree = 0
        for (column, value) in zip(row_headers, first_row):
            if column == '_skip_':
                continue

            if column.lower() != value.lower():
                notify("** assuming column '{}' is {} in spreadsheet", value,
                       column)
                n_disagree += 1
                if n_disagree > 2:
                    error('whoa, too many assumptions. are the headers right?')
                    error('expecting {}', ",".join(row_headers))
                    if not force:
                        sys.exit(-1)
                    notify('...continue, because --force was specified.')

    # convert into a lineage pair
    assignments = {}
    num_rows = 0
    for row in r:
        if row and row[0].strip():  # want non-empty row
            num_rows += 1
            lineage = list(zip(row_headers, row))
            lineage = [x for x in lineage if x[0] != '_skip_']

            ident = lineage[0][1]
            lineage = lineage[1:]

            # clean lineage of null names, replace with 'unassigned'
            lineage = [(a, lca_utils.filter_null(b)) for (a, b) in lineage]
            lineage = [LineagePair(a, b) for (a, b) in lineage]

            # remove end nulls
            while lineage and lineage[-1].name == 'unassigned':
                lineage = lineage[:-1]

            # store lineage tuple
            if lineage:
                assignments[ident] = tuple(lineage)

    fp.close()

    return assignments, num_rows
示例#2
0
def main(sysv_args):
    set_quiet(False)

    commands = {
        'classify': classify,
        'index': index,
        'summarize': summarize_main,
        'rankinfo': rankinfo_main,
        'gather': gather_main,
        'compare_csv': compare_csv
    }

    parser = argparse.ArgumentParser(
        description='lowest-common ancestor (LCA) utilities', usage=usage)
    parser.add_argument('lca_command', nargs='?')
    args = parser.parse_args(sysv_args[0:1])

    if not args.lca_command:
        print(usage)
        sys.exit(1)

    if args.lca_command not in commands:
        error('Unrecognized command: {}', args.lca_command)
        parser.print_help()
        sys.exit(1)

    cmd = commands.get(args.lca_command)
    cmd(sysv_args[1:])
示例#3
0
def rankinfo_main(args):
    """
    rankinfo!
    """
    p = argparse.ArgumentParser(prog="sourmash lca rankinfo")
    p.add_argument('db', nargs='+')
    p.add_argument('--scaled', type=float)
    p.add_argument('-d', '--debug', action='store_true')
    args = p.parse_args(args)

    if not args.db:
        error('Error! must specify at least one LCA database with --db')
        sys.exit(-1)

    if args.debug:
        set_debug(args.debug)

    if args.scaled:
        args.scaled = int(args.scaled)

    # load all the databases
    dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled)

    # count all the LCAs across these databases
    counts = make_lca_counts(dblist)

    # collect counts across all ranks
    counts_by_rank = defaultdict(int)
    for lineage, count in counts.items():
        if lineage:
            lineage_tup = lineage[-1]
            counts_by_rank[lineage_tup.rank] += count

    # output!
    total = float(sum(counts_by_rank.values()))
    for rank in lca_utils.taxlist():
        count = counts_by_rank.get(rank, 0)
        print('{}: {} ({:.1f}%)'.format(rank, count, count / total * 100.))
示例#4
0
def main(sysv_args):
    set_quiet(False)

    commands = {
        'classify': classify,
        'index': index,
        'summarize': summarize_main,
        'rankinfo': rankinfo_main,
        'compare_csv': compare_csv
    }

    parser = argparse.ArgumentParser(
        description='lowest-common ancestor (LCA) utilities',
        usage='''sourmash lca <command> [<args>]

Commands can be:

index <taxonomy.csv> <output_db name> <signature [...]>  - create LCA database
classify --db <db_name [...]> --query <signature [...]>  - classify genomes
summarize --db <db_name [...]> --query <signature [...]> - summarize mixture
rankinfo <db_name [...]>                                 - database rank info
compare_csv <csv1> <csv2>                                - compare spreadsheets

Use '-h' to get subcommand-specific help, e.g.

sourmash lca index -h
.
''')
    parser.add_argument('lca_command')
    args = parser.parse_args(sysv_args[0:1])
    if args.lca_command not in commands:
        error('Unrecognized command: {}', args.lca_command)
        parser.print_help()
        sys.exit(1)

    cmd = commands.get(args.lca_command)
    cmd(sysv_args[1:])
示例#5
0
def classify(args):
    """
    main single-genome classification function.
    """
    p = argparse.ArgumentParser()
    p.add_argument('--db', nargs='+', action='append')
    p.add_argument('--query', nargs='+', action='append')
    p.add_argument('--threshold', type=int, default=DEFAULT_THRESHOLD)
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   help='output CSV to this file instead of stdout')
    p.add_argument('--scaled', type=float)
    p.add_argument('--traverse-directory',
                   action='store_true',
                   help='load all signatures underneath directories.')
    p.add_argument('-d', '--debug', action='store_true')
    args = p.parse_args(args)

    if not args.db:
        error('Error! must specify at least one LCA database with --db')
        sys.exit(-1)

    if not args.query:
        error('Error! must specify at least one query signature with --query')
        sys.exit(-1)

    if args.debug:
        set_debug(args.debug)

    # flatten --db and --query
    args.db = [item for sublist in args.db for item in sublist]
    args.query = [item for sublist in args.query for item in sublist]

    # load all the databases
    dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled)
    notify('ksize={} scaled={}', ksize, scaled)

    # find all the queries
    notify('finding query signatures...')
    if args.traverse_directory:
        inp_files = list(sourmash_args.traverse_find_sigs(args.query))
    else:
        inp_files = list(args.query)

    # set up output
    csvfp = csv.writer(sys.stdout)
    if args.output:
        notify("outputting classifications to '{}'", args.output.name)
        csvfp = csv.writer(args.output)
    else:
        notify("outputting classifications to stdout")
    csvfp.writerow(['ID', 'status'] + list(lca_utils.taxlist()))

    # for each query, gather all the matches across databases
    total_count = 0
    n = 0
    total_n = len(inp_files)
    for query_filename in inp_files:
        n += 1
        for query_sig in sourmash_lib.load_signatures(query_filename,
                                                      ksize=ksize):
            notify(u'\r\033[K', end=u'')
            notify('... classifying {} (file {} of {})',
                   query_sig.name(),
                   n,
                   total_n,
                   end='\r')
            debug('classifying', query_sig.name())
            total_count += 1

            # make sure we're looking at the same scaled value as database
            query_sig.minhash = query_sig.minhash.downsample_scaled(scaled)

            # do the classification
            lineage, status = classify_signature(query_sig, dblist,
                                                 args.threshold)
            debug(lineage)

            # output each classification to the spreadsheet
            row = [query_sig.name(), status]
            row += lca_utils.zip_lineage(lineage)

            # when outputting to stdout, make output intelligible
            if not args.output:
                notify(u'\r\033[K', end=u'')
            csvfp.writerow(row)

    notify(u'\r\033[K', end=u'')
    notify('classified {} signatures total', total_count)
示例#6
0
def index(args):
    """
    main function for building an LCA database.
    """
    p = argparse.ArgumentParser()
    p.add_argument('csv', help='taxonomy spreadsheet')
    p.add_argument('lca_db_out', help='name to save database to')
    p.add_argument('signatures', nargs='+',
                   help='one or more sourmash signatures')
    p.add_argument('--scaled', default=10000, type=float)
    p.add_argument('-k', '--ksize', default=31, type=int)
    p.add_argument('-d', '--debug', action='store_true')
    p.add_argument('-C', '--start-column', default=2, type=int,
                   help='column at which taxonomic assignments start')
    p.add_argument('--tabs', action='store_true',
                   help='input spreadsheet is tab-delimited (default: commas)')
    p.add_argument('--no-headers', action='store_true',
                   help='no headers present in taxonomy spreadsheet')
    p.add_argument('--split-identifiers', action='store_true',
                   help='split names in signatures on whitspace and period')
    p.add_argument('-f', '--force', action='store_true')
    p.add_argument('--traverse-directory', action='store_true',
                   help='load all signatures underneath directories.')
    p.add_argument('--report', help='output a report on anomalies, if any.')
    args = p.parse_args(args)

    if args.start_column < 2:
        error('error, --start-column cannot be less than 2')
        sys.exit(-1)

    if args.debug:
        set_debug(args.debug)

    args.scaled = int(args.scaled)

    # first, load taxonomy spreadsheet
    delimiter = ','
    if args.tabs:
        delimiter = '\t'
    assignments, num_rows = load_taxonomy_assignments(args.csv,
                                               delimiter=delimiter,
                                               start_column=args.start_column,
                                               use_headers=not args.no_headers,
                                               force=args.force)

    # convert lineages to numbers.
    next_lineage_index = 0
    lineage_dict = {}

    assignments_idx = {}
    lineage_to_idx = {}
    for (ident, lineage) in assignments.items():
        idx = lineage_to_idx.get(lineage)
        if idx is None:
            idx = next_lineage_index
            next_lineage_index += 1

            lineage_dict[idx] = lineage
            lineage_to_idx[lineage] = idx

        assignments_idx[ident] = idx

    notify('{} distinct lineages in spreadsheet out of {} rows',
           len(lineage_dict), num_rows)

    # load signatures, construct index of hashvals to lineages
    hashval_to_lineage = defaultdict(set)
    md5_to_lineage = {}

    notify('finding signatures...')
    if args.traverse_directory:
        yield_all_files = False           # only pick up *.sig files?
        if args.force:
            yield_all_files = True
        inp_files = list(sourmash_args.traverse_find_sigs(args.signatures,
                                                          yield_all_files=yield_all_files))
    else:
        inp_files = list(args.signatures)

    n = 0
    total_n = len(inp_files)
    record_duplicates = set()
    record_no_lineage = set()
    record_remnants = set(assignments_idx.keys())
    for filename in inp_files:
        n += 1
        for sig in sourmash_lib.load_signatures(filename, ksize=args.ksize):
            notify(u'\r\033[K', end=u'')
            notify('... loading signature {} (file {} of {})', sig.name()[:30], n, total_n, end='\r')
            debug(filename, sig.name())

            if sig.md5sum() in md5_to_lineage:
                notify('\nWARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum())
                record_duplicates.add(filename)
                continue

            name = sig.name()
            if args.split_identifiers: # hack for NCBI-style names, etc.
                name = name.split(' ')[0].split('.')[0]

            # is this one for which we have a lineage assigned?
            lineage_idx = assignments_idx.get(name)
            if lineage_idx is None:
               notify('\nWARNING: no lineage assignment for {}.', name)
               record_no_lineage.add(name)
            else:
                # remove from our list of remnant lineages
                record_remnants.remove(name)

                # downsample to specified scaled; this has the side effect of
                # making sure they're all at the same scaled value!
                minhash = sig.minhash.downsample_scaled(args.scaled)

                # connect hashvals to lineage
                for hashval in minhash.get_mins():
                    hashval_to_lineage[hashval].add(lineage_idx)

                # store md5 -> lineage too
                md5_to_lineage[sig.md5sum()] = lineage_idx

    notify(u'\r\033[K', end=u'')
    notify('...found {} genomes with lineage assignments!!',
           len(md5_to_lineage))

    # remove those lineages with no genomes associated
    assigned_lineages = set(md5_to_lineage.values())
    lineage_dict_2 = {}
    for idx in assigned_lineages:
        lineage_dict_2[idx] = lineage_dict[idx]

    unused_lineages = set(lineage_dict.values()) - set(lineage_dict_2.values())

    notify('{} assigned lineages out of {} distinct lineages in spreadsheet',
           len(lineage_dict_2), len(lineage_dict))
    lineage_dict = lineage_dict_2

    # now, save!
    db_outfile = args.lca_db_out
    if not (db_outfile.endswith('.lca.json') or db_outfile.endswith('.lca.json.gz')):
        db_outfile += '.lca.json'
    notify('saving to LCA DB: {}'.format(db_outfile))

    db = lca_utils.LCA_Database()
    db.lineage_dict = lineage_dict
    db.hashval_to_lineage_id = hashval_to_lineage
    db.ksize = int(args.ksize)
    db.scaled = int(args.scaled)
    db.signatures_to_lineage = md5_to_lineage

    db.save(db_outfile)

    if record_duplicates or record_no_lineage or record_remnants or unused_lineages:
        if record_duplicates:
            notify('WARNING: {} duplicate signatures.', len(record_duplicates))
        if record_no_lineage:
            notify('WARNING: no lineage provided for {} signatures.',
                   len(record_no_lineage))
        if record_remnants:
            notify('WARNING: no signatures for {} lineage assignments.',
                   len(record_remnants))
        if unused_lineages:
            notify('WARNING: {} unused lineages.', len(unused_lineages))

        if args.report:
            notify("generating a report and saving in '{}'", args.report)
            generate_report(record_duplicates, record_no_lineage,
                            record_remnants, unused_lineages, args.report)
        else:
            notify('(You can use --report to generate a detailed report.)')
示例#7
0
def summarize_main(args):
    """
    main summarization function.
    """
    p = argparse.ArgumentParser()
    p.add_argument('--db', nargs='+', action='append')
    p.add_argument('--query', nargs='+', action='append')
    p.add_argument('--threshold', type=int, default=DEFAULT_THRESHOLD)
    p.add_argument('--traverse-directory',
                   action='store_true',
                   help='load all signatures underneath directories.')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   help='CSV output')
    p.add_argument('--scaled', type=float)
    p.add_argument('-d', '--debug', action='store_true')
    args = p.parse_args(args)

    if not args.db:
        error('Error! must specify at least one LCA database with --db')
        sys.exit(-1)

    if not args.query:
        error('Error! must specify at least one query signature with --query')
        sys.exit(-1)

    if args.debug:
        set_debug(args.debug)

    if args.scaled:
        args.scaled = int(args.scaled)

    # flatten --db and --query
    args.db = [item for sublist in args.db for item in sublist]
    args.query = [item for sublist in args.query for item in sublist]

    # load all the databases
    dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled)
    notify('ksize={} scaled={}', ksize, scaled)

    # find all the queries
    notify('finding query signatures...')
    if args.traverse_directory:
        inp_files = list(sourmash_args.traverse_find_sigs(args.query))
    else:
        inp_files = list(args.query)

    # for each query, gather all the hashvals across databases
    total_count = 0
    n = 0
    total_n = len(inp_files)
    hashvals = defaultdict(int)
    for query_filename in inp_files:
        n += 1
        for query_sig in sourmash_lib.load_signatures(query_filename,
                                                      ksize=ksize):
            notify(u'\r\033[K', end=u'')
            notify('... loading {} (file {} of {})',
                   query_sig.name(),
                   n,
                   total_n,
                   end='\r')
            total_count += 1

            mh = query_sig.minhash.downsample_scaled(scaled)
            for hashval in mh.get_mins():
                hashvals[hashval] += 1

    notify(u'\r\033[K', end=u'')
    notify('loaded {} signatures from {} files total.', total_count, n)

    # get the full counted list of lineage counts in this signature
    lineage_counts = summarize(hashvals, dblist, args.threshold)

    # output!
    total = float(len(hashvals))
    for (lineage, count) in lineage_counts.items():
        if lineage:
            lineage = lca_utils.zip_lineage(lineage, truncate_empty=True)
            lineage = ';'.join(lineage)
        else:
            lineage = '(root)'

        p = count / total * 100.
        p = '{:.1f}%'.format(p)

        print_results('{:5} {:>5}   {}'.format(p, count, lineage))

    # CSV:
    if args.output:
        w = csv.writer(args.output)
        headers = ['count'] + list(lca_utils.taxlist())
        w.writerow(headers)

        for (lineage, count) in lineage_counts.items():
            debug('lineage:', lineage)
            row = [count] + lca_utils.zip_lineage(lineage)
            w.writerow(row)
示例#8
0
def compare_csv(args):
    p = argparse.ArgumentParser()
    p.add_argument('csv1', help='taxonomy spreadsheet output by classify')
    p.add_argument('csv2', help='custom taxonomy spreadsheet')
    p.add_argument('-d', '--debug', action='store_true')
    p.add_argument('-C',
                   '--start-column',
                   default=2,
                   type=int,
                   help='column at which taxonomic assignments start')
    p.add_argument('--tabs',
                   action='store_true',
                   help='input spreadsheet is tab-delimited (default: commas)')
    p.add_argument('--no-headers',
                   action='store_true',
                   help='no headers present in taxonomy spreadsheet')
    p.add_argument('-f', '--force', action='store_true')
    args = p.parse_args(args)

    if args.start_column < 2:
        error('error, --start-column cannot be less than 2')
        sys.exit(-1)

    if args.debug:
        set_debug(args.debug)

    # first, load classify-style spreadsheet
    notify('loading classify output from: {}', args.csv1)
    assignments0, num_rows0 = load_taxonomy_assignments(args.csv1,
                                                        start_column=3)

    notify('loaded {} distinct lineages, {} rows',
           len(set(assignments0.values())), num_rows0)
    notify('----')

    # next, load custom taxonomy spreadsheet
    delimiter = ','
    if args.tabs:
        delimiter = '\t'

    notify('loading custom spreadsheet from: {}', args.csv2)
    assignments, num_rows = load_taxonomy_assignments(
        args.csv2,
        delimiter=delimiter,
        start_column=args.start_column,
        use_headers=not args.no_headers,
        force=args.force)
    notify('loaded {} distinct lineages, {} rows',
           len(set(assignments.values())), num_rows)

    # now, compute basic differences:
    missing_1 = set(assignments0.keys()) - set(assignments.keys())
    missing_2 = set(assignments.keys()) - set(assignments0.keys())
    if missing_2:
        notify('missing {} assignments in classify spreadsheet.',
               len(missing_2))
    if missing_1:
        notify('missing {} assignments in custom spreadsheet.', len(missing_1))
    if missing_1 or missing_2:
        notify('(these will not be evaluated any further)')
    else:
        notify('note: all IDs are in both spreadsheets!')

    # next, look at differences in lineages
    common = set(assignments0.keys())
    common.intersection_update(assignments.keys())

    n_total = 0
    n_different = 0
    n_compat = 0
    n_incompat = 0
    incompat_rank = defaultdict(int)
    for k in common:
        n_total += 1
        v0 = assignments0[k]
        v1 = assignments[k]
        if v0 != v1:
            n_different += 1
            tree = lca_utils.build_tree([v0])
            lca_utils.build_tree([v1], tree)

            lca, reason = lca_utils.find_lca(tree)
            if reason == 0:  # compatible lineages
                n_compat += 1
                print_results("{},compatible,{}", k,
                              ";".join(zip_lineage(lca)))
            else:
                n_incompat += 1
                print_results("{},incompatible,{}", k,
                              ";".join(zip_lineage(lca)))
                rank = next(iter(lca_utils.taxlist()))
                if lca:
                    rank = lca[-1].rank
                incompat_rank[rank] += 1

    notify("{} total assignments, {} differ between spreadsheets.", n_total,
           n_different)
    notify("{} are compatible (one lineage is ancestor of another.", n_compat)
    notify("{} are incompatible (there is a disagreement in the trees).",
           n_incompat)

    if n_incompat:
        for rank in lca_utils.taxlist():
            notify('{} incompatible at rank {}', incompat_rank[rank], rank)