예제 #1
0
def action(args):
    engine = create_engine(args.url, echo=args.verbosity > 2)
    tax = Taxonomy(engine, schema=args.schema)

    taxids = set()

    if args.taxids:
        if os.access(args.taxids, os.F_OK):
            for line in getlines(args.taxids):
                taxids.update(set(re.split(r'[\s,;]+', line)))
        else:
            taxids.update([x.strip()
                           for x in re.split(r'[\s,;]+', args.taxids)])

    if args.seq_info:
        with args.seq_info:
            reader = csv.DictReader(args.seq_info)
            taxids.update(frozenset(i['tax_id']
                                    for i in reader if i['tax_id']))

    writer = csv.writer(args.out)

    for t in taxids:
        try:
            tax._node(t)
        except ValueError:
            # Check for merged
            m = tax._get_merged(t)
            if m and m != t:
                writer.writerow([t, m])
            else:
                writer.writerow([t, None])

    engine.dispose()
    return 0
예제 #2
0
def action(args):
    engine = sqlalchemy.create_engine(args.url, echo=False)
    tax = Taxonomy(engine, schema=args.schema)

    names = []
    if args.infile:
        names += [line.split('#', 1)[0].strip()
                  for line in args.infile
                  if line.strip() and not line.startswith('#')]

    if args.names:
        names += [x.strip() for x in args.names.split(',')]

    writer = csv.writer(args.outfile)
    writer.writerow(['input', 'tax_name', 'tax_id', 'rank'])

    found = 0
    for name in names:
        try:
            tax_id, tax_name, is_primary = tax.primary_from_name(name)
        except ValueError:
            if args.include_unmatched:
                writer.writerow([name, None, None, None])
        else:
            found += 1
            parent, rank = tax._node(tax_id)
            writer.writerow([name, tax_name, tax_id, rank])

    log.warning('found {} of {} names'.format(found, len(names)))
예제 #3
0
def test_child_of():
    engine = create_engine(
        'sqlite:///../testfiles/small_taxonomy.db', echo=False)
    tax = Taxonomy(engine, taxtastic.ncbi.RANKS)
    assert tax.child_of(None) is None
    assert tax.child_of('1239') == '91061'
    assert tax.children_of('1239', 2) == ['91061', '186801']
예제 #4
0
def test_nary_subtree():
    engine = create_engine(
        'sqlite:///../testfiles/small_taxonomy.db', echo=False)
    tax = Taxonomy(engine, taxtastic.ncbi.RANKS)
    assert tax.nary_subtree(None) is None
    t = tax.nary_subtree('1239')
    assert t == ['1280', '372074', '1579', '1580',
                 '37734', '420335', '166485', '166486']
예제 #5
0
def action(args):

    engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3)
    tax = Taxonomy(engine, schema=args.schema)

    writer = csv.writer(args.outfile)
    for row in tax._get_lineage_table(args.tax_ids):
        writer.writerow(row)

    engine.dispose()
예제 #6
0
def action(args):
    engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 2)
    tax = Taxonomy(engine, schema=args.schema)

    records = list(yaml.load_all(args.new_nodes))

    log.info('adding new nodes')
    retval = None
    for rec in records:
        try:
            record_type = rec.pop('type')
            if record_type not in {'node', 'name'}:
                raise ValueError
        except (KeyError, ValueError):
            log.error(
                ('Error in record for tax_id {tax_id}: "type" is '
                 'required and must be one of "node" or "name"').format(**rec))
            retval = 1
            continue

        tax_id = rec['tax_id']
        rec['source_name'] = rec.get('source_name') or args.source_name

        try:
            if record_type == 'node':
                if not rec['source_name']:
                    log.error('Error: record has no source_name:\n{}'.format(
                        pprint.pformat(rec)))
                    raise ValueError
                if tax.has_node(tax_id):
                    log.info('updating *node* "{tax_id}"'.format(**rec))
                    tax.update_node(**rec)
                else:
                    log.info('new *node* "{tax_id}"'.format(**rec))
                    tax.add_node(**rec)
            elif record_type == 'name':
                for name in rec['names']:
                    name['tax_id'] = tax_id
                    # source_name may be provided at the record or name level
                    name['source_name'] = name.get(
                        'source_name') or rec['source_name']
                    if not name['source_name']:
                        log.error(
                            'Error: record has no source_name:\n {}'.format(
                                pprint.pformat(rec)))
                        raise ValueError

                    log.info('new *name* for "{tax_id}": "{tax_name}"'.format(
                        **name))
                    tax.add_name(**name)
        except (ValueError, TypeError):
            log.error('Error in record with tax_id {}'.format(rec['tax_id']))
            log.error(''.join(traceback.format_exception(*sys.exc_info())))
            retval = 1

    engine.dispose()

    if retval:
        log.error('Error: some records were malformed')
    return retval
예제 #7
0
def action(args):
    log.info('reading tax_ids')
    if args.tax_ids:
        tax_ids = set(args.tax_ids)
    elif args.tax_id_file:
        tax_ids = set(args.tax_id_file.read().split())
    elif args.seq_info:
        tax_ids = {row['tax_id'] for row in csv.DictReader(args.seq_info)}
    else:
        sys.exit('Error: no tax_ids were specified')

    engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3)
    tax = Taxonomy(engine, schema=args.schema)

    rows = tax._get_lineage_table(tax_ids)

    log.info('grouping lineages')
    all_ranks = set()
    taxtable = {}
    for tax_id, grp in groupby(rows, lambda row: row[0]):
        ranks, tax_rows = as_taxtable_rows(grp, seen=taxtable)
        taxtable.update(dict(tax_rows))
        all_ranks |= set(ranks)

    # guppy requires that tax_id == parent_id for the root node;
    # identify the root node by calculating an arbitrary lineage.
    root_id = tax.lineage(tax_id)['root']
    taxtable[root_id]['parent_id'] = root_id

    sorted_ranks = sorted(all_ranks, key=order_ranks(tax.ranks[::-1]))

    # guppy requires this column order
    fieldnames = ['tax_id', 'parent_id', 'rank', 'tax_name'] + sorted_ranks

    output = list(taxtable.values())
    log.info('sorting lineages')

    output = sorted(
        output,
        # key=getitems(*sorted_ranks)
        key=lambda row: tuple(row.get(rank) or '' for rank in sorted_ranks)
    )

    log.info('writing taxtable')
    writer = csv.DictWriter(
        args.outfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output)
예제 #8
0
def action(args):
    taxids = args.tax_ids
    # Add taxids from input file
    if args.input:
        with args.input as h:
            for l in h:
                val = l.split('#')[0].strip()
                taxids.append(val)
    # Connect to the taxonomy
    engine = create_engine('sqlite:///%s' % args.taxdb, echo=False)
    tax = Taxonomy(engine, ncbi.RANKS)
    # Finally, real work...
    if args.cut:
        company = lonely.lonely_company(tax, taxids)
    else:
        company = lonely.solid_company(tax, taxids)
    txt = ""
    for t in company:
        txt += "%s\n" % (t if t else "")
    if args.out:
        with open(args.out, 'w') as h:
            h.write(txt)
    else:
        print(txt)
    return 0
예제 #9
0
def action(args):
    engine = create_engine(
        'sqlite:///%s' % args.database_file, echo=args.verbosity > 2)
    tax = Taxonomy(engine, ncbi.ranks)

    taxids = set()

    if args.taxids:
        if os.access(args.taxids, os.F_OK):
            for line in getlines(args.taxids):
                taxids.update(set(re.split(r'[\s,;]+', line)))
        else:
            taxids.update([x.strip()
                           for x in re.split(r'[\s,;]+', args.taxids)])

    if args.taxnames:
        for taxname in getlines(args.taxnames):
            for name in re.split(r'\s*[,;]\s*', taxname):
                tax_id, primary_name, is_primary = tax.primary_from_name(
                    name.strip())
                taxids.add(tax_id)

    if args.seq_info:
        with args.seq_info:
            reader = csv.DictReader(args.seq_info)
            taxids.update(frozenset(i['tax_id']
                                    for i in reader if i['tax_id']))

    # Before digging into lineages, make sure all the taxids exist in
    # the taxonomy database.
    valid_taxids = True
    for t in taxids:
        try:
            tax._node(t)
        except KeyError:
            # Check for merged
            m = tax._get_merged(t)
            if m and m != t:
                msg = ("Taxid {0} has been replaced by {1}. "
                       "Please update your records").format(t, m)
                print >> sys.stderr, msg
            else:
                print >>sys.stderr, "Taxid %s not found in taxonomy." % t
            valid_taxids = False
    if not(valid_taxids):
        print >>sys.stderr, "Some taxids were invalid.  Exiting."
        return 1  # exits with code 1

    # Extract all the taxids to be exported in the CSV file.
    taxids_to_export = set()
    for t in taxids:
        taxids_to_export.update([y for (x, y) in tax._get_lineage(t)])

    tax.write_table(taxids_to_export, csvfile=args.out_file, full=args.full)

    engine.dispose()
    return 0
예제 #10
0
def action(args):
    engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 2)
    tax = Taxonomy(engine, schema=args.schema)

    records = list(yaml.load_all(args.new_nodes))

    log.info('adding new nodes')
    retval = None
    for rec in records:
        try:
            record_type = rec.pop('type')
            if record_type not in {'node', 'name'}:
                raise ValueError
        except (KeyError, ValueError):
            log.error(('Error in record for tax_id {tax_id}: "type" is '
                       'required and must be one of "node" or "name"').format(**rec))
            retval = 1
            continue

        tax_id = rec['tax_id']
        rec['source_name'] = rec.get('source_name') or args.source_name

        try:
            if record_type == 'node':
                if not rec['source_name']:
                    log.error('Error: record has no source_name:\n{}'.format(
                        pprint.pformat(rec)))
                    raise ValueError
                if tax.has_node(tax_id):
                    log.info('updating *node* "{tax_id}"'.format(**rec))
                    tax.update_node(**rec)
                else:
                    log.info('new *node* "{tax_id}"'.format(**rec))
                    tax.add_node(**rec)
            elif record_type == 'name':
                for name in rec['names']:
                    name['tax_id'] = tax_id
                    # source_name may be provided at the record or name level
                    name['source_name'] = name.get('source_name') or rec['source_name']
                    if not name['source_name']:
                        log.error(
                            'Error: record has no source_name:\n {}'.format(
                                pprint.pformat(rec)))
                        raise ValueError

                    log.info('new *name* for "{tax_id}": "{tax_name}"'.format(**name))
                    tax.add_name(**name)
        except (ValueError, TypeError):
            log.error('Error in record with tax_id {}'.format(rec['tax_id']))
            log.error(''.join(traceback.format_exception(*sys.exc_info())))
            retval = 1

    engine.dispose()

    if retval:
        log.error('Error: some records were malformed')
    return retval
예제 #11
0
def action(args):
    engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 2)
    tax = Taxonomy(engine, schema=args.schema)
    nodes = list(get_new_nodes(args.new_nodes))

    # check if there are any new ranks and exit if needed
    node_ranks = set(n['rank'] for n in nodes)
    for r in node_ranks:
        if r not in tax.ranks:
            msg = 'adding new ranks to taxonomy is not yet supported'
            raise TaxonIntegrityError(msg)

    ranksdict = tax.ranksdict()
    ranksdict.update(dict([(n['tax_id'], n['rank']) for n in nodes]))
    nodes = [verify_rank_integrity(n, ranksdict, tax.ranks) for n in nodes]
    nodes = [verify_lineage_integrity(n, ranksdict, tax.ranks, tax) for n in nodes]

    log.info('adding new nodes')
    for d in nodes:
        if args.source_name:
            d['source_name'] = args.source_name
            try:
                tax.add_node(**d)
            except sqlalchemy.exc.IntegrityError:
                if args.update:
                    tax.update_node(**d)
                else:
                    log.warn('node with tax_id %(tax_id)s already exists' % d)
            else:
                log.info('added new node with tax_id %(tax_id)s' % d)

    engine.dispose()
예제 #12
0
def action(args):

    dbfile = args.dbfile
    taxnames_file = args.taxnames_file
    taxnames = args.taxnames

    outfile = args.outfile

    engine = create_engine('sqlite:///%s' % dbfile, echo=False)
    tax = Taxonomy(engine, ncbi.RANKS)

    names = []
    if taxnames_file:
        names += [line.split('#', 1)[0].strip()
                  for line in taxnames_file
                  if line.strip() and not line.startswith('#')]

    if taxnames:
        names += [x.strip() for x in taxnames.split(',')]

    taxa = {}
    for name in set(names):
        tax_id, tax_name, is_primary, rank, note = '', '', '', '', ''

        try:
            tax_id, tax_name, is_primary = tax.primary_from_name(name)
        except ValueError:
            note = 'not found'
        else:
            parent, rank = tax._node(tax_id)
            note = '' if is_primary else 'not primary'

        if note:
            log.warning(
                '%(name)20s | %(tax_id)7s %(tax_name)20s %(note)s' % locals())

        if rank == 'species':
            taxa[tax_id] = dict(tax_id=tax_id, tax_name=tax_name, rank=rank)
        else:
            keys, rows = get_children(engine, [tax_id])
            taxa.update(dict((row['tax_id'], row) for row in rows))

    for d in sorted(taxa.values(), key=lambda x: x['tax_name']):
        outfile.write('%(tax_id)s # %(tax_name)s\n' % d)
예제 #13
0
def test_species_below():
    engine = create_engine('sqlite:///../testfiles/taxonomy.db', echo=False)
    tax = Taxonomy(engine, taxtastic.ncbi.RANKS)
    t = tax.species_below('1239')
    parent_id, rank = tax._node(t)
    for t in [None, '1239', '186801', '1117']:
        s = tax.species_below(t)
        assert t is None or s is None or tax.is_ancestor_of(s, t)
        assert s is None or tax.rank(s) == 'species'
예제 #14
0
def action(args):

    dbname = args.database_file
    new_nodes = args.new_nodes
    source_name = args.source_name

    engine = create_engine('sqlite:///%s' % dbname, echo=args.verbosity > 2)
    tax = Taxonomy(engine, ncbi.RANKS)

    log.warning('adding new nodes')
    nodes = get_new_nodes(new_nodes)
    for d in nodes:
        if source_name:
            d['source_name'] = source_name
            try:
                tax.add_node(**d)
            except IntegrityError:
                log.info('node with tax_id %(tax_id)s already exists' % d)
            else:
                log.info('added new node with tax_id %(tax_id)s' % d)

    engine.dispose()
예제 #15
0
def action(args):

    dbname = args.database_file
    new_nodes = args.new_nodes
    source_name = args.source_name

    engine = create_engine('sqlite:///%s' % dbname, echo=args.verbosity > 2)
    tax = Taxonomy(engine, ncbi.RANKS)

    log.warning('adding new nodes')
    nodes = get_new_nodes(new_nodes)
    for d in nodes:
        if source_name:
            d['source_name'] = source_name
            try:
                tax.add_node(**d)
            except IntegrityError:
                log.info('node with tax_id %(tax_id)s already exists' % d)
            else:
                log.info('added new node with tax_id %(tax_id)s' % d)

    engine.dispose()
예제 #16
0
    def test_new_nodes05(self):
        args = ['add_nodes', self.dbname, data_path('staph_species_group2.yml')]
        self.assertZeroExitStatus(main(args))

        tax = Taxonomy(sqlalchemy.create_engine('sqlite:///' + self.dbname))
        with tax.engine.connect() as con:
            result = con.execute(
                'select * from nodes where parent_id = ?', ('stapha_sg',))
            keys = list(result.keys())
            nodes = [dict(list(zip(keys, row))) for row in result.fetchall()]

        self.assertEqual(len(nodes), 5)
        self.assertEqual([row['source_id'] for row in nodes], [2] * len(nodes))
예제 #17
0
def test_rank_and_parent():
    engine = create_engine('sqlite:///../testfiles/taxonomy.db', echo=False)
    tax = Taxonomy(engine, taxtastic.ncbi.RANKS)
    assert tax.rank(None) is None
    assert tax.rank('1239') == 'phylum'
    assert tax.rank('1280') == 'species'
    assert tax.parent_id(None) is None
    assert tax.parent_id('1239') == '2'
예제 #18
0
def action(args):
    engine = create_engine('sqlite:///%s' %
                           args.database_file, echo=args.verbosity > 2)
    tax = Taxonomy(engine, ncbi.RANKS)

    taxids = set()

    if args.taxids:
        if os.access(args.taxids, os.F_OK):
            for line in getlines(args.taxids):
                taxids.update(set(re.split(r'[\s,;]+', line)))
        else:
            taxids.update([x.strip()
                           for x in re.split(r'[\s,;]+', args.taxids)])

    if args.seq_info:
        with args.seq_info:
            reader = csv.DictReader(args.seq_info)
            taxids.update(frozenset(i['tax_id']
                                    for i in reader if i['tax_id']))

    writer = csv.writer(args.out_file)

    for t in taxids:
        try:
            tax._node(t)
        except ValueError:
            # Check for merged
            m = tax._get_merged(t)
            if m and m != t:
                writer.writerow([t, m])
            else:
                writer.writerow([t, None])

    engine.dispose()
    return 0
예제 #19
0
def test_lonely_company():
    if not(os.path.exists('../testfiles/taxonomy.db')):
        return
    engine = create_engine('sqlite:///../testfiles/taxonomy.db', echo=False)
    tax = Taxonomy(engine, ncbi.ranks)
    print tax.lineage(49896)
    lonely_tax_ids = [None, 816, 1239]
    for a,t in zip(lonely_tax_ids, lonely_company(tax, lonely_tax_ids)):
        assert a is None or t is None or tax.is_ancestor_of(t, tax.parent_id(a))
        assert t is None or tax.rank(t) == 'species'
예제 #20
0
def action(a):
    # Start database
    e = sqlalchemy.create_engine('sqlite:///{0}'.format(a.database))
    taxonomy = Taxonomy(e, ncbi.ranks)
    is_classified = species_is_classified_fn(taxonomy)

    with a.infile as fp, \
            a.output as out_fp, \
            a.fasta_out as fasta_fp:
        records = SeqIO.parse(fp, 'genbank')
        records = util.Counter(records, prefix='Record ')
        taxa = ((record, ncbi_extract_genbank.tax_of_genbank(record))
                for record in records)
        taxa = ((record, tax_id, record.annotations['organism'])
                for record, tax_id in taxa)
        taxa = ((record, update_taxid(tax_id, taxonomy, organism))
                for record, tax_id, organism in taxa)

        writer = csv.writer(out_fp,
                            lineterminator='\n',
                            quoting=csv.QUOTE_NONNUMERIC)
        if a.header:
            header = ('version', 'seqname', 'tax_id', 'accession',
                      'description', 'length', 'ambig_count', 'is_type',
                      'rdp_lineage', 'taxid_classified')
            writer.writerow(header)

        for record, tax_id in taxa:
            accession, version = ncbi_extract_genbank.accession_version_of_genbank(
                record)
            rdp_lineage = ';'.join(record.annotations.get('taxonomy', []))
            rdp_lineage = rdp_lineage.replace('"', '')

            row = (version, record.name, tax_id, accession, record.description,
                   len(record), count_ambiguous(str(record.seq)),
                   str(ncbi_extract_genbank.is_type(record)).upper(),
                   rdp_lineage, is_classified(tax_id))

            writer.writerow(row)
            SeqIO.write([transform_id(record)], fasta_fp, 'fasta')

    logging.info("Total records: %d", records.count)
예제 #21
0
def action(args):
    engine = create_engine('sqlite:///%s' % args.database_file,
                           echo=args.verbosity > 2)
    tax = Taxonomy(engine, ncbi.RANKS)

    if any([args.taxids, args.taxnames, args.seq_info]):
        taxids = set()
        if args.taxids:
            if os.access(args.taxids, os.F_OK):
                for line in getlines(args.taxids):
                    taxids.update(set(re.split(r'[\s,;]+', line)))
            else:
                taxids.update(
                    [x.strip() for x in re.split(r'[\s,;]+', args.taxids)])

        if args.seq_info:
            with args.seq_info:
                reader = csv.DictReader(args.seq_info)
                taxids.update(
                    frozenset(i['tax_id'] for i in reader if i['tax_id']))

        if not (are_valid(taxids, tax)):
            return "Some taxids were invalid.  Exiting."

        if args.taxnames:
            for taxname in getlines(args.taxnames):
                for name in re.split(r'\s*[,;]\s*', taxname):
                    tax_id, primary_name, is_primary = tax.primary_from_name(
                        name.strip())
                    taxids.add(tax_id)
    else:
        taxids = set(tax.tax_ids())

    # Extract all the taxids to be exported in the CSV file.
    taxids_to_export = set()
    for t in taxids:
        taxids_to_export.update([y for (x, y) in tax._get_lineage(t)])

    tax.write_table(taxids_to_export, csvfile=args.out_file, full=args.full)

    engine.dispose()
    return 0
예제 #22
0
def action(args):
    engine = create_engine(
        'sqlite:///%s' % args.database_file, echo=args.verbosity > 2)
    tax = Taxonomy(engine, ncbi.RANKS)

    if any([args.taxids, args.taxnames, args.seq_info]):
        taxids = set()
        if args.taxids:
            if os.access(args.taxids, os.F_OK):
                for line in getlines(args.taxids):
                    taxids.update(set(re.split(r'[\s,;]+', line)))
            else:
                taxids.update(
                    [x.strip() for x in re.split(r'[\s,;]+', args.taxids)])

        if args.seq_info:
            with args.seq_info:
                reader = csv.DictReader(args.seq_info)
                taxids.update(
                    frozenset(i['tax_id'] for i in reader if i['tax_id']))

        if not(are_valid(taxids, tax)):
            return "Some taxids were invalid.  Exiting."

        if args.taxnames:
            for taxname in getlines(args.taxnames):
                for name in re.split(r'\s*[,;]\s*', taxname):
                    tax_id, primary_name, is_primary = tax.primary_from_name(
                        name.strip())
                    taxids.add(tax_id)
    else:
        taxids = set(tax.tax_ids())

    # Extract all the taxids to be exported in the CSV file.
    taxids_to_export = set()
    for t in taxids:
        taxids_to_export.update([y for (x, y) in tax._get_lineage(t)])

    tax.write_table(taxids_to_export, csvfile=args.out_file, full=args.full)

    engine.dispose()
    return 0
예제 #23
0
def action(args):
    reader = csv.DictReader(args.infile)
    fieldnames = reader.fieldnames
    taxid_column = args.taxid_column
    drop = args.unknown_action == 'drop'
    error = args.unknown_action == 'error'
    ignore = args.unknown_action == 'ignore'

    if taxid_column not in fieldnames:
        raise ValueError("No column " + args.taxid_column)

    # TODO: remove unless --use-names is implemented
    # if args.use_names and args.name_column not in fieldnames:
    #     raise ValueError("No column " + args.name_column)

    writer = csv.DictWriter(args.outfile,
                            fieldnames=fieldnames,
                            quoting=csv.QUOTE_ALL)
    writer.writeheader()

    if args.unknowns:
        unknowns = csv.DictWriter(args.unknowns,
                                  fieldnames=fieldnames,
                                  quoting=csv.QUOTE_ALL)
        unknowns.writeheader()

    engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3)
    tax = Taxonomy(engine, taxtastic.ncbi.RANKS, schema=args.schema)

    with tax.engine.connect() as con:
        log.info('reading table merged')
        result = con.execute(
            'select old_tax_id, new_tax_id from {merged}'.format(
                merged=tax.merged))
        mergedict = dict(result.fetchall())

        log.info('reading tax_ids from table {nodes}'.format(nodes=tax.nodes))
        result = con.execute(
            'select tax_id from {nodes}'.format(nodes=tax.nodes))
        all_tax_ids = {x[0] for x in result.fetchall()}

    log.info('reading input file')
    for row in reader:
        tax_id = row[taxid_column]

        if tax_id in all_tax_ids:
            pass  # write row without modification
        elif tax_id in mergedict:
            row[taxid_column] = mergedict[tax_id]
        else:  # tax_id is unknown
            if args.unknowns:
                unknowns.writerow(row)

            if ignore:
                pass
            elif drop:
                continue
            elif error:
                sys.exit('Error: tax_id {} is unknown'.format(tax_id))

        writer.writerow(row)
예제 #24
0
 def setUp(self):
     self.engine = create_engine('sqlite:///%s' % self.dbname, echo=echo)
     self.tax = Taxonomy(self.engine, taxtastic.ncbi.RANKS)
예제 #25
0
def test_is_below():
    assert Taxonomy.is_below('species', 'family')
    assert Taxonomy.is_below('family', 'kingdom')
    assert not Taxonomy.is_below('kingdom', 'family')
    assert Taxonomy.ranks_below('species') == []
    assert Taxonomy.ranks_below('family') == ['species', 'genus']
예제 #26
0
def action(args):
    engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 2)
    tax = Taxonomy(engine, schema=args.schema)

    with engine.connect() as con:
        # TODO: need to order nodes so that parents are always created first

        cmd = """
        select nodes.*, source.name as source_name
        from {nodes}
        join {source} on nodes.source_id = source.id
        where source.name = {x}
        """.format(x=tax.placeholder, nodes=tax.nodes, source=tax.source)

        result = con.execute(cmd, (args.source_name, ))
        keys = list(result.keys())
        nodes = [clean_dict(keys, vals) for vals in result.fetchall()]

        # get the complete lineage for each node, and provide an
        # ordering for all nodes so that children may be placed after
        # parents.
        tax_ids = list(map(itemgetter('tax_id'), nodes))
        lineages = tax._get_lineage_table(tax_ids)
        ordering = {}
        for i, lineage in enumerate(lineages):
            tax_id = lineage[1]
            if tax_id not in ordering:
                ordering[tax_id] = i

        nodes = sorted(nodes, key=lambda n: ordering[n['tax_id']])

        cmd = """
        select names.*, source.name as source_name
        from {names}
        join {source} on names.source_id = source.id
        where source.name = {x}
        """.format(x=tax.placeholder, names=tax.names, source=tax.source)

        result = con.execute(cmd, (args.source_name, ))
        keys = list(result.keys())
        names = [clean_dict(keys, vals) for vals in result.fetchall()]
        namedict = {
            key: list(grp)
            for key, grp in groupby(names, itemgetter('tax_id'))
        }

        for node in nodes:
            node['type'] = 'node'
            tax_id = node['tax_id']
            if tax_id in namedict:
                node['names'] = namedict.pop(tax_id)

        yaml.safe_dump_all(nodes,
                           args.outfile,
                           default_flow_style=False,
                           explicit_start=True,
                           indent=2)

        # prepare remaining names
        remaining_names = []
        for tax_id, names in list(namedict.items()):
            for name in names:
                del name['tax_id']

            remaining_names.append({
                'tax_id': tax_id,
                'type': 'name',
                'names': names
            })

        yaml.safe_dump_all(remaining_names,
                           args.outfile,
                           default_flow_style=False,
                           explicit_start=True,
                           indent=2)
예제 #27
0
def action(args):
    rows = pandas.read_csv(args.infile, dtype='str')
    columns = rows.columns.tolist()  # preserve column order

    if args.taxid_column not in columns:
        raise ValueError("No column " + args.taxid_column)

    if args.name_column:
        if args.name_column not in columns:
            msg = '"No "' + args.name_column + '" column'
            raise ValueError(msg)

    con = 'sqlite:///{0}'.format(args.database_file)
    e = sqlalchemy.create_engine(con)
    tax = Taxonomy(e, ncbi.RANKS)

    merged = pandas.read_sql_table('merged', con, index_col='old_tax_id')
    log.info('updating tax_ids')
    rows = rows.join(merged, on=args.taxid_column)

    # overwrite tax_ids where there is a new_tax_id
    inew_tax_ids = ~rows['new_tax_id'].isnull()
    rows.loc[inew_tax_ids, args.taxid_column] = \
        rows[inew_tax_ids]['new_tax_id']
    rows = rows.drop('new_tax_id', axis=1)

    log.info('loading names table')
    names = pandas.read_sql_table('names',
                                  con,
                                  columns=['tax_id', 'tax_name', 'is_primary'])

    if args.name_column:
        """
        use the args.name_column to do a string comparison with
        names.tax_name column to find a suitable tax_id
        """
        unknowns = rows[~rows[args.taxid_column].isin(names['tax_id'])]

        if not unknowns.empty:
            """
            Take any tax_id associated with a string match
            to tax_name prioritizing is_primary=True
            """
            unknowns = unknowns.drop(args.taxid_column, axis=1)
            names = names.sort_values('is_primary', ascending=False)
            names = names.drop_duplicates(subset='tax_name', keep='first')
            names = names.set_index('tax_name')
            found = unknowns.join(names, on=args.name_column, how='inner')
            rows.loc[found.index, args.taxid_column] = found['tax_id']

    if not args.ignore_unknowns:
        unknowns = rows[~rows[args.taxid_column].isin(names['tax_id'])]
        if args.unknowns:
            """
            Output unknown tax_ids
            """
            unknowns.to_csv(args.unknowns,
                            index=False,
                            columns=columns,
                            quoting=csv.QUOTE_NONNUMERIC)
        elif not unknowns.empty:
            raise ValueError('Unknown or missing tax_ids present')

        rows = rows[~rows.index.isin(unknowns.index)]

    if args.taxid_classified:
        """
        """
        if 'taxid_classified' in columns:
            rows = rows.drop('taxid_classified', axis=1)
        else:
            columns.append('taxid_classified')

        def is_classified(row):
            row['taxid_classified'] = species_is_classified(
                row[args.taxid_column], tax)
            return row

        msg = 'validating tax_ids:'
        rows = utils.apply_df_status(is_classified, rows, msg)

    if args.append_lineage:
        """
        Append a column from the taxonomy to seq_info
        """
        if args.append_lineage in columns:
            rows = rows.drop(args.append_lineage, axis=1)
        else:
            columns.append(args.append_lineage)

        def add_rank_column(row):
            try:
                lineage = tax.lineage(row[args.taxid_column])
            except ValueError as e:
                log.warn(e)
                lineage = {}
            row[args.append_lineage] = lineage.get(args.append_lineage, None)
            return row

        msg = 'appending {} column'.format(args.append_lineage)
        rows = utils.apply_df_status(add_rank_column, rows, msg)

    # output seq_info with new tax_ids
    rows.to_csv(args.out_file,
                index=False,
                columns=columns,
                quoting=csv.QUOTE_NONNUMERIC)
예제 #28
0
def action(args):
    engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3)

    ranks_df = pandas.read_sql_table('ranks', engine, schema=args.schema)
    # most operations in this script require ordering from 'root' down
    ranks_df = ranks_df.sort_values(by='height', ascending=False)
    ranks = ranks_df['rank'].tolist()

    nodes = None
    subset_ids = set()

    # check tax_ids subsets first before building taxtable
    if any([args.tax_ids, args.taxnames, args.seq_info]):
        tax = Taxonomy(engine, schema=args.schema)
        if args.tax_ids:
            if os.access(args.tax_ids, os.F_OK):
                for line in getlines(args.tax_ids):
                    subset_ids.update(set(re.split(r'[\s,;]+', line)))
            else:
                subset_ids.update(
                    [x.strip() for x in re.split(r'[\s,;]+', args.tax_ids)])

        if args.seq_info:
            log.info('reading tax_ids ' + args.seq_info.name)
            with args.seq_info:
                reader = csv.DictReader(args.seq_info)
                subset_ids.update(
                    frozenset(i['tax_id'] for i in reader if i['tax_id']))

        # this will raise an error if any tax_ids do not exist in database
        all_known(subset_ids, tax)

        if args.taxnames:
            for taxname in getlines(args.taxnames):
                for name in re.split(r'\s*[,;]\s*', taxname):
                    tax_id, primary_name, is_primary = tax.primary_from_name(
                        name.strip())
                    subset_ids.add(tax_id)

        if not subset_ids:
            log.error('no tax_ids to subset taxtable, exiting')
            return

    log.info('loading nodes table from database')
    nodes = pandas.read_sql_table('nodes',
                                  engine,
                                  schema=args.schema,
                                  index_col='tax_id')

    if args.taxtable:
        log.info('using existing taxtable ' + args.taxtable)
        taxtable = pandas.read_csv(args.taxtable, dtype=str)
        taxtable = taxtable.set_index('tax_id')
        taxtable = taxtable.join(nodes[['parent_id', 'is_valid']])
    else:
        log.info('building taxtable')
        names = pandas.read_sql_table(
            'names',
            engine,
            schema=args.schema,
            columns=['tax_id', 'tax_name', 'is_primary'])
        names = names[names['is_primary']].set_index('tax_id')
        len_nodes = len(nodes)
        nodes = nodes.join(names['tax_name'])
        assert len_nodes == len(nodes)
        taxtable = build_taxtable(nodes, ranks)

    # subset taxtable clade lineages
    if args.clade_ids:
        dtypes = taxtable.dtypes
        clades = []
        for i in args.clade_ids.split(','):
            ancestor = taxtable.loc[i]

            # select all rows where rank column == args.from_id
            clade = taxtable[taxtable[ancestor['rank']] == i]

            # build taxtable up to root from args.from_id
            while ancestor.name != '1':  # root
                parent = taxtable.loc[ancestor['parent_id']]
                clade = pandas.concat([pandas.DataFrame(parent).T, clade])
                ancestor = parent
            # reset lost index name after concatenating transposed series
            clades.append(clade)
        taxtable = pandas.concat(clades)
        taxtable = taxtable[~taxtable.index.duplicated()]

        # set index.name and dtypes back after concating transposed series
        taxtable.index.name = 'tax_id'
        for d, t in dtypes.iteritems():
            taxtable[d] = taxtable[d].astype(t)

    # subset taxtable by set of tax_ids
    if subset_ids:
        keepers = taxtable.loc[subset_ids]
        for col in keepers.columns:
            if col in ranks:
                subset_ids.update(keepers[col].dropna().values)
        taxtable = taxtable.loc[subset_ids]

    # drop no rank nodes
    if args.ranked:
        ranks = ranks_df[~ranks_df['no_rank']]['rank'].tolist()
        taxtable = taxtable[taxtable['rank'].isin(ranks)]

    if args.valid:
        invalid = taxtable[~taxtable['is_valid']]
        # remove all invalids from the rank columns
        for r, g in invalid.groupby(by='rank'):
            taxtable.loc[taxtable[r].isin(g.index), r] = None
        # remove invalid rows
        taxtable = taxtable[taxtable['is_valid']]

    # clean up empty rank columns
    taxtable = taxtable.dropna(axis=1, how='all')

    # sort final column output
    taxtable = taxtable[['rank', 'tax_name'] +
                        [r for r in ranks if r in taxtable.columns]]

    # sort rows
    taxtable['rank'] = taxtable['rank'].astype('category', categories=ranks)
    taxtable = taxtable.sort_values('rank')

    # write and close db
    taxtable.to_csv(args.out)
    engine.dispose()
예제 #29
0
def test_sibling_of():
    engine = create_engine('sqlite:///../testfiles/taxonomy.db', echo=False)
    tax = Taxonomy(engine, taxtastic.ncbi.RANKS)
    assert tax.sibling_of(None) is None
    assert tax.sibling_of('91061') == '186801'
    assert tax.sibling_of('1696') is None
예제 #30
0
def test__node():
    engine = create_engine(
        'sqlite:///../testfiles/small_taxonomy.db', echo=False)
    tax = Taxonomy(engine, taxtastic.ncbi.RANKS)
    assert tax._node(None) is None
    assert tax._node('91061') == (u'1239', u'class')
예제 #31
0
def action(args):
    engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 2)
    tax = Taxonomy(engine, schema=args.schema)

    with engine.connect() as con:
        # TODO: need to order nodes so that parents are always created first

        cmd = """
        select nodes.*, source.name as source_name
        from {nodes}
        join {source} on nodes.source_id = source.id
        where source.name = {x}
        """.format(x=tax.placeholder, nodes=tax.nodes, source=tax.source)

        result = con.execute(cmd, (args.source_name,))
        keys = list(result.keys())
        nodes = [clean_dict(keys, vals) for vals in result.fetchall()]

        # get the complete lineage for each node, and provide an
        # ordering for all nodes so that children may be placed after
        # parents.
        tax_ids = list(map(itemgetter('tax_id'), nodes))
        lineages = tax._get_lineage_table(tax_ids)
        ordering = {}
        for i, lineage in enumerate(lineages):
            tax_id = lineage[1]
            if tax_id not in ordering:
                ordering[tax_id] = i

        nodes = sorted(nodes, key=lambda n: ordering[n['tax_id']])

        cmd = """
        select names.*, source.name as source_name
        from {names}
        join {source} on names.source_id = source.id
        where source.name = {x}
        """.format(x=tax.placeholder, names=tax.names, source=tax.source)

        result = con.execute(cmd, (args.source_name,))
        keys = list(result.keys())
        names = [clean_dict(keys, vals) for vals in result.fetchall()]
        namedict = {key: list(grp)
                    for key, grp in groupby(names, itemgetter('tax_id'))}

        for node in nodes:
            node['type'] = 'node'
            tax_id = node['tax_id']
            if tax_id in namedict:
                node['names'] = namedict.pop(tax_id)

        yaml.safe_dump_all(nodes, args.outfile, default_flow_style=False,
                           explicit_start=True, indent=2)

        # prepare remaining names
        remaining_names = []
        for tax_id, names in list(namedict.items()):
            for name in names:
                del name['tax_id']

            remaining_names.append({
                'tax_id': tax_id,
                'type': 'name',
                'names': names
            })

        yaml.safe_dump_all(remaining_names, args.outfile, default_flow_style=False,
                           explicit_start=True, indent=2)
예제 #32
0
def test_is_ancestor_of():
    engine = create_engine('sqlite:///../testfiles/taxonomy.db', echo=False)
    tax = Taxonomy(engine, taxtastic.ncbi.RANKS)
    assert tax.is_ancestor_of('1280', '1239')
    assert tax.is_ancestor_of(None, '1239') is False
    assert tax.is_ancestor_of('1239', None) is False
예제 #33
0
 def setUp(self):
     self.engine = create_engine('sqlite:///%s' % dbname, echo=echo)
     self.tax = Taxonomy(self.engine)