Пример #1
0
def action(args):
    engine = create_engine(
        'sqlite:///%s' % args.database_file, echo=args.verbosity > 2)
    tax = Taxonomy(engine, ncbi.ranks)

    taxids = set()

    if args.taxids:
        if os.access(args.taxids, os.F_OK):
            for line in getlines(args.taxids):
                taxids.update(set(re.split(r'[\s,;]+', line)))
        else:
            taxids.update([x.strip()
                           for x in re.split(r'[\s,;]+', args.taxids)])

    if args.taxnames:
        for taxname in getlines(args.taxnames):
            for name in re.split(r'\s*[,;]\s*', taxname):
                tax_id, primary_name, is_primary = tax.primary_from_name(
                    name.strip())
                taxids.add(tax_id)

    if args.seq_info:
        with args.seq_info:
            reader = csv.DictReader(args.seq_info)
            taxids.update(frozenset(i['tax_id']
                                    for i in reader if i['tax_id']))

    # Before digging into lineages, make sure all the taxids exist in
    # the taxonomy database.
    valid_taxids = True
    for t in taxids:
        try:
            tax._node(t)
        except KeyError:
            # Check for merged
            m = tax._get_merged(t)
            if m and m != t:
                msg = ("Taxid {0} has been replaced by {1}. "
                       "Please update your records").format(t, m)
                print >> sys.stderr, msg
            else:
                print >>sys.stderr, "Taxid %s not found in taxonomy." % t
            valid_taxids = False
    if not(valid_taxids):
        print >>sys.stderr, "Some taxids were invalid.  Exiting."
        return 1  # exits with code 1

    # Extract all the taxids to be exported in the CSV file.
    taxids_to_export = set()
    for t in taxids:
        taxids_to_export.update([y for (x, y) in tax._get_lineage(t)])

    tax.write_table(taxids_to_export, csvfile=args.out_file, full=args.full)

    engine.dispose()
    return 0
Пример #2
0
def action(args):
    engine = create_engine(args.url, echo=args.verbosity > 2)
    tax = Taxonomy(engine, schema=args.schema)

    taxids = set()

    if args.taxids:
        if os.access(args.taxids, os.F_OK):
            for line in getlines(args.taxids):
                taxids.update(set(re.split(r'[\s,;]+', line)))
        else:
            taxids.update([x.strip()
                           for x in re.split(r'[\s,;]+', args.taxids)])

    if args.seq_info:
        with args.seq_info:
            reader = csv.DictReader(args.seq_info)
            taxids.update(frozenset(i['tax_id']
                                    for i in reader if i['tax_id']))

    writer = csv.writer(args.out)

    for t in taxids:
        try:
            tax._node(t)
        except ValueError:
            # Check for merged
            m = tax._get_merged(t)
            if m and m != t:
                writer.writerow([t, m])
            else:
                writer.writerow([t, None])

    engine.dispose()
    return 0
Пример #3
0
def action(args):
    engine = create_engine(
        'sqlite:///%s' % args.database_file, echo=args.verbosity > 2)
    tax = Taxonomy(engine, ncbi.RANKS)

    if any([args.taxids, args.taxnames, args.seq_info]):
        taxids = set()
        if args.taxids:
            if os.access(args.taxids, os.F_OK):
                for line in getlines(args.taxids):
                    taxids.update(set(re.split(r'[\s,;]+', line)))
            else:
                taxids.update(
                    [x.strip() for x in re.split(r'[\s,;]+', args.taxids)])

        if args.seq_info:
            with args.seq_info:
                reader = csv.DictReader(args.seq_info)
                taxids.update(
                    frozenset(i['tax_id'] for i in reader if i['tax_id']))

        if not(are_valid(taxids, tax)):
            return "Some taxids were invalid.  Exiting."

        if args.taxnames:
            for taxname in getlines(args.taxnames):
                for name in re.split(r'\s*[,;]\s*', taxname):
                    tax_id, primary_name, is_primary = tax.primary_from_name(
                        name.strip())
                    taxids.add(tax_id)
    else:
        taxids = set(tax.tax_ids())

    # Extract all the taxids to be exported in the CSV file.
    taxids_to_export = set()
    for t in taxids:
        taxids_to_export.update([y for (x, y) in tax._get_lineage(t)])

    tax.write_table(taxids_to_export, csvfile=args.out_file, full=args.full)

    engine.dispose()
    return 0
Пример #4
0
def action(args):
    engine = create_engine('sqlite:///%s' % args.database_file,
                           echo=args.verbosity > 2)
    tax = Taxonomy(engine, ncbi.RANKS)

    if any([args.taxids, args.taxnames, args.seq_info]):
        taxids = set()
        if args.taxids:
            if os.access(args.taxids, os.F_OK):
                for line in getlines(args.taxids):
                    taxids.update(set(re.split(r'[\s,;]+', line)))
            else:
                taxids.update(
                    [x.strip() for x in re.split(r'[\s,;]+', args.taxids)])

        if args.seq_info:
            with args.seq_info:
                reader = csv.DictReader(args.seq_info)
                taxids.update(
                    frozenset(i['tax_id'] for i in reader if i['tax_id']))

        if not (are_valid(taxids, tax)):
            return "Some taxids were invalid.  Exiting."

        if args.taxnames:
            for taxname in getlines(args.taxnames):
                for name in re.split(r'\s*[,;]\s*', taxname):
                    tax_id, primary_name, is_primary = tax.primary_from_name(
                        name.strip())
                    taxids.add(tax_id)
    else:
        taxids = set(tax.tax_ids())

    # Extract all the taxids to be exported in the CSV file.
    taxids_to_export = set()
    for t in taxids:
        taxids_to_export.update([y for (x, y) in tax._get_lineage(t)])

    tax.write_table(taxids_to_export, csvfile=args.out_file, full=args.full)

    engine.dispose()
    return 0
Пример #5
0
def action(args):
    engine = create_engine('sqlite:///%s' %
                           args.database_file, echo=args.verbosity > 2)
    tax = Taxonomy(engine, ncbi.RANKS)

    taxids = set()

    if args.taxids:
        if os.access(args.taxids, os.F_OK):
            for line in getlines(args.taxids):
                taxids.update(set(re.split(r'[\s,;]+', line)))
        else:
            taxids.update([x.strip()
                           for x in re.split(r'[\s,;]+', args.taxids)])

    if args.seq_info:
        with args.seq_info:
            reader = csv.DictReader(args.seq_info)
            taxids.update(frozenset(i['tax_id']
                                    for i in reader if i['tax_id']))

    writer = csv.writer(args.out_file)

    for t in taxids:
        try:
            tax._node(t)
        except ValueError:
            # Check for merged
            m = tax._get_merged(t)
            if m and m != t:
                writer.writerow([t, m])
            else:
                writer.writerow([t, None])

    engine.dispose()
    return 0
Пример #6
0
def action(args):
    engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3)

    ranks_df = pandas.read_sql_table('ranks', engine, schema=args.schema)
    # most operations in this script require ordering from 'root' down
    ranks_df = ranks_df.sort_values(by='height', ascending=False)
    ranks = ranks_df['rank'].tolist()

    nodes = None
    subset_ids = set()

    # check tax_ids subsets first before building taxtable
    if any([args.tax_ids, args.taxnames, args.seq_info]):
        tax = Taxonomy(engine, schema=args.schema)
        if args.tax_ids:
            if os.access(args.tax_ids, os.F_OK):
                for line in getlines(args.tax_ids):
                    subset_ids.update(set(re.split(r'[\s,;]+', line)))
            else:
                subset_ids.update(
                    [x.strip() for x in re.split(r'[\s,;]+', args.tax_ids)])

        if args.seq_info:
            log.info('reading tax_ids ' + args.seq_info.name)
            with args.seq_info:
                reader = csv.DictReader(args.seq_info)
                subset_ids.update(
                    frozenset(i['tax_id'] for i in reader if i['tax_id']))

        # this will raise an error if any tax_ids do not exist in database
        all_known(subset_ids, tax)

        if args.taxnames:
            for taxname in getlines(args.taxnames):
                for name in re.split(r'\s*[,;]\s*', taxname):
                    tax_id, primary_name, is_primary = tax.primary_from_name(
                        name.strip())
                    subset_ids.add(tax_id)

        if not subset_ids:
            log.error('no tax_ids to subset taxtable, exiting')
            return

    log.info('loading nodes table from database')
    nodes = pandas.read_sql_table('nodes',
                                  engine,
                                  schema=args.schema,
                                  index_col='tax_id')

    if args.taxtable:
        log.info('using existing taxtable ' + args.taxtable)
        taxtable = pandas.read_csv(args.taxtable, dtype=str)
        taxtable = taxtable.set_index('tax_id')
        taxtable = taxtable.join(nodes[['parent_id', 'is_valid']])
    else:
        log.info('building taxtable')
        names = pandas.read_sql_table(
            'names',
            engine,
            schema=args.schema,
            columns=['tax_id', 'tax_name', 'is_primary'])
        names = names[names['is_primary']].set_index('tax_id')
        len_nodes = len(nodes)
        nodes = nodes.join(names['tax_name'])
        assert len_nodes == len(nodes)
        taxtable = build_taxtable(nodes, ranks)

    # subset taxtable clade lineages
    if args.clade_ids:
        dtypes = taxtable.dtypes
        clades = []
        for i in args.clade_ids.split(','):
            ancestor = taxtable.loc[i]

            # select all rows where rank column == args.from_id
            clade = taxtable[taxtable[ancestor['rank']] == i]

            # build taxtable up to root from args.from_id
            while ancestor.name != '1':  # root
                parent = taxtable.loc[ancestor['parent_id']]
                clade = pandas.concat([pandas.DataFrame(parent).T, clade])
                ancestor = parent
            # reset lost index name after concatenating transposed series
            clades.append(clade)
        taxtable = pandas.concat(clades)
        taxtable = taxtable[~taxtable.index.duplicated()]

        # set index.name and dtypes back after concating transposed series
        taxtable.index.name = 'tax_id'
        for d, t in dtypes.iteritems():
            taxtable[d] = taxtable[d].astype(t)

    # subset taxtable by set of tax_ids
    if subset_ids:
        keepers = taxtable.loc[subset_ids]
        for col in keepers.columns:
            if col in ranks:
                subset_ids.update(keepers[col].dropna().values)
        taxtable = taxtable.loc[subset_ids]

    # drop no rank nodes
    if args.ranked:
        ranks = ranks_df[~ranks_df['no_rank']]['rank'].tolist()
        taxtable = taxtable[taxtable['rank'].isin(ranks)]

    if args.valid:
        invalid = taxtable[~taxtable['is_valid']]
        # remove all invalids from the rank columns
        for r, g in invalid.groupby(by='rank'):
            taxtable.loc[taxtable[r].isin(g.index), r] = None
        # remove invalid rows
        taxtable = taxtable[taxtable['is_valid']]

    # clean up empty rank columns
    taxtable = taxtable.dropna(axis=1, how='all')

    # sort final column output
    taxtable = taxtable[['rank', 'tax_name'] +
                        [r for r in ranks if r in taxtable.columns]]

    # sort rows
    taxtable['rank'] = taxtable['rank'].astype('category', categories=ranks)
    taxtable = taxtable.sort_values('rank')

    # write and close db
    taxtable.to_csv(args.out)
    engine.dispose()