def action(args): logging.info("Loading taxtable") with args.search_taxtable as fp: full_taxonomy = taxtable.read(fp) logging.info("Loading chosen sequence metadata") chosen_taxonomy = copy.deepcopy(full_taxonomy) chosen_taxonomy.populate_from_seqinfo(args.chosen_seqinfo) chosen_taxonomy.prune_unrepresented() logging.info("loading full sequence metadata") full_taxonomy.populate_from_seqinfo(args.search_seqinfo) # Find lonely nodes = [i for i in chosen_taxonomy if i.rank == args.lonely_rank] lonely_nodes = [i for i in nodes if is_lonely(i)] additional_reps = set() futs = [] with futures.ThreadPoolExecutor(args.threads) as executor: for node in lonely_nodes: futs.append(executor.submit(fill_lonely_worker, node.tax_id, node.at_rank(args.parent_rank).tax_id, full_taxonomy, args.search_fasta, n_reps=args.number_of_reps)) while futs: try: done, pending = futures.wait(futs, 1, futures.FIRST_COMPLETED) futs = set(pending) for f in done: if f.exception(): raise f.exception() additional_reps |= f.result() sys.stderr.write("{0:6d}/{1:6d} complete \r".format(len(lonely_nodes) - len(pending), len(lonely_nodes))) except futures.TimeoutError: pass # Keep waiting except: logging.exception("Caught error in child thread - exiting") executor.shutdown(False) raise logging.info("%d additional references", len(additional_reps)) with open(args.chosen_fasta) as fp, args.output as ofp: shutil.copyfileobj(fp, ofp) wrap.esl_sfetch(args.search_fasta, additional_reps, ofp) with args.chosen_seqinfo as fp, args.output_seqinfo as ofp, \ args.search_seqinfo as sub_fp: fp.seek(0) r = csv.DictReader(fp) w = csv.DictWriter(ofp, r.fieldnames, quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n') w.writeheader() w.writerows(r) args.search_seqinfo.seek(0) for row in csv.DictReader(sub_fp): if row['seqname'] in additional_reps: w.writerow(row)
def action(args): with args.taxtable as fp: tax = taxtable.read(fp) with args.extra_nodes_csv: reader = csv.DictReader(args.extra_nodes_csv) missing_fields = frozenset(['tax_id', 'tax_name', 'rank', 'parent_id' ]) - frozenset(reader.fieldnames) if missing_fields: raise IOError("Missing expected fields: {0}".format( ','.join(missing_fields))) for row in reader: if row['tax_id'] in tax.index: logging.warn( "tax_id %s already represented in taxtable. [row %d]", row['tax_id'], reader.line_num) continue parent_id = row['parent_id'] rank = row['rank'] try: parent_node = tax.get_node(parent_id) except ValueError: raise ValueError( "Parent {parent_id} of {tax_id}[{tax_name}] not found.". format(**row)) if rank not in tax.ranks: add_rank(tax, parent_node, rank) node = taxtable.TaxNode(tax_id=row['tax_id'], name=row['tax_name'], rank=rank) parent_node.add_child(node) logging.info("Added %s %s[%s] below %s %s[%s]", node.rank, node.tax_id, node.name, parent_node.rank, parent_node.tax_id, parent_node.name) tax.write_taxtable(args.out_file) return 0
def action(args): with args.taxtable as fp: tax = taxtable.read(fp) with args.extra_nodes_csv: reader = csv.DictReader(args.extra_nodes_csv) missing_fields = frozenset( ['tax_id', 'tax_name', 'rank', 'parent_id']) - frozenset(reader.fieldnames) if missing_fields: raise IOError("Missing expected fields: {0}".format( ','.join(missing_fields))) for row in reader: if row['tax_id'] in tax.index: logging.warn("tax_id %s already represented in taxtable. [row %d]", row['tax_id'], reader.line_num) continue parent_id = row['parent_id'] rank = row['rank'] try: parent_node = tax.get_node(parent_id) except ValueError: raise ValueError( "Parent {parent_id} of {tax_id}[{tax_name}] not found.".format(**row)) if rank not in tax.ranks: add_rank(tax, parent_node, rank) node = taxtable.TaxNode( tax_id=row['tax_id'], name=row['tax_name'], rank=rank) parent_node.add_child(node) logging.info( "Added %s %s[%s] below %s %s[%s]", node.rank, node.tax_id, node.name, parent_node.rank, parent_node.tax_id, parent_node.name) tax.write_taxtable(args.out_file) return 0
def add_clusters_to_refpkg(refpkg, **kwargs): with refpkg.open_resource('taxonomy') as tax_fp: tax = taxtable.read(tax_fp) with refpkg.open_resource('seq_info') as sinfo_fp: reader = csv.DictReader(sinfo_fp) sinfo = list(reader) # Annotate add_cluster_taxids(tax, sinfo, **kwargs) with util.ntf(prefix='seq_info-', suffix='.csv') as seqinfo_tf, \ util.ntf(prefix='taxonomy-', suffix='.csv') as tax_tf: w = csv.DictWriter(seqinfo_tf, reader.fieldnames) w.writeheader() w.writerows(sinfo) seqinfo_tf.close() tax.write_taxtable(tax_tf) tax_tf.close() refpkg.start_transaction() refpkg.update_file('seq_info', seqinfo_tf.name) refpkg.update_file('taxonomy', tax_tf.name) refpkg.commit_transaction()
def action(args): log_writer = None if args.log: log_writer = csv.DictWriter(args.log, ['seqname', 'orig_tax_id', 'renamed_tax_id', 'renamed_tax_name', 'best_hit', 'pct_id', 'applied'], quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n') log_writer.writeheader() # Load all tax_ids with args.taxtable as fp: new_tax = taxtable.read(fp) with args.seq_info as fp: new_seq_info = {row['seqname']: row for row in csv.DictReader(fp)} with args.refpkg.open_resource('aln_fasta') as fp: ref_sequences = [ungap(i) for i in SeqIO.parse(fp, 'fasta')] with args.refpkg.open_resource('seq_info') as fp: ref_seq_info_reader = csv.DictReader(fp) ref_seq_info = {row['seqname']: row for row in ref_seq_info_reader} with args.refpkg.open_resource('taxonomy') as fp: ref_taxonomy = taxtable.read(fp) search = functools.partial(uclust.search, pct_id=args.percent_id, search_pct_id=0.9, quiet=True) # Search the sequences from the reference package against the input sequences with util.as_fasta(ref_sequences) as ref_fasta_path, util.ntf(prefix='uclust') as tf: search(args.fasta_file, ref_fasta_path, tf.name) input_records = uclust.parse_uclust_out(i for i in tf if i.startswith('H')) # Also search sequences from the reference package against themselves # TODO: decide if we want to use this #with util.ntf(prefix='uclust') as self_tf: #search(ref_fasta_path, ref_fasta_path, self_tf.name, maxaccepts=10) #ref_records = uclust.parse_uclust_out(i for i in self_tf if i.startswith('H')) ## Drop self-hits #ref_records = (i for i in ref_records if i.query_label != i.target_label) #grouped = itertools.groupby(ref_records, operator.attrgetter('query_label')) #best_hit_id = dict((g, max(i.pct_id for i in v)) for g, v in grouped) for record in input_records: ref_si = ref_seq_info[record.query_label] target_si = new_seq_info[record.target_label] #if record.pct_id > best_hit_id.get(record.query_label, 0.0): tax_id = target_si['tax_id'] node = new_tax.get_node(tax_id) if log_writer: log_record = {'seqname': record.query_label, 'best_hit': record.target_label, 'pct_id': record.pct_id, 'orig_tax_id': ref_si['tax_id'], 'renamed_tax_id': node.tax_id, 'renamed_tax_name': node.name, 'applied': not ref_si['tax_id'] or args.conflict_action == 'replace'} log_writer.writerow(log_record) logging.info('Naming %s %s[%s,%s] based on %s (%.2f%%)', ref_si['seqname'], node.name, node.tax_id, node.rank, record.target_label, record.pct_id) if ref_si['tax_id'] and ref_si['tax_id'] != tax_id: old_node = ref_taxonomy.get_node(ref_si['tax_id']) logging.warn('Already named: %s[%s,%s]%s', old_node.name, old_node.tax_id, old_node.rank, ' - replacing' if args.conflict_action == 'replace' else '') if not ref_si['tax_id'] or args.conflict_action == 'replace': ref_si['tax_id'] = target_si['tax_id'] if tax_id not in ref_taxonomy.index: add_to_taxonomy(ref_taxonomy, node) # Write updated taxtable, seqinfo with util.ntf(prefix='taxonomy-', suffix='.csv') as new_tax, \ util.ntf(prefix='seq_info-', suffix='.csv') as new_seq_info: ref_taxonomy.write_taxtable(new_tax) new_tax.close() w = csv.DictWriter(new_seq_info, ref_seq_info_reader.fieldnames) w.writeheader() w.writerows(ref_seq_info.values()) new_seq_info.close() args.refpkg.start_transaction() args.refpkg.update_file('taxonomy', new_tax.name) args.refpkg.update_file('seq_info', new_seq_info.name) args.refpkg.commit_transaction()
def action(args): logging.info("Loading taxtable") with args.search_taxtable as fp: full_taxonomy = taxtable.read(fp) logging.info("Loading chosen sequence metadata") chosen_taxonomy = copy.deepcopy(full_taxonomy) chosen_taxonomy.populate_from_seqinfo(args.chosen_seqinfo) chosen_taxonomy.prune_unrepresented() logging.info("loading full sequence metadata") full_taxonomy.populate_from_seqinfo(args.search_seqinfo) if args.exclude_taxids: for e in args.exclude_taxids: e = e.strip() logging.info('ignoring tax_id {}'.format(e)) full_taxonomy.get_node(e).remove_subtree() # Find lonely nodes = [i for i in chosen_taxonomy if i.rank == args.lonely_rank] lonely_nodes = [i for i in nodes if is_lonely(i)] additional_reps = set() futs = [] with futures.ThreadPoolExecutor(args.threads) as executor: for node in lonely_nodes: futs.append( executor.submit(fill_lonely_worker, node.tax_id, node.at_rank(args.parent_rank).tax_id, full_taxonomy, args.search_fasta, n_reps=args.number_of_reps)) while futs: try: done, pending = futures.wait(futs, 1, futures.FIRST_COMPLETED) futs = set(pending) for f in done: if f.exception(): raise f.exception() additional_reps |= f.result() sys.stderr.write("{0:6d}/{1:6d} complete \r".format( len(lonely_nodes) - len(pending), len(lonely_nodes))) except futures.TimeoutError: pass # Keep waiting except: logging.exception("Caught error in child thread - exiting") executor.shutdown(False) raise if args.include_taxids: for t in args.include_taxids: t = t.strip() logging.info('including tax_id {}'.format(t)) for s in set(full_taxonomy.get_node(t).subtree_sequence_ids()): logging.info('sequence {}'.format(s)) additional_reps.add(s) logging.info("%d additional references", len(additional_reps)) with open(args.chosen_fasta) as fp, args.output as ofp: shutil.copyfileobj(fp, ofp) wrap.esl_sfetch(args.search_fasta, additional_reps, ofp) with args.chosen_seqinfo as fp, args.output_seqinfo as ofp, \ args.search_seqinfo as sub_fp: fp.seek(0) r = csv.DictReader(fp) w = csv.DictWriter(ofp, r.fieldnames, quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n') w.writeheader() w.writerows(r) args.search_seqinfo.seek(0) for row in csv.DictReader(sub_fp): if row['seqname'] in additional_reps: w.writerow(row)
def action(args): log_writer = None if args.log: log_writer = csv.DictWriter(args.log, [ 'seqname', 'orig_tax_id', 'renamed_tax_id', 'renamed_tax_name', 'best_hit', 'pct_id', 'applied' ], quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n') log_writer.writeheader() # Load all tax_ids with args.taxtable as fp: new_tax = taxtable.read(fp) with args.seq_info as fp: new_seq_info = {row['seqname']: row for row in csv.DictReader(fp)} with args.refpkg.open_resource('aln_fasta') as fp: ref_sequences = [ungap(i) for i in SeqIO.parse(fp, 'fasta')] with args.refpkg.open_resource('seq_info') as fp: ref_seq_info_reader = csv.DictReader(fp) ref_seq_info = {row['seqname']: row for row in ref_seq_info_reader} with args.refpkg.open_resource('taxonomy') as fp: ref_taxonomy = taxtable.read(fp) search = functools.partial(uclust.search, pct_id=args.percent_id, search_pct_id=0.9, quiet=True) # Search the sequences from the reference package against the input sequences with util.as_fasta(ref_sequences) as ref_fasta_path, util.ntf( prefix='uclust') as tf: search(args.fasta_file, ref_fasta_path, tf.name) input_records = uclust.parse_uclust_out(i for i in tf if i.startswith('H')) # Also search sequences from the reference package against themselves # TODO: decide if we want to use this #with util.ntf(prefix='uclust') as self_tf: #search(ref_fasta_path, ref_fasta_path, self_tf.name, maxaccepts=10) #ref_records = uclust.parse_uclust_out(i for i in self_tf if i.startswith('H')) ## Drop self-hits #ref_records = (i for i in ref_records if i.query_label != i.target_label) #grouped = itertools.groupby(ref_records, operator.attrgetter('query_label')) #best_hit_id = dict((g, max(i.pct_id for i in v)) for g, v in grouped) for record in input_records: ref_si = ref_seq_info[record.query_label] target_si = new_seq_info[record.target_label] #if record.pct_id > best_hit_id.get(record.query_label, 0.0): tax_id = target_si['tax_id'] node = new_tax.get_node(tax_id) if log_writer: log_record = { 'seqname': record.query_label, 'best_hit': record.target_label, 'pct_id': record.pct_id, 'orig_tax_id': ref_si['tax_id'], 'renamed_tax_id': node.tax_id, 'renamed_tax_name': node.name, 'applied': not ref_si['tax_id'] or args.conflict_action == 'replace' } log_writer.writerow(log_record) logging.info('Naming %s %s[%s,%s] based on %s (%.2f%%)', ref_si['seqname'], node.name, node.tax_id, node.rank, record.target_label, record.pct_id) if ref_si['tax_id'] and ref_si['tax_id'] != tax_id: old_node = ref_taxonomy.get_node(ref_si['tax_id']) logging.warn( 'Already named: %s[%s,%s]%s', old_node.name, old_node.tax_id, old_node.rank, ' - replacing' if args.conflict_action == 'replace' else '') if not ref_si['tax_id'] or args.conflict_action == 'replace': ref_si['tax_id'] = target_si['tax_id'] if tax_id not in ref_taxonomy.index: add_to_taxonomy(ref_taxonomy, node) # Write updated taxtable, seqinfo with util.ntf(prefix='taxonomy-', suffix='.csv') as new_tax, \ util.ntf(prefix='seq_info-', suffix='.csv') as new_seq_info: ref_taxonomy.write_taxtable(new_tax) new_tax.close() w = csv.DictWriter(new_seq_info, ref_seq_info_reader.fieldnames) w.writeheader() w.writerows(ref_seq_info.values()) new_seq_info.close() args.refpkg.start_transaction() args.refpkg.update_file('taxonomy', new_tax.name) args.refpkg.update_file('seq_info', new_seq_info.name) args.refpkg.commit_transaction()