def gb2info(seqname, seq, record): return { 'seqname': seqname, 'tax_id': tax_of_genbank(record), 'accession': record.id, 'description': record.description, 'length': len(seq), 'ambig_count': count_ambiguous(seq) }
def gb2info(seqname, seq, record): return { "seqname": seqname, "tax_id": tax_of_genbank(record), "accession": record.id, "description": record.description, "length": len(seq), "ambig_count": count_ambiguous(seq), }
def action(args): records = islice(SeqIO.parse(args.infile, "genbank"), args.limit) if args.type_strains: records = (r for r in records if is_type(r)) if args.filter: fltr = lambda r: not UNCLASSIFIED_REGEX.search(r.description) records = (r for r in records if fltr(r)) info = [] if args.features: args.features = set(args.features) # Parse out product locations for r in records: for f in r.features: products = set(f.qualifiers.get("product", [])) if products & args.features: tag = f.qualifiers.get("locus_tag", ["unspecified"])[0] name = "{}_{}".format(r.name, tag) start, end = f.location.start.position, f.location.end.position seq = r.seq[start:end] length = len(seq) if (args.minus and f.location.strand == 1) or f.location.strand == -1: seq = seq.reverse_complement() ambig_count = count_ambiguous(seq) if length < args.min_length: log.warning("dropping seq {} because of length {}".format(name, length)) log.debug("Record and Feature information for short seq:") log.debug(r) log.debug(f) elif ambig_count > args.max_ambiguous: log.warning("dropping seq {} because of {} ambiguous bases".format(name, ambig_count)) else: args.out.write(">{} {} {}\n{}\n".format(name, r.id, r.description, seq)) info.append(gb2info(name, seq, r)) else: # if no product specified output entire seq for r in records: if args.region: start, end = args.region[0], args.region[1] seq = r.seq[start:end] name = "{}_{}_{}".format(r.name, start, end) else: seq = r.seq name = r.name length = len(seq) if length < args.min_length: log.warning("dropping seq {} because of length {}".format(name, length)) log.debug("Record and Feature information for short seq:") log.debug(r) log.debug(f) else: type_source = lambda f: f.type == "source" src = next((f for f in r.features if type_source(f)), None) if src and ((args.minus and src.location.strand == 1) or src.location.strand == -1): seq = seq.reverse_complement() args.out.write(">{} {} {}\n{}\n".format(name, r.id, r.description, seq)) info.append(gb2info(name, seq, r)) if args.info_out: args.info_out.writeheader() args.info_out.writerows(info)
def action(args): records = islice(SeqIO.parse(args.infile, 'genbank'), args.limit) if args.type_strains: records = (r for r in records if is_type(r)) if args.filter: fltr = lambda r: not UNCLASSIFIED_REGEX.search(r.description) records = (r for r in records if fltr(r)) info = [] if args.features: args.features = set(args.features) # Parse out product locations for r in records: for f in r.features: products = set(f.qualifiers.get('product', [])) if products & args.features: tag = f.qualifiers.get('locus_tag', ['unspecified'])[0] name = '{}_{}'.format(r.name, tag) start, end = f.location.start.position, f.location.end.position seq = r.seq[start:end] length = len(seq) if (args.minus and f.location.strand == 1) or f.location.strand == -1: seq = seq.reverse_complement() ambig_count = count_ambiguous(seq) if length < args.min_length: log.warning( 'dropping seq {} because of length {}'.format( name, length)) log.debug( 'Record and Feature information for short seq:') log.debug(r) log.debug(f) elif ambig_count > args.max_ambiguous: log.warning( 'dropping seq {} because of {} ambiguous bases'. format(name, ambig_count)) else: args.out.write('>{} {} {}\n{}\n'.format( name, r.id, r.description, seq)) info.append(gb2info(name, seq, r)) else: # if no product specified output entire seq for r in records: if args.region: start, end = args.region[0], args.region[1] seq = r.seq[start:end] name = '{}_{}_{}'.format(r.name, start, end) else: seq = r.seq name = r.name length = len(seq) if length < args.min_length: log.warning('dropping seq {} because of length {}'.format( name, length)) log.debug('Record and Feature information for short seq:') log.debug(r) log.debug(f) else: type_source = lambda f: f.type == 'source' src = next((f for f in r.features if type_source(f)), None) if src and ((args.minus and src.location.strand == 1) or src.location.strand == -1): seq = seq.reverse_complement() args.out.write('>{} {} {}\n{}\n'.format( name, r.id, r.description, seq)) info.append(gb2info(name, seq, r)) if args.info_out: args.info_out.writeheader() args.info_out.writerows(info)