Пример #1
0
def gb2info(seqname, seq, record):
    return {
        'seqname': seqname,
        'tax_id': tax_of_genbank(record),
        'accession': record.id,
        'description': record.description,
        'length': len(seq),
        'ambig_count': count_ambiguous(seq)
    }
Пример #2
0
def gb2info(seqname, seq, record):
    return {
        "seqname": seqname,
        "tax_id": tax_of_genbank(record),
        "accession": record.id,
        "description": record.description,
        "length": len(seq),
        "ambig_count": count_ambiguous(seq),
    }
Пример #3
0
def action(args):
    records = islice(SeqIO.parse(args.infile, "genbank"), args.limit)

    if args.type_strains:
        records = (r for r in records if is_type(r))

    if args.filter:
        fltr = lambda r: not UNCLASSIFIED_REGEX.search(r.description)
        records = (r for r in records if fltr(r))

    info = []

    if args.features:
        args.features = set(args.features)
        # Parse out product locations
        for r in records:
            for f in r.features:
                products = set(f.qualifiers.get("product", []))
                if products & args.features:
                    tag = f.qualifiers.get("locus_tag", ["unspecified"])[0]
                    name = "{}_{}".format(r.name, tag)
                    start, end = f.location.start.position, f.location.end.position
                    seq = r.seq[start:end]
                    length = len(seq)

                    if (args.minus and f.location.strand == 1) or f.location.strand == -1:
                        seq = seq.reverse_complement()

                    ambig_count = count_ambiguous(seq)

                    if length < args.min_length:
                        log.warning("dropping seq {} because of length {}".format(name, length))
                        log.debug("Record and Feature information for short seq:")
                        log.debug(r)
                        log.debug(f)
                    elif ambig_count > args.max_ambiguous:
                        log.warning("dropping seq {} because of {} ambiguous bases".format(name, ambig_count))
                    else:
                        args.out.write(">{} {} {}\n{}\n".format(name, r.id, r.description, seq))
                        info.append(gb2info(name, seq, r))
    else:
        # if no product specified output entire seq
        for r in records:
            if args.region:
                start, end = args.region[0], args.region[1]
                seq = r.seq[start:end]
                name = "{}_{}_{}".format(r.name, start, end)
            else:
                seq = r.seq
                name = r.name

            length = len(seq)

            if length < args.min_length:
                log.warning("dropping seq {} because of length {}".format(name, length))
                log.debug("Record and Feature information for short seq:")
                log.debug(r)
                log.debug(f)
            else:
                type_source = lambda f: f.type == "source"
                src = next((f for f in r.features if type_source(f)), None)
                if src and ((args.minus and src.location.strand == 1) or src.location.strand == -1):
                    seq = seq.reverse_complement()

                args.out.write(">{} {} {}\n{}\n".format(name, r.id, r.description, seq))
                info.append(gb2info(name, seq, r))

    if args.info_out:
        args.info_out.writeheader()
        args.info_out.writerows(info)
Пример #4
0
def action(args):
    records = islice(SeqIO.parse(args.infile, 'genbank'), args.limit)

    if args.type_strains:
        records = (r for r in records if is_type(r))

    if args.filter:
        fltr = lambda r: not UNCLASSIFIED_REGEX.search(r.description)
        records = (r for r in records if fltr(r))

    info = []

    if args.features:
        args.features = set(args.features)
        # Parse out product locations
        for r in records:
            for f in r.features:
                products = set(f.qualifiers.get('product', []))
                if products & args.features:
                    tag = f.qualifiers.get('locus_tag', ['unspecified'])[0]
                    name = '{}_{}'.format(r.name, tag)
                    start, end = f.location.start.position, f.location.end.position
                    seq = r.seq[start:end]
                    length = len(seq)

                    if (args.minus and f.location.strand
                            == 1) or f.location.strand == -1:
                        seq = seq.reverse_complement()

                    ambig_count = count_ambiguous(seq)

                    if length < args.min_length:
                        log.warning(
                            'dropping seq {} because of length {}'.format(
                                name, length))
                        log.debug(
                            'Record and Feature information for short seq:')
                        log.debug(r)
                        log.debug(f)
                    elif ambig_count > args.max_ambiguous:
                        log.warning(
                            'dropping seq {} because of {} ambiguous bases'.
                            format(name, ambig_count))
                    else:
                        args.out.write('>{} {} {}\n{}\n'.format(
                            name, r.id, r.description, seq))
                        info.append(gb2info(name, seq, r))
    else:
        # if no product specified output entire seq
        for r in records:
            if args.region:
                start, end = args.region[0], args.region[1]
                seq = r.seq[start:end]
                name = '{}_{}_{}'.format(r.name, start, end)
            else:
                seq = r.seq
                name = r.name

            length = len(seq)

            if length < args.min_length:
                log.warning('dropping seq {} because of length {}'.format(
                    name, length))
                log.debug('Record and Feature information for short seq:')
                log.debug(r)
                log.debug(f)
            else:
                type_source = lambda f: f.type == 'source'
                src = next((f for f in r.features if type_source(f)), None)
                if src and ((args.minus and src.location.strand == 1)
                            or src.location.strand == -1):
                    seq = seq.reverse_complement()

                args.out.write('>{} {} {}\n{}\n'.format(
                    name, r.id, r.description, seq))
                info.append(gb2info(name, seq, r))

    if args.info_out:
        args.info_out.writeheader()
        args.info_out.writerows(info)