예제 #1
0
def hints(args: argparse.Namespace) -> None:
    gff_to_hints = get_hints_map(args)

    type_to_trim = get_trim_map(args)
    type_to_priority = get_priority_map(args)

    gff = GFF.parse(args.infile)
    for parent in gff.select_type(args.group_level):
        group_name = fmap(lambda a: getattr(a, "id"), parent.attributes)

        if group_name is None:
            raise GPMissingID(
                "One of the selected records doesn't have an ID. "
                f"The offending line is {parent}.")

        for feature in gff.traverse_children([parent]):
            hint_feature = transform_child(
                feature,
                group_name,
                gff_to_hints,
                type_to_trim,
                type_to_priority,
                args.source,
                args.priority,
            )

            if hint_feature is not None:
                print(hint_feature, file=args.outfile)
    return
예제 #2
0
def add_parents(args: argparse.Namespace) -> None:

    gff = GFF.parse(args.infile)
    gff.infer_missing_parents()

    for f in gff.select_type("mRNA"):
        if len(f.parents) > 0:
            continue

        if f.attributes is None:
            continue

        if f.attributes.id is None:
            continue

        id_ = f.attributes.id
        gene_id = f"gene.{id_}"
        gene = GFF3Record.infer_from_children([f], id=gene_id, type="gene")
        f.add_parent(gene)
        gff.add_record(gene)

    print("##gff-version 3", file=args.outfile)
    for feature in gff.traverse_children(sort=True):
        print(feature, file=args.outfile)
    return
예제 #3
0
def main():
    args = cli(sys.argv[0], sys.argv[1:])
    gff = GFF.parse(args.infile)

    itree = gff_to_itree(gff.select_type(args.group_level))

    for mrna in gff.select_type(args.group_level):

        if mrna.attributes is None:
            mrna.attributes = GFF3Attributes()

        failed_antifam = "antifam_match" in mrna.attributes.custom
        if failed_antifam:
            mrna.attributes.custom["is_unreliable"] = "true"
            mrna.attributes.custom["should_exclude"] = "true"

        length, aligned = deal_with_kids(
            gff.traverse_children([mrna]),
            args.type,
            0,
            defaultdict(int)
        )

        coverages = find_coverages(aligned, length)
        supported = [k for k, v in coverages.items() if v > args.min_cov]
        not_supported = ((len(supported) == 0) or
                         (all(s in args.exclude for s in supported)))

        is_novel = is_novel_locus(mrna, itree, args.threshold)

        if not_supported:
            mrna.attributes.custom["is_unreliable"] = "true"
            if not is_novel:
                mrna.attributes.custom["should_exclude"] = "true"

        if args.stats:
            line = coverages
            line["id"] = mrna.attributes.id
            line["length"] = length
            line["is_supported"] = not not_supported
            line["is_novel_locus"] = is_novel
            line["antifam_match"] = failed_antifam
            line["excluded"] = (failed_antifam
                                or (not_supported and not is_novel))

            print(json.dumps(line), file=args.stats)

    kept, dropped = split_gffs(gff, args.group_level)

    write_gff(kept, args.outfile)

    if args.filtered is not None:
        write_gff(dropped, args.filtered)
    return
예제 #4
0
def ncbi(args: argparse.Namespace) -> None:
    gff = GFF.parse(args.infile).break_bubbles()
    so = Ontology.from_obo_library(args.so)

    name_to_so = {term.name: term for term in so.values()}

    add_so_as_ontologies(gff, name_to_so)
    add_ncrna_types(gff, name_to_so, so, NCRNA_TYPES)
    add_pseudogene_types(gff, name_to_so, so, PSEUDOGENE_TYPES)

    return
예제 #5
0
def expandcds(args: argparse.Namespace) -> None:

    gff = GFF.parse(args.infile)

    if args.infasta is None:
        seqs = None
    else:
        seqs = SeqIO.to_dict(SeqIO.parse(args.infasta, format="fasta"))

    codon_table = CodonTable.unambiguous_dna_by_id[args.gencode]

    cds_parents: Set[GFF3Record] = set()
    for record in gff.select_type(args.cds_type):
        cds_parents.update((cast(GFF3Record, p) for p in record.parents))

    for parent in cds_parents:
        cdss = sorted([
            cast(GFF3Record, f)
            for f in parent.children if f.type == args.cds_type
        ],
                      key=lambda f: (f.start, f.end))

        strand = find_strand(cdss, parent)

        if args.start:
            bump_start(cdss, strand)

        if args.stop:
            bump_end(cdss, strand)

        if seqs is not None and parent.seqid in seqs:

            start_codon = check_start(cdss, parent, strand, seqs, codon_table)

            if start_codon is not None:
                print(start_codon, file=args.warnings)

            stop_codon = check_stop(cdss, parent, strand, seqs, codon_table)

            if stop_codon is not None:
                print(stop_codon, file=args.warnings)

    child_cdss: Sequence[GFF3Record] = list(gff.select_type(args.cds_type))
    for parent in gff.traverse_parents(child_cdss):
        parent.expand_to_children()

    print("##gff-version 3", file=args.outfile)
    for feature in gff.traverse_children(sort=True):
        print(feature, file=args.outfile)
    return
예제 #6
0
def run_extract(args) -> None:

    if args.good is not None:
        good_ids: Optional[Set[str]] = get_good_ids(args.good)
    else:
        good_ids = None

    in_gff = GFF.parse(args.gff)
    in_fasta = SeqIO.to_dict(SeqIO.parse(args.fasta, format="fasta"))

    if args.hints is not None:
        in_hints = GFF.parse(args.hints)
    else:
        in_hints = None

    block_iter = get_blocks(in_fasta, in_gff, in_hints, args.pad, args.merge,
                            good_ids)

    print("##gff-version 3", file=args.outgff)

    if args.outhints is not None:
        print("##gff-version 3", file=args.outhints)

    for name, seq, genes, hints in block_iter:
        SeqIO.write(seq, args.outfasta, format="fasta")

        sr = f"##sequence-region   {name} 1 {len(seq)}"
        print(sr, file=args.outgff)
        print(genes, file=args.outgff)

        if (args.outhints is not None and hints is not None
                and len(hints.inner) > 0):
            print(sr, file=args.outhints)
            print(hints, file=args.outhints)

    return
예제 #7
0
def select(args: argparse.Namespace) -> None:

    gff = GFF.parse(args.infile)
    ids = {l.strip() for l in args.ids}

    to_keep: Set[GFF3Record] = set()

    for record in gff:
        if record.attributes is not None and record.attributes.id in ids:
            to_keep.update(
                cast(Iterator[GFF3Record], record.traverse_parents()))

            to_keep.update(
                cast(Iterator[GFF3Record], record.traverse_children()))

    pruned = prune_gff(to_keep)

    print("#gff-version 3", file=args.outfile)
    for feature in pruned.traverse_children(sort=True):
        print(feature, file=args.outfile)
예제 #8
0
def main():
    args = cli(sys.argv[0], sys.argv[1:])

    gff = GFF.parse(args.ingff)
    seqs = SeqIO.to_dict(SeqIO.parse(args.infasta, format="fasta"))

    counter = 1
    for region in gff:
        region.type = args.type
        region.source = args.source
        region.strand = Strand.UNSTRANDED

        region.attributes.id = f"{args.type}{counter}"

        sf = gff_to_seqfeature(region)
        seq = sf.extract(seqs[region.seqid])

        base_counts = count_frequencies(seq)
        region.attributes.custom = base_counts

        counter += 1

        print(region, file=args.outfile)
    return