Пример #1
0
def hints(args: argparse.Namespace) -> None:
    gff_to_hints = get_hints_map(args)

    type_to_trim = get_trim_map(args)
    type_to_priority = get_priority_map(args)

    gff = GFF.parse(args.infile)
    for parent in gff.select_type(args.group_level):
        group_name = fmap(lambda a: getattr(a, "id"), parent.attributes)

        if group_name is None:
            raise GPMissingID(
                "One of the selected records doesn't have an ID. "
                f"The offending line is {parent}.")

        for feature in gff.traverse_children([parent]):
            hint_feature = transform_child(
                feature,
                group_name,
                gff_to_hints,
                type_to_trim,
                type_to_priority,
                args.source,
                args.priority,
            )

            if hint_feature is not None:
                print(hint_feature, file=args.outfile)
    return
Пример #2
0
def add_parents(args: argparse.Namespace) -> None:

    gff = GFF.parse(args.infile)
    gff.infer_missing_parents()

    for f in gff.select_type("mRNA"):
        if len(f.parents) > 0:
            continue

        if f.attributes is None:
            continue

        if f.attributes.id is None:
            continue

        id_ = f.attributes.id
        gene_id = f"gene.{id_}"
        gene = GFF3Record.infer_from_children([f], id=gene_id, type="gene")
        f.add_parent(gene)
        gff.add_record(gene)

    print("##gff-version 3", file=args.outfile)
    for feature in gff.traverse_children(sort=True):
        print(feature, file=args.outfile)
    return
Пример #3
0
def write_gff(
    gff: GFF,
    outfile: TextIO,
) -> None:
    print("#gff-version 3", file=outfile)
    for feature in gff.traverse_children(sort=True, breadth=True):
        print(feature, file=outfile)
    return
Пример #4
0
def make_new_subsequence(
        seqid: str, start: int, end: int, gene_itree: IntervalTree,
        hint_itree: Optional[IntervalTree], gene_features: Sequence[GFFRecord],
        hint_features: Optional[Sequence[GFFRecord]],
        seq: SeqRecord) -> Tuple[str, SeqRecord, GFF, Optional[GFF]]:

    gene_intervals = gene_itree[start:end]

    min_gene_start = min(f.begin for f in gene_intervals) - 10
    max_gene_end = max(f.end for f in gene_intervals) + 10

    start = min([start, min_gene_start])
    if start < 0:
        start = 0

    end = max([end, max_gene_end])
    if end > len(seq):
        end = len(seq)

    if hint_itree is None:
        hint_intervals = None
    else:
        hint_intervals = [
            i for i in hint_itree[start:end]
            if i.begin >= start and i.end <= end
        ]

    name = f"{seqid}:{start}-{end}"

    subseq = FeatureLocation(start, end, 1).extract(seq)
    subseq.id = name
    subseq.name = name
    subseq.description = name

    subgenes = GFF([gene_features[i.data] for i in gene_intervals])
    subgenes_shifted = shift_gff(subgenes, name, start)

    if hint_intervals is None or hint_features is None:
        subhints_shifted = None
    else:
        subhints = GFF([hint_features[i.data] for i in hint_intervals])
        subhints_shifted = shift_gff(subhints, name, start)

    return name, subseq, subgenes_shifted, subhints_shifted
Пример #5
0
def main():
    args = cli(sys.argv[0], sys.argv[1:])
    gff = GFF.parse(args.infile)

    itree = gff_to_itree(gff.select_type(args.group_level))

    for mrna in gff.select_type(args.group_level):

        if mrna.attributes is None:
            mrna.attributes = GFF3Attributes()

        failed_antifam = "antifam_match" in mrna.attributes.custom
        if failed_antifam:
            mrna.attributes.custom["is_unreliable"] = "true"
            mrna.attributes.custom["should_exclude"] = "true"

        length, aligned = deal_with_kids(
            gff.traverse_children([mrna]),
            args.type,
            0,
            defaultdict(int)
        )

        coverages = find_coverages(aligned, length)
        supported = [k for k, v in coverages.items() if v > args.min_cov]
        not_supported = ((len(supported) == 0) or
                         (all(s in args.exclude for s in supported)))

        is_novel = is_novel_locus(mrna, itree, args.threshold)

        if not_supported:
            mrna.attributes.custom["is_unreliable"] = "true"
            if not is_novel:
                mrna.attributes.custom["should_exclude"] = "true"

        if args.stats:
            line = coverages
            line["id"] = mrna.attributes.id
            line["length"] = length
            line["is_supported"] = not not_supported
            line["is_novel_locus"] = is_novel
            line["antifam_match"] = failed_antifam
            line["excluded"] = (failed_antifam
                                or (not_supported and not is_novel))

            print(json.dumps(line), file=args.stats)

    kept, dropped = split_gffs(gff, args.group_level)

    write_gff(kept, args.outfile)

    if args.filtered is not None:
        write_gff(dropped, args.filtered)
    return
Пример #6
0
def ncbi(args: argparse.Namespace) -> None:
    gff = GFF.parse(args.infile).break_bubbles()
    so = Ontology.from_obo_library(args.so)

    name_to_so = {term.name: term for term in so.values()}

    add_so_as_ontologies(gff, name_to_so)
    add_ncrna_types(gff, name_to_so, so, NCRNA_TYPES)
    add_pseudogene_types(gff, name_to_so, so, PSEUDOGENE_TYPES)

    return
Пример #7
0
def get_blocks(
    seqs: Dict[str, SeqRecord], genes: GFF, hints: Optional[GFF], pad: int,
    merge: bool, target_ids: Optional[Set[str]]
) -> Iterator[Tuple[str, SeqRecord, GFF, Optional[GFF]]]:

    hints_by_seqid = fmap(groupby_seqid, hints)

    for seqid, gene_features in groupby_seqid(
            genes.select_type("gene")).items():
        if seqid not in seqs:
            continue

        gene_itree = IntervalTree(gffrecords_to_intervals(gene_features))
        try:
            hint_itree = fmap(
                IntervalTree,
                fmap(gffrecords_to_intervals,
                     fmap(lambda h: h.get(seqid, []), hints_by_seqid)))
        except ValueError as e:
            if hints_by_seqid is not None:
                print(seqid, hints_by_seqid.get(seqid, []))
            raise e

        if target_ids is None:
            target_features: List[GFFRecord] = gene_features
        else:
            target_features = subset_features_by_id(gene_features, target_ids)

        if merge:
            block_iter = merge_overlapping(target_features, pad)
        else:
            block_iter = pad_features(target_features, pad)

        seen: Set[Tuple[int, int]] = set()
        for start, end in block_iter:
            if (start, end) in seen:
                continue
            else:
                seen.add((start, end))

            yield make_new_subsequence(
                seqid,
                start,
                end,
                gene_itree,
                hint_itree,
                gene_features,
                fmap(lambda h: h.get(seqid, []), hints_by_seqid),
                seqs[seqid],
            )

    return
Пример #8
0
def split_gffs(gff: GFF, group_level: str) -> Tuple[GFF, GFF]:
    keep: Set[GFF3Record] = set()
    drop: Set[GFF3Record] = set()

    for mrna in gff.select_type(group_level):
        if mrna.attributes is None:
            should_keep = True
        elif "should_exclude" in mrna.attributes.custom:
            should_keep = False
        else:
            should_keep = True

        if should_keep:
            keep.update(gff.traverse_children([mrna]))
            keep.update(gff.traverse_parents([mrna]))
        else:
            drop.update(gff.traverse_children([mrna]))
            drop.update(gff.traverse_parents([mrna]))

    kept = prune_gff(keep)
    dropped = prune_gff(drop)
    return kept, dropped
Пример #9
0
def shift_gff(gff: GFF, seqid: str, start: int) -> GFF:
    out = list()

    for feature in gff.traverse_children(sort=True):
        f = deepcopy(feature)
        f.start -= start
        f.end -= start
        f.seqid = seqid
        if f.attributes is not None:
            if f.attributes.id is not None:
                f.attributes.id = seqid + "_" + f.attributes.id

            new_parents = []
            for parent in f.attributes.parent:
                new_parent = seqid + "_" + parent
                new_parents.append(new_parent)

            f.attributes.parent = new_parents

        out.append(f)

    return GFF(out)
Пример #10
0
def expandcds(args: argparse.Namespace) -> None:

    gff = GFF.parse(args.infile)

    if args.infasta is None:
        seqs = None
    else:
        seqs = SeqIO.to_dict(SeqIO.parse(args.infasta, format="fasta"))

    codon_table = CodonTable.unambiguous_dna_by_id[args.gencode]

    cds_parents: Set[GFF3Record] = set()
    for record in gff.select_type(args.cds_type):
        cds_parents.update((cast(GFF3Record, p) for p in record.parents))

    for parent in cds_parents:
        cdss = sorted([
            cast(GFF3Record, f)
            for f in parent.children if f.type == args.cds_type
        ],
                      key=lambda f: (f.start, f.end))

        strand = find_strand(cdss, parent)

        if args.start:
            bump_start(cdss, strand)

        if args.stop:
            bump_end(cdss, strand)

        if seqs is not None and parent.seqid in seqs:

            start_codon = check_start(cdss, parent, strand, seqs, codon_table)

            if start_codon is not None:
                print(start_codon, file=args.warnings)

            stop_codon = check_stop(cdss, parent, strand, seqs, codon_table)

            if stop_codon is not None:
                print(stop_codon, file=args.warnings)

    child_cdss: Sequence[GFF3Record] = list(gff.select_type(args.cds_type))
    for parent in gff.traverse_parents(child_cdss):
        parent.expand_to_children()

    print("##gff-version 3", file=args.outfile)
    for feature in gff.traverse_children(sort=True):
        print(feature, file=args.outfile)
    return
Пример #11
0
def run_extract(args) -> None:

    if args.good is not None:
        good_ids: Optional[Set[str]] = get_good_ids(args.good)
    else:
        good_ids = None

    in_gff = GFF.parse(args.gff)
    in_fasta = SeqIO.to_dict(SeqIO.parse(args.fasta, format="fasta"))

    if args.hints is not None:
        in_hints = GFF.parse(args.hints)
    else:
        in_hints = None

    block_iter = get_blocks(in_fasta, in_gff, in_hints, args.pad, args.merge,
                            good_ids)

    print("##gff-version 3", file=args.outgff)

    if args.outhints is not None:
        print("##gff-version 3", file=args.outhints)

    for name, seq, genes, hints in block_iter:
        SeqIO.write(seq, args.outfasta, format="fasta")

        sr = f"##sequence-region   {name} 1 {len(seq)}"
        print(sr, file=args.outgff)
        print(genes, file=args.outgff)

        if (args.outhints is not None and hints is not None
                and len(hints.inner) > 0):
            print(sr, file=args.outhints)
            print(hints, file=args.outhints)

    return
Пример #12
0
def trnascan2gff(args: argparse.Namespace) -> None:
    genes: List[GFF3Record] = []

    matches = TRNAScanRecord.from_file(args.txt)
    sses = TRNAScanSS.from_file(args.ss)
    num_to_ss = {f"{r.seqid}.{r.num}": r for r in sses}

    for match in matches:
        ss = num_to_ss[f"{match.seqid}.{match.num}"]

        if match.note is not None and "pseudo" in match.note:
            type_ = "pseudogene"
        else:
            type_ = "tRNA_gene"

        gene = match_to_gene(match, args.source, type=type_)
        genes.append(gene)

        trna = match_to_trna(
            match,
            ss,
            args.source,
            type_map=TYPE_MAP,
            parents=[gene]
        )
        genes.append(trna)

        introns = match_to_introns(
            match,
            args.source,
            type="tRNA_intron",
            parents=[trna]
        )
        genes.extend(introns)

        anticodon = match_to_anticodon(
            match,
            ss,
            args.source,
            type="anticodon",
            parents=[trna]
        )
        genes.append(anticodon)

    for record in GFF(genes).traverse_children(sort=True):
        print(record, file=args.outfile)

    return
Пример #13
0
def prune_gff(records: Set[GFF3Record]) -> GFF:

    new_records: Dict[GFF3Record, GFF3Record] = dict()

    # Create a mapping from old to new objects
    # to preserve hashing/lookup capability
    for record in records:
        new_record = deepcopy(record)
        new_record.children = []
        new_record.parents = []

        new_records[record] = new_record

    for record in records:
        new_record = new_records[record]

        for parent in record.parents:
            # Don't add parents that shouldn't be in this set
            if parent not in records:
                continue

            new_parent = new_records[cast(GFF3Record, parent)]
            new_record.add_parent(new_parent)

        for child in record.children:
            # Don't add children that shouldn't be in this set
            if child not in records:
                continue

            new_child = new_records[cast(GFF3Record, child)]
            new_record.add_child(new_child)

        # Update the record parent IDS to reflect the new split set.
        if new_record.attributes is not None:
            new_record.attributes.parent = []
            for parent in new_record.parents:
                # This should always be true, as the ID is necessary
                assert parent.attributes is not None
                assert parent.attributes.id is not None
                new_record.attributes.parent.append(parent.attributes.id)
        else:
            # This necessarily should be true, since attributes define parent
            # child relationships.
            assert len(new_record.children) == 0
            assert len(new_record.parents) == 0

    return GFF(list(new_records.values()))
Пример #14
0
def rnammer2gff(args: argparse.Namespace) -> None:
    records: List[GFFRecord] = []

    for line in args.infile:
        if line.startswith("#"):
            continue

        sline = line.strip().split("\t")
        rrna_type = sline[8]
        new_type = TYPE_MAP[args.kingdom][rrna_type.lower()]
        sline[1] = args.source
        sline[2] = new_type
        sline[8] = "."

        rna_record = GFFRecord.parse("\t".join(sline))
        gene_record = deepcopy(rna_record)
        gene_record.type = "rRNA_gene"
        gene_record.add_child(rna_record)

        records.append(gene_record)
        records.append(rna_record)

    num = 0
    for record in GFF(records).traverse_children(sort=True):
        if record.attributes is None:
            attr = GFFAttributes()
            record.attributes = attr
        else:
            attr = record.attributes

        if record.type == "rRNA_gene":
            num += 1
            attr.id = f"rRNA_gene{num}"
        else:
            attr.id = f"rRNA{num}"
            attr.parent = [
                p.attributes.id for p in record.parents
                if (p.attributes is not None and p.attributes.id is not None)
            ]

        print(record, file=args.outfile)

    return
Пример #15
0
def select(args: argparse.Namespace) -> None:

    gff = GFF.parse(args.infile)
    ids = {l.strip() for l in args.ids}

    to_keep: Set[GFF3Record] = set()

    for record in gff:
        if record.attributes is not None and record.attributes.id in ids:
            to_keep.update(
                cast(Iterator[GFF3Record], record.traverse_parents()))

            to_keep.update(
                cast(Iterator[GFF3Record], record.traverse_children()))

    pruned = prune_gff(to_keep)

    print("#gff-version 3", file=args.outfile)
    for feature in pruned.traverse_children(sort=True):
        print(feature, file=args.outfile)
Пример #16
0
def main():
    args = cli(sys.argv[0], sys.argv[1:])

    gff = GFF.parse(args.ingff)
    seqs = SeqIO.to_dict(SeqIO.parse(args.infasta, format="fasta"))

    counter = 1
    for region in gff:
        region.type = args.type
        region.source = args.source
        region.strand = Strand.UNSTRANDED

        region.attributes.id = f"{args.type}{counter}"

        sf = gff_to_seqfeature(region)
        seq = sf.extract(seqs[region.seqid])

        base_counts = count_frequencies(seq)
        region.attributes.custom = base_counts

        counter += 1

        print(region, file=args.outfile)
    return
Пример #17
0
def main():
    args = cli(sys.argv[0], sys.argv[1:])

    rows = list()
    for line in args.infile:
        if line.startswith("#"):
            continue
        record = GFFRecord.parse(line)
        record.attributes.id = record.attributes.name
        record.attributes.name = None
        record.attributes.custom = {}
        rows.append(record)

    gff = GFF(rows)
    gff.infer_missing_parents()

    counter = 1

    for mrna in gff.select_type("mRNA"):
        if len(mrna.children) < 2:
            continue

        region = deepcopy(mrna)
        region.type = "repeat_region"
        region.attributes.id = f"repeat_region{counter}"
        region.attributes.ontology_term = ["SO:0000657"],

        mrna.type = "helitron"
        mrna.parents = [region]
        mrna.attributes.parent = [region.attributes.id]
        mrna.attributes.id = f"helitron{counter}"
        mrna.attributes.ontology_term = ["SO:0000544", "SO:helitron"]
        mrna.attributes.custom = {}

        flank3 = [c for c in mrna.children
                  if c.attributes.id.endswith(".3")][0]
        flank5 = [c for c in mrna.children
                  if c.attributes.id.endswith(".5.1")][0]

        flank3.type = "three_prime_flanking_region"
        flank3.attributes.ontology_term = [
            "SO:0001417",
            "SO:three_prime_flanking_region",
            "SO:0000364",
            "SO:transposable_element_flanking_region"
        ]
        flank3.attributes.id = None
        flank3.attributes.parent = [mrna.attributes.id]

        flank5.type = "five_prime_flanking_region"
        flank5.attributes.ontology_term = [
            "SO:0001416",
            "SO:five_prime_flanking_region",
            "SO:0000364",
            "SO:transposable_element_flanking_region"
        ]
        flank5.attributes.id = None
        flank5.attributes.parent = [mrna.attributes.id]

        mrna.source = flank5.source

        print(region, file=args.outfile)
        print(mrna, file=args.outfile)
        if mrna.strand == Strand.MINUS:
            print(flank3, file=args.outfile)
            print(flank5, file=args.outfile)
        else:
            print(flank5, file=args.outfile)
            print(flank3, file=args.outfile)

        counter += 1

    return