예제 #1
0
def make_new_subsequence(
        seqid: str, start: int, end: int, gene_itree: IntervalTree,
        hint_itree: Optional[IntervalTree], gene_features: Sequence[GFFRecord],
        hint_features: Optional[Sequence[GFFRecord]],
        seq: SeqRecord) -> Tuple[str, SeqRecord, GFF, Optional[GFF]]:

    gene_intervals = gene_itree[start:end]

    min_gene_start = min(f.begin for f in gene_intervals) - 10
    max_gene_end = max(f.end for f in gene_intervals) + 10

    start = min([start, min_gene_start])
    if start < 0:
        start = 0

    end = max([end, max_gene_end])
    if end > len(seq):
        end = len(seq)

    if hint_itree is None:
        hint_intervals = None
    else:
        hint_intervals = [
            i for i in hint_itree[start:end]
            if i.begin >= start and i.end <= end
        ]

    name = f"{seqid}:{start}-{end}"

    subseq = FeatureLocation(start, end, 1).extract(seq)
    subseq.id = name
    subseq.name = name
    subseq.description = name

    subgenes = GFF([gene_features[i.data] for i in gene_intervals])
    subgenes_shifted = shift_gff(subgenes, name, start)

    if hint_intervals is None or hint_features is None:
        subhints_shifted = None
    else:
        subhints = GFF([hint_features[i.data] for i in hint_intervals])
        subhints_shifted = shift_gff(subhints, name, start)

    return name, subseq, subgenes_shifted, subhints_shifted
예제 #2
0
def trnascan2gff(args: argparse.Namespace) -> None:
    genes: List[GFF3Record] = []

    matches = TRNAScanRecord.from_file(args.txt)
    sses = TRNAScanSS.from_file(args.ss)
    num_to_ss = {f"{r.seqid}.{r.num}": r for r in sses}

    for match in matches:
        ss = num_to_ss[f"{match.seqid}.{match.num}"]

        if match.note is not None and "pseudo" in match.note:
            type_ = "pseudogene"
        else:
            type_ = "tRNA_gene"

        gene = match_to_gene(match, args.source, type=type_)
        genes.append(gene)

        trna = match_to_trna(
            match,
            ss,
            args.source,
            type_map=TYPE_MAP,
            parents=[gene]
        )
        genes.append(trna)

        introns = match_to_introns(
            match,
            args.source,
            type="tRNA_intron",
            parents=[trna]
        )
        genes.extend(introns)

        anticodon = match_to_anticodon(
            match,
            ss,
            args.source,
            type="anticodon",
            parents=[trna]
        )
        genes.append(anticodon)

    for record in GFF(genes).traverse_children(sort=True):
        print(record, file=args.outfile)

    return
예제 #3
0
def prune_gff(records: Set[GFF3Record]) -> GFF:

    new_records: Dict[GFF3Record, GFF3Record] = dict()

    # Create a mapping from old to new objects
    # to preserve hashing/lookup capability
    for record in records:
        new_record = deepcopy(record)
        new_record.children = []
        new_record.parents = []

        new_records[record] = new_record

    for record in records:
        new_record = new_records[record]

        for parent in record.parents:
            # Don't add parents that shouldn't be in this set
            if parent not in records:
                continue

            new_parent = new_records[cast(GFF3Record, parent)]
            new_record.add_parent(new_parent)

        for child in record.children:
            # Don't add children that shouldn't be in this set
            if child not in records:
                continue

            new_child = new_records[cast(GFF3Record, child)]
            new_record.add_child(new_child)

        # Update the record parent IDS to reflect the new split set.
        if new_record.attributes is not None:
            new_record.attributes.parent = []
            for parent in new_record.parents:
                # This should always be true, as the ID is necessary
                assert parent.attributes is not None
                assert parent.attributes.id is not None
                new_record.attributes.parent.append(parent.attributes.id)
        else:
            # This necessarily should be true, since attributes define parent
            # child relationships.
            assert len(new_record.children) == 0
            assert len(new_record.parents) == 0

    return GFF(list(new_records.values()))
예제 #4
0
def rnammer2gff(args: argparse.Namespace) -> None:
    records: List[GFFRecord] = []

    for line in args.infile:
        if line.startswith("#"):
            continue

        sline = line.strip().split("\t")
        rrna_type = sline[8]
        new_type = TYPE_MAP[args.kingdom][rrna_type.lower()]
        sline[1] = args.source
        sline[2] = new_type
        sline[8] = "."

        rna_record = GFFRecord.parse("\t".join(sline))
        gene_record = deepcopy(rna_record)
        gene_record.type = "rRNA_gene"
        gene_record.add_child(rna_record)

        records.append(gene_record)
        records.append(rna_record)

    num = 0
    for record in GFF(records).traverse_children(sort=True):
        if record.attributes is None:
            attr = GFFAttributes()
            record.attributes = attr
        else:
            attr = record.attributes

        if record.type == "rRNA_gene":
            num += 1
            attr.id = f"rRNA_gene{num}"
        else:
            attr.id = f"rRNA{num}"
            attr.parent = [
                p.attributes.id for p in record.parents
                if (p.attributes is not None and p.attributes.id is not None)
            ]

        print(record, file=args.outfile)

    return
예제 #5
0
def shift_gff(gff: GFF, seqid: str, start: int) -> GFF:
    out = list()

    for feature in gff.traverse_children(sort=True):
        f = deepcopy(feature)
        f.start -= start
        f.end -= start
        f.seqid = seqid
        if f.attributes is not None:
            if f.attributes.id is not None:
                f.attributes.id = seqid + "_" + f.attributes.id

            new_parents = []
            for parent in f.attributes.parent:
                new_parent = seqid + "_" + parent
                new_parents.append(new_parent)

            f.attributes.parent = new_parents

        out.append(f)

    return GFF(out)
예제 #6
0
def main():
    args = cli(sys.argv[0], sys.argv[1:])

    rows = list()
    for line in args.infile:
        if line.startswith("#"):
            continue
        record = GFFRecord.parse(line)
        record.attributes.id = record.attributes.name
        record.attributes.name = None
        record.attributes.custom = {}
        rows.append(record)

    gff = GFF(rows)
    gff.infer_missing_parents()

    counter = 1

    for mrna in gff.select_type("mRNA"):
        if len(mrna.children) < 2:
            continue

        region = deepcopy(mrna)
        region.type = "repeat_region"
        region.attributes.id = f"repeat_region{counter}"
        region.attributes.ontology_term = ["SO:0000657"],

        mrna.type = "helitron"
        mrna.parents = [region]
        mrna.attributes.parent = [region.attributes.id]
        mrna.attributes.id = f"helitron{counter}"
        mrna.attributes.ontology_term = ["SO:0000544", "SO:helitron"]
        mrna.attributes.custom = {}

        flank3 = [c for c in mrna.children
                  if c.attributes.id.endswith(".3")][0]
        flank5 = [c for c in mrna.children
                  if c.attributes.id.endswith(".5.1")][0]

        flank3.type = "three_prime_flanking_region"
        flank3.attributes.ontology_term = [
            "SO:0001417",
            "SO:three_prime_flanking_region",
            "SO:0000364",
            "SO:transposable_element_flanking_region"
        ]
        flank3.attributes.id = None
        flank3.attributes.parent = [mrna.attributes.id]

        flank5.type = "five_prime_flanking_region"
        flank5.attributes.ontology_term = [
            "SO:0001416",
            "SO:five_prime_flanking_region",
            "SO:0000364",
            "SO:transposable_element_flanking_region"
        ]
        flank5.attributes.id = None
        flank5.attributes.parent = [mrna.attributes.id]

        mrna.source = flank5.source

        print(region, file=args.outfile)
        print(mrna, file=args.outfile)
        if mrna.strand == Strand.MINUS:
            print(flank3, file=args.outfile)
            print(flank5, file=args.outfile)
        else:
            print(flank5, file=args.outfile)
            print(flank3, file=args.outfile)

        counter += 1

    return