def make_new_subsequence( seqid: str, start: int, end: int, gene_itree: IntervalTree, hint_itree: Optional[IntervalTree], gene_features: Sequence[GFFRecord], hint_features: Optional[Sequence[GFFRecord]], seq: SeqRecord) -> Tuple[str, SeqRecord, GFF, Optional[GFF]]: gene_intervals = gene_itree[start:end] min_gene_start = min(f.begin for f in gene_intervals) - 10 max_gene_end = max(f.end for f in gene_intervals) + 10 start = min([start, min_gene_start]) if start < 0: start = 0 end = max([end, max_gene_end]) if end > len(seq): end = len(seq) if hint_itree is None: hint_intervals = None else: hint_intervals = [ i for i in hint_itree[start:end] if i.begin >= start and i.end <= end ] name = f"{seqid}:{start}-{end}" subseq = FeatureLocation(start, end, 1).extract(seq) subseq.id = name subseq.name = name subseq.description = name subgenes = GFF([gene_features[i.data] for i in gene_intervals]) subgenes_shifted = shift_gff(subgenes, name, start) if hint_intervals is None or hint_features is None: subhints_shifted = None else: subhints = GFF([hint_features[i.data] for i in hint_intervals]) subhints_shifted = shift_gff(subhints, name, start) return name, subseq, subgenes_shifted, subhints_shifted
def trnascan2gff(args: argparse.Namespace) -> None: genes: List[GFF3Record] = [] matches = TRNAScanRecord.from_file(args.txt) sses = TRNAScanSS.from_file(args.ss) num_to_ss = {f"{r.seqid}.{r.num}": r for r in sses} for match in matches: ss = num_to_ss[f"{match.seqid}.{match.num}"] if match.note is not None and "pseudo" in match.note: type_ = "pseudogene" else: type_ = "tRNA_gene" gene = match_to_gene(match, args.source, type=type_) genes.append(gene) trna = match_to_trna( match, ss, args.source, type_map=TYPE_MAP, parents=[gene] ) genes.append(trna) introns = match_to_introns( match, args.source, type="tRNA_intron", parents=[trna] ) genes.extend(introns) anticodon = match_to_anticodon( match, ss, args.source, type="anticodon", parents=[trna] ) genes.append(anticodon) for record in GFF(genes).traverse_children(sort=True): print(record, file=args.outfile) return
def prune_gff(records: Set[GFF3Record]) -> GFF: new_records: Dict[GFF3Record, GFF3Record] = dict() # Create a mapping from old to new objects # to preserve hashing/lookup capability for record in records: new_record = deepcopy(record) new_record.children = [] new_record.parents = [] new_records[record] = new_record for record in records: new_record = new_records[record] for parent in record.parents: # Don't add parents that shouldn't be in this set if parent not in records: continue new_parent = new_records[cast(GFF3Record, parent)] new_record.add_parent(new_parent) for child in record.children: # Don't add children that shouldn't be in this set if child not in records: continue new_child = new_records[cast(GFF3Record, child)] new_record.add_child(new_child) # Update the record parent IDS to reflect the new split set. if new_record.attributes is not None: new_record.attributes.parent = [] for parent in new_record.parents: # This should always be true, as the ID is necessary assert parent.attributes is not None assert parent.attributes.id is not None new_record.attributes.parent.append(parent.attributes.id) else: # This necessarily should be true, since attributes define parent # child relationships. assert len(new_record.children) == 0 assert len(new_record.parents) == 0 return GFF(list(new_records.values()))
def rnammer2gff(args: argparse.Namespace) -> None: records: List[GFFRecord] = [] for line in args.infile: if line.startswith("#"): continue sline = line.strip().split("\t") rrna_type = sline[8] new_type = TYPE_MAP[args.kingdom][rrna_type.lower()] sline[1] = args.source sline[2] = new_type sline[8] = "." rna_record = GFFRecord.parse("\t".join(sline)) gene_record = deepcopy(rna_record) gene_record.type = "rRNA_gene" gene_record.add_child(rna_record) records.append(gene_record) records.append(rna_record) num = 0 for record in GFF(records).traverse_children(sort=True): if record.attributes is None: attr = GFFAttributes() record.attributes = attr else: attr = record.attributes if record.type == "rRNA_gene": num += 1 attr.id = f"rRNA_gene{num}" else: attr.id = f"rRNA{num}" attr.parent = [ p.attributes.id for p in record.parents if (p.attributes is not None and p.attributes.id is not None) ] print(record, file=args.outfile) return
def shift_gff(gff: GFF, seqid: str, start: int) -> GFF: out = list() for feature in gff.traverse_children(sort=True): f = deepcopy(feature) f.start -= start f.end -= start f.seqid = seqid if f.attributes is not None: if f.attributes.id is not None: f.attributes.id = seqid + "_" + f.attributes.id new_parents = [] for parent in f.attributes.parent: new_parent = seqid + "_" + parent new_parents.append(new_parent) f.attributes.parent = new_parents out.append(f) return GFF(out)
def main(): args = cli(sys.argv[0], sys.argv[1:]) rows = list() for line in args.infile: if line.startswith("#"): continue record = GFFRecord.parse(line) record.attributes.id = record.attributes.name record.attributes.name = None record.attributes.custom = {} rows.append(record) gff = GFF(rows) gff.infer_missing_parents() counter = 1 for mrna in gff.select_type("mRNA"): if len(mrna.children) < 2: continue region = deepcopy(mrna) region.type = "repeat_region" region.attributes.id = f"repeat_region{counter}" region.attributes.ontology_term = ["SO:0000657"], mrna.type = "helitron" mrna.parents = [region] mrna.attributes.parent = [region.attributes.id] mrna.attributes.id = f"helitron{counter}" mrna.attributes.ontology_term = ["SO:0000544", "SO:helitron"] mrna.attributes.custom = {} flank3 = [c for c in mrna.children if c.attributes.id.endswith(".3")][0] flank5 = [c for c in mrna.children if c.attributes.id.endswith(".5.1")][0] flank3.type = "three_prime_flanking_region" flank3.attributes.ontology_term = [ "SO:0001417", "SO:three_prime_flanking_region", "SO:0000364", "SO:transposable_element_flanking_region" ] flank3.attributes.id = None flank3.attributes.parent = [mrna.attributes.id] flank5.type = "five_prime_flanking_region" flank5.attributes.ontology_term = [ "SO:0001416", "SO:five_prime_flanking_region", "SO:0000364", "SO:transposable_element_flanking_region" ] flank5.attributes.id = None flank5.attributes.parent = [mrna.attributes.id] mrna.source = flank5.source print(region, file=args.outfile) print(mrna, file=args.outfile) if mrna.strand == Strand.MINUS: print(flank3, file=args.outfile) print(flank5, file=args.outfile) else: print(flank5, file=args.outfile) print(flank3, file=args.outfile) counter += 1 return