def get_blocks( seqs: Dict[str, SeqRecord], genes: GFF, hints: Optional[GFF], pad: int, merge: bool, target_ids: Optional[Set[str]] ) -> Iterator[Tuple[str, SeqRecord, GFF, Optional[GFF]]]: hints_by_seqid = fmap(groupby_seqid, hints) for seqid, gene_features in groupby_seqid( genes.select_type("gene")).items(): if seqid not in seqs: continue gene_itree = IntervalTree(gffrecords_to_intervals(gene_features)) try: hint_itree = fmap( IntervalTree, fmap(gffrecords_to_intervals, fmap(lambda h: h.get(seqid, []), hints_by_seqid))) except ValueError as e: if hints_by_seqid is not None: print(seqid, hints_by_seqid.get(seqid, [])) raise e if target_ids is None: target_features: List[GFFRecord] = gene_features else: target_features = subset_features_by_id(gene_features, target_ids) if merge: block_iter = merge_overlapping(target_features, pad) else: block_iter = pad_features(target_features, pad) seen: Set[Tuple[int, int]] = set() for start, end in block_iter: if (start, end) in seen: continue else: seen.add((start, end)) yield make_new_subsequence( seqid, start, end, gene_itree, hint_itree, gene_features, fmap(lambda h: h.get(seqid, []), hints_by_seqid), seqs[seqid], ) return
def split_gffs(gff: GFF, group_level: str) -> Tuple[GFF, GFF]: keep: Set[GFF3Record] = set() drop: Set[GFF3Record] = set() for mrna in gff.select_type(group_level): if mrna.attributes is None: should_keep = True elif "should_exclude" in mrna.attributes.custom: should_keep = False else: should_keep = True if should_keep: keep.update(gff.traverse_children([mrna])) keep.update(gff.traverse_parents([mrna])) else: drop.update(gff.traverse_children([mrna])) drop.update(gff.traverse_parents([mrna])) kept = prune_gff(keep) dropped = prune_gff(drop) return kept, dropped
def main(): args = cli(sys.argv[0], sys.argv[1:]) rows = list() for line in args.infile: if line.startswith("#"): continue record = GFFRecord.parse(line) record.attributes.id = record.attributes.name record.attributes.name = None record.attributes.custom = {} rows.append(record) gff = GFF(rows) gff.infer_missing_parents() counter = 1 for mrna in gff.select_type("mRNA"): if len(mrna.children) < 2: continue region = deepcopy(mrna) region.type = "repeat_region" region.attributes.id = f"repeat_region{counter}" region.attributes.ontology_term = ["SO:0000657"], mrna.type = "helitron" mrna.parents = [region] mrna.attributes.parent = [region.attributes.id] mrna.attributes.id = f"helitron{counter}" mrna.attributes.ontology_term = ["SO:0000544", "SO:helitron"] mrna.attributes.custom = {} flank3 = [c for c in mrna.children if c.attributes.id.endswith(".3")][0] flank5 = [c for c in mrna.children if c.attributes.id.endswith(".5.1")][0] flank3.type = "three_prime_flanking_region" flank3.attributes.ontology_term = [ "SO:0001417", "SO:three_prime_flanking_region", "SO:0000364", "SO:transposable_element_flanking_region" ] flank3.attributes.id = None flank3.attributes.parent = [mrna.attributes.id] flank5.type = "five_prime_flanking_region" flank5.attributes.ontology_term = [ "SO:0001416", "SO:five_prime_flanking_region", "SO:0000364", "SO:transposable_element_flanking_region" ] flank5.attributes.id = None flank5.attributes.parent = [mrna.attributes.id] mrna.source = flank5.source print(region, file=args.outfile) print(mrna, file=args.outfile) if mrna.strand == Strand.MINUS: print(flank3, file=args.outfile) print(flank5, file=args.outfile) else: print(flank5, file=args.outfile) print(flank3, file=args.outfile) counter += 1 return