def hints(args: argparse.Namespace) -> None: gff_to_hints = get_hints_map(args) type_to_trim = get_trim_map(args) type_to_priority = get_priority_map(args) gff = GFF.parse(args.infile) for parent in gff.select_type(args.group_level): group_name = fmap(lambda a: getattr(a, "id"), parent.attributes) if group_name is None: raise GPMissingID( "One of the selected records doesn't have an ID. " f"The offending line is {parent}.") for feature in gff.traverse_children([parent]): hint_feature = transform_child( feature, group_name, gff_to_hints, type_to_trim, type_to_priority, args.source, args.priority, ) if hint_feature is not None: print(hint_feature, file=args.outfile) return
def add_parents(args: argparse.Namespace) -> None: gff = GFF.parse(args.infile) gff.infer_missing_parents() for f in gff.select_type("mRNA"): if len(f.parents) > 0: continue if f.attributes is None: continue if f.attributes.id is None: continue id_ = f.attributes.id gene_id = f"gene.{id_}" gene = GFF3Record.infer_from_children([f], id=gene_id, type="gene") f.add_parent(gene) gff.add_record(gene) print("##gff-version 3", file=args.outfile) for feature in gff.traverse_children(sort=True): print(feature, file=args.outfile) return
def write_gff( gff: GFF, outfile: TextIO, ) -> None: print("#gff-version 3", file=outfile) for feature in gff.traverse_children(sort=True, breadth=True): print(feature, file=outfile) return
def make_new_subsequence( seqid: str, start: int, end: int, gene_itree: IntervalTree, hint_itree: Optional[IntervalTree], gene_features: Sequence[GFFRecord], hint_features: Optional[Sequence[GFFRecord]], seq: SeqRecord) -> Tuple[str, SeqRecord, GFF, Optional[GFF]]: gene_intervals = gene_itree[start:end] min_gene_start = min(f.begin for f in gene_intervals) - 10 max_gene_end = max(f.end for f in gene_intervals) + 10 start = min([start, min_gene_start]) if start < 0: start = 0 end = max([end, max_gene_end]) if end > len(seq): end = len(seq) if hint_itree is None: hint_intervals = None else: hint_intervals = [ i for i in hint_itree[start:end] if i.begin >= start and i.end <= end ] name = f"{seqid}:{start}-{end}" subseq = FeatureLocation(start, end, 1).extract(seq) subseq.id = name subseq.name = name subseq.description = name subgenes = GFF([gene_features[i.data] for i in gene_intervals]) subgenes_shifted = shift_gff(subgenes, name, start) if hint_intervals is None or hint_features is None: subhints_shifted = None else: subhints = GFF([hint_features[i.data] for i in hint_intervals]) subhints_shifted = shift_gff(subhints, name, start) return name, subseq, subgenes_shifted, subhints_shifted
def main(): args = cli(sys.argv[0], sys.argv[1:]) gff = GFF.parse(args.infile) itree = gff_to_itree(gff.select_type(args.group_level)) for mrna in gff.select_type(args.group_level): if mrna.attributes is None: mrna.attributes = GFF3Attributes() failed_antifam = "antifam_match" in mrna.attributes.custom if failed_antifam: mrna.attributes.custom["is_unreliable"] = "true" mrna.attributes.custom["should_exclude"] = "true" length, aligned = deal_with_kids( gff.traverse_children([mrna]), args.type, 0, defaultdict(int) ) coverages = find_coverages(aligned, length) supported = [k for k, v in coverages.items() if v > args.min_cov] not_supported = ((len(supported) == 0) or (all(s in args.exclude for s in supported))) is_novel = is_novel_locus(mrna, itree, args.threshold) if not_supported: mrna.attributes.custom["is_unreliable"] = "true" if not is_novel: mrna.attributes.custom["should_exclude"] = "true" if args.stats: line = coverages line["id"] = mrna.attributes.id line["length"] = length line["is_supported"] = not not_supported line["is_novel_locus"] = is_novel line["antifam_match"] = failed_antifam line["excluded"] = (failed_antifam or (not_supported and not is_novel)) print(json.dumps(line), file=args.stats) kept, dropped = split_gffs(gff, args.group_level) write_gff(kept, args.outfile) if args.filtered is not None: write_gff(dropped, args.filtered) return
def ncbi(args: argparse.Namespace) -> None: gff = GFF.parse(args.infile).break_bubbles() so = Ontology.from_obo_library(args.so) name_to_so = {term.name: term for term in so.values()} add_so_as_ontologies(gff, name_to_so) add_ncrna_types(gff, name_to_so, so, NCRNA_TYPES) add_pseudogene_types(gff, name_to_so, so, PSEUDOGENE_TYPES) return
def get_blocks( seqs: Dict[str, SeqRecord], genes: GFF, hints: Optional[GFF], pad: int, merge: bool, target_ids: Optional[Set[str]] ) -> Iterator[Tuple[str, SeqRecord, GFF, Optional[GFF]]]: hints_by_seqid = fmap(groupby_seqid, hints) for seqid, gene_features in groupby_seqid( genes.select_type("gene")).items(): if seqid not in seqs: continue gene_itree = IntervalTree(gffrecords_to_intervals(gene_features)) try: hint_itree = fmap( IntervalTree, fmap(gffrecords_to_intervals, fmap(lambda h: h.get(seqid, []), hints_by_seqid))) except ValueError as e: if hints_by_seqid is not None: print(seqid, hints_by_seqid.get(seqid, [])) raise e if target_ids is None: target_features: List[GFFRecord] = gene_features else: target_features = subset_features_by_id(gene_features, target_ids) if merge: block_iter = merge_overlapping(target_features, pad) else: block_iter = pad_features(target_features, pad) seen: Set[Tuple[int, int]] = set() for start, end in block_iter: if (start, end) in seen: continue else: seen.add((start, end)) yield make_new_subsequence( seqid, start, end, gene_itree, hint_itree, gene_features, fmap(lambda h: h.get(seqid, []), hints_by_seqid), seqs[seqid], ) return
def split_gffs(gff: GFF, group_level: str) -> Tuple[GFF, GFF]: keep: Set[GFF3Record] = set() drop: Set[GFF3Record] = set() for mrna in gff.select_type(group_level): if mrna.attributes is None: should_keep = True elif "should_exclude" in mrna.attributes.custom: should_keep = False else: should_keep = True if should_keep: keep.update(gff.traverse_children([mrna])) keep.update(gff.traverse_parents([mrna])) else: drop.update(gff.traverse_children([mrna])) drop.update(gff.traverse_parents([mrna])) kept = prune_gff(keep) dropped = prune_gff(drop) return kept, dropped
def shift_gff(gff: GFF, seqid: str, start: int) -> GFF: out = list() for feature in gff.traverse_children(sort=True): f = deepcopy(feature) f.start -= start f.end -= start f.seqid = seqid if f.attributes is not None: if f.attributes.id is not None: f.attributes.id = seqid + "_" + f.attributes.id new_parents = [] for parent in f.attributes.parent: new_parent = seqid + "_" + parent new_parents.append(new_parent) f.attributes.parent = new_parents out.append(f) return GFF(out)
def expandcds(args: argparse.Namespace) -> None: gff = GFF.parse(args.infile) if args.infasta is None: seqs = None else: seqs = SeqIO.to_dict(SeqIO.parse(args.infasta, format="fasta")) codon_table = CodonTable.unambiguous_dna_by_id[args.gencode] cds_parents: Set[GFF3Record] = set() for record in gff.select_type(args.cds_type): cds_parents.update((cast(GFF3Record, p) for p in record.parents)) for parent in cds_parents: cdss = sorted([ cast(GFF3Record, f) for f in parent.children if f.type == args.cds_type ], key=lambda f: (f.start, f.end)) strand = find_strand(cdss, parent) if args.start: bump_start(cdss, strand) if args.stop: bump_end(cdss, strand) if seqs is not None and parent.seqid in seqs: start_codon = check_start(cdss, parent, strand, seqs, codon_table) if start_codon is not None: print(start_codon, file=args.warnings) stop_codon = check_stop(cdss, parent, strand, seqs, codon_table) if stop_codon is not None: print(stop_codon, file=args.warnings) child_cdss: Sequence[GFF3Record] = list(gff.select_type(args.cds_type)) for parent in gff.traverse_parents(child_cdss): parent.expand_to_children() print("##gff-version 3", file=args.outfile) for feature in gff.traverse_children(sort=True): print(feature, file=args.outfile) return
def run_extract(args) -> None: if args.good is not None: good_ids: Optional[Set[str]] = get_good_ids(args.good) else: good_ids = None in_gff = GFF.parse(args.gff) in_fasta = SeqIO.to_dict(SeqIO.parse(args.fasta, format="fasta")) if args.hints is not None: in_hints = GFF.parse(args.hints) else: in_hints = None block_iter = get_blocks(in_fasta, in_gff, in_hints, args.pad, args.merge, good_ids) print("##gff-version 3", file=args.outgff) if args.outhints is not None: print("##gff-version 3", file=args.outhints) for name, seq, genes, hints in block_iter: SeqIO.write(seq, args.outfasta, format="fasta") sr = f"##sequence-region {name} 1 {len(seq)}" print(sr, file=args.outgff) print(genes, file=args.outgff) if (args.outhints is not None and hints is not None and len(hints.inner) > 0): print(sr, file=args.outhints) print(hints, file=args.outhints) return
def trnascan2gff(args: argparse.Namespace) -> None: genes: List[GFF3Record] = [] matches = TRNAScanRecord.from_file(args.txt) sses = TRNAScanSS.from_file(args.ss) num_to_ss = {f"{r.seqid}.{r.num}": r for r in sses} for match in matches: ss = num_to_ss[f"{match.seqid}.{match.num}"] if match.note is not None and "pseudo" in match.note: type_ = "pseudogene" else: type_ = "tRNA_gene" gene = match_to_gene(match, args.source, type=type_) genes.append(gene) trna = match_to_trna( match, ss, args.source, type_map=TYPE_MAP, parents=[gene] ) genes.append(trna) introns = match_to_introns( match, args.source, type="tRNA_intron", parents=[trna] ) genes.extend(introns) anticodon = match_to_anticodon( match, ss, args.source, type="anticodon", parents=[trna] ) genes.append(anticodon) for record in GFF(genes).traverse_children(sort=True): print(record, file=args.outfile) return
def prune_gff(records: Set[GFF3Record]) -> GFF: new_records: Dict[GFF3Record, GFF3Record] = dict() # Create a mapping from old to new objects # to preserve hashing/lookup capability for record in records: new_record = deepcopy(record) new_record.children = [] new_record.parents = [] new_records[record] = new_record for record in records: new_record = new_records[record] for parent in record.parents: # Don't add parents that shouldn't be in this set if parent not in records: continue new_parent = new_records[cast(GFF3Record, parent)] new_record.add_parent(new_parent) for child in record.children: # Don't add children that shouldn't be in this set if child not in records: continue new_child = new_records[cast(GFF3Record, child)] new_record.add_child(new_child) # Update the record parent IDS to reflect the new split set. if new_record.attributes is not None: new_record.attributes.parent = [] for parent in new_record.parents: # This should always be true, as the ID is necessary assert parent.attributes is not None assert parent.attributes.id is not None new_record.attributes.parent.append(parent.attributes.id) else: # This necessarily should be true, since attributes define parent # child relationships. assert len(new_record.children) == 0 assert len(new_record.parents) == 0 return GFF(list(new_records.values()))
def rnammer2gff(args: argparse.Namespace) -> None: records: List[GFFRecord] = [] for line in args.infile: if line.startswith("#"): continue sline = line.strip().split("\t") rrna_type = sline[8] new_type = TYPE_MAP[args.kingdom][rrna_type.lower()] sline[1] = args.source sline[2] = new_type sline[8] = "." rna_record = GFFRecord.parse("\t".join(sline)) gene_record = deepcopy(rna_record) gene_record.type = "rRNA_gene" gene_record.add_child(rna_record) records.append(gene_record) records.append(rna_record) num = 0 for record in GFF(records).traverse_children(sort=True): if record.attributes is None: attr = GFFAttributes() record.attributes = attr else: attr = record.attributes if record.type == "rRNA_gene": num += 1 attr.id = f"rRNA_gene{num}" else: attr.id = f"rRNA{num}" attr.parent = [ p.attributes.id for p in record.parents if (p.attributes is not None and p.attributes.id is not None) ] print(record, file=args.outfile) return
def select(args: argparse.Namespace) -> None: gff = GFF.parse(args.infile) ids = {l.strip() for l in args.ids} to_keep: Set[GFF3Record] = set() for record in gff: if record.attributes is not None and record.attributes.id in ids: to_keep.update( cast(Iterator[GFF3Record], record.traverse_parents())) to_keep.update( cast(Iterator[GFF3Record], record.traverse_children())) pruned = prune_gff(to_keep) print("#gff-version 3", file=args.outfile) for feature in pruned.traverse_children(sort=True): print(feature, file=args.outfile)
def main(): args = cli(sys.argv[0], sys.argv[1:]) gff = GFF.parse(args.ingff) seqs = SeqIO.to_dict(SeqIO.parse(args.infasta, format="fasta")) counter = 1 for region in gff: region.type = args.type region.source = args.source region.strand = Strand.UNSTRANDED region.attributes.id = f"{args.type}{counter}" sf = gff_to_seqfeature(region) seq = sf.extract(seqs[region.seqid]) base_counts = count_frequencies(seq) region.attributes.custom = base_counts counter += 1 print(region, file=args.outfile) return
def main(): args = cli(sys.argv[0], sys.argv[1:]) rows = list() for line in args.infile: if line.startswith("#"): continue record = GFFRecord.parse(line) record.attributes.id = record.attributes.name record.attributes.name = None record.attributes.custom = {} rows.append(record) gff = GFF(rows) gff.infer_missing_parents() counter = 1 for mrna in gff.select_type("mRNA"): if len(mrna.children) < 2: continue region = deepcopy(mrna) region.type = "repeat_region" region.attributes.id = f"repeat_region{counter}" region.attributes.ontology_term = ["SO:0000657"], mrna.type = "helitron" mrna.parents = [region] mrna.attributes.parent = [region.attributes.id] mrna.attributes.id = f"helitron{counter}" mrna.attributes.ontology_term = ["SO:0000544", "SO:helitron"] mrna.attributes.custom = {} flank3 = [c for c in mrna.children if c.attributes.id.endswith(".3")][0] flank5 = [c for c in mrna.children if c.attributes.id.endswith(".5.1")][0] flank3.type = "three_prime_flanking_region" flank3.attributes.ontology_term = [ "SO:0001417", "SO:three_prime_flanking_region", "SO:0000364", "SO:transposable_element_flanking_region" ] flank3.attributes.id = None flank3.attributes.parent = [mrna.attributes.id] flank5.type = "five_prime_flanking_region" flank5.attributes.ontology_term = [ "SO:0001416", "SO:five_prime_flanking_region", "SO:0000364", "SO:transposable_element_flanking_region" ] flank5.attributes.id = None flank5.attributes.parent = [mrna.attributes.id] mrna.source = flank5.source print(region, file=args.outfile) print(mrna, file=args.outfile) if mrna.strand == Strand.MINUS: print(flank3, file=args.outfile) print(flank5, file=args.outfile) else: print(flank5, file=args.outfile) print(flank3, file=args.outfile) counter += 1 return