def transform_child( feature: GFFRecord, group_name: str, gff_to_hints: Dict[str, str], type_to_trim: Dict[str, int], type_to_priority: Dict[str, int], source: str, priority: int, ) -> Optional[GFFRecord]: """ Converts a regular feature to a hint record. """ feature = copy(feature) if feature.type not in gff_to_hints: mapped_type = GFF_TYPE_MAP.get(feature.type, None) else: mapped_type = feature.type hint_type: Optional[str] = applicative(lambda t: gff_to_hints.get(t, None), mapped_type) if hint_type is None: return None feature.type = hint_type feature.trim_ends(type_to_trim.get(feature.type, 0)) priority_boost = type_to_priority[feature.type] attr = GFFAttributes(custom=dict(source=source, group=group_name, priority=str(priority + priority_boost))) feature.attributes = attr return feature
def main(): args = cli(sys.argv[0], sys.argv[1:]) if args.go is not None: go = parse_rfam2go(args.go) else: go = {} for line in args.infile: if line.startswith("#"): continue record = GFFRecord.parse(line) attrs = record.attributes if args.best and attrs.custom["olp"] == "=": continue name = record.type dbxrefs = ["Rfam:" + attrs.custom["mdlaccn"], "Rfam:" + name] if "clan" in attrs.custom: dbxrefs.append("RfamClan:" + attrs.custom["clan"]) ontology_terms = go.get(attrs.custom["mdlaccn"], []) notes = [attrs.custom["desc"]] target = Target( attrs.custom["mdlaccn"], int(attrs.custom["mdlfrom"]), int(attrs.custom["mdlto"]), ) custom = { "evalue": attrs.custom["evalue"], "model_type": attrs.custom["mdl"], "gc": attrs.custom["gc"], "bias": attrs.custom["bias"], "bitscore": record.score, } if attrs.custom["trunc"] == "yes": custom["truncated_match"] = "true" if attrs.custom["olp"] == "=": custom["overlap_with_better_score"] = "true" record.source = args.source record.type = args.type record.score = float(custom["evalue"]) record.attributes = GFFAttributes( name=name, dbxref=dbxrefs, target=target, note=notes, ontology_term=ontology_terms, custom=custom, ) print(record, file=args.outfile)
def decode_gff(infiles, outfile, map_, column): inhandles = join_files(infiles, header=False) if column == "id": trans_function = replace_gff_id elif column == "name": trans_function = replace_gff_name elif column == "seqid": trans_function = replace_gff_seqid else: raise ValueError("This shouldn't ever happen") record_chunk = list() for i, line in enumerate(inhandles): if line.startswith("#"): record_chunk.append(line.strip()) continue old_record = GFFRecord.parse(line) new_records = trans_function(old_record, map_) record_chunk.append(str(new_records)) if i % 10000 == 0: outfile.write("\n".join(record_chunk)) record_chunk = list() if len(record_chunk) > 0: outfile.write("\n".join(record_chunk)) return
def match_to_anticodon( match: TRNAScanRecord, ss: TRNAScanSS, source: str, type: str = "anticodon", parents: Sequence[GFFRecord] = [] ) -> GFFRecord: start, end, strand = fix_strand(ss.anticodon_start, ss.anticodon_end) parent_ids = [ p.attributes.id for p in parents if (p.attributes is not None and p.attributes.id is not None) ] anticodon = GFFRecord( seqid=match.seqid, source=source, type=type, start=start, end=end, score=match.infernal_score, strand=strand, phase=Phase.NOT_CDS, attributes=GFFAttributes( id=f"{match.seqid}.{type}{match.num}", parent=parent_ids, ), parents=parents ) return anticodon
def match_to_introns( match: TRNAScanRecord, source: str, type: str = "tRNA_intron", parents: Sequence[GFFRecord] = [], ) -> List[GFFRecord]: introns = [] parent_ids = [ p.attributes.id for p in parents if (p.attributes is not None and p.attributes.id is not None) ] for istart, iend in zip(match.intron_starts, match.intron_ends): start, end, strand = fix_strand(istart, iend) intron = GFFRecord( seqid=match.seqid, source=source, type=type, start=start, end=end, score=match.infernal_score, strand=strand, phase=Phase.NOT_CDS, attributes=GFFAttributes( id=f"{match.seqid}.{type}{match.num}", parent=parent_ids, ), parents=parents ) introns.append(intron) return introns
def dict_to_record(row, source="gffpal", ext_attributes=None): if "#target" in row: seqid = row["#target"] elif "target" in row: seqid = row["target"] else: raise ValueError("Input file doesn't have a 'target' column.") start = int(row["tstart"]) end = int(row["tend"]) if start <= end: strand = Strand.PLUS else: tmp = start start = end end = tmp del tmp strand = Strand.MINUS custom_attrs = { "pident": row["pident"], "alnlen": row["alnlen"], "score": row["raw"], "bitscore": row["bits"], "gapopen": row["gapopen"], "query_coverage": row["qcov"], "evalue": row["evalue"], } target = Target(row["query"], int(row["qstart"]), int(row["qend"])) gap = parse_cigar(row["cigar"]) if ext_attributes is None: attributes = GFFAttributes(target=target, custom=custom_attrs, gap=gap) else: attributes = GFFAttributes( target=target, custom=custom_attrs, gap=gap, name=ext_attributes[row["query"]]["Name"], alias=ext_attributes[row["query"]]["Alias"], dbxref=ext_attributes[row["query"]]["Dbxref"], ontology_term=ext_attributes[row["query"]]["Ontology_term"], note=ext_attributes[row["query"]]["Note"], ) record = GFFRecord( seqid=seqid, source=source, type="nucleotide_to_protein_match", start=start, end=end, score=float(row["evalue"]), strand=strand, attributes=attributes, ) return record
def encode_gff( infiles, outfile, mapfile, column, id_conv, ): inhandles = join_files(infiles, header=False) seen = dict() if column == "id": trans_function = replace_gff_id elif column == "name": trans_function = replace_gff_name elif column == "seqid": trans_function = replace_gff_seqid else: raise ValueError("This shouldn't ever happen") id_chunk = list() record_chunk = list() for i, line in enumerate(inhandles): if line.startswith("#"): record_chunk.append(line.strip()) continue old_record = GFFRecord.parse(line) new_record = trans_function( old_record, seen, id_chunk, lambda x: next(id_conv), ) record_chunk.append(str(new_record)) if i % 10000 == 0: if len(record_chunk) > 0: outfile.write("\n".join(record_chunk)) outfile.write("\n") record_chunk = list() if len(id_chunk) > 0: mapfile.write("".join(f"{n}\t{o}\n" for n, o in id_chunk)) id_chunk = list() if len(record_chunk) > 0: outfile.write("\n".join(record_chunk)) outfile.write("\n") if len(id_chunk) > 0: mapfile.write("".join(f"{n}\t{o}\n" for n, o in id_chunk)) return
def match_to_trna( match: TRNAScanRecord, ss: TRNAScanSS, source: str, type_map: Mapping[str, str] = TYPE_MAP, parents: Sequence[GFFRecord] = [] ) -> GFFRecord: start, end, strand = fix_strand(match.start, match.end) parent_ids = [ p.attributes.id for p in parents if (p.attributes is not None and p.attributes.id is not None) ] if match.note is None or match.note == "": notes: List[str] = [] else: notes = [match.note] trna = GFFRecord( seqid=match.seqid, source=source, type=type_map.get(match.trna_type.lower(), "tRNA"), start=start, end=end, score=match.infernal_score, strand=strand, phase=Phase.NOT_CDS, attributes=GFFAttributes( id=f"{match.seqid}.tRNA{match.num}", parent=parent_ids, note=notes, custom={ "secondary_structure": ss.ss, "anticodon": match.anticodon, "amino_acid": match.trna_type, } ), parents=parents ) return trna
def rnammer2gff(args: argparse.Namespace) -> None: records: List[GFFRecord] = [] for line in args.infile: if line.startswith("#"): continue sline = line.strip().split("\t") rrna_type = sline[8] new_type = TYPE_MAP[args.kingdom][rrna_type.lower()] sline[1] = args.source sline[2] = new_type sline[8] = "." rna_record = GFFRecord.parse("\t".join(sline)) gene_record = deepcopy(rna_record) gene_record.type = "rRNA_gene" gene_record.add_child(rna_record) records.append(gene_record) records.append(rna_record) num = 0 for record in GFF(records).traverse_children(sort=True): if record.attributes is None: attr = GFFAttributes() record.attributes = attr else: attr = record.attributes if record.type == "rRNA_gene": num += 1 attr.id = f"rRNA_gene{num}" else: attr.id = f"rRNA{num}" attr.parent = [ p.attributes.id for p in record.parents if (p.attributes is not None and p.attributes.id is not None) ] print(record, file=args.outfile) return
def get_non_canon_stop_codon( seqid: str, start: int, end: int, strand: Strand, codon: str, parent_id: Optional[str], ) -> GFFRecord: custom = {"codon": codon} if parent_id is not None: custom["cds_parent"] = parent_id return GFFRecord( seqid, "gffpal", "stop_codon", start, end, None, strand, Phase.NOT_CDS, GFFAttributes( ontology_term=["SO:0000319"], note=["Non-canonical stop codon"], custom=custom, ))
def match_to_gene( match: TRNAScanRecord, source: str, type: str ) -> GFFRecord: start, end, strand = fix_strand(match.start, match.end) gene = GFFRecord( seqid=match.seqid, source=source, type=type, start=start, end=end, score=match.infernal_score, strand=strand, phase=Phase.NOT_CDS, attributes=GFFAttributes( id=f"{match.seqid}.{type}{match.num}", ) ) return gene
def as_gff3(self, source="RepeatMasker"): custom = { "smith_waterman_score": str(self.score), "percent_divergence": str(self.perc_divergence), "percent_deletions": str(self.perc_deletions), "percent_insertions": str(self.perc_insertions), "family_consensus_length": str(self.tend + self.tremaining), } ontology_terms = [] if self.better_hit: custom["has_better_overlapping_hit"] = "true" if self.kind == "Simple_repeat": repeat_unit = re.match(r"\((?P<rep>.+)\)n", self.family).group("rep") custom["repeat_unit"] = repeat_unit if len(repeat_unit) == 1: type_ = "monomeric_repeat" ontology_terms.extend(["SO:0001934", "SO:monomeric_repeat"]) elif len(repeat_unit) < 10: type_ = "microsatellite" ontology_terms.extend(["SO:0000289", "SO:microsatellite"]) else: type_ = "minisatellite" ontology_terms.extend(["SO:0000643", "SO:minisatellite"]) elif self.kind == "Low_complexity": type_ = "low_complexity_region" ontology_terms.extend(["SO:0001005", "SO:low_complexity_region"]) else: type_ = "repeat_region" ontology_terms.extend([ "SO:0000657", "SO:repeat_region", "SO:0000347", "SO:nucleotide_match" ]) custom["repeat_family"] = self.kind if self.kind == "Other/DNA_virus": ontology_terms.extend(["SO:0001041", "SO:viral_sequence"]) elif self.kind.startswith("snRNA"): ontology_terms.extend(["SO:0001268", "SO:snRNA_gene"]) elif self.kind.startswith("tRNA"): ontology_terms.extend(["SO:0001272", "SO:tRNA_gene"]) elif self.kind.startswith("rRNA"): ontology_terms.extend(["SO:0001637", "SO:rRNA_gene"]) elif self.kind.startswith("scRNA"): ontology_terms.extend(["SO:0001266", "SO:scRNA_gene"]) elif self.kind.startswith("Segmental"): ontology_terms.extend(["SO:1000035", "SO:duplication"]) elif self.kind.startswith("Satellite"): type_ = "satellite_DNA" ontology_terms.extend(["SO:0000005", "SO:satellite_DNA"]) del custom["repeat_family"] elif self.kind.startswith("Retrotransposon"): ontology_terms.extend(["SO:0000180", "SO:retrotransposon"]) elif self.kind.startswith("DNA"): ontology_terms.extend(["SO:0000182", "SO:DNA_transposon"]) elif self.kind.startswith("LTR"): ontology_terms.extend(["SO:0000186", "SO:LTR_retrotransposon"]) elif self.kind.startswith("LINE"): ontology_terms.extend(["SO:0000194", "SO:LINE_element"]) elif self.kind.startswith("SINE"): ontology_terms.extend(["SO:0000206", "SO:SINE_element"]) if "helitron" in self.kind.lower(): ontology_terms.extend(["SO:0000544", "SO:helitron"]) if ("maverick" in self.kind.lower() or "polinton" in self.kind.lower()): ontology_terms.extend(["SO:0001170", "SO:polinton"]) if "mite" in self.kind.lower(): ontology_terms.extend(["SO:0000338", "SO:MITE"]) if (self.kind.lower().endswith("/P") or self.kind.lower().endswith("P-Fungi")): ontology_terms.extend(["SO:0001535", "SO:p_element"]) attributes = GFFAttributes( name=self.family, target=Target(self.family, self.tstart, self.tend), ontology_term=ontology_terms, custom=custom, ) if type_ in { "monomeric_repeat", "microsatellite", "minisatellite", "low_complexity_region", "satellite_DNA", "duplication" }: strand = Strand.UNSTRANDED else: strand = Strand.parse(self.strand) record = GFFRecord( seqid=self.query, source=source, type=type_, start=self.qstart, end=self.qend, score=self.perc_divergence, strand=strand, attributes=attributes, ) return record
def main(): args = cli(sys.argv[0], sys.argv[1:]) rows = list() for line in args.infile: if line.startswith("#"): continue record = GFFRecord.parse(line) record.attributes.id = record.attributes.name record.attributes.name = None record.attributes.custom = {} rows.append(record) gff = GFF(rows) gff.infer_missing_parents() counter = 1 for mrna in gff.select_type("mRNA"): if len(mrna.children) < 2: continue region = deepcopy(mrna) region.type = "repeat_region" region.attributes.id = f"repeat_region{counter}" region.attributes.ontology_term = ["SO:0000657"], mrna.type = "helitron" mrna.parents = [region] mrna.attributes.parent = [region.attributes.id] mrna.attributes.id = f"helitron{counter}" mrna.attributes.ontology_term = ["SO:0000544", "SO:helitron"] mrna.attributes.custom = {} flank3 = [c for c in mrna.children if c.attributes.id.endswith(".3")][0] flank5 = [c for c in mrna.children if c.attributes.id.endswith(".5.1")][0] flank3.type = "three_prime_flanking_region" flank3.attributes.ontology_term = [ "SO:0001417", "SO:three_prime_flanking_region", "SO:0000364", "SO:transposable_element_flanking_region" ] flank3.attributes.id = None flank3.attributes.parent = [mrna.attributes.id] flank5.type = "five_prime_flanking_region" flank5.attributes.ontology_term = [ "SO:0001416", "SO:five_prime_flanking_region", "SO:0000364", "SO:transposable_element_flanking_region" ] flank5.attributes.id = None flank5.attributes.parent = [mrna.attributes.id] mrna.source = flank5.source print(region, file=args.outfile) print(mrna, file=args.outfile) if mrna.strand == Strand.MINUS: print(flank3, file=args.outfile) print(flank5, file=args.outfile) else: print(flank5, file=args.outfile) print(flank3, file=args.outfile) counter += 1 return
def parse_block(handle, source, type_): note = [] name = None species = [] ids = [] for line in handle: sline = line.strip() if line.startswith("//"): break if line.startswith("#=GF ID"): name = sline.split("ID", maxsplit=1)[-1].strip() elif line.startswith("#=GF DE"): note.append(sline.split("DE", maxsplit=1)[-1].strip()) elif line.startswith("#=GF TP"): species.extend( sline.split("TP", maxsplit=1)[-1].strip().split(";") ) elif not line.startswith("#"): ids.append(line.split(maxsplit=1)[0]) seen = set() out = [] for id_ in ids: if id_ in seen: continue else: seen.add(id) seqid, start, end, strand = id_to_loc(id_) if len(species) == 0: custom = {} else: custom = {"species": ":".join(species)} attributes = GFFAttributes( name=name, note=note, custom=custom, ) record = GFFRecord( seqid=seqid, source=source, type=type_, start=start, end=end, score=None, strand=strand, attributes=attributes, ) out.append(record) return out
def deal_with_block(block: List[str], gene_num: int) -> List[GFF3Record]: parsed: Dict[str, List[GFFRecord[GTFAttributes]]] = dict() for line in block: rec = GFFRecord.parse(line, attr=GTFAttributes) if rec.type in parsed: parsed[rec.type].append(rec) else: parsed[rec.type] = [rec] assert len(parsed["gene"]) == 1 assert len(parsed["similarity"]) == 1 gene_parsed = parsed["gene"][0] similarity_parsed = parsed["similarity"][0] custom: Dict[str, str] = dict() if similarity_parsed.attributes is not None: custom["query"] = similarity_parsed.attributes.custom["Query"] if gene_parsed.attributes is not None: custom["identity"] = gene_parsed.attributes.custom["identity"] custom["similarity"] = gene_parsed.attributes.custom["similarity"] gene = GFF3Record( parsed["gene"][0].seqid, "exonerate", type="gene", start=parsed["gene"][0].start, end=parsed["gene"][0].end, score=parsed["gene"][0].score, strand=parsed["gene"][0].strand, phase=parsed["gene"][0].phase, attributes=GFF3Attributes( id=f"gene{gene_num}", custom=custom, ) ) cdss = [ GFF3Record( e.seqid, "exonerate", "CDS", e.start, e.end, e.score, e.strand, e.phase, attributes=GFF3Attributes( id=f"CDS{gene_num}", parent=[f"mRNA{gene_num}"], custom=(e.attributes.custom if e.attributes is not None else None) ) ) for e in parsed["exon"] ] for c in cdss: if gene.attributes is not None: # This is safe because we added attributes. assert c.attributes is not None c.attributes.custom["query"] = gene.attributes.custom["query"] mrna = GFF3Record.infer_from_children( cdss, id=f"mRNA{gene_num}", seqid=gene.seqid, source="exonerate", type="mRNA", strand=gene.strand, score=gene.score, ) mrna.add_parent(gene) if gene.attributes is not None: # This is safe because infer_from_children adds an ID to attributes. assert mrna.attributes is not None if gene.attributes.id is not None: mrna.attributes.parent = [gene.attributes.id] mrna.attributes.custom["query"] = gene.attributes.custom["query"] out = [gene, mrna] out.extend(cdss) return out