예제 #1
0
def transform_child(
    feature: GFFRecord,
    group_name: str,
    gff_to_hints: Dict[str, str],
    type_to_trim: Dict[str, int],
    type_to_priority: Dict[str, int],
    source: str,
    priority: int,
) -> Optional[GFFRecord]:
    """ Converts a regular feature to a hint record. """

    feature = copy(feature)
    if feature.type not in gff_to_hints:
        mapped_type = GFF_TYPE_MAP.get(feature.type, None)
    else:
        mapped_type = feature.type

    hint_type: Optional[str] = applicative(lambda t: gff_to_hints.get(t, None),
                                           mapped_type)

    if hint_type is None:
        return None

    feature.type = hint_type
    feature.trim_ends(type_to_trim.get(feature.type, 0))
    priority_boost = type_to_priority[feature.type]

    attr = GFFAttributes(custom=dict(source=source,
                                     group=group_name,
                                     priority=str(priority + priority_boost)))
    feature.attributes = attr
    return feature
예제 #2
0
def main():

    args = cli(sys.argv[0], sys.argv[1:])

    if args.go is not None:
        go = parse_rfam2go(args.go)
    else:
        go = {}

    for line in args.infile:
        if line.startswith("#"):
            continue

        record = GFFRecord.parse(line)
        attrs = record.attributes

        if args.best and attrs.custom["olp"] == "=":
            continue

        name = record.type
        dbxrefs = ["Rfam:" + attrs.custom["mdlaccn"], "Rfam:" + name]
        if "clan" in attrs.custom:
            dbxrefs.append("RfamClan:" + attrs.custom["clan"])

        ontology_terms = go.get(attrs.custom["mdlaccn"], [])
        notes = [attrs.custom["desc"]]

        target = Target(
            attrs.custom["mdlaccn"],
            int(attrs.custom["mdlfrom"]),
            int(attrs.custom["mdlto"]),
        )

        custom = {
            "evalue": attrs.custom["evalue"],
            "model_type": attrs.custom["mdl"],
            "gc": attrs.custom["gc"],
            "bias": attrs.custom["bias"],
            "bitscore": record.score,
        }

        if attrs.custom["trunc"] == "yes":
            custom["truncated_match"] = "true"

        if attrs.custom["olp"] == "=":
            custom["overlap_with_better_score"] = "true"

        record.source = args.source
        record.type = args.type
        record.score = float(custom["evalue"])
        record.attributes = GFFAttributes(
            name=name,
            dbxref=dbxrefs,
            target=target,
            note=notes,
            ontology_term=ontology_terms,
            custom=custom,
        )

        print(record, file=args.outfile)
예제 #3
0
def decode_gff(infiles, outfile, map_, column):
    inhandles = join_files(infiles, header=False)

    if column == "id":
        trans_function = replace_gff_id
    elif column == "name":
        trans_function = replace_gff_name
    elif column == "seqid":
        trans_function = replace_gff_seqid
    else:
        raise ValueError("This shouldn't ever happen")

    record_chunk = list()
    for i, line in enumerate(inhandles):
        if line.startswith("#"):
            record_chunk.append(line.strip())
            continue

        old_record = GFFRecord.parse(line)
        new_records = trans_function(old_record, map_)
        record_chunk.append(str(new_records))

        if i % 10000 == 0:
            outfile.write("\n".join(record_chunk))
            record_chunk = list()

    if len(record_chunk) > 0:
        outfile.write("\n".join(record_chunk))

    return
예제 #4
0
def match_to_anticodon(
    match: TRNAScanRecord,
    ss: TRNAScanSS,
    source: str,
    type: str = "anticodon",
    parents: Sequence[GFFRecord] = []
) -> GFFRecord:
    start, end, strand = fix_strand(ss.anticodon_start, ss.anticodon_end)

    parent_ids = [
        p.attributes.id
        for p
        in parents
        if (p.attributes is not None
            and p.attributes.id is not None)
    ]

    anticodon = GFFRecord(
        seqid=match.seqid,
        source=source,
        type=type,
        start=start,
        end=end,
        score=match.infernal_score,
        strand=strand,
        phase=Phase.NOT_CDS,
        attributes=GFFAttributes(
            id=f"{match.seqid}.{type}{match.num}",
            parent=parent_ids,
        ),
        parents=parents
    )
    return anticodon
예제 #5
0
def match_to_introns(
    match: TRNAScanRecord,
    source: str,
    type: str = "tRNA_intron",
    parents: Sequence[GFFRecord] = [],
) -> List[GFFRecord]:
    introns = []

    parent_ids = [
        p.attributes.id
        for p
        in parents
        if (p.attributes is not None
            and p.attributes.id is not None)
    ]

    for istart, iend in zip(match.intron_starts, match.intron_ends):
        start, end, strand = fix_strand(istart, iend)
        intron = GFFRecord(
            seqid=match.seqid,
            source=source,
            type=type,
            start=start,
            end=end,
            score=match.infernal_score,
            strand=strand,
            phase=Phase.NOT_CDS,
            attributes=GFFAttributes(
                id=f"{match.seqid}.{type}{match.num}",
                parent=parent_ids,
            ),
            parents=parents
        )
        introns.append(intron)
    return introns
예제 #6
0
def dict_to_record(row, source="gffpal", ext_attributes=None):

    if "#target" in row:
        seqid = row["#target"]
    elif "target" in row:
        seqid = row["target"]
    else:
        raise ValueError("Input file doesn't have a 'target' column.")

    start = int(row["tstart"])
    end = int(row["tend"])

    if start <= end:
        strand = Strand.PLUS
    else:
        tmp = start
        start = end
        end = tmp
        del tmp
        strand = Strand.MINUS

    custom_attrs = {
        "pident": row["pident"],
        "alnlen": row["alnlen"],
        "score": row["raw"],
        "bitscore": row["bits"],
        "gapopen": row["gapopen"],
        "query_coverage": row["qcov"],
        "evalue": row["evalue"],
    }

    target = Target(row["query"], int(row["qstart"]), int(row["qend"]))
    gap = parse_cigar(row["cigar"])

    if ext_attributes is None:
        attributes = GFFAttributes(target=target, custom=custom_attrs, gap=gap)
    else:
        attributes = GFFAttributes(
            target=target,
            custom=custom_attrs,
            gap=gap,
            name=ext_attributes[row["query"]]["Name"],
            alias=ext_attributes[row["query"]]["Alias"],
            dbxref=ext_attributes[row["query"]]["Dbxref"],
            ontology_term=ext_attributes[row["query"]]["Ontology_term"],
            note=ext_attributes[row["query"]]["Note"],
        )

    record = GFFRecord(
        seqid=seqid,
        source=source,
        type="nucleotide_to_protein_match",
        start=start,
        end=end,
        score=float(row["evalue"]),
        strand=strand,
        attributes=attributes,
    )
    return record
예제 #7
0
def encode_gff(
    infiles,
    outfile,
    mapfile,
    column,
    id_conv,
):
    inhandles = join_files(infiles, header=False)
    seen = dict()

    if column == "id":
        trans_function = replace_gff_id
    elif column == "name":
        trans_function = replace_gff_name
    elif column == "seqid":
        trans_function = replace_gff_seqid
    else:
        raise ValueError("This shouldn't ever happen")

    id_chunk = list()
    record_chunk = list()

    for i, line in enumerate(inhandles):
        if line.startswith("#"):
            record_chunk.append(line.strip())
            continue

        old_record = GFFRecord.parse(line)
        new_record = trans_function(
            old_record,
            seen,
            id_chunk,
            lambda x: next(id_conv),
        )
        record_chunk.append(str(new_record))

        if i % 10000 == 0:
            if len(record_chunk) > 0:
                outfile.write("\n".join(record_chunk))
                outfile.write("\n")
                record_chunk = list()

            if len(id_chunk) > 0:
                mapfile.write("".join(f"{n}\t{o}\n" for n, o in id_chunk))
                id_chunk = list()

    if len(record_chunk) > 0:
        outfile.write("\n".join(record_chunk))
        outfile.write("\n")

    if len(id_chunk) > 0:
        mapfile.write("".join(f"{n}\t{o}\n" for n, o in id_chunk))
    return
예제 #8
0
def match_to_trna(
    match: TRNAScanRecord,
    ss: TRNAScanSS,
    source: str,
    type_map: Mapping[str, str] = TYPE_MAP,
    parents: Sequence[GFFRecord] = []
) -> GFFRecord:
    start, end, strand = fix_strand(match.start, match.end)

    parent_ids = [
        p.attributes.id
        for p
        in parents
        if (p.attributes is not None
            and p.attributes.id is not None)
    ]

    if match.note is None or match.note == "":
        notes: List[str] = []
    else:
        notes = [match.note]

    trna = GFFRecord(
        seqid=match.seqid,
        source=source,
        type=type_map.get(match.trna_type.lower(), "tRNA"),
        start=start,
        end=end,
        score=match.infernal_score,
        strand=strand,
        phase=Phase.NOT_CDS,
        attributes=GFFAttributes(
            id=f"{match.seqid}.tRNA{match.num}",
            parent=parent_ids,
            note=notes,
            custom={
                "secondary_structure": ss.ss,
                "anticodon": match.anticodon,
                "amino_acid": match.trna_type,
            }
        ),
        parents=parents
    )
    return trna
예제 #9
0
def rnammer2gff(args: argparse.Namespace) -> None:
    records: List[GFFRecord] = []

    for line in args.infile:
        if line.startswith("#"):
            continue

        sline = line.strip().split("\t")
        rrna_type = sline[8]
        new_type = TYPE_MAP[args.kingdom][rrna_type.lower()]
        sline[1] = args.source
        sline[2] = new_type
        sline[8] = "."

        rna_record = GFFRecord.parse("\t".join(sline))
        gene_record = deepcopy(rna_record)
        gene_record.type = "rRNA_gene"
        gene_record.add_child(rna_record)

        records.append(gene_record)
        records.append(rna_record)

    num = 0
    for record in GFF(records).traverse_children(sort=True):
        if record.attributes is None:
            attr = GFFAttributes()
            record.attributes = attr
        else:
            attr = record.attributes

        if record.type == "rRNA_gene":
            num += 1
            attr.id = f"rRNA_gene{num}"
        else:
            attr.id = f"rRNA{num}"
            attr.parent = [
                p.attributes.id for p in record.parents
                if (p.attributes is not None and p.attributes.id is not None)
            ]

        print(record, file=args.outfile)

    return
예제 #10
0
def get_non_canon_stop_codon(
    seqid: str,
    start: int,
    end: int,
    strand: Strand,
    codon: str,
    parent_id: Optional[str],
) -> GFFRecord:

    custom = {"codon": codon}
    if parent_id is not None:
        custom["cds_parent"] = parent_id

    return GFFRecord(
        seqid, "gffpal", "stop_codon", start, end, None, strand, Phase.NOT_CDS,
        GFFAttributes(
            ontology_term=["SO:0000319"],
            note=["Non-canonical stop codon"],
            custom=custom,
        ))
예제 #11
0
def match_to_gene(
    match: TRNAScanRecord,
    source: str,
    type: str
) -> GFFRecord:
    start, end, strand = fix_strand(match.start, match.end)

    gene = GFFRecord(
        seqid=match.seqid,
        source=source,
        type=type,
        start=start,
        end=end,
        score=match.infernal_score,
        strand=strand,
        phase=Phase.NOT_CDS,
        attributes=GFFAttributes(
            id=f"{match.seqid}.{type}{match.num}",
        )
    )

    return gene
예제 #12
0
    def as_gff3(self, source="RepeatMasker"):

        custom = {
            "smith_waterman_score": str(self.score),
            "percent_divergence": str(self.perc_divergence),
            "percent_deletions": str(self.perc_deletions),
            "percent_insertions": str(self.perc_insertions),
            "family_consensus_length": str(self.tend + self.tremaining),
        }

        ontology_terms = []

        if self.better_hit:
            custom["has_better_overlapping_hit"] = "true"

        if self.kind == "Simple_repeat":
            repeat_unit = re.match(r"\((?P<rep>.+)\)n",
                                   self.family).group("rep")

            custom["repeat_unit"] = repeat_unit

            if len(repeat_unit) == 1:
                type_ = "monomeric_repeat"
                ontology_terms.extend(["SO:0001934", "SO:monomeric_repeat"])
            elif len(repeat_unit) < 10:
                type_ = "microsatellite"
                ontology_terms.extend(["SO:0000289", "SO:microsatellite"])
            else:
                type_ = "minisatellite"
                ontology_terms.extend(["SO:0000643", "SO:minisatellite"])

        elif self.kind == "Low_complexity":
            type_ = "low_complexity_region"
            ontology_terms.extend(["SO:0001005", "SO:low_complexity_region"])

        else:
            type_ = "repeat_region"
            ontology_terms.extend([
                "SO:0000657", "SO:repeat_region", "SO:0000347",
                "SO:nucleotide_match"
            ])
            custom["repeat_family"] = self.kind

        if self.kind == "Other/DNA_virus":
            ontology_terms.extend(["SO:0001041", "SO:viral_sequence"])

        elif self.kind.startswith("snRNA"):
            ontology_terms.extend(["SO:0001268", "SO:snRNA_gene"])

        elif self.kind.startswith("tRNA"):
            ontology_terms.extend(["SO:0001272", "SO:tRNA_gene"])

        elif self.kind.startswith("rRNA"):
            ontology_terms.extend(["SO:0001637", "SO:rRNA_gene"])

        elif self.kind.startswith("scRNA"):
            ontology_terms.extend(["SO:0001266", "SO:scRNA_gene"])

        elif self.kind.startswith("Segmental"):
            ontology_terms.extend(["SO:1000035", "SO:duplication"])

        elif self.kind.startswith("Satellite"):
            type_ = "satellite_DNA"
            ontology_terms.extend(["SO:0000005", "SO:satellite_DNA"])
            del custom["repeat_family"]

        elif self.kind.startswith("Retrotransposon"):
            ontology_terms.extend(["SO:0000180", "SO:retrotransposon"])

        elif self.kind.startswith("DNA"):
            ontology_terms.extend(["SO:0000182", "SO:DNA_transposon"])

        elif self.kind.startswith("LTR"):
            ontology_terms.extend(["SO:0000186", "SO:LTR_retrotransposon"])

        elif self.kind.startswith("LINE"):
            ontology_terms.extend(["SO:0000194", "SO:LINE_element"])

        elif self.kind.startswith("SINE"):
            ontology_terms.extend(["SO:0000206", "SO:SINE_element"])

        if "helitron" in self.kind.lower():
            ontology_terms.extend(["SO:0000544", "SO:helitron"])

        if ("maverick" in self.kind.lower()
                or "polinton" in self.kind.lower()):
            ontology_terms.extend(["SO:0001170", "SO:polinton"])

        if "mite" in self.kind.lower():
            ontology_terms.extend(["SO:0000338", "SO:MITE"])

        if (self.kind.lower().endswith("/P")
                or self.kind.lower().endswith("P-Fungi")):
            ontology_terms.extend(["SO:0001535", "SO:p_element"])

        attributes = GFFAttributes(
            name=self.family,
            target=Target(self.family, self.tstart, self.tend),
            ontology_term=ontology_terms,
            custom=custom,
        )

        if type_ in {
                "monomeric_repeat", "microsatellite", "minisatellite",
                "low_complexity_region", "satellite_DNA", "duplication"
        }:
            strand = Strand.UNSTRANDED
        else:
            strand = Strand.parse(self.strand)

        record = GFFRecord(
            seqid=self.query,
            source=source,
            type=type_,
            start=self.qstart,
            end=self.qend,
            score=self.perc_divergence,
            strand=strand,
            attributes=attributes,
        )
        return record
예제 #13
0
def main():
    args = cli(sys.argv[0], sys.argv[1:])

    rows = list()
    for line in args.infile:
        if line.startswith("#"):
            continue
        record = GFFRecord.parse(line)
        record.attributes.id = record.attributes.name
        record.attributes.name = None
        record.attributes.custom = {}
        rows.append(record)

    gff = GFF(rows)
    gff.infer_missing_parents()

    counter = 1

    for mrna in gff.select_type("mRNA"):
        if len(mrna.children) < 2:
            continue

        region = deepcopy(mrna)
        region.type = "repeat_region"
        region.attributes.id = f"repeat_region{counter}"
        region.attributes.ontology_term = ["SO:0000657"],

        mrna.type = "helitron"
        mrna.parents = [region]
        mrna.attributes.parent = [region.attributes.id]
        mrna.attributes.id = f"helitron{counter}"
        mrna.attributes.ontology_term = ["SO:0000544", "SO:helitron"]
        mrna.attributes.custom = {}

        flank3 = [c for c in mrna.children
                  if c.attributes.id.endswith(".3")][0]
        flank5 = [c for c in mrna.children
                  if c.attributes.id.endswith(".5.1")][0]

        flank3.type = "three_prime_flanking_region"
        flank3.attributes.ontology_term = [
            "SO:0001417",
            "SO:three_prime_flanking_region",
            "SO:0000364",
            "SO:transposable_element_flanking_region"
        ]
        flank3.attributes.id = None
        flank3.attributes.parent = [mrna.attributes.id]

        flank5.type = "five_prime_flanking_region"
        flank5.attributes.ontology_term = [
            "SO:0001416",
            "SO:five_prime_flanking_region",
            "SO:0000364",
            "SO:transposable_element_flanking_region"
        ]
        flank5.attributes.id = None
        flank5.attributes.parent = [mrna.attributes.id]

        mrna.source = flank5.source

        print(region, file=args.outfile)
        print(mrna, file=args.outfile)
        if mrna.strand == Strand.MINUS:
            print(flank3, file=args.outfile)
            print(flank5, file=args.outfile)
        else:
            print(flank5, file=args.outfile)
            print(flank3, file=args.outfile)

        counter += 1

    return
예제 #14
0
def parse_block(handle, source, type_):

    note = []
    name = None
    species = []

    ids = []

    for line in handle:
        sline = line.strip()
        if line.startswith("//"):
            break

        if line.startswith("#=GF ID"):
            name = sline.split("ID", maxsplit=1)[-1].strip()

        elif line.startswith("#=GF DE"):
            note.append(sline.split("DE", maxsplit=1)[-1].strip())

        elif line.startswith("#=GF TP"):
            species.extend(
                sline.split("TP", maxsplit=1)[-1].strip().split(";")
            )

        elif not line.startswith("#"):
            ids.append(line.split(maxsplit=1)[0])

    seen = set()

    out = []
    for id_ in ids:
        if id_ in seen:
            continue
        else:
            seen.add(id)

        seqid, start, end, strand = id_to_loc(id_)

        if len(species) == 0:
            custom = {}
        else:
            custom = {"species": ":".join(species)}

        attributes = GFFAttributes(
            name=name,
            note=note,
            custom=custom,
        )

        record = GFFRecord(
            seqid=seqid,
            source=source,
            type=type_,
            start=start,
            end=end,
            score=None,
            strand=strand,
            attributes=attributes,
        )

        out.append(record)

    return out
예제 #15
0
def deal_with_block(block: List[str], gene_num: int) -> List[GFF3Record]:

    parsed: Dict[str, List[GFFRecord[GTFAttributes]]] = dict()
    for line in block:
        rec = GFFRecord.parse(line, attr=GTFAttributes)

        if rec.type in parsed:
            parsed[rec.type].append(rec)
        else:
            parsed[rec.type] = [rec]

    assert len(parsed["gene"]) == 1
    assert len(parsed["similarity"]) == 1
    gene_parsed = parsed["gene"][0]
    similarity_parsed = parsed["similarity"][0]

    custom: Dict[str, str] = dict()
    if similarity_parsed.attributes is not None:
        custom["query"] = similarity_parsed.attributes.custom["Query"]

    if gene_parsed.attributes is not None:
        custom["identity"] = gene_parsed.attributes.custom["identity"]
        custom["similarity"] = gene_parsed.attributes.custom["similarity"]

    gene = GFF3Record(
        parsed["gene"][0].seqid,
        "exonerate",
        type="gene",
        start=parsed["gene"][0].start,
        end=parsed["gene"][0].end,
        score=parsed["gene"][0].score,
        strand=parsed["gene"][0].strand,
        phase=parsed["gene"][0].phase,
        attributes=GFF3Attributes(
            id=f"gene{gene_num}",
            custom=custom,
        )
    )

    cdss = [
        GFF3Record(
            e.seqid,
            "exonerate",
            "CDS",
            e.start,
            e.end,
            e.score,
            e.strand,
            e.phase,
            attributes=GFF3Attributes(
                id=f"CDS{gene_num}",
                parent=[f"mRNA{gene_num}"],
                custom=(e.attributes.custom
                        if e.attributes is not None
                        else None)
            )
        )
        for e
        in parsed["exon"]
    ]

    for c in cdss:
        if gene.attributes is not None:
            # This is safe because we added attributes.
            assert c.attributes is not None
            c.attributes.custom["query"] = gene.attributes.custom["query"]

    mrna = GFF3Record.infer_from_children(
        cdss,
        id=f"mRNA{gene_num}",
        seqid=gene.seqid,
        source="exonerate",
        type="mRNA",
        strand=gene.strand,
        score=gene.score,
    )

    mrna.add_parent(gene)

    if gene.attributes is not None:
        # This is safe because infer_from_children adds an ID to attributes.
        assert mrna.attributes is not None
        if gene.attributes.id is not None:
            mrna.attributes.parent = [gene.attributes.id]
        mrna.attributes.custom["query"] = gene.attributes.custom["query"]

    out = [gene, mrna]
    out.extend(cdss)
    return out