Exemplo n.º 1
0
def dict_to_record(row, source="gffpal", ext_attributes=None):

    if "#target" in row:
        seqid = row["#target"]
    elif "target" in row:
        seqid = row["target"]
    else:
        raise ValueError("Input file doesn't have a 'target' column.")

    start = int(row["tstart"])
    end = int(row["tend"])

    if start <= end:
        strand = Strand.PLUS
    else:
        tmp = start
        start = end
        end = tmp
        del tmp
        strand = Strand.MINUS

    custom_attrs = {
        "pident": row["pident"],
        "alnlen": row["alnlen"],
        "score": row["raw"],
        "bitscore": row["bits"],
        "gapopen": row["gapopen"],
        "query_coverage": row["qcov"],
        "evalue": row["evalue"],
    }

    target = Target(row["query"], int(row["qstart"]), int(row["qend"]))
    gap = parse_cigar(row["cigar"])

    if ext_attributes is None:
        attributes = GFFAttributes(target=target, custom=custom_attrs, gap=gap)
    else:
        attributes = GFFAttributes(
            target=target,
            custom=custom_attrs,
            gap=gap,
            name=ext_attributes[row["query"]]["Name"],
            alias=ext_attributes[row["query"]]["Alias"],
            dbxref=ext_attributes[row["query"]]["Dbxref"],
            ontology_term=ext_attributes[row["query"]]["Ontology_term"],
            note=ext_attributes[row["query"]]["Note"],
        )

    record = GFFRecord(
        seqid=seqid,
        source=source,
        type="nucleotide_to_protein_match",
        start=start,
        end=end,
        score=float(row["evalue"]),
        strand=strand,
        attributes=attributes,
    )
    return record
Exemplo n.º 2
0
def main():

    args = cli(sys.argv[0], sys.argv[1:])

    if args.go is not None:
        go = parse_rfam2go(args.go)
    else:
        go = {}

    for line in args.infile:
        if line.startswith("#"):
            continue

        record = GFFRecord.parse(line)
        attrs = record.attributes

        if args.best and attrs.custom["olp"] == "=":
            continue

        name = record.type
        dbxrefs = ["Rfam:" + attrs.custom["mdlaccn"], "Rfam:" + name]
        if "clan" in attrs.custom:
            dbxrefs.append("RfamClan:" + attrs.custom["clan"])

        ontology_terms = go.get(attrs.custom["mdlaccn"], [])
        notes = [attrs.custom["desc"]]

        target = Target(
            attrs.custom["mdlaccn"],
            int(attrs.custom["mdlfrom"]),
            int(attrs.custom["mdlto"]),
        )

        custom = {
            "evalue": attrs.custom["evalue"],
            "model_type": attrs.custom["mdl"],
            "gc": attrs.custom["gc"],
            "bias": attrs.custom["bias"],
            "bitscore": record.score,
        }

        if attrs.custom["trunc"] == "yes":
            custom["truncated_match"] = "true"

        if attrs.custom["olp"] == "=":
            custom["overlap_with_better_score"] = "true"

        record.source = args.source
        record.type = args.type
        record.score = float(custom["evalue"])
        record.attributes = GFFAttributes(
            name=name,
            dbxref=dbxrefs,
            target=target,
            note=notes,
            ontology_term=ontology_terms,
            custom=custom,
        )

        print(record, file=args.outfile)
Exemplo n.º 3
0
def transform_child(
    feature: GFFRecord,
    group_name: str,
    gff_to_hints: Dict[str, str],
    type_to_trim: Dict[str, int],
    type_to_priority: Dict[str, int],
    source: str,
    priority: int,
) -> Optional[GFFRecord]:
    """ Converts a regular feature to a hint record. """

    feature = copy(feature)
    if feature.type not in gff_to_hints:
        mapped_type = GFF_TYPE_MAP.get(feature.type, None)
    else:
        mapped_type = feature.type

    hint_type: Optional[str] = applicative(lambda t: gff_to_hints.get(t, None),
                                           mapped_type)

    if hint_type is None:
        return None

    feature.type = hint_type
    feature.trim_ends(type_to_trim.get(feature.type, 0))
    priority_boost = type_to_priority[feature.type]

    attr = GFFAttributes(custom=dict(source=source,
                                     group=group_name,
                                     priority=str(priority + priority_boost)))
    feature.attributes = attr
    return feature
Exemplo n.º 4
0
def match_to_anticodon(
    match: TRNAScanRecord,
    ss: TRNAScanSS,
    source: str,
    type: str = "anticodon",
    parents: Sequence[GFFRecord] = []
) -> GFFRecord:
    start, end, strand = fix_strand(ss.anticodon_start, ss.anticodon_end)

    parent_ids = [
        p.attributes.id
        for p
        in parents
        if (p.attributes is not None
            and p.attributes.id is not None)
    ]

    anticodon = GFFRecord(
        seqid=match.seqid,
        source=source,
        type=type,
        start=start,
        end=end,
        score=match.infernal_score,
        strand=strand,
        phase=Phase.NOT_CDS,
        attributes=GFFAttributes(
            id=f"{match.seqid}.{type}{match.num}",
            parent=parent_ids,
        ),
        parents=parents
    )
    return anticodon
Exemplo n.º 5
0
def match_to_introns(
    match: TRNAScanRecord,
    source: str,
    type: str = "tRNA_intron",
    parents: Sequence[GFFRecord] = [],
) -> List[GFFRecord]:
    introns = []

    parent_ids = [
        p.attributes.id
        for p
        in parents
        if (p.attributes is not None
            and p.attributes.id is not None)
    ]

    for istart, iend in zip(match.intron_starts, match.intron_ends):
        start, end, strand = fix_strand(istart, iend)
        intron = GFFRecord(
            seqid=match.seqid,
            source=source,
            type=type,
            start=start,
            end=end,
            score=match.infernal_score,
            strand=strand,
            phase=Phase.NOT_CDS,
            attributes=GFFAttributes(
                id=f"{match.seqid}.{type}{match.num}",
                parent=parent_ids,
            ),
            parents=parents
        )
        introns.append(intron)
    return introns
Exemplo n.º 6
0
def rnammer2gff(args: argparse.Namespace) -> None:
    records: List[GFFRecord] = []

    for line in args.infile:
        if line.startswith("#"):
            continue

        sline = line.strip().split("\t")
        rrna_type = sline[8]
        new_type = TYPE_MAP[args.kingdom][rrna_type.lower()]
        sline[1] = args.source
        sline[2] = new_type
        sline[8] = "."

        rna_record = GFFRecord.parse("\t".join(sline))
        gene_record = deepcopy(rna_record)
        gene_record.type = "rRNA_gene"
        gene_record.add_child(rna_record)

        records.append(gene_record)
        records.append(rna_record)

    num = 0
    for record in GFF(records).traverse_children(sort=True):
        if record.attributes is None:
            attr = GFFAttributes()
            record.attributes = attr
        else:
            attr = record.attributes

        if record.type == "rRNA_gene":
            num += 1
            attr.id = f"rRNA_gene{num}"
        else:
            attr.id = f"rRNA{num}"
            attr.parent = [
                p.attributes.id for p in record.parents
                if (p.attributes is not None and p.attributes.id is not None)
            ]

        print(record, file=args.outfile)

    return
Exemplo n.º 7
0
def match_to_trna(
    match: TRNAScanRecord,
    ss: TRNAScanSS,
    source: str,
    type_map: Mapping[str, str] = TYPE_MAP,
    parents: Sequence[GFFRecord] = []
) -> GFFRecord:
    start, end, strand = fix_strand(match.start, match.end)

    parent_ids = [
        p.attributes.id
        for p
        in parents
        if (p.attributes is not None
            and p.attributes.id is not None)
    ]

    if match.note is None or match.note == "":
        notes: List[str] = []
    else:
        notes = [match.note]

    trna = GFFRecord(
        seqid=match.seqid,
        source=source,
        type=type_map.get(match.trna_type.lower(), "tRNA"),
        start=start,
        end=end,
        score=match.infernal_score,
        strand=strand,
        phase=Phase.NOT_CDS,
        attributes=GFFAttributes(
            id=f"{match.seqid}.tRNA{match.num}",
            parent=parent_ids,
            note=notes,
            custom={
                "secondary_structure": ss.ss,
                "anticodon": match.anticodon,
                "amino_acid": match.trna_type,
            }
        ),
        parents=parents
    )
    return trna
Exemplo n.º 8
0
def get_non_canon_stop_codon(
    seqid: str,
    start: int,
    end: int,
    strand: Strand,
    codon: str,
    parent_id: Optional[str],
) -> GFFRecord:

    custom = {"codon": codon}
    if parent_id is not None:
        custom["cds_parent"] = parent_id

    return GFFRecord(
        seqid, "gffpal", "stop_codon", start, end, None, strand, Phase.NOT_CDS,
        GFFAttributes(
            ontology_term=["SO:0000319"],
            note=["Non-canonical stop codon"],
            custom=custom,
        ))
Exemplo n.º 9
0
def match_to_gene(
    match: TRNAScanRecord,
    source: str,
    type: str
) -> GFFRecord:
    start, end, strand = fix_strand(match.start, match.end)

    gene = GFFRecord(
        seqid=match.seqid,
        source=source,
        type=type,
        start=start,
        end=end,
        score=match.infernal_score,
        strand=strand,
        phase=Phase.NOT_CDS,
        attributes=GFFAttributes(
            id=f"{match.seqid}.{type}{match.num}",
        )
    )

    return gene
Exemplo n.º 10
0
    def as_gff3(self, source="RepeatMasker"):

        custom = {
            "smith_waterman_score": str(self.score),
            "percent_divergence": str(self.perc_divergence),
            "percent_deletions": str(self.perc_deletions),
            "percent_insertions": str(self.perc_insertions),
            "family_consensus_length": str(self.tend + self.tremaining),
        }

        ontology_terms = []

        if self.better_hit:
            custom["has_better_overlapping_hit"] = "true"

        if self.kind == "Simple_repeat":
            repeat_unit = re.match(r"\((?P<rep>.+)\)n",
                                   self.family).group("rep")

            custom["repeat_unit"] = repeat_unit

            if len(repeat_unit) == 1:
                type_ = "monomeric_repeat"
                ontology_terms.extend(["SO:0001934", "SO:monomeric_repeat"])
            elif len(repeat_unit) < 10:
                type_ = "microsatellite"
                ontology_terms.extend(["SO:0000289", "SO:microsatellite"])
            else:
                type_ = "minisatellite"
                ontology_terms.extend(["SO:0000643", "SO:minisatellite"])

        elif self.kind == "Low_complexity":
            type_ = "low_complexity_region"
            ontology_terms.extend(["SO:0001005", "SO:low_complexity_region"])

        else:
            type_ = "repeat_region"
            ontology_terms.extend([
                "SO:0000657", "SO:repeat_region", "SO:0000347",
                "SO:nucleotide_match"
            ])
            custom["repeat_family"] = self.kind

        if self.kind == "Other/DNA_virus":
            ontology_terms.extend(["SO:0001041", "SO:viral_sequence"])

        elif self.kind.startswith("snRNA"):
            ontology_terms.extend(["SO:0001268", "SO:snRNA_gene"])

        elif self.kind.startswith("tRNA"):
            ontology_terms.extend(["SO:0001272", "SO:tRNA_gene"])

        elif self.kind.startswith("rRNA"):
            ontology_terms.extend(["SO:0001637", "SO:rRNA_gene"])

        elif self.kind.startswith("scRNA"):
            ontology_terms.extend(["SO:0001266", "SO:scRNA_gene"])

        elif self.kind.startswith("Segmental"):
            ontology_terms.extend(["SO:1000035", "SO:duplication"])

        elif self.kind.startswith("Satellite"):
            type_ = "satellite_DNA"
            ontology_terms.extend(["SO:0000005", "SO:satellite_DNA"])
            del custom["repeat_family"]

        elif self.kind.startswith("Retrotransposon"):
            ontology_terms.extend(["SO:0000180", "SO:retrotransposon"])

        elif self.kind.startswith("DNA"):
            ontology_terms.extend(["SO:0000182", "SO:DNA_transposon"])

        elif self.kind.startswith("LTR"):
            ontology_terms.extend(["SO:0000186", "SO:LTR_retrotransposon"])

        elif self.kind.startswith("LINE"):
            ontology_terms.extend(["SO:0000194", "SO:LINE_element"])

        elif self.kind.startswith("SINE"):
            ontology_terms.extend(["SO:0000206", "SO:SINE_element"])

        if "helitron" in self.kind.lower():
            ontology_terms.extend(["SO:0000544", "SO:helitron"])

        if ("maverick" in self.kind.lower()
                or "polinton" in self.kind.lower()):
            ontology_terms.extend(["SO:0001170", "SO:polinton"])

        if "mite" in self.kind.lower():
            ontology_terms.extend(["SO:0000338", "SO:MITE"])

        if (self.kind.lower().endswith("/P")
                or self.kind.lower().endswith("P-Fungi")):
            ontology_terms.extend(["SO:0001535", "SO:p_element"])

        attributes = GFFAttributes(
            name=self.family,
            target=Target(self.family, self.tstart, self.tend),
            ontology_term=ontology_terms,
            custom=custom,
        )

        if type_ in {
                "monomeric_repeat", "microsatellite", "minisatellite",
                "low_complexity_region", "satellite_DNA", "duplication"
        }:
            strand = Strand.UNSTRANDED
        else:
            strand = Strand.parse(self.strand)

        record = GFFRecord(
            seqid=self.query,
            source=source,
            type=type_,
            start=self.qstart,
            end=self.qend,
            score=self.perc_divergence,
            strand=strand,
            attributes=attributes,
        )
        return record
Exemplo n.º 11
0
def parse_block(handle, source, type_):

    note = []
    name = None
    species = []

    ids = []

    for line in handle:
        sline = line.strip()
        if line.startswith("//"):
            break

        if line.startswith("#=GF ID"):
            name = sline.split("ID", maxsplit=1)[-1].strip()

        elif line.startswith("#=GF DE"):
            note.append(sline.split("DE", maxsplit=1)[-1].strip())

        elif line.startswith("#=GF TP"):
            species.extend(
                sline.split("TP", maxsplit=1)[-1].strip().split(";")
            )

        elif not line.startswith("#"):
            ids.append(line.split(maxsplit=1)[0])

    seen = set()

    out = []
    for id_ in ids:
        if id_ in seen:
            continue
        else:
            seen.add(id)

        seqid, start, end, strand = id_to_loc(id_)

        if len(species) == 0:
            custom = {}
        else:
            custom = {"species": ":".join(species)}

        attributes = GFFAttributes(
            name=name,
            note=note,
            custom=custom,
        )

        record = GFFRecord(
            seqid=seqid,
            source=source,
            type=type_,
            start=start,
            end=end,
            score=None,
            strand=strand,
            attributes=attributes,
        )

        out.append(record)

    return out