Exemplo n.º 1
0
    def _parse_alignment_line(
            line: str) -> Tuple[str, str, int, str, int, int, Optional[int]]:
        sline = MULTISPACE_REGEX.split(line.strip(), maxsplit=5)

        columns = ["type", "id", "ali_start", "sequence", "ali_end", "length"]
        dline = dict(zip(columns, sline))

        length = fmap(lambda x: x.lstrip("(").rstrip(")"),
                      dline.get("length", None))

        if length is None:
            raise LineParseError(
                f"Missing 'length' from alignment line: '{line}'.")

        seq_begin_match = ALI_REGEX.match(line)
        if seq_begin_match is None:
            seq_begin: Optional[int] = None
        else:
            seq_begin = seq_begin_match.end()

        return (get_and_parse("type", "type", is_one_of(["T", "Q"]))(dline),
                get_and_parse("id", "id", parse_str)(dline),
                get_and_parse("ali_start", "ali_start", parse_int)(dline),
                get_and_parse("sequence", "sequence", parse_str)(dline),
                get_and_parse("ali_end", "ali_end", parse_int)(dline),
                raise_it(parse_field(parse_int, "length",
                                     "field"))(length), seq_begin)
Exemplo n.º 2
0
    def _parse_probab_line(
        field: str
    ) -> Tuple[float, float, float, int, float, float, float, Optional[float]]:
        sline = (s for s in MULTISPACE_REGEX.split(field.strip()))
        columns = [
            "Probab",
            "E-value",
            "Score",
            "Aligned_cols",
            "Identities",
            "Similarity",
            "Sum_probs",
            "Template_Neff",
        ]

        dline = {
            col: raise_it(parse_field(split_at_eq(parse_str, col), col))(f)
            for f, col in zip(sline, columns)
        }

        if "Template_Neff" in dline:
            template_neff: Optional[float] = raise_it(
                parse_field(parse_float,
                            "template_neff"))(dline["Template_Neff"])
        else:
            template_neff = None

        return (
            get_and_parse("Probab", "probability", parse_float)(dline),
            get_and_parse("E-value", "evalue", parse_float)(dline),
            get_and_parse("Score", "score", parse_float)(dline),
            get_and_parse("Aligned_cols", "aligned_cols", parse_int)(dline),
            get_and_parse("Identities", "identity",
                          lambda x: parse_float(x.rstrip("%")))(dline) / 100.0,
            get_and_parse("Similarity", "similarity", parse_float)(dline),
            get_and_parse("Sum_probs", "sum_probs", parse_float)(dline),
            template_neff,
        )
Exemplo n.º 3
0
    def from_line(cls, line: str) -> "LOCALIZER":
        """ Parse an ApoplastP line as an object. """

        if line == "":
            raise LineParseError("The line was empty.")

        sline = [c.strip() for c in line.strip().split("\t")]

        if len(sline) != 4:
            raise LineParseError("The line had the wrong number of columns. "
                                 f"Expected 4 but got {len(sline)}")

        (cp, cp_prob, cp_start,
         cp_end) = parse_tp_field(sline[1], "chloroplast")

        (mt, mt_prob, mt_start,
         mt_end) = parse_tp_field(sline[2], "mitochondria")

        (nuc, nuc_sigs) = parse_nuc_field(sline[3])

        return cls(
            raise_it(parse_field(parse_str, "name"))(sline[0]), cp, cp_prob,
            fmap(lambda x: x - 1 + 20, cp_start), cp_end, mt, mt_prob,
            fmap(lambda x: x - 1 + 20, mt_start), mt_end, nuc, nuc_sigs)
Exemplo n.º 4
0
    field_name: str = "active_site",
) -> str:
    """ """

    field = field.strip()
    if not field.startswith("predicted_active_site"):
        raise LineParseError(
            f"Invalid value: '{field}' in the column: '{field_name}'. "
            "Must have the form 'predicted_active_site[1,2,3]'.")

    field = field[len("predicted_active_site"):]
    sfield = (f.strip("[],; ") for f in field.split('['))
    return ';'.join(f.replace(' ', '') for f in sfield if len(f) > 0)


ps_name = raise_it(parse_field(parse_str, "name"))
ps_ali_start = raise_it(parse_field(parse_int, "ali_start"))
ps_ali_end = raise_it(parse_field(parse_int, "ali_end"))
ps_env_start = raise_it(parse_field(parse_int, "env_start"))
ps_env_end = raise_it(parse_field(parse_int, "env_end"))
ps_hmm = raise_it(parse_field(parse_str, "hmm"))
ps_hmm_name = raise_it(parse_field(parse_str, "hmm_name"))
ps_hmm_type = raise_it(parse_field(parse_str, "hmm_type"))
ps_hmm_start = raise_it(parse_field(parse_int, "hmm_start"))
ps_hmm_end = raise_it(parse_field(parse_int, "hmm_end"))
ps_hmm_len = raise_it(parse_field(parse_int, "hmm_len"))
ps_bitscore = raise_it(parse_field(parse_float, "bitscore"))
ps_evalue = raise_it(parse_field(parse_float, "evalue"))
ps_is_significant = raise_it(
    parse_field(parse_bool("1", "0"), "is_significant"))
Exemplo n.º 5
0
#!/usr/bin/env python3

from typing import TextIO
from typing import Iterator

from predectorutils.analyses.base import Analysis
from predectorutils.parsers import (FieldParseError, LineParseError, raise_it,
                                    parse_field, parse_str, parse_float,
                                    is_one_of)

__all__ = ["ApoplastP"]

apo_name = raise_it(parse_field(parse_str, "name"))
apo_prediction = raise_it(
    parse_field(is_one_of(["Apoplastic", "Non-apoplastic"]), "prediction"))
apo_prob = raise_it(parse_field(parse_float, "prob"))


class ApoplastP(Analysis):
    """     """

    columns = ["name", "prediction", "prob"]
    types = [str, str, float]
    analysis = "apoplastp"
    software = "ApoplastP"

    def __init__(self, name: str, prediction: str, prob: float) -> None:
        self.name = name
        self.prediction = prediction
        self.prob = prob
        return
Exemplo n.º 6
0
GFF3_WRITE_ORDER: List[str] = [
    "ID",
    "Name",
    "Alias",
    "Parent",
    "Target",
    "Gap",
    "Derives_from",
    "Note",
    "Dbxref",
    "Ontology_term",
    "Is_circular",
]

rec_seqid = raise_it(parse_field(parse_str, "seqid"))
rec_source = raise_it(parse_field(parse_str, "source"))
rec_type = raise_it(parse_field(parse_str, "type"))
rec_start = raise_it(parse_field(parse_int, "start"))
rec_end = raise_it(parse_field(parse_int, "end"))
rec_score = raise_it(parse_field(parse_or_none(parse_float, "."), "score"))
rec_strand = raise_it(parse_field(is_one_of(["-", "+", ".", "?"]), "strand"))
rec_phase = raise_it(parse_field(is_one_of(["0", "1", "2", "."]), "phase"))


def parse_attr_list(string: str) -> List[str]:
    return list(f.strip() for f in string.strip(", ").split(","))


attr_is_circular = raise_it(
    parse_field(
Exemplo n.º 7
0
#!/usr/bin/env python3

from typing import TextIO
from typing import Iterator

from predectorutils.analyses.base import Analysis
from predectorutils.parsers import (FieldParseError, LineParseError,
                                    parse_field, raise_it, parse_str,
                                    parse_float, is_one_of)

dl_name = raise_it(parse_field(parse_str, "name"))
dl_prediction = raise_it(
    parse_field(
        is_one_of([
            "Membrane", "Nucleus", "Cytoplasm", "Extracellular",
            "Mitochondrion", "Cell_membrane", "Endoplasmic_reticulum",
            "Plastid", "Golgi_apparatus", "Lysosome/Vacuole", "Peroxisome"
        ]), "prediction"))
dl_membrane = raise_it(parse_field(parse_float, "membrane"))
dl_nucleus = raise_it(parse_field(parse_float, "nucleus"))
dl_cytoplasm = raise_it(parse_field(parse_float, "cytoplasm"))
dl_extracellular = raise_it(parse_field(parse_float, "extracellular"))
dl_mitochondrion = raise_it(parse_field(parse_float, "mitochondrion"))
dl_cell_membrane = raise_it(parse_field(parse_float, "cell_membrane"))
dl_endoplasmic_reticulum = raise_it(
    parse_field(parse_float, "endoplasmic_reticulum"))
dl_plastid = raise_it(parse_field(parse_float, "plastid"))
dl_golgi_apparatus = raise_it(parse_field(parse_float, "golgi_apparatus"))
dl_lysosome = raise_it(parse_field(parse_float, "lysosome_vacuole"))
dl_peroxisome = raise_it(parse_field(parse_float, "peroxisome"))
Exemplo n.º 8
0
 def _parse_query_length_line(field: str) -> int:
     return raise_it(
         parse_field(
             split_at_multispace(parse_int, "Match_columns"),
             "query_length",
         ))(field)
Exemplo n.º 9
0
from typing import Optional

from predectorutils.gff import (GFFRecord, GFFAttributes, Strand, Target, Gap,
                                GapCode, GapElement)
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.parsers import (
    FieldParseError,
    LineParseError,
    parse_field,
    raise_it,
    parse_str,
    parse_float,
    parse_int,
)

mm_query = raise_it(parse_field(parse_str, "query"))
mm_target = raise_it(parse_field(parse_str, "target"))
mm_qstart = raise_it(parse_field(parse_int, "qstart"))
mm_qend = raise_it(parse_field(parse_int, "qend"))
mm_qlen = raise_it(parse_field(parse_int, "qlen"))
mm_tstart = raise_it(parse_field(parse_int, "tstart"))
mm_tend = raise_it(parse_field(parse_int, "tend"))
mm_tlen = raise_it(parse_field(parse_int, "tlen"))
mm_evalue = raise_it(parse_field(parse_float, "evalue"))
mm_gapopen = raise_it(parse_field(parse_int, "gapopen"))
mm_pident = raise_it(parse_field(parse_float, "pident"))
mm_alnlen = raise_it(parse_field(parse_int, "alnlen"))
mm_raw = raise_it(parse_field(parse_float, "raw"))
mm_bits = raise_it(parse_field(parse_float, "bits"))
mm_cigar = raise_it(parse_field(parse_str, "cigar"))
mm_mismatch = raise_it(parse_field(parse_int, "mismatch"))
Exemplo n.º 10
0
from predectorutils.gff import (
    GFFRecord,
    GFFAttributes,
    Strand,
)
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.analyses.base import str_or_none
from predectorutils.parsers import (FieldParseError, LineParseError, raise_it,
                                    parse_field, parse_str, parse_float,
                                    parse_int, parse_bool, parse_regex,
                                    MULTISPACE_REGEX, is_one_of, is_value)

__all__ = ["SignalP3NN", "SignalP3HMM", "SignalP4", "SignalP5"]

s3nn_name = raise_it(parse_field(parse_str, "name"))
s3nn_cmax = raise_it(parse_field(parse_float, "cmax"))
s3nn_cmax_pos = raise_it(parse_field(parse_int, "cmax_pos"))
s3nn_cmax_decision = raise_it(
    parse_field(parse_bool("Y", "N"), "cmax_decision"))
s3nn_ymax = raise_it(parse_field(parse_float, "ymax"))
s3nn_ymax_pos = raise_it(parse_field(parse_int, "ymax_pos"))
s3nn_ymax_decision = raise_it(
    parse_field(parse_bool("Y", "N"), "ymax_decision"))
s3nn_smax = raise_it(parse_field(parse_float, "smax"))
s3nn_smax_pos = raise_it(parse_field(parse_int, "smax_pos"))
s3nn_smax_decision = raise_it(
    parse_field(parse_bool("Y", "N"), "smax_decision"))
s3nn_smean = raise_it(parse_field(parse_float, "smean"))
s3nn_smean_decision = raise_it(
    parse_field(parse_bool("Y", "N"), "smean_decision"))
Exemplo n.º 11
0
from typing import Optional
from typing import TextIO
from typing import Iterator

from predectorutils.gff import (
    GFFRecord,
    GFFAttributes,
    Strand,
)
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.analyses.base import float_or_none, str_or_none
from predectorutils.parsers import (FieldParseError, LineParseError,
                                    parse_field, raise_it, parse_str,
                                    parse_regex, parse_float, is_one_of)

tp_name = raise_it(parse_field(parse_str, "name"))
tp_prediction = raise_it(
    parse_field(is_one_of(["OTHER", "noTP", "SP", "mTP", "cTP", "luTP"]),
                "prediction"))
tp_other = raise_it(parse_field(parse_float, "OTHER"))
tp_sp = raise_it(parse_field(parse_float, "SP"))
tp_mtp = raise_it(parse_field(parse_float, "mTP"))

pl_prediction = raise_it(
    parse_field(is_one_of(["OTHER", "SP", "mTP", "cTP", "luTP"]),
                "prediction"))
pl_ctp = raise_it(parse_field(parse_float, "cTP"))
pl_lutp = raise_it(parse_field(parse_float, "luTP"))

CS_POS_REGEX = re.compile(r"CS\s+pos:\s+\d+-(?P<cs>\d+)\.?\s+"
                          r"[A-Za-z]+-[A-Za-z]+\.?\s+"
Exemplo n.º 12
0
    parse_str,
    parse_float,
    parse_int,
    MULTISPACE_REGEX,
)


def split_hmm(s: str) -> Union[ValueParseError, str]:
    s1 = parse_str(s)
    if isinstance(s1, ValueParseError):
        return s1
    else:
        return s.rsplit(".hmm", maxsplit=1)[0]


hm_name = raise_it(parse_field(parse_str, "name"))  # query name
hm_hmm = raise_it(parse_field(split_hmm, "hmm"))  # target name
hm_hmm_len = raise_it(parse_field(parse_int, "hmm_len"))  # tlen
hm_query_len = raise_it(parse_field(parse_int, "query_len"))  # qlen
hm_full_evalue = raise_it(parse_field(parse_float, "full_evalue"))
hm_full_score = raise_it(parse_field(parse_float, "full_score"))
hm_full_bias = raise_it(parse_field(parse_float, "full_bias"))
hm_nmatches = raise_it(parse_field(parse_int, "nmatches"))
hm_domain_c_evalue = raise_it(parse_field(parse_float, "domain_c_evalue"))
hm_domain_i_evalue = raise_it(parse_field(parse_float, "domain_i_evalue"))
hm_domain_score = raise_it(parse_field(parse_float, "domain_score"))
hm_domain_bias = raise_it(parse_field(parse_float, "domain_bias"))
hm_hmm_from = raise_it(parse_field(parse_int, "hmm_from"))
hm_hmm_to = raise_it(parse_field(parse_int, "hmm_to"))
hm_query_from = raise_it(parse_field(parse_int, "query_from"))
hm_query_to = raise_it(parse_field(parse_int, "query_to"))
Exemplo n.º 13
0
 def _parse_query_neff_line(field: str) -> float:
     return raise_it(
         parse_field(
             split_at_multispace(parse_float, "Neff"),
             "query_neff",
         ))(field)
Exemplo n.º 14
0
from predectorutils.parsers import (
    FieldParseError,
    LineParseError,
    raise_it,
    parse_field,
    parse_str,
    parse_float,
    parse_int,
    parse_or_none,
    is_one_of
)

__all__ = ["DeepSig"]


ds_name = raise_it(parse_field(parse_str, "name"))
ds_prediction = raise_it(parse_field(
    is_one_of(["SignalPeptide", "Transmembrane", "Other"]),
    "prediction"
))
ds_prob = raise_it(parse_field(parse_float, "prob"))
ds_cs_pos = raise_it(parse_field(parse_or_none(parse_int, "-"), "cs_pos"))


class DeepSig(Analysis, GFFAble):

    """     """

    columns = ["name", "prediction", "prob", "cs_pos"]
    types = [str, str, float, int_or_none]
    analysis = "deepsig"
Exemplo n.º 15
0
from typing import Optional

from predectorutils.gff import (GFFRecord, Strand)
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.parsers import (
    FieldParseError,
    LineParseError,
    parse_field,
    raise_it,
    parse_str,
    parse_float,
    parse_int,
    split_at_eq,
)

tm_name = raise_it(parse_field(parse_str, "name"))
tm_length = raise_it(parse_field(split_at_eq(parse_int, "len"), "length"))
tm_exp_aa = raise_it(parse_field(split_at_eq(parse_float, "ExpAA"), "exp_aa"))
tm_first_60 = raise_it(
    parse_field(split_at_eq(parse_float, "First60"), "first_60"))
tm_pred_hel = raise_it(
    parse_field(split_at_eq(parse_int, "PredHel"), "pred_hel"))
tm_topology = raise_it(
    parse_field(split_at_eq(parse_str, "Topology"), "topology"))


def parse_topology(string: str) -> List[Tuple[int, int]]:
    parts = re.findall(r"(?P<tag>[ncio])(?P<start>\d+)-(?P<end>\d+)", string)
    out = []
    for tag, start, end in parts:
        assert tag in ("i", "o"), string
Exemplo n.º 16
0
#!/usr/bin/env python3

from typing import TextIO
from typing import Iterator
from typing import Optional

from predectorutils.gff import GFFRecord, GFFAttributes, Strand
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.parsers import (FieldParseError, LineParseError,
                                    parse_field, raise_it, parse_str,
                                    parse_int)

re_name = raise_it(parse_field(parse_str, "name"))
re_kind = raise_it(parse_field(parse_str, "kind"))
re_pattern = raise_it(parse_field(parse_str, "pattern"))
re_match = raise_it(parse_field(parse_str, "match"))
re_start = raise_it(parse_field(parse_int, "start"))
re_end = raise_it(parse_field(parse_int, "end"))


class RegexAnalysis(Analysis, GFFAble):

    columns = ["name", "kind", "pattern", "match", "start", "end"]

    types = [str, str, str, str, int, int]

    analysis = "regex"
    software = "predutils"

    def __init__(
        self,
Exemplo n.º 17
0
#!/usr/bin/env python3

from typing import TextIO
from typing import Iterator

from predectorutils.analyses.base import Analysis
from predectorutils.parsers import (FieldParseError, LineParseError,
                                    parse_field, raise_it, parse_str,
                                    parse_float, is_one_of)

dre_name = raise_it(parse_field(parse_str, "name"))
dre_s_score = raise_it(parse_field(parse_float, "s_score"))
dre_prediction = raise_it(
    parse_field(is_one_of(["effector", "non-effector"]), "prediction"))


class Deepredeff(Analysis):
    """ """
    columns = [
        "name",
        "s_score",
        "prediction",
    ]

    types = [
        str,
        float,
        str,
    ]

    software = "deepredeff"
Exemplo n.º 18
0
#!/usr/bin/env python3

import re

from typing import Optional
from typing import TextIO
from typing import Iterator

from predectorutils.analyses.base import Analysis
from predectorutils.parsers import (FieldParseError, LineParseError, raise_it,
                                    parse_field, parse_regex, parse_str,
                                    parse_float, is_one_of)
from predectorutils.analyses.base import float_or_none

e1_name = raise_it(parse_field(parse_str, "name"))
e1_prediction = raise_it(
    parse_field(is_one_of(["Effector", "Non-effector"]), "prediction"))
e1_prob = raise_it(parse_field(parse_float, "prob"))


class EffectorP1(Analysis):
    """ """

    columns = ["name", "prediction", "prob"]
    types = [str, str, float]
    analysis = "effectorp1"
    software = "EffectorP"

    def __init__(self, name: str, prediction: str, prob: float) -> None:
        self.name = name
        self.prediction = prediction
Exemplo n.º 19
0
 def _parse_query_line(field: str) -> str:
     return raise_it(
         parse_field(split_at_multispace(parse_str, "Query"),
                     "query"))(field)