Пример #1
0
    def _parse_alignment_line(
            line: str) -> Tuple[str, str, int, str, int, int, Optional[int]]:
        sline = MULTISPACE_REGEX.split(line.strip(), maxsplit=5)

        columns = ["type", "id", "ali_start", "sequence", "ali_end", "length"]
        dline = dict(zip(columns, sline))

        length = fmap(lambda x: x.lstrip("(").rstrip(")"),
                      dline.get("length", None))

        if length is None:
            raise LineParseError(
                f"Missing 'length' from alignment line: '{line}'.")

        seq_begin_match = ALI_REGEX.match(line)
        if seq_begin_match is None:
            seq_begin: Optional[int] = None
        else:
            seq_begin = seq_begin_match.end()

        return (get_and_parse("type", "type", is_one_of(["T", "Q"]))(dline),
                get_and_parse("id", "id", parse_str)(dline),
                get_and_parse("ali_start", "ali_start", parse_int)(dline),
                get_and_parse("sequence", "sequence", parse_str)(dline),
                get_and_parse("ali_end", "ali_end", parse_int)(dline),
                raise_it(parse_field(parse_int, "length",
                                     "field"))(length), seq_begin)
Пример #2
0
    def _parse_probab_line(
        field: str
    ) -> Tuple[float, float, float, int, float, float, float, Optional[float]]:
        sline = (s for s in MULTISPACE_REGEX.split(field.strip()))
        columns = [
            "Probab",
            "E-value",
            "Score",
            "Aligned_cols",
            "Identities",
            "Similarity",
            "Sum_probs",
            "Template_Neff",
        ]

        dline = {
            col: raise_it(parse_field(split_at_eq(parse_str, col), col))(f)
            for f, col in zip(sline, columns)
        }

        if "Template_Neff" in dline:
            template_neff: Optional[float] = raise_it(
                parse_field(parse_float,
                            "template_neff"))(dline["Template_Neff"])
        else:
            template_neff = None

        return (
            get_and_parse("Probab", "probability", parse_float)(dline),
            get_and_parse("E-value", "evalue", parse_float)(dline),
            get_and_parse("Score", "score", parse_float)(dline),
            get_and_parse("Aligned_cols", "aligned_cols", parse_int)(dline),
            get_and_parse("Identities", "identity",
                          lambda x: parse_float(x.rstrip("%")))(dline) / 100.0,
            get_and_parse("Similarity", "similarity", parse_float)(dline),
            get_and_parse("Sum_probs", "sum_probs", parse_float)(dline),
            template_neff,
        )
Пример #3
0
    def from_line(cls, line: str) -> "LOCALIZER":
        """ Parse an ApoplastP line as an object. """

        if line == "":
            raise LineParseError("The line was empty.")

        sline = [c.strip() for c in line.strip().split("\t")]

        if len(sline) != 4:
            raise LineParseError("The line had the wrong number of columns. "
                                 f"Expected 4 but got {len(sline)}")

        (cp, cp_prob, cp_start,
         cp_end) = parse_tp_field(sline[1], "chloroplast")

        (mt, mt_prob, mt_start,
         mt_end) = parse_tp_field(sline[2], "mitochondria")

        (nuc, nuc_sigs) = parse_nuc_field(sline[3])

        return cls(
            raise_it(parse_field(parse_str, "name"))(sline[0]), cp, cp_prob,
            fmap(lambda x: x - 1 + 20, cp_start), cp_end, mt, mt_prob,
            fmap(lambda x: x - 1 + 20, mt_start), mt_end, nuc, nuc_sigs)
Пример #4
0
    field_name: str = "active_site",
) -> str:
    """ """

    field = field.strip()
    if not field.startswith("predicted_active_site"):
        raise LineParseError(
            f"Invalid value: '{field}' in the column: '{field_name}'. "
            "Must have the form 'predicted_active_site[1,2,3]'.")

    field = field[len("predicted_active_site"):]
    sfield = (f.strip("[],; ") for f in field.split('['))
    return ';'.join(f.replace(' ', '') for f in sfield if len(f) > 0)


ps_name = raise_it(parse_field(parse_str, "name"))
ps_ali_start = raise_it(parse_field(parse_int, "ali_start"))
ps_ali_end = raise_it(parse_field(parse_int, "ali_end"))
ps_env_start = raise_it(parse_field(parse_int, "env_start"))
ps_env_end = raise_it(parse_field(parse_int, "env_end"))
ps_hmm = raise_it(parse_field(parse_str, "hmm"))
ps_hmm_name = raise_it(parse_field(parse_str, "hmm_name"))
ps_hmm_type = raise_it(parse_field(parse_str, "hmm_type"))
ps_hmm_start = raise_it(parse_field(parse_int, "hmm_start"))
ps_hmm_end = raise_it(parse_field(parse_int, "hmm_end"))
ps_hmm_len = raise_it(parse_field(parse_int, "hmm_len"))
ps_bitscore = raise_it(parse_field(parse_float, "bitscore"))
ps_evalue = raise_it(parse_field(parse_float, "evalue"))
ps_is_significant = raise_it(
    parse_field(parse_bool("1", "0"), "is_significant"))
Пример #5
0
#!/usr/bin/env python3

from typing import TextIO
from typing import Iterator

from predectorutils.analyses.base import Analysis
from predectorutils.parsers import (FieldParseError, LineParseError, raise_it,
                                    parse_field, parse_str, parse_float,
                                    is_one_of)

__all__ = ["ApoplastP"]

apo_name = raise_it(parse_field(parse_str, "name"))
apo_prediction = raise_it(
    parse_field(is_one_of(["Apoplastic", "Non-apoplastic"]), "prediction"))
apo_prob = raise_it(parse_field(parse_float, "prob"))


class ApoplastP(Analysis):
    """     """

    columns = ["name", "prediction", "prob"]
    types = [str, str, float]
    analysis = "apoplastp"
    software = "ApoplastP"

    def __init__(self, name: str, prediction: str, prob: float) -> None:
        self.name = name
        self.prediction = prediction
        self.prob = prob
        return
Пример #6
0
GFF3_WRITE_ORDER: List[str] = [
    "ID",
    "Name",
    "Alias",
    "Parent",
    "Target",
    "Gap",
    "Derives_from",
    "Note",
    "Dbxref",
    "Ontology_term",
    "Is_circular",
]

rec_seqid = raise_it(parse_field(parse_str, "seqid"))
rec_source = raise_it(parse_field(parse_str, "source"))
rec_type = raise_it(parse_field(parse_str, "type"))
rec_start = raise_it(parse_field(parse_int, "start"))
rec_end = raise_it(parse_field(parse_int, "end"))
rec_score = raise_it(parse_field(parse_or_none(parse_float, "."), "score"))
rec_strand = raise_it(parse_field(is_one_of(["-", "+", ".", "?"]), "strand"))
rec_phase = raise_it(parse_field(is_one_of(["0", "1", "2", "."]), "phase"))


def parse_attr_list(string: str) -> List[str]:
    return list(f.strip() for f in string.strip(", ").split(","))


attr_is_circular = raise_it(
    parse_field(
Пример #7
0
#!/usr/bin/env python3

from typing import TextIO
from typing import Iterator

from predectorutils.analyses.base import Analysis
from predectorutils.parsers import (FieldParseError, LineParseError,
                                    parse_field, raise_it, parse_str,
                                    parse_float, is_one_of)

dl_name = raise_it(parse_field(parse_str, "name"))
dl_prediction = raise_it(
    parse_field(
        is_one_of([
            "Membrane", "Nucleus", "Cytoplasm", "Extracellular",
            "Mitochondrion", "Cell_membrane", "Endoplasmic_reticulum",
            "Plastid", "Golgi_apparatus", "Lysosome/Vacuole", "Peroxisome"
        ]), "prediction"))
dl_membrane = raise_it(parse_field(parse_float, "membrane"))
dl_nucleus = raise_it(parse_field(parse_float, "nucleus"))
dl_cytoplasm = raise_it(parse_field(parse_float, "cytoplasm"))
dl_extracellular = raise_it(parse_field(parse_float, "extracellular"))
dl_mitochondrion = raise_it(parse_field(parse_float, "mitochondrion"))
dl_cell_membrane = raise_it(parse_field(parse_float, "cell_membrane"))
dl_endoplasmic_reticulum = raise_it(
    parse_field(parse_float, "endoplasmic_reticulum"))
dl_plastid = raise_it(parse_field(parse_float, "plastid"))
dl_golgi_apparatus = raise_it(parse_field(parse_float, "golgi_apparatus"))
dl_lysosome = raise_it(parse_field(parse_float, "lysosome_vacuole"))
dl_peroxisome = raise_it(parse_field(parse_float, "peroxisome"))
Пример #8
0
 def _parse_query_length_line(field: str) -> int:
     return raise_it(
         parse_field(
             split_at_multispace(parse_int, "Match_columns"),
             "query_length",
         ))(field)
Пример #9
0
from typing import Optional

from predectorutils.gff import (GFFRecord, GFFAttributes, Strand, Target, Gap,
                                GapCode, GapElement)
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.parsers import (
    FieldParseError,
    LineParseError,
    parse_field,
    raise_it,
    parse_str,
    parse_float,
    parse_int,
)

mm_query = raise_it(parse_field(parse_str, "query"))
mm_target = raise_it(parse_field(parse_str, "target"))
mm_qstart = raise_it(parse_field(parse_int, "qstart"))
mm_qend = raise_it(parse_field(parse_int, "qend"))
mm_qlen = raise_it(parse_field(parse_int, "qlen"))
mm_tstart = raise_it(parse_field(parse_int, "tstart"))
mm_tend = raise_it(parse_field(parse_int, "tend"))
mm_tlen = raise_it(parse_field(parse_int, "tlen"))
mm_evalue = raise_it(parse_field(parse_float, "evalue"))
mm_gapopen = raise_it(parse_field(parse_int, "gapopen"))
mm_pident = raise_it(parse_field(parse_float, "pident"))
mm_alnlen = raise_it(parse_field(parse_int, "alnlen"))
mm_raw = raise_it(parse_field(parse_float, "raw"))
mm_bits = raise_it(parse_field(parse_float, "bits"))
mm_cigar = raise_it(parse_field(parse_str, "cigar"))
mm_mismatch = raise_it(parse_field(parse_int, "mismatch"))
Пример #10
0
from predectorutils.gff import (
    GFFRecord,
    GFFAttributes,
    Strand,
)
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.analyses.base import str_or_none
from predectorutils.parsers import (FieldParseError, LineParseError, raise_it,
                                    parse_field, parse_str, parse_float,
                                    parse_int, parse_bool, parse_regex,
                                    MULTISPACE_REGEX, is_one_of, is_value)

__all__ = ["SignalP3NN", "SignalP3HMM", "SignalP4", "SignalP5"]

s3nn_name = raise_it(parse_field(parse_str, "name"))
s3nn_cmax = raise_it(parse_field(parse_float, "cmax"))
s3nn_cmax_pos = raise_it(parse_field(parse_int, "cmax_pos"))
s3nn_cmax_decision = raise_it(
    parse_field(parse_bool("Y", "N"), "cmax_decision"))
s3nn_ymax = raise_it(parse_field(parse_float, "ymax"))
s3nn_ymax_pos = raise_it(parse_field(parse_int, "ymax_pos"))
s3nn_ymax_decision = raise_it(
    parse_field(parse_bool("Y", "N"), "ymax_decision"))
s3nn_smax = raise_it(parse_field(parse_float, "smax"))
s3nn_smax_pos = raise_it(parse_field(parse_int, "smax_pos"))
s3nn_smax_decision = raise_it(
    parse_field(parse_bool("Y", "N"), "smax_decision"))
s3nn_smean = raise_it(parse_field(parse_float, "smean"))
s3nn_smean_decision = raise_it(
    parse_field(parse_bool("Y", "N"), "smean_decision"))
Пример #11
0
from typing import Optional
from typing import TextIO
from typing import Iterator

from predectorutils.gff import (
    GFFRecord,
    GFFAttributes,
    Strand,
)
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.analyses.base import float_or_none, str_or_none
from predectorutils.parsers import (FieldParseError, LineParseError,
                                    parse_field, raise_it, parse_str,
                                    parse_regex, parse_float, is_one_of)

tp_name = raise_it(parse_field(parse_str, "name"))
tp_prediction = raise_it(
    parse_field(is_one_of(["OTHER", "noTP", "SP", "mTP", "cTP", "luTP"]),
                "prediction"))
tp_other = raise_it(parse_field(parse_float, "OTHER"))
tp_sp = raise_it(parse_field(parse_float, "SP"))
tp_mtp = raise_it(parse_field(parse_float, "mTP"))

pl_prediction = raise_it(
    parse_field(is_one_of(["OTHER", "SP", "mTP", "cTP", "luTP"]),
                "prediction"))
pl_ctp = raise_it(parse_field(parse_float, "cTP"))
pl_lutp = raise_it(parse_field(parse_float, "luTP"))

CS_POS_REGEX = re.compile(r"CS\s+pos:\s+\d+-(?P<cs>\d+)\.?\s+"
                          r"[A-Za-z]+-[A-Za-z]+\.?\s+"
Пример #12
0
    parse_str,
    parse_float,
    parse_int,
    MULTISPACE_REGEX,
)


def split_hmm(s: str) -> Union[ValueParseError, str]:
    s1 = parse_str(s)
    if isinstance(s1, ValueParseError):
        return s1
    else:
        return s.rsplit(".hmm", maxsplit=1)[0]


hm_name = raise_it(parse_field(parse_str, "name"))  # query name
hm_hmm = raise_it(parse_field(split_hmm, "hmm"))  # target name
hm_hmm_len = raise_it(parse_field(parse_int, "hmm_len"))  # tlen
hm_query_len = raise_it(parse_field(parse_int, "query_len"))  # qlen
hm_full_evalue = raise_it(parse_field(parse_float, "full_evalue"))
hm_full_score = raise_it(parse_field(parse_float, "full_score"))
hm_full_bias = raise_it(parse_field(parse_float, "full_bias"))
hm_nmatches = raise_it(parse_field(parse_int, "nmatches"))
hm_domain_c_evalue = raise_it(parse_field(parse_float, "domain_c_evalue"))
hm_domain_i_evalue = raise_it(parse_field(parse_float, "domain_i_evalue"))
hm_domain_score = raise_it(parse_field(parse_float, "domain_score"))
hm_domain_bias = raise_it(parse_field(parse_float, "domain_bias"))
hm_hmm_from = raise_it(parse_field(parse_int, "hmm_from"))
hm_hmm_to = raise_it(parse_field(parse_int, "hmm_to"))
hm_query_from = raise_it(parse_field(parse_int, "query_from"))
hm_query_to = raise_it(parse_field(parse_int, "query_to"))
Пример #13
0
 def _parse_query_neff_line(field: str) -> float:
     return raise_it(
         parse_field(
             split_at_multispace(parse_float, "Neff"),
             "query_neff",
         ))(field)
Пример #14
0
from predectorutils.parsers import (
    FieldParseError,
    LineParseError,
    raise_it,
    parse_field,
    parse_str,
    parse_float,
    parse_int,
    parse_or_none,
    is_one_of
)

__all__ = ["DeepSig"]


ds_name = raise_it(parse_field(parse_str, "name"))
ds_prediction = raise_it(parse_field(
    is_one_of(["SignalPeptide", "Transmembrane", "Other"]),
    "prediction"
))
ds_prob = raise_it(parse_field(parse_float, "prob"))
ds_cs_pos = raise_it(parse_field(parse_or_none(parse_int, "-"), "cs_pos"))


class DeepSig(Analysis, GFFAble):

    """     """

    columns = ["name", "prediction", "prob", "cs_pos"]
    types = [str, str, float, int_or_none]
    analysis = "deepsig"
Пример #15
0
from typing import Optional

from predectorutils.gff import (GFFRecord, Strand)
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.parsers import (
    FieldParseError,
    LineParseError,
    parse_field,
    raise_it,
    parse_str,
    parse_float,
    parse_int,
    split_at_eq,
)

tm_name = raise_it(parse_field(parse_str, "name"))
tm_length = raise_it(parse_field(split_at_eq(parse_int, "len"), "length"))
tm_exp_aa = raise_it(parse_field(split_at_eq(parse_float, "ExpAA"), "exp_aa"))
tm_first_60 = raise_it(
    parse_field(split_at_eq(parse_float, "First60"), "first_60"))
tm_pred_hel = raise_it(
    parse_field(split_at_eq(parse_int, "PredHel"), "pred_hel"))
tm_topology = raise_it(
    parse_field(split_at_eq(parse_str, "Topology"), "topology"))


def parse_topology(string: str) -> List[Tuple[int, int]]:
    parts = re.findall(r"(?P<tag>[ncio])(?P<start>\d+)-(?P<end>\d+)", string)
    out = []
    for tag, start, end in parts:
        assert tag in ("i", "o"), string
Пример #16
0
#!/usr/bin/env python3

from typing import TextIO
from typing import Iterator
from typing import Optional

from predectorutils.gff import GFFRecord, GFFAttributes, Strand
from predectorutils.analyses.base import Analysis, GFFAble
from predectorutils.parsers import (FieldParseError, LineParseError,
                                    parse_field, raise_it, parse_str,
                                    parse_int)

re_name = raise_it(parse_field(parse_str, "name"))
re_kind = raise_it(parse_field(parse_str, "kind"))
re_pattern = raise_it(parse_field(parse_str, "pattern"))
re_match = raise_it(parse_field(parse_str, "match"))
re_start = raise_it(parse_field(parse_int, "start"))
re_end = raise_it(parse_field(parse_int, "end"))


class RegexAnalysis(Analysis, GFFAble):

    columns = ["name", "kind", "pattern", "match", "start", "end"]

    types = [str, str, str, str, int, int]

    analysis = "regex"
    software = "predutils"

    def __init__(
        self,
Пример #17
0
#!/usr/bin/env python3

from typing import TextIO
from typing import Iterator

from predectorutils.analyses.base import Analysis
from predectorutils.parsers import (FieldParseError, LineParseError,
                                    parse_field, raise_it, parse_str,
                                    parse_float, is_one_of)

dre_name = raise_it(parse_field(parse_str, "name"))
dre_s_score = raise_it(parse_field(parse_float, "s_score"))
dre_prediction = raise_it(
    parse_field(is_one_of(["effector", "non-effector"]), "prediction"))


class Deepredeff(Analysis):
    """ """
    columns = [
        "name",
        "s_score",
        "prediction",
    ]

    types = [
        str,
        float,
        str,
    ]

    software = "deepredeff"
Пример #18
0
#!/usr/bin/env python3

import re

from typing import Optional
from typing import TextIO
from typing import Iterator

from predectorutils.analyses.base import Analysis
from predectorutils.parsers import (FieldParseError, LineParseError, raise_it,
                                    parse_field, parse_regex, parse_str,
                                    parse_float, is_one_of)
from predectorutils.analyses.base import float_or_none

e1_name = raise_it(parse_field(parse_str, "name"))
e1_prediction = raise_it(
    parse_field(is_one_of(["Effector", "Non-effector"]), "prediction"))
e1_prob = raise_it(parse_field(parse_float, "prob"))


class EffectorP1(Analysis):
    """ """

    columns = ["name", "prediction", "prob"]
    types = [str, str, float]
    analysis = "effectorp1"
    software = "EffectorP"

    def __init__(self, name: str, prediction: str, prob: float) -> None:
        self.name = name
        self.prediction = prediction
Пример #19
0
 def _parse_query_line(field: str) -> str:
     return raise_it(
         parse_field(split_at_multispace(parse_str, "Query"),
                     "query"))(field)