def _parse_alignment_line( line: str) -> Tuple[str, str, int, str, int, int, Optional[int]]: sline = MULTISPACE_REGEX.split(line.strip(), maxsplit=5) columns = ["type", "id", "ali_start", "sequence", "ali_end", "length"] dline = dict(zip(columns, sline)) length = fmap(lambda x: x.lstrip("(").rstrip(")"), dline.get("length", None)) if length is None: raise LineParseError( f"Missing 'length' from alignment line: '{line}'.") seq_begin_match = ALI_REGEX.match(line) if seq_begin_match is None: seq_begin: Optional[int] = None else: seq_begin = seq_begin_match.end() return (get_and_parse("type", "type", is_one_of(["T", "Q"]))(dline), get_and_parse("id", "id", parse_str)(dline), get_and_parse("ali_start", "ali_start", parse_int)(dline), get_and_parse("sequence", "sequence", parse_str)(dline), get_and_parse("ali_end", "ali_end", parse_int)(dline), raise_it(parse_field(parse_int, "length", "field"))(length), seq_begin)
def _parse_probab_line( field: str ) -> Tuple[float, float, float, int, float, float, float, Optional[float]]: sline = (s for s in MULTISPACE_REGEX.split(field.strip())) columns = [ "Probab", "E-value", "Score", "Aligned_cols", "Identities", "Similarity", "Sum_probs", "Template_Neff", ] dline = { col: raise_it(parse_field(split_at_eq(parse_str, col), col))(f) for f, col in zip(sline, columns) } if "Template_Neff" in dline: template_neff: Optional[float] = raise_it( parse_field(parse_float, "template_neff"))(dline["Template_Neff"]) else: template_neff = None return ( get_and_parse("Probab", "probability", parse_float)(dline), get_and_parse("E-value", "evalue", parse_float)(dline), get_and_parse("Score", "score", parse_float)(dline), get_and_parse("Aligned_cols", "aligned_cols", parse_int)(dline), get_and_parse("Identities", "identity", lambda x: parse_float(x.rstrip("%")))(dline) / 100.0, get_and_parse("Similarity", "similarity", parse_float)(dline), get_and_parse("Sum_probs", "sum_probs", parse_float)(dline), template_neff, )
def from_line(cls, line: str) -> "LOCALIZER": """ Parse an ApoplastP line as an object. """ if line == "": raise LineParseError("The line was empty.") sline = [c.strip() for c in line.strip().split("\t")] if len(sline) != 4: raise LineParseError("The line had the wrong number of columns. " f"Expected 4 but got {len(sline)}") (cp, cp_prob, cp_start, cp_end) = parse_tp_field(sline[1], "chloroplast") (mt, mt_prob, mt_start, mt_end) = parse_tp_field(sline[2], "mitochondria") (nuc, nuc_sigs) = parse_nuc_field(sline[3]) return cls( raise_it(parse_field(parse_str, "name"))(sline[0]), cp, cp_prob, fmap(lambda x: x - 1 + 20, cp_start), cp_end, mt, mt_prob, fmap(lambda x: x - 1 + 20, mt_start), mt_end, nuc, nuc_sigs)
field_name: str = "active_site", ) -> str: """ """ field = field.strip() if not field.startswith("predicted_active_site"): raise LineParseError( f"Invalid value: '{field}' in the column: '{field_name}'. " "Must have the form 'predicted_active_site[1,2,3]'.") field = field[len("predicted_active_site"):] sfield = (f.strip("[],; ") for f in field.split('[')) return ';'.join(f.replace(' ', '') for f in sfield if len(f) > 0) ps_name = raise_it(parse_field(parse_str, "name")) ps_ali_start = raise_it(parse_field(parse_int, "ali_start")) ps_ali_end = raise_it(parse_field(parse_int, "ali_end")) ps_env_start = raise_it(parse_field(parse_int, "env_start")) ps_env_end = raise_it(parse_field(parse_int, "env_end")) ps_hmm = raise_it(parse_field(parse_str, "hmm")) ps_hmm_name = raise_it(parse_field(parse_str, "hmm_name")) ps_hmm_type = raise_it(parse_field(parse_str, "hmm_type")) ps_hmm_start = raise_it(parse_field(parse_int, "hmm_start")) ps_hmm_end = raise_it(parse_field(parse_int, "hmm_end")) ps_hmm_len = raise_it(parse_field(parse_int, "hmm_len")) ps_bitscore = raise_it(parse_field(parse_float, "bitscore")) ps_evalue = raise_it(parse_field(parse_float, "evalue")) ps_is_significant = raise_it( parse_field(parse_bool("1", "0"), "is_significant"))
#!/usr/bin/env python3 from typing import TextIO from typing import Iterator from predectorutils.analyses.base import Analysis from predectorutils.parsers import (FieldParseError, LineParseError, raise_it, parse_field, parse_str, parse_float, is_one_of) __all__ = ["ApoplastP"] apo_name = raise_it(parse_field(parse_str, "name")) apo_prediction = raise_it( parse_field(is_one_of(["Apoplastic", "Non-apoplastic"]), "prediction")) apo_prob = raise_it(parse_field(parse_float, "prob")) class ApoplastP(Analysis): """ """ columns = ["name", "prediction", "prob"] types = [str, str, float] analysis = "apoplastp" software = "ApoplastP" def __init__(self, name: str, prediction: str, prob: float) -> None: self.name = name self.prediction = prediction self.prob = prob return
GFF3_WRITE_ORDER: List[str] = [ "ID", "Name", "Alias", "Parent", "Target", "Gap", "Derives_from", "Note", "Dbxref", "Ontology_term", "Is_circular", ] rec_seqid = raise_it(parse_field(parse_str, "seqid")) rec_source = raise_it(parse_field(parse_str, "source")) rec_type = raise_it(parse_field(parse_str, "type")) rec_start = raise_it(parse_field(parse_int, "start")) rec_end = raise_it(parse_field(parse_int, "end")) rec_score = raise_it(parse_field(parse_or_none(parse_float, "."), "score")) rec_strand = raise_it(parse_field(is_one_of(["-", "+", ".", "?"]), "strand")) rec_phase = raise_it(parse_field(is_one_of(["0", "1", "2", "."]), "phase")) def parse_attr_list(string: str) -> List[str]: return list(f.strip() for f in string.strip(", ").split(",")) attr_is_circular = raise_it( parse_field(
#!/usr/bin/env python3 from typing import TextIO from typing import Iterator from predectorutils.analyses.base import Analysis from predectorutils.parsers import (FieldParseError, LineParseError, parse_field, raise_it, parse_str, parse_float, is_one_of) dl_name = raise_it(parse_field(parse_str, "name")) dl_prediction = raise_it( parse_field( is_one_of([ "Membrane", "Nucleus", "Cytoplasm", "Extracellular", "Mitochondrion", "Cell_membrane", "Endoplasmic_reticulum", "Plastid", "Golgi_apparatus", "Lysosome/Vacuole", "Peroxisome" ]), "prediction")) dl_membrane = raise_it(parse_field(parse_float, "membrane")) dl_nucleus = raise_it(parse_field(parse_float, "nucleus")) dl_cytoplasm = raise_it(parse_field(parse_float, "cytoplasm")) dl_extracellular = raise_it(parse_field(parse_float, "extracellular")) dl_mitochondrion = raise_it(parse_field(parse_float, "mitochondrion")) dl_cell_membrane = raise_it(parse_field(parse_float, "cell_membrane")) dl_endoplasmic_reticulum = raise_it( parse_field(parse_float, "endoplasmic_reticulum")) dl_plastid = raise_it(parse_field(parse_float, "plastid")) dl_golgi_apparatus = raise_it(parse_field(parse_float, "golgi_apparatus")) dl_lysosome = raise_it(parse_field(parse_float, "lysosome_vacuole")) dl_peroxisome = raise_it(parse_field(parse_float, "peroxisome"))
def _parse_query_length_line(field: str) -> int: return raise_it( parse_field( split_at_multispace(parse_int, "Match_columns"), "query_length", ))(field)
from typing import Optional from predectorutils.gff import (GFFRecord, GFFAttributes, Strand, Target, Gap, GapCode, GapElement) from predectorutils.analyses.base import Analysis, GFFAble from predectorutils.parsers import ( FieldParseError, LineParseError, parse_field, raise_it, parse_str, parse_float, parse_int, ) mm_query = raise_it(parse_field(parse_str, "query")) mm_target = raise_it(parse_field(parse_str, "target")) mm_qstart = raise_it(parse_field(parse_int, "qstart")) mm_qend = raise_it(parse_field(parse_int, "qend")) mm_qlen = raise_it(parse_field(parse_int, "qlen")) mm_tstart = raise_it(parse_field(parse_int, "tstart")) mm_tend = raise_it(parse_field(parse_int, "tend")) mm_tlen = raise_it(parse_field(parse_int, "tlen")) mm_evalue = raise_it(parse_field(parse_float, "evalue")) mm_gapopen = raise_it(parse_field(parse_int, "gapopen")) mm_pident = raise_it(parse_field(parse_float, "pident")) mm_alnlen = raise_it(parse_field(parse_int, "alnlen")) mm_raw = raise_it(parse_field(parse_float, "raw")) mm_bits = raise_it(parse_field(parse_float, "bits")) mm_cigar = raise_it(parse_field(parse_str, "cigar")) mm_mismatch = raise_it(parse_field(parse_int, "mismatch"))
from predectorutils.gff import ( GFFRecord, GFFAttributes, Strand, ) from predectorutils.analyses.base import Analysis, GFFAble from predectorutils.analyses.base import str_or_none from predectorutils.parsers import (FieldParseError, LineParseError, raise_it, parse_field, parse_str, parse_float, parse_int, parse_bool, parse_regex, MULTISPACE_REGEX, is_one_of, is_value) __all__ = ["SignalP3NN", "SignalP3HMM", "SignalP4", "SignalP5"] s3nn_name = raise_it(parse_field(parse_str, "name")) s3nn_cmax = raise_it(parse_field(parse_float, "cmax")) s3nn_cmax_pos = raise_it(parse_field(parse_int, "cmax_pos")) s3nn_cmax_decision = raise_it( parse_field(parse_bool("Y", "N"), "cmax_decision")) s3nn_ymax = raise_it(parse_field(parse_float, "ymax")) s3nn_ymax_pos = raise_it(parse_field(parse_int, "ymax_pos")) s3nn_ymax_decision = raise_it( parse_field(parse_bool("Y", "N"), "ymax_decision")) s3nn_smax = raise_it(parse_field(parse_float, "smax")) s3nn_smax_pos = raise_it(parse_field(parse_int, "smax_pos")) s3nn_smax_decision = raise_it( parse_field(parse_bool("Y", "N"), "smax_decision")) s3nn_smean = raise_it(parse_field(parse_float, "smean")) s3nn_smean_decision = raise_it( parse_field(parse_bool("Y", "N"), "smean_decision"))
from typing import Optional from typing import TextIO from typing import Iterator from predectorutils.gff import ( GFFRecord, GFFAttributes, Strand, ) from predectorutils.analyses.base import Analysis, GFFAble from predectorutils.analyses.base import float_or_none, str_or_none from predectorutils.parsers import (FieldParseError, LineParseError, parse_field, raise_it, parse_str, parse_regex, parse_float, is_one_of) tp_name = raise_it(parse_field(parse_str, "name")) tp_prediction = raise_it( parse_field(is_one_of(["OTHER", "noTP", "SP", "mTP", "cTP", "luTP"]), "prediction")) tp_other = raise_it(parse_field(parse_float, "OTHER")) tp_sp = raise_it(parse_field(parse_float, "SP")) tp_mtp = raise_it(parse_field(parse_float, "mTP")) pl_prediction = raise_it( parse_field(is_one_of(["OTHER", "SP", "mTP", "cTP", "luTP"]), "prediction")) pl_ctp = raise_it(parse_field(parse_float, "cTP")) pl_lutp = raise_it(parse_field(parse_float, "luTP")) CS_POS_REGEX = re.compile(r"CS\s+pos:\s+\d+-(?P<cs>\d+)\.?\s+" r"[A-Za-z]+-[A-Za-z]+\.?\s+"
parse_str, parse_float, parse_int, MULTISPACE_REGEX, ) def split_hmm(s: str) -> Union[ValueParseError, str]: s1 = parse_str(s) if isinstance(s1, ValueParseError): return s1 else: return s.rsplit(".hmm", maxsplit=1)[0] hm_name = raise_it(parse_field(parse_str, "name")) # query name hm_hmm = raise_it(parse_field(split_hmm, "hmm")) # target name hm_hmm_len = raise_it(parse_field(parse_int, "hmm_len")) # tlen hm_query_len = raise_it(parse_field(parse_int, "query_len")) # qlen hm_full_evalue = raise_it(parse_field(parse_float, "full_evalue")) hm_full_score = raise_it(parse_field(parse_float, "full_score")) hm_full_bias = raise_it(parse_field(parse_float, "full_bias")) hm_nmatches = raise_it(parse_field(parse_int, "nmatches")) hm_domain_c_evalue = raise_it(parse_field(parse_float, "domain_c_evalue")) hm_domain_i_evalue = raise_it(parse_field(parse_float, "domain_i_evalue")) hm_domain_score = raise_it(parse_field(parse_float, "domain_score")) hm_domain_bias = raise_it(parse_field(parse_float, "domain_bias")) hm_hmm_from = raise_it(parse_field(parse_int, "hmm_from")) hm_hmm_to = raise_it(parse_field(parse_int, "hmm_to")) hm_query_from = raise_it(parse_field(parse_int, "query_from")) hm_query_to = raise_it(parse_field(parse_int, "query_to"))
def _parse_query_neff_line(field: str) -> float: return raise_it( parse_field( split_at_multispace(parse_float, "Neff"), "query_neff", ))(field)
from predectorutils.parsers import ( FieldParseError, LineParseError, raise_it, parse_field, parse_str, parse_float, parse_int, parse_or_none, is_one_of ) __all__ = ["DeepSig"] ds_name = raise_it(parse_field(parse_str, "name")) ds_prediction = raise_it(parse_field( is_one_of(["SignalPeptide", "Transmembrane", "Other"]), "prediction" )) ds_prob = raise_it(parse_field(parse_float, "prob")) ds_cs_pos = raise_it(parse_field(parse_or_none(parse_int, "-"), "cs_pos")) class DeepSig(Analysis, GFFAble): """ """ columns = ["name", "prediction", "prob", "cs_pos"] types = [str, str, float, int_or_none] analysis = "deepsig"
from typing import Optional from predectorutils.gff import (GFFRecord, Strand) from predectorutils.analyses.base import Analysis, GFFAble from predectorutils.parsers import ( FieldParseError, LineParseError, parse_field, raise_it, parse_str, parse_float, parse_int, split_at_eq, ) tm_name = raise_it(parse_field(parse_str, "name")) tm_length = raise_it(parse_field(split_at_eq(parse_int, "len"), "length")) tm_exp_aa = raise_it(parse_field(split_at_eq(parse_float, "ExpAA"), "exp_aa")) tm_first_60 = raise_it( parse_field(split_at_eq(parse_float, "First60"), "first_60")) tm_pred_hel = raise_it( parse_field(split_at_eq(parse_int, "PredHel"), "pred_hel")) tm_topology = raise_it( parse_field(split_at_eq(parse_str, "Topology"), "topology")) def parse_topology(string: str) -> List[Tuple[int, int]]: parts = re.findall(r"(?P<tag>[ncio])(?P<start>\d+)-(?P<end>\d+)", string) out = [] for tag, start, end in parts: assert tag in ("i", "o"), string
#!/usr/bin/env python3 from typing import TextIO from typing import Iterator from typing import Optional from predectorutils.gff import GFFRecord, GFFAttributes, Strand from predectorutils.analyses.base import Analysis, GFFAble from predectorutils.parsers import (FieldParseError, LineParseError, parse_field, raise_it, parse_str, parse_int) re_name = raise_it(parse_field(parse_str, "name")) re_kind = raise_it(parse_field(parse_str, "kind")) re_pattern = raise_it(parse_field(parse_str, "pattern")) re_match = raise_it(parse_field(parse_str, "match")) re_start = raise_it(parse_field(parse_int, "start")) re_end = raise_it(parse_field(parse_int, "end")) class RegexAnalysis(Analysis, GFFAble): columns = ["name", "kind", "pattern", "match", "start", "end"] types = [str, str, str, str, int, int] analysis = "regex" software = "predutils" def __init__( self,
#!/usr/bin/env python3 from typing import TextIO from typing import Iterator from predectorutils.analyses.base import Analysis from predectorutils.parsers import (FieldParseError, LineParseError, parse_field, raise_it, parse_str, parse_float, is_one_of) dre_name = raise_it(parse_field(parse_str, "name")) dre_s_score = raise_it(parse_field(parse_float, "s_score")) dre_prediction = raise_it( parse_field(is_one_of(["effector", "non-effector"]), "prediction")) class Deepredeff(Analysis): """ """ columns = [ "name", "s_score", "prediction", ] types = [ str, float, str, ] software = "deepredeff"
#!/usr/bin/env python3 import re from typing import Optional from typing import TextIO from typing import Iterator from predectorutils.analyses.base import Analysis from predectorutils.parsers import (FieldParseError, LineParseError, raise_it, parse_field, parse_regex, parse_str, parse_float, is_one_of) from predectorutils.analyses.base import float_or_none e1_name = raise_it(parse_field(parse_str, "name")) e1_prediction = raise_it( parse_field(is_one_of(["Effector", "Non-effector"]), "prediction")) e1_prob = raise_it(parse_field(parse_float, "prob")) class EffectorP1(Analysis): """ """ columns = ["name", "prediction", "prob"] types = [str, str, float] analysis = "effectorp1" software = "EffectorP" def __init__(self, name: str, prediction: str, prob: float) -> None: self.name = name self.prediction = prediction
def _parse_query_line(field: str) -> str: return raise_it( parse_field(split_at_multispace(parse_str, "Query"), "query"))(field)