Exemplo n.º 1
0
def test_text_regex(name: str,
                    regex_text: str,
                    test_expected_list: List[Tuple[str, List[str]]],
                    verbose: bool = False) -> None:
    """
    Test a regex upon some text.

    Args:
        name: regex name (for display purposes only)
        regex_text: text that should be compiled to give our regex
        test_expected_list:
            list of tuples ``teststring, expected_results``, where
            ``teststring`` is some text and ``expected_results`` is a list of
            expected hits for the regex within ``teststring``
        verbose: be verbose?

    Returns:

    """
    log.info(f"Testing regex named {name}")
    compiled_regex = compile_regex(regex_text)
    if verbose:
        log.debug(f"... regex text:\n{regex_text}")
    for test_string, expected_values in test_expected_list:
        actual_values = get_compiled_regex_results(compiled_regex, test_string)
        assert actual_values == expected_values, (
            "Regex {name}: Expected {expected_values}, got {actual_values}, "
            "when parsing {test_string}. Regex text:\n{regex_text}]".format(
                name=name,
                expected_values=expected_values,
                actual_values=actual_values,
                test_string=repr(test_string),
                regex_text=regex_text,
            ))
    log.info("... OK")
Exemplo n.º 2
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 regex_str: str,
                 variable: str,
                 target_unit: str,
                 units_to_factor: Dict[typing.re.Pattern, float],
                 take_absolute: bool = False,
                 commit: bool = False,
                 debug: bool = False) -> None:
        """
        This class operates with compiled regexes having this group format:
          - variable
          - tense_indicator
          - relation
          - value
          - units

        units_to_factor: dictionary, mapping
            FROM (compiled regex for units)
            TO EITHER
                - float [multiple] to multiple those units by, to get preferred
                   unit
            OR  - function taking text parameter and returning float value
                  in preferred unit

            - any units present in the regex but absent from units_to_factor
              will lead the result to be ignored -- for example, allowing you
              to ignore a relative neutrophil count ("neutrophils 2.2%") while
              detecting absolute neutrophil counts ("neutrophils 2.2"), or
              ignoring "docusate sodium 100mg" but detecting "sodium 140 mM".

        take_absolute: converts negative values to positive ones.
            Typical text requiring this might look like:
                CRP-4
                CRP-106
                CRP -97
                Blood results for today as follows: Na- 142, K-4.1, ...
            ... occurring in 23 / 8054 for CRP of one test set in our data.
            For many quantities, we know that they cannot be negative,
            so this is just a notation rather than a minus sign.
            We have to account for it, or it'll distort our values.
            Preferable to account for it here rather than later; see manual.
        """
        super().__init__(nlpdef=nlpdef,
                         cfgsection=cfgsection,
                         variable=variable,
                         target_unit=target_unit,
                         regex_str_for_debugging=regex_str,
                         commit=commit)
        if debug:
            print("Regex for {}: {}".format(type(self).__name__, regex_str))
        self.compiled_regex = compile_regex(regex_str)
        self.units_to_factor = compile_regex_dict(units_to_factor)
        self.take_absolute = take_absolute
Exemplo n.º 3
0
def learning_alternative_regex_groups():
    regex_str = r"""
        (
            (?:
                \s*
                (?: (a) | (b) | (c) | (d) )
                \s*
            )*
            ( fish )?
        )
    """
    compiled_regex = compile_regex(regex_str)
    for test_str in ("a", "b", "a c", "d", "e", "a fish", "c c c"):
        m = compiled_regex.match(test_str)
        print("Match: {}; groups: {}".format(m, m.groups()))
    """
Exemplo n.º 4
0
def test_text_regex(name: str,
                    regex_text: str,
                    test_expected_list: List[Tuple[str, List[str]]],
                    verbose: bool = False) -> None:
    print("Testing regex named {}".format(name))
    compiled_regex = compile_regex(regex_text)
    if verbose:
        print("... regex text:\n{}".format(regex_text))
    for test_string, expected_values in test_expected_list:
        actual_values = get_compiled_regex_results(compiled_regex, test_string)
        assert actual_values == expected_values, (
            "Regex {name}: Expected {expected_values}, got {actual_values}, "
            "when parsing {test_string}. Regex text:\n{regex_text}]".format(
                name=name,
                expected_values=expected_values,
                actual_values=actual_values,
                test_string=repr(test_string),
                regex_text=regex_text,
            ))
    print("... OK")
Exemplo n.º 5
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 regex_str_list: List[str],
                 validated_variable: str,
                 commit: bool = False) -> None:
        """
        This class operates with compiled regexes having this group format:
          - variable
        """
        super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit)
        self.regex_str_list = regex_str_list  # for debugging only
        self.compiled_regex_list = [compile_regex(r) for r in regex_str_list]
        self.variable = "{}_validator".format(validated_variable)
        self.NAME = self.variable

        if nlpdef is None:  # only None for debugging!
            self.tablename = ''
        else:
            self.tablename = nlpdef.opt_str(
                cfgsection, 'desttable', required=True)
Exemplo n.º 6
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 variable_name: str,  # e.g. "MMSE"
                 variable_regex_str: str,  # e.g. regex for MMSE
                 expected_denominator: int,
                 numerator_text_fieldname: str = "numerator_text",
                 numerator_fieldname: str = "numerator",
                 denominator_text_fieldname: str = "denominator_text",
                 denominator_fieldname: str = "denominator",
                 correct_numerator_fieldname: str = None,  # default below
                 take_absolute: bool = True,
                 commit: bool = False,
                 debug: bool = False) -> None:
        """
        This class operates with compiled regexes having this group format:
          - quantity_regex_str: e.g. to find "MMSE"
        """
        self.variable_name = variable_name
        assert(expected_denominator > 0)
        self.expected_denominator = expected_denominator
        self.numerator_text_fieldname = numerator_text_fieldname
        self.numerator_fieldname = numerator_fieldname
        self.denominator_text_fieldname = denominator_text_fieldname
        self.denominator_fieldname = denominator_fieldname
        self.correct_numerator_fieldname = (
            correct_numerator_fieldname or
            "out_of_{}".format(expected_denominator))
        self.take_absolute = take_absolute

        super().__init__(nlpdef=nlpdef,
                         cfgsection=cfgsection,
                         commit=commit)
        if nlpdef is None:  # only None for debugging!
            self.tablename = ''
        else:
            self.tablename = nlpdef.opt_str(
                cfgsection, 'desttable', required=True)

        regex_str = r"""
            ( {variable} )                     # 1. group for variable (thing being measured)
            {OPTIONAL_RESULTS_IGNORABLES}
            {SCORE}?                           # optional "score" or similar
            {OPTIONAL_RESULTS_IGNORABLES}
            ( {TENSE_INDICATOR} )?             # 2. optional group for tense indicator
            {OPTIONAL_RESULTS_IGNORABLES}
            ( {RELATION} )?                    # 3. optional group for relation
            {OPTIONAL_RESULTS_IGNORABLES}
            ( {SIGNED_FLOAT} )                 # 4. group for numerator
            (?:                                # optional "/ denominator"
                \s* {OUT_OF_SEPARATOR} \s*
                ( {UNSIGNED_INTEGER} )         # 5. group for denominator
            )?
        """.format(  # noqa
            variable=variable_regex_str,
            OPTIONAL_RESULTS_IGNORABLES=OPTIONAL_RESULTS_IGNORABLES,
            SCORE=SCORE,
            TENSE_INDICATOR=TENSE_INDICATOR,
            RELATION=RELATION,
            SIGNED_FLOAT=SIGNED_FLOAT,
            OUT_OF_SEPARATOR=OUT_OF_SEPARATOR,
            UNSIGNED_INTEGER=UNSIGNED_INTEGER,
        )
        if debug:
            print("Regex for {}: {}".format(type(self).__name__, regex_str))
        self.regex_str = regex_str
        self.compiled_regex = compile_regex(regex_str)