def test_pattern_matches_sentences(): sentences = Sentences(' I want this, or that.\n These and those.') # first sentence match = sentences.get_pattern(Pattern('(this|that)'), get_indices=True) assert match is not None s, start, end = match assert s == 'this' assert start == 7 assert end == 11 # second sentence match = sentences.get_pattern(Pattern('(these|those)'), get_indices=True) assert match is not None s, start, end = match assert s == 'These' assert start == 23 assert end == 28
def get_pattern(self, pat: Pattern, *, index=0, get_indices=False, return_negation=False, return_negation_keyword=False): """ :param return_negation: if True return Negation instance rather than ignoring negation :param pat: :param index: :param get_indices: to maintain backward compatibility :return: """ # incorporate offset information m = pat.matches(self.text, offset=self.start, return_negation=return_negation) self._update_last_search(bool(m)) if m: self.matches.add(m) if get_indices: # offset has already been added in pat.matches if return_negation_keyword: return m.group(index), m.start(index), m.end( index), m.neg_group() return m.group(index), m.start(index), m.end(index) elif return_negation_keyword: return m.group(index), m.neg_group() else: return m.group(index)
def test_pattern_matches_sentences_keep_offsets(): sentences = Sentences(' I want this, or that.\n These and those.', ssplit=keep_offsets_ssplit) # first sentence match = sentences.get_pattern(Pattern('(this|that)'), get_indices=True) assert match is not None s, start, end = match assert s == 'this' assert start == 8 assert end == 12 # second sentence match = sentences.get_pattern(Pattern('(these|those)'), get_indices=True) assert match is not None s, start, end = match assert s == 'These' assert start == 24 assert end == 29
def has_pattern(self, pat: Pattern, ignore_negation=False): m = pat.matches(self.text, ignore_negation=ignore_negation, offset=self.start) self._update_last_search(bool(m)) if m: self.matches.add(m) return m
def test_pattern_matches_sentence(): pat = Pattern('(this|that)') sentence = Sentence('\t I want this, or that.\n') match = sentence.get_pattern(pat, get_indices=True) assert match is not None s, start, end = match assert s == 'this' assert start == 9 assert end == 13
def test_pattern_return_negate(): m = Pattern('test', negates=[r'\bnot?\b']).matches('do not test this', return_negation=True) assert isinstance(m, Negation) assert m.neg_group() == 'not' assert m.match == 'test'
def test_sentence_return_negation_keyword(): p = Pattern('test', negates=[r'\bnot?\b']) text, neg = Sentence('do not test this').get_pattern( p, return_negation=True, return_negation_keyword=True) assert text == 'test' assert neg == 'not'
def test_sentence_return_negate(): p = Pattern('test', negates=[r'\bnot?\b']) text = Sentence('do not test this').get_pattern(p, return_negation=True) assert text == 'test'
def test_pattern_no_return_negate(): m = Pattern('test', negates=[r'\bnot?\b']).matches('do not test this') assert m is False
assert match is not None s, start, end = match assert s == 'this' assert start == 7 assert end == 11 # second sentence match = sentences.get_pattern(Pattern('(these|those)'), get_indices=True) assert match is not None s, start, end = match assert s == 'These' assert start == 23 assert end == 28 @pytest.mark.parametrize(('pat', 'sentence', 'n_matches'), [ (Pattern('(this|that)'), ' I want this, or that.\n', 2), ]) def test_pattern_finditer_sentence(pat: Pattern, sentence: str, n_matches): sentence = Sentence(sentence) matches = list(x[0] for x in sentence.get_patterns(pat)) # text only assert len(matches) == n_matches @pytest.mark.parametrize(('pat', 'text', 'n_matches'), [ (Pattern('(this|that)'), ' I want this, or that.\n\n But not that', 3), ]) def test_pattern_finditer_sentences(pat: Pattern, text: str, n_matches): sentences = Sentences(text) matches = list(sentences.get_patterns(pat)) assert len(matches) == n_matches
""" Useful phrases for negation when building patterns. """ from runrex.algo import Pattern # date pattern years_ago = r'(?:\d+ (?:year|yr|week|wk|month|mon|day)s? (?:ago|before|previous))' date_pat = r'\d+[-/]\d+[-/]\d+' date2_pat = r'\d+[/]\d+' month_pat = r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sept|oct|nov|dec)\w*(?:\W*\d{1,2})?\W*\d{4}' month_only_pat = r'in\b(?:jan|feb|mar|apr|may|jun|jul|aug|sept|oct|nov|dec)\w*' DATE_PAT = Pattern( f'({years_ago}|{date_pat}|{date2_pat}|{month_pat}|{month_only_pat})') # avoid 'last' or 'in' or 'since' safe_may = r'(?<!in|st|ce) may (?!\d)' # useful starting phrases for detecting negation, etc. boilerplate = r'\b(pamphlet|warning|information|review|side effect|counsel|\bsign|ensure' \ r'|risk|\bif\b|after your visit|appt|appointment|due (to|for|at)|recommend' \ r'|pamphlet|schedul|doctor|contact|\bhow\b|\bcall|includ|failure|' \ r'associated|avoid|instruct|guideline)' possible = r'\b(unlikely|\bposs\b|possib(ly|le|ility)|improbable|potential|susp(ect|icious)|' \ r'chance|may\b|afraid|concern|tentative|doubt|thought|think)' POSSIBLE_PAT = Pattern(possible) negation = r'(no evidence|without|r/o|rule out|normal|\bnot?\b|\bor\b|denies|negative for)' historical = r'(history|previous|\bhx\b|\bpast\b|\bprior\b|\bh/o\b)' hypothetical = r'(' \ r'option|possib\w+|desire|want|will|\bcan\b|usual' \ r'|\bor\b|like|would|need|until|request|when|you\Wll' \
from runrex.algo import Pattern BURDEN = Pattern( '(burden|debt)', negates=['not?'], # exclude a match requires=['heavy', r'a\W*lot', 'significant'] # require this for match )
def test_is_close_to(text, start, end, window, exp): assert is_close_to(Pattern(r'\bpain\b'), text, start, end, window) is exp