Пример #1
0
    def test_extract_regex_tok_len_2(self):
        file_name = "test_data/io/test_systemt/test.dict"
        file_text = self._read_file(file_name)

        tokenizer = self._make_tokenizer()
        char_span = make_tokens(file_text, tokenizer)

        match_regex = re.compile(r".*y$")

        result_df = extract_regex_tok(char_span,
                                      match_regex,
                                      min_len=2,
                                      max_len=2,
                                      output_col_name="result")

        self.assertIn("result", result_df.columns)
        # print(f"****\n{result_df}\n****")
        self.assertEqual(
            repr(result_df),
            textwrap.dedent("""\
                                        result
                0  [0, 16): 'Dictionary Entry'
                1      [11, 22): 'Entry Entry'
                2    [50, 63): 'Haiku factory'
                3      [73, 84): 'before they'"""),
        )
Пример #2
0
    def test_extract_dict(self):
        file_name = "test_data/io/test_systemt/test.dict"
        file_text = self._read_file(file_name)

        tokenizer = self._make_tokenizer()
        char_span = make_tokens(file_text, tokenizer)

        dict_df = load_dict(file_name, tokenizer)

        result_df = extract_dict(char_span, dict_df, "result")

        self.assertIn("result", result_df.columns)
        #print(f"****\n{result_df}\n****")
        self.assertEqual(
            repr(result_df),
            textwrap.dedent(
                """\
                                              result
                2        [0, 16): 'Dictionary Entry'
                0                  [11, 16): 'Entry'
                1                  [17, 22): 'Entry'
                5  [23, 44): 'Help me! I am trapped'
                4    [45, 64): 'In a Haiku factory!'
                3    [65, 84): 'Save me before they'"""
            )
        )
Пример #3
0
    overlap_join,
)

# SpaCy tokenizer (only) setup
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
_tokenizer = nlp.tokenizer

# Build up some example relations for the tests in this file
_TEXT = """
In AD 932, King Arthur and his squire, Patsy, travel throughout Britain 
searching for men to join the Knights of the Round Table. Along the way, he 
recruits Sir Bedevere the Wise, Sir Lancelot the Brave, Sir Galahad the Pure...
"""
_TOKENS_SERIES = make_tokens(_TEXT, _tokenizer)
_TOKENS_ARRAY = _TOKENS_SERIES.array  # type: SpanArray
_TOKEN_SPANS_ARRAY = TokenSpanArray.from_char_offsets(_TOKENS_ARRAY)
_CAPS_WORD = extract_regex_tok(_TOKENS_ARRAY, regex.compile("[A-Z][a-z]*"))
_CAPS_WORDS = extract_regex_tok(
    _TOKENS_ARRAY, regex.compile("[A-Z][a-z]*(\\s([A-Z][a-z]*))*"), 1, 2
)
_THE = extract_regex_tok(_TOKENS_ARRAY, regex.compile("[Tt]he"))


class JoinTest(TestBase):
    def setUp(self):
        # Make it easier to see what's going on with join results
        self._prev_token_offsets_flag_value = TokenSpan.USE_TOKEN_OFFSETS_IN_REPR
        TokenSpan.USE_TOKEN_OFFSETS_IN_REPR = True