示例#1
0
def test_batch_tokenize_with_character(raw_texts: List[str],
                                       tokenizer_params: Dict):
    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [[Token.from_dict(token_param) for token_param in token_params]
              for token_params in read_lines(tokenizer_name)]
    result = tokenizer.batch_tokenize(raw_texts)
    assert expect == result
示例#2
0
def test_tokenize(raw_texts: List[str], tokenizer_params: Dict):
    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [
        Token.from_dict(token_param)
        for token_param in read_lines(tokenizer_name)[0]
    ]
    result = tokenizer.tokenize(raw_texts[0])
    assert expect == result
示例#3
0
def test_batch_tokenize(raw_texts: List[str], tokenizer_params: Dict):
    if "AWS_ACCESS_KEY_ID" not in os.environ and tokenizer_params[
            "system_dictionary_path"].startswith("s3://"):
        pytest.skip("AWS credentials not found.")

    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [[Token.from_dict(token_param) for token_param in token_params]
              for token_params in read_lines(tokenizer_name)]
    result = tokenizer.batch_tokenize(raw_texts)
    assert expect == result
示例#4
0
def test_batch_tokenize_with_character(raw_texts: List[str],
                                       tokenizer_params: Dict):
    if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7):
        pytest.skip("KyTea doesn't work in Python3.6")

    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [[Token.from_dict(token_param) for token_param in token_params]
              for token_params in read_lines(tokenizer_name)]
    result = tokenizer.batch_tokenize(raw_texts)
    assert expect == result
示例#5
0
    def tokenize(self, text: str) -> List[Token]:
        """Tokenize input text"""

        if isinstance(self._endpoint, str):
            endpoint = self.get_endpoint("/api/v1/tokenize")
            payload = dict(self.payload, text=text)
            token_params = self._tokenize_with_remote_host(
                endpoint=endpoint, payload=payload, headers=self.headers)
            return [
                Token.from_dict(token_param) for token_param in token_params
            ]

        else:
            assert self._tokenizer is not None
            return self._tokenizer.tokenize(text)
示例#6
0
    def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
        """Tokenize input texts"""

        if isinstance(self._endpoint, str):
            endpoint = self.get_endpoint("/api/v1/batch_tokenize")
            payload = dict(self.payload, texts=texts)
            token_params_list = self._batch_tokenize_with_remote_host(
                endpoint=endpoint,
                payload=payload,
                headers=self.headers,
            )

            tokens_list: List[List[Token]] = []
            for tokens in token_params_list:
                tokens_list.append(
                    [Token.from_dict(token) for token in tokens])

            return tokens_list

        else:
            assert self._tokenizer is not None
            return [self._tokenizer.tokenize(text) for text in texts]