def test_batch_tokenize_with_character(raw_texts: List[str], tokenizer_params: Dict): tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)] result = tokenizer.batch_tokenize(raw_texts) assert expect == result
def test_tokenize(raw_texts: List[str], tokenizer_params: Dict): tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [ Token.from_dict(token_param) for token_param in read_lines(tokenizer_name)[0] ] result = tokenizer.tokenize(raw_texts[0]) assert expect == result
def test_batch_tokenize(raw_texts: List[str], tokenizer_params: Dict): if "AWS_ACCESS_KEY_ID" not in os.environ and tokenizer_params[ "system_dictionary_path"].startswith("s3://"): pytest.skip("AWS credentials not found.") tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)] result = tokenizer.batch_tokenize(raw_texts) assert expect == result
def test_batch_tokenize_with_character(raw_texts: List[str], tokenizer_params: Dict): if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7): pytest.skip("KyTea doesn't work in Python3.6") tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)] result = tokenizer.batch_tokenize(raw_texts) assert expect == result
def tokenize(self, text: str) -> List[Token]: """Tokenize input text""" if isinstance(self._endpoint, str): endpoint = self.get_endpoint("/api/v1/tokenize") payload = dict(self.payload, text=text) token_params = self._tokenize_with_remote_host( endpoint=endpoint, payload=payload, headers=self.headers) return [ Token.from_dict(token_param) for token_param in token_params ] else: assert self._tokenizer is not None return self._tokenizer.tokenize(text)
def batch_tokenize(self, texts: List[str]) -> List[List[Token]]: """Tokenize input texts""" if isinstance(self._endpoint, str): endpoint = self.get_endpoint("/api/v1/batch_tokenize") payload = dict(self.payload, texts=texts) token_params_list = self._batch_tokenize_with_remote_host( endpoint=endpoint, payload=payload, headers=self.headers, ) tokens_list: List[List[Token]] = [] for tokens in token_params_list: tokens_list.append( [Token.from_dict(token) for token in tokens]) return tokens_list else: assert self._tokenizer is not None return [self._tokenizer.tokenize(text) for text in texts]