def test_batch_tokenize_with_character(raw_texts: List[str], tokenizer_params: Dict): tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)] result = tokenizer.batch_tokenize(raw_texts) assert expect == result
def test_batch_tokenize(raw_texts: List[str], tokenizer_params: Dict): if "AWS_ACCESS_KEY_ID" not in os.environ and tokenizer_params[ "system_dictionary_path"].startswith("s3://"): pytest.skip("AWS credentials not found.") tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)] result = tokenizer.batch_tokenize(raw_texts) assert expect == result
def test_batch_tokenize_with_character(raw_texts: List[str], tokenizer_params: Dict): if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7): pytest.skip("KyTea doesn't work in Python3.6") tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)] result = tokenizer.batch_tokenize(raw_texts) assert expect == result