예제 #1
0
def test_batch_tokenize_with_character(raw_texts: List[str],
                                       tokenizer_params: Dict):
    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [[Token.from_dict(token_param) for token_param in token_params]
              for token_params in read_lines(tokenizer_name)]
    result = tokenizer.batch_tokenize(raw_texts)
    assert expect == result
예제 #2
0
def test_batch_tokenize(raw_texts: List[str], tokenizer_params: Dict):
    if "AWS_ACCESS_KEY_ID" not in os.environ and tokenizer_params[
            "system_dictionary_path"].startswith("s3://"):
        pytest.skip("AWS credentials not found.")

    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [[Token.from_dict(token_param) for token_param in token_params]
              for token_params in read_lines(tokenizer_name)]
    result = tokenizer.batch_tokenize(raw_texts)
    assert expect == result
예제 #3
0
def test_batch_tokenize_with_character(raw_texts: List[str],
                                       tokenizer_params: Dict):
    if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7):
        pytest.skip("KyTea doesn't work in Python3.6")

    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [[Token.from_dict(token_param) for token_param in token_params]
              for token_params in read_lines(tokenizer_name)]
    result = tokenizer.batch_tokenize(raw_texts)
    assert expect == result