Пример #1
0
def test_tokenize_df_english():
    input_df = pd.DataFrame({
        "input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]
    })
    tokenizer = MultilingualTokenizer()
    output_df = tokenizer.tokenize_df(df=input_df,
                                      text_column="input_text",
                                      language="en")
    tokenized_document = output_df[tokenizer.tokenized_column][0]
    assert len(tokenized_document) == 15
def test_tokenize_df_multilingual():
    input_df = pd.DataFrame({
        "input_text": [
            "I hope nothing. I fear nothing. I am free.",
            " Les sanglots longs des violons d'automne",
            "子曰:“學而不思則罔,思而不學則殆。”",
        ],
        "language": ["en", "fr", "zh"],
    })
    tokenizer = MultilingualTokenizer()
    output_df = tokenizer.tokenize_df(df=input_df,
                                      text_column="input_text",
                                      language_column="language")
    tokenized_documents = output_df[tokenizer.tokenized_column]
    tokenized_documents_length = [len(doc) for doc in tokenized_documents]
    assert tokenized_documents_length == [12, 8, 13]
def test_tokenize_df_long_text():
    input_df = pd.DataFrame({"input_text": ["Long text"]})
    tokenizer = MultilingualTokenizer(max_num_characters=1)
    with pytest.raises(ValueError):
        tokenizer.tokenize_df(df=input_df, text_column="input_text", language="en")
def test_tokenize_df_japanese():
    input_df = pd.DataFrame({"input_text": ["期一会。 異体同心。 そうです。"]})
    tokenizer = MultilingualTokenizer()
    output_df = tokenizer.tokenize_df(df=input_df, text_column="input_text", language="ja")
    tokenized_document = output_df[tokenizer.tokenized_column][0]
    assert len(tokenized_document) == 9