예제 #1
0
def test_tokenize_df_english():
    input_df = pd.DataFrame({
        "input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]
    })
    tokenizer = MultilingualTokenizer()
    output_df = tokenizer.tokenize_df(df=input_df,
                                      text_column="input_text",
                                      language="en")
    tokenized_document = output_df[tokenizer.tokenized_column][0]
    assert len(tokenized_document) == 15
def test_tokenize_df_multilingual():
    input_df = pd.DataFrame({
        "input_text": [
            "I hope nothing. I fear nothing. I am free.",
            " Les sanglots longs des violons d'automne",
            "子曰:“學而不思則罔,思而不學則殆。”",
        ],
        "language": ["en", "fr", "zh"],
    })
    tokenizer = MultilingualTokenizer()
    output_df = tokenizer.tokenize_df(df=input_df,
                                      text_column="input_text",
                                      language_column="language")
    tokenized_documents = output_df[tokenizer.tokenized_column]
    tokenized_documents_length = [len(doc) for doc in tokenized_documents]
    assert tokenized_documents_length == [12, 8, 13]
예제 #3
0
def test_tokenize_multilingual():
    input_df = pd.DataFrame({
        "input_text": [
            "I hope nothing. I fear nothing. I am free.",
            " Les sanglots longs des violons d'automne",
            "子曰:“學而不思則罔,思而不學則殆。”",
        ],
        "language": ["en", "fr", "zh"],
    })
    tokenizer = MultilingualTokenizer()
    worcloud_visualizer = WordcloudVisualizer(
        tokenizer=tokenizer,
        text_column="input_text",
        font_folder_path="toto",
        language="language_column",
        language_column="language",
        subchart_column="language",
    )
    frequencies = worcloud_visualizer.tokenize_and_count(input_df)
    assert frequencies == [
        ("en",
         Counter({
             "I": 3,
             "hope": 1,
             "nothing": 2,
             ".": 3,
             "fear": 1,
             "am": 1,
             "free": 1
         })),
        ("fr",
         Counter({
             " ": 1,
             "Les": 1,
             "sanglots": 1,
             "longs": 1,
             "des": 1,
             "violons": 1,
             "d'": 1,
             "automne": 1
         })),
        (
            "zh",
            Counter({
                "子": 1,
                "曰": 1,
                ":": 1,
                "“": 1,
                "學而": 1,
                "不思則": 1,
                "罔": 1,
                ",": 1,
                "思而": 1,
                "不學則": 1,
                "殆": 1,
                "。": 1,
                "”": 1,
            }),
        ),
    ]
def test_clean_df_multilingual():
    input_df = pd.DataFrame(
        {
            "input_text": [
                "I did a 10k run this morning at 6h34 follow me @superRunnerdu95 didn't I?",
                "Nous cherchâmes des informations sur https://www.google.com/ le 03/11/2046 l'aventures",
                "#Barcelona Fútbol es la vida [email protected] ℌ ①",
            ],
            "language": ["en", "fr", "es"],
        }
    )
    token_filters = {"is_stop", "is_measure", "is_datetime", "like_url", "like_email", "is_username", "is_hashtag"}
    text_cleaner = TextCleaner(
        tokenizer=MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path),
        token_filters=token_filters,
        lemmatization=True,
        lowercase=False,
        unicode_normalization=UnicodeNormalization.NFKD,
    )
    output_df = text_cleaner.clean_df(df=input_df, text_column="input_text", language_column="language")
    cleaned_text_column = list(text_cleaner.output_column_descriptions.keys())[0]
    cleaned_texts = output_df[cleaned_text_column].values.tolist()
    expected_cleaned_texts = [
        "run morning follow not ?",
        "chercher information aventurer",
        "Fútbol vida H 1",
    ]
    assert cleaned_texts == expected_cleaned_texts
예제 #5
0
def test_tokenize_and_count_multilingual():
    input_df = pd.DataFrame(
        {
            "input_text": [
                "I hope nothing. I fear Nothing. Nothing. I am free.",
                " Les sanglots longs des violons d'automne",
                "子曰:“學而不思則罔,思而不學則殆。”",
            ],
            "language": ["en", "fr", "zh"],
        }
    )
    tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path)
    worcloud_visualizer = WordcloudVisualizer(
        tokenizer=tokenizer,
        text_column="input_text",
        font_folder_path="toto",
        language="language_column",
        language_column="language",
        subchart_column="language",
        remove_stopwords=True,
        remove_punctuation=True,
        case_insensitive=True,
    )
    frequencies = worcloud_visualizer.tokenize_and_count(input_df)
    assert frequencies == [
        ("en", Counter({"hope": 1, "Nothing": 3, "fear": 1, "free": 1})),
        ("fr", Counter({"sanglots": 1, "longs": 1, "violons": 1, "automne": 1})),
        ("zh", Counter({"子": 1, "曰": 1, "學而": 1, "不思則": 1, "罔": 1, "思而": 1, "不學則": 1}),),
    ]
예제 #6
0
def test_wordcloud_multilingual():
    input_df = pd.DataFrame({
        "input_text": [
            "I hope nothing. I fear nothing. I am free.",
            " Les sanglots longs des violons d'automne",
            "子曰:“學而不思則罔,思而不學則殆。”",
        ],
        "language": ["en", "fr", "zh"],
    })
    tokenizer = MultilingualTokenizer()
    worcloud_visualizer = WordcloudVisualizer(
        tokenizer=tokenizer,
        text_column="input_text",
        font_folder_path=font_folder_path,
        language="language_column",
        language_column="language",
        subchart_column="language",
    )
    frequencies = worcloud_visualizer.tokenize_and_count(input_df)
    num_wordclouds = 0
    for temp, name in worcloud_visualizer.generate_wordclouds(frequencies):
        assert temp is not None
        assert "wordcloud_" in name
        num_wordclouds += 1
    assert num_wordclouds == 3
def test_spellcheck_df_multilingual():
    input_df = pd.DataFrame({
        "input_text": [
            "Can yu read tHISs message despite the horible AB1234 sppeling msitakes 😂 #OMG",
            "Les fautes d'orthografe c pas toop #LOOOL PTDR",
            "Toodo lo que puéde ser covfefe es real.",
        ],
        "language": ["en", "fr", "es"],
    })
    tokenizer = MultilingualTokenizer(
        stopwords_folder_path=stopwords_folder_path)
    spellchecker = SpellChecker(tokenizer=tokenizer,
                                dictionary_folder_path=dictionary_folder_path,
                                custom_vocabulary_set={"PTDR"})
    output_df = spellchecker.check_df(df=input_df,
                                      text_column="input_text",
                                      language_column="language")
    corrected_text_column = list(
        spellchecker.output_column_descriptions.keys())[0]
    corrected_texts = output_df[corrected_text_column].values.tolist()
    expected_corrections = [
        "Can you read tHIS message despite the horrible AB1234 spelling mistakes 😂 #OMG",
        "Les fautes d'orthographe c pas trop #LOOOL PTDR",
        "Todo lo que puede ser covfefe es real.",
    ]
    assert corrected_texts == expected_corrections
예제 #8
0
def test_tokenize_and_count_english():
    input_df = pd.DataFrame({"input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]})
    tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path)
    worcloud_visualizer = WordcloudVisualizer(
        tokenizer=tokenizer, text_column="input_text", font_folder_path=font_folder_path, language="en"
    )
    frequencies = worcloud_visualizer.tokenize_and_count(input_df)
    assert frequencies == [("", {"hope": 1, "nothing": 2, "fear": 1, "free": 1, "💩": 1, "😂": 1, "#OMG": 1})]
def test_clean_df_english():
    input_df = pd.DataFrame({"input_text": ["Hi, I have two apples costing 3$ 😂    \n and unicode has #snowpersons ☃"]})
    token_filters = {"is_punct", "is_stop", "like_num", "is_symbol", "is_currency", "is_emoji"}
    text_cleaner = TextCleaner(tokenizer=MultilingualTokenizer(), token_filters=token_filters, lemmatization=True)
    output_df = text_cleaner.clean_df(df=input_df, text_column="input_text", language="en")
    cleaned_text_column = list(text_cleaner.output_column_descriptions.keys())[0]
    cleaned_text = output_df[cleaned_text_column][0]
    expected_cleaned_text = "apple cost unicode #snowpersons"
    assert cleaned_text == expected_cleaned_text
예제 #10
0
def test_wordcloud_english():
    input_df = pd.DataFrame({"input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]})
    tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path)
    worcloud_visualizer = WordcloudVisualizer(
        tokenizer=tokenizer, text_column="input_text", font_folder_path=font_folder_path, language="en"
    )
    frequencies = worcloud_visualizer.tokenize_and_count(input_df)
    for temp, output_file_name in worcloud_visualizer.generate_wordclouds(frequencies):
        assert temp is not None
        assert output_file_name == "wordcloud.png"
예제 #11
0
def test_wordcloud_deterministic():
    reference_test_image = Image.open(os.path.join(test_resource_folder_path, "test_image.png"))
    input_df = pd.DataFrame({"input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]})
    tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path)
    worcloud_visualizer = WordcloudVisualizer(
        tokenizer=tokenizer,
        text_column="input_text",
        font_folder_path=font_folder_path,
        language="en",
        max_words=10,
        color_list=["#ff0000", "#0000ff", "#008000"],
    )
    frequencies = worcloud_visualizer.tokenize_and_count(input_df)
    for temp, output_file_name in worcloud_visualizer.generate_wordclouds(frequencies):
        generated_test_image = Image.open(temp)
        assert list(generated_test_image.getdata()) == list(reference_test_image.getdata())
def test_spellcheck_df_english():
    input_df = pd.DataFrame({
        "input_text": [
            "Can yu read tHISs message despite the horible AB1234 sppeling msitakes 😂 #OMG"
        ]
    })
    spellchecker = SpellChecker(tokenizer=MultilingualTokenizer(),
                                dictionary_folder_path=dictionary_folder_path)
    output_df = spellchecker.check_df(df=input_df,
                                      text_column="input_text",
                                      language="en")
    corrected_text_column = list(
        spellchecker.output_column_descriptions.keys())[0]
    corrected_text = output_df[corrected_text_column][0]
    expected_correction = "Can you read tHIS message despite the horrible AB1234 spelling mistakes 😂 #OMG"
    assert corrected_text == expected_correction
예제 #13
0
import os
import logging
from time import perf_counter

from spacy_tokenizer import MultilingualTokenizer
from wordcloud_visualizer import WordcloudVisualizer
from plugin_config_loading import load_config_and_data_wordcloud

# Load config
params, df = load_config_and_data_wordcloud()
output_folder = params.output_folder
output_partition_path = params.output_partition_path

# Load wordcloud visualizer
worcloud_visualizer = WordcloudVisualizer(
    tokenizer=MultilingualTokenizer(
        stopwords_folder_path=params.stopwords_folder_path),
    text_column=params.text_column,
    font_folder_path=params.font_folder_path,
    language=params.language,
    language_column=params.language_column,
    subchart_column=params.subchart_column,
    remove_stopwords=params.remove_stopwords,
    remove_punctuation=params.remove_punctuation,
    case_insensitive=params.case_insensitive,
    max_words=params.max_words,
    color_list=params.color_list,
)

# Prepare data and count tokens for each subchart
frequencies = worcloud_visualizer.tokenize_and_count(df)
def test_tokenize_df_long_text():
    input_df = pd.DataFrame({"input_text": ["Long text"]})
    tokenizer = MultilingualTokenizer(max_num_characters=1)
    with pytest.raises(ValueError):
        tokenizer.tokenize_df(df=input_df, text_column="input_text", language="en")
def test_tokenize_df_japanese():
    input_df = pd.DataFrame({"input_text": ["期一会。 異体同心。 そうです。"]})
    tokenizer = MultilingualTokenizer()
    output_df = tokenizer.tokenize_df(df=input_df, text_column="input_text", language="ja")
    tokenized_document = output_df[tokenizer.tokenized_column][0]
    assert len(tokenized_document) == 9
예제 #16
0
from dataiku.customrecipe import get_recipe_resource
from spacy_tokenizer import MultilingualTokenizer
from wordcloud_visualizer import WordcloudVisualizer
from plugin_config_loading import load_plugin_config_wordcloud


# Load config
params = load_plugin_config_wordcloud()
font_folder_path = os.path.join(get_recipe_resource(), "fonts")
output_folder = params["output_folder"]
output_partition_path = params["output_partition_path"]
df = params["df"]

# Load wordcloud visualizer
worcloud_visualizer = WordcloudVisualizer(
    tokenizer=MultilingualTokenizer(),
    text_column=params["text_column"],
    font_folder_path=font_folder_path,
    language=params["language"],
    language_column=params["language_column"],
    subchart_column=params["subchart_column"],
)

# Prepare data and count tokens for each subchart
frequencies = worcloud_visualizer.tokenize_and_count(df)

# Clear output folder's target partition
output_folder.delete_path(output_partition_path)

# Save wordclouds to folder
start = perf_counter()