def test_tokenize_df_english(): input_df = pd.DataFrame({ "input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"] }) tokenizer = MultilingualTokenizer() output_df = tokenizer.tokenize_df(df=input_df, text_column="input_text", language="en") tokenized_document = output_df[tokenizer.tokenized_column][0] assert len(tokenized_document) == 15
def test_tokenize_df_multilingual(): input_df = pd.DataFrame({ "input_text": [ "I hope nothing. I fear nothing. I am free.", " Les sanglots longs des violons d'automne", "子曰:“學而不思則罔,思而不學則殆。”", ], "language": ["en", "fr", "zh"], }) tokenizer = MultilingualTokenizer() output_df = tokenizer.tokenize_df(df=input_df, text_column="input_text", language_column="language") tokenized_documents = output_df[tokenizer.tokenized_column] tokenized_documents_length = [len(doc) for doc in tokenized_documents] assert tokenized_documents_length == [12, 8, 13]
def test_tokenize_multilingual(): input_df = pd.DataFrame({ "input_text": [ "I hope nothing. I fear nothing. I am free.", " Les sanglots longs des violons d'automne", "子曰:“學而不思則罔,思而不學則殆。”", ], "language": ["en", "fr", "zh"], }) tokenizer = MultilingualTokenizer() worcloud_visualizer = WordcloudVisualizer( tokenizer=tokenizer, text_column="input_text", font_folder_path="toto", language="language_column", language_column="language", subchart_column="language", ) frequencies = worcloud_visualizer.tokenize_and_count(input_df) assert frequencies == [ ("en", Counter({ "I": 3, "hope": 1, "nothing": 2, ".": 3, "fear": 1, "am": 1, "free": 1 })), ("fr", Counter({ " ": 1, "Les": 1, "sanglots": 1, "longs": 1, "des": 1, "violons": 1, "d'": 1, "automne": 1 })), ( "zh", Counter({ "子": 1, "曰": 1, ":": 1, "“": 1, "學而": 1, "不思則": 1, "罔": 1, ",": 1, "思而": 1, "不學則": 1, "殆": 1, "。": 1, "”": 1, }), ), ]
def test_clean_df_multilingual(): input_df = pd.DataFrame( { "input_text": [ "I did a 10k run this morning at 6h34 follow me @superRunnerdu95 didn't I?", "Nous cherchâmes des informations sur https://www.google.com/ le 03/11/2046 l'aventures", "#Barcelona Fútbol es la vida [email protected] ℌ ①", ], "language": ["en", "fr", "es"], } ) token_filters = {"is_stop", "is_measure", "is_datetime", "like_url", "like_email", "is_username", "is_hashtag"} text_cleaner = TextCleaner( tokenizer=MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path), token_filters=token_filters, lemmatization=True, lowercase=False, unicode_normalization=UnicodeNormalization.NFKD, ) output_df = text_cleaner.clean_df(df=input_df, text_column="input_text", language_column="language") cleaned_text_column = list(text_cleaner.output_column_descriptions.keys())[0] cleaned_texts = output_df[cleaned_text_column].values.tolist() expected_cleaned_texts = [ "run morning follow not ?", "chercher information aventurer", "Fútbol vida H 1", ] assert cleaned_texts == expected_cleaned_texts
def test_tokenize_and_count_multilingual(): input_df = pd.DataFrame( { "input_text": [ "I hope nothing. I fear Nothing. Nothing. I am free.", " Les sanglots longs des violons d'automne", "子曰:“學而不思則罔,思而不學則殆。”", ], "language": ["en", "fr", "zh"], } ) tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path) worcloud_visualizer = WordcloudVisualizer( tokenizer=tokenizer, text_column="input_text", font_folder_path="toto", language="language_column", language_column="language", subchart_column="language", remove_stopwords=True, remove_punctuation=True, case_insensitive=True, ) frequencies = worcloud_visualizer.tokenize_and_count(input_df) assert frequencies == [ ("en", Counter({"hope": 1, "Nothing": 3, "fear": 1, "free": 1})), ("fr", Counter({"sanglots": 1, "longs": 1, "violons": 1, "automne": 1})), ("zh", Counter({"子": 1, "曰": 1, "學而": 1, "不思則": 1, "罔": 1, "思而": 1, "不學則": 1}),), ]
def test_wordcloud_multilingual(): input_df = pd.DataFrame({ "input_text": [ "I hope nothing. I fear nothing. I am free.", " Les sanglots longs des violons d'automne", "子曰:“學而不思則罔,思而不學則殆。”", ], "language": ["en", "fr", "zh"], }) tokenizer = MultilingualTokenizer() worcloud_visualizer = WordcloudVisualizer( tokenizer=tokenizer, text_column="input_text", font_folder_path=font_folder_path, language="language_column", language_column="language", subchart_column="language", ) frequencies = worcloud_visualizer.tokenize_and_count(input_df) num_wordclouds = 0 for temp, name in worcloud_visualizer.generate_wordclouds(frequencies): assert temp is not None assert "wordcloud_" in name num_wordclouds += 1 assert num_wordclouds == 3
def test_spellcheck_df_multilingual(): input_df = pd.DataFrame({ "input_text": [ "Can yu read tHISs message despite the horible AB1234 sppeling msitakes 😂 #OMG", "Les fautes d'orthografe c pas toop #LOOOL PTDR", "Toodo lo que puéde ser covfefe es real.", ], "language": ["en", "fr", "es"], }) tokenizer = MultilingualTokenizer( stopwords_folder_path=stopwords_folder_path) spellchecker = SpellChecker(tokenizer=tokenizer, dictionary_folder_path=dictionary_folder_path, custom_vocabulary_set={"PTDR"}) output_df = spellchecker.check_df(df=input_df, text_column="input_text", language_column="language") corrected_text_column = list( spellchecker.output_column_descriptions.keys())[0] corrected_texts = output_df[corrected_text_column].values.tolist() expected_corrections = [ "Can you read tHIS message despite the horrible AB1234 spelling mistakes 😂 #OMG", "Les fautes d'orthographe c pas trop #LOOOL PTDR", "Todo lo que puede ser covfefe es real.", ] assert corrected_texts == expected_corrections
def test_tokenize_and_count_english(): input_df = pd.DataFrame({"input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]}) tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path) worcloud_visualizer = WordcloudVisualizer( tokenizer=tokenizer, text_column="input_text", font_folder_path=font_folder_path, language="en" ) frequencies = worcloud_visualizer.tokenize_and_count(input_df) assert frequencies == [("", {"hope": 1, "nothing": 2, "fear": 1, "free": 1, "💩": 1, "😂": 1, "#OMG": 1})]
def test_clean_df_english(): input_df = pd.DataFrame({"input_text": ["Hi, I have two apples costing 3$ 😂 \n and unicode has #snowpersons ☃"]}) token_filters = {"is_punct", "is_stop", "like_num", "is_symbol", "is_currency", "is_emoji"} text_cleaner = TextCleaner(tokenizer=MultilingualTokenizer(), token_filters=token_filters, lemmatization=True) output_df = text_cleaner.clean_df(df=input_df, text_column="input_text", language="en") cleaned_text_column = list(text_cleaner.output_column_descriptions.keys())[0] cleaned_text = output_df[cleaned_text_column][0] expected_cleaned_text = "apple cost unicode #snowpersons" assert cleaned_text == expected_cleaned_text
def test_wordcloud_english(): input_df = pd.DataFrame({"input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]}) tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path) worcloud_visualizer = WordcloudVisualizer( tokenizer=tokenizer, text_column="input_text", font_folder_path=font_folder_path, language="en" ) frequencies = worcloud_visualizer.tokenize_and_count(input_df) for temp, output_file_name in worcloud_visualizer.generate_wordclouds(frequencies): assert temp is not None assert output_file_name == "wordcloud.png"
def test_wordcloud_deterministic(): reference_test_image = Image.open(os.path.join(test_resource_folder_path, "test_image.png")) input_df = pd.DataFrame({"input_text": ["I hope nothing. I fear nothing. I am free. 💩 😂 #OMG"]}) tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path) worcloud_visualizer = WordcloudVisualizer( tokenizer=tokenizer, text_column="input_text", font_folder_path=font_folder_path, language="en", max_words=10, color_list=["#ff0000", "#0000ff", "#008000"], ) frequencies = worcloud_visualizer.tokenize_and_count(input_df) for temp, output_file_name in worcloud_visualizer.generate_wordclouds(frequencies): generated_test_image = Image.open(temp) assert list(generated_test_image.getdata()) == list(reference_test_image.getdata())
def test_spellcheck_df_english(): input_df = pd.DataFrame({ "input_text": [ "Can yu read tHISs message despite the horible AB1234 sppeling msitakes 😂 #OMG" ] }) spellchecker = SpellChecker(tokenizer=MultilingualTokenizer(), dictionary_folder_path=dictionary_folder_path) output_df = spellchecker.check_df(df=input_df, text_column="input_text", language="en") corrected_text_column = list( spellchecker.output_column_descriptions.keys())[0] corrected_text = output_df[corrected_text_column][0] expected_correction = "Can you read tHIS message despite the horrible AB1234 spelling mistakes 😂 #OMG" assert corrected_text == expected_correction
import os import logging from time import perf_counter from spacy_tokenizer import MultilingualTokenizer from wordcloud_visualizer import WordcloudVisualizer from plugin_config_loading import load_config_and_data_wordcloud # Load config params, df = load_config_and_data_wordcloud() output_folder = params.output_folder output_partition_path = params.output_partition_path # Load wordcloud visualizer worcloud_visualizer = WordcloudVisualizer( tokenizer=MultilingualTokenizer( stopwords_folder_path=params.stopwords_folder_path), text_column=params.text_column, font_folder_path=params.font_folder_path, language=params.language, language_column=params.language_column, subchart_column=params.subchart_column, remove_stopwords=params.remove_stopwords, remove_punctuation=params.remove_punctuation, case_insensitive=params.case_insensitive, max_words=params.max_words, color_list=params.color_list, ) # Prepare data and count tokens for each subchart frequencies = worcloud_visualizer.tokenize_and_count(df)
def test_tokenize_df_long_text(): input_df = pd.DataFrame({"input_text": ["Long text"]}) tokenizer = MultilingualTokenizer(max_num_characters=1) with pytest.raises(ValueError): tokenizer.tokenize_df(df=input_df, text_column="input_text", language="en")
def test_tokenize_df_japanese(): input_df = pd.DataFrame({"input_text": ["期一会。 異体同心。 そうです。"]}) tokenizer = MultilingualTokenizer() output_df = tokenizer.tokenize_df(df=input_df, text_column="input_text", language="ja") tokenized_document = output_df[tokenizer.tokenized_column][0] assert len(tokenized_document) == 9
from dataiku.customrecipe import get_recipe_resource from spacy_tokenizer import MultilingualTokenizer from wordcloud_visualizer import WordcloudVisualizer from plugin_config_loading import load_plugin_config_wordcloud # Load config params = load_plugin_config_wordcloud() font_folder_path = os.path.join(get_recipe_resource(), "fonts") output_folder = params["output_folder"] output_partition_path = params["output_partition_path"] df = params["df"] # Load wordcloud visualizer worcloud_visualizer = WordcloudVisualizer( tokenizer=MultilingualTokenizer(), text_column=params["text_column"], font_folder_path=font_folder_path, language=params["language"], language_column=params["language_column"], subchart_column=params["subchart_column"], ) # Prepare data and count tokens for each subchart frequencies = worcloud_visualizer.tokenize_and_count(df) # Clear output folder's target partition output_folder.delete_path(output_partition_path) # Save wordclouds to folder start = perf_counter()