def load_config_and_data_wordcloud() -> Tuple[PluginParams, pd.DataFrame]: """Utility function to: - Validate and load wordcloud parameters into a clean class - Validate input data, keep only necessary columns and drop invalid rows Returns: - Class instance with parameter names as attributes and associated values - Pandas DataFrame with necessary input data """ params = PluginParams() # Input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) != 1: raise PluginParamValidationError("Please specify one input dataset") input_dataset = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [p["name"] for p in input_dataset.read_schema()] # Output folder output_folder_names = get_output_names_for_role("output_folder") if len(output_folder_names) != 1: raise PluginParamValidationError("Please specify one output folder") params.output_folder = dataiku.Folder(output_folder_names[0]) # Partition handling params.output_partition_path = get_folder_partition_root( params.output_folder) # Recipe parameters recipe_config = get_recipe_config() # Text column if recipe_config.get("text_column") not in input_dataset_columns: raise PluginParamValidationError( f"Invalid text column selection: {recipe_config.get('text_column')}" ) params.text_column = recipe_config.get("text_column") logging.info(f"Text column: {params.text_column}") # Language selection if recipe_config.get("language") == "language_column": if recipe_config.get("language_column") not in input_dataset_columns: raise PluginParamValidationError( f"Invalid language column selection: {recipe_config.get('language_column')}" ) params.language = recipe_config.get("language") params.language_column = recipe_config.get("language_column") logging.info(f"Language column: {params.language_column}") else: if not recipe_config.get("language"): raise PluginParamValidationError("Empty language selection") if recipe_config.get("language") not in SUPPORTED_LANGUAGES_SPACY: raise PluginParamValidationError( f"Unsupported language code: {recipe_config.get('language')}") params.language = recipe_config.get("language") params.language_column = None logging.info(f"Language: {params.language}") # Subcharts subchart_column = recipe_config.get("subchart_column") # If parameter is saved then cleared, config retrieves "" subchart_column = None if not subchart_column else subchart_column if subchart_column and ((subchart_column not in input_dataset_columns + ["order66"])): raise PluginParamValidationError( f"Invalid categorical column selection: {subchart_column}") params.subchart_column = subchart_column logging.info(f"Subcharts column: {params.subchart_column}") # Input dataframe necessary_columns = [ column for column in set([ params.text_column, params.language_column, params.subchart_column, ]) if (column not in [None, "order66"]) ] df = input_dataset.get_dataframe(columns=necessary_columns).dropna( subset=necessary_columns) if df.empty: raise PluginParamValidationError("Dataframe is empty") # Check if unsupported languages in multilingual case elif params.language_column: languages = set(df[params.language_column].unique()) unsupported_lang = languages - SUPPORTED_LANGUAGES_SPACY.keys() if unsupported_lang: raise PluginParamValidationError( f"Found {len(unsupported_lang)} unsupported languages: {', '.join(sorted(unsupported_lang))}" ) logging.info(f"Read dataset of shape: {df.shape}") # Text simplification parameters params.remove_stopwords = recipe_config.get("remove_stopwords") params.stopwords_folder_path = os.path.join( get_recipe_resource(), "stopwords") if params.remove_stopwords else None params.font_folder_path = os.path.join(get_recipe_resource(), "fonts") params.remove_punctuation = recipe_config.get("remove_punctuation") params.case_insensitive = recipe_config.get("case_insensitive") logging.info(f"Remove stopwords: {params.remove_stopwords}") logging.info(f"Stopwords folder path: {params.stopwords_folder_path}") logging.info(f"Fonts folder path: {params.font_folder_path}") logging.info(f"Remove punctuation: {params.remove_punctuation}") logging.info(f"Case-insensitive: {params.case_insensitive}") # Display parameters: max_words = recipe_config.get("max_words") if (not max_words) or not ((isinstance(max_words, int)) and (max_words >= 1)): raise PluginParamValidationError( "Maximum number of words is not a positive integer") params.max_words = max_words logging.info(f"Max number of words: {params.max_words}") color_palette = recipe_config.get("color_palette") if not color_palette: raise PluginParamValidationError("Empty color palette selection") if color_palette == "custom": color_list = recipe_config.get("color_list") if not (isinstance(color_list, list) and (len(color_list) >= 1)): raise PluginParamValidationError("Empty custom palette") if not all( [matplotlib.colors.is_color_like(color) for color in color_list]): raise PluginParamValidationError( f"Invalid custom palette: {color_list}") params.color_list = [ matplotlib.colors.to_hex(color) for color in color_list ] logging.info(f"Custom palette: {params.color_list}") else: if color_palette not in { builtin_palette["id"] for builtin_palette in DSS_BUILTIN_COLOR_PALETTES }: raise PluginParamValidationError( f"Unsupported color palette: {color_palette}") selected_palette_dict = [ builtin_palette for builtin_palette in DSS_BUILTIN_COLOR_PALETTES if builtin_palette["id"] == color_palette ][0] params.color_list = selected_palette_dict["colors"] logging.info( f"Using built-in DSS palette: '{selected_palette_dict['name']}' with colors: {params.color_list}" ) return params, df
text_column_name = recipe_config.get('text_column_name', None) if text_column_name == None: raise ValueError("You did not choose a text column.") predict_polarity = bool(recipe_config.get('predict_polarity', True)) output_probabilities = bool(recipe_config.get('output_confidence', False)) ############################# # Load FastText Model ############################# model = load_model( os.path.join( get_recipe_resource(), "fasttext", "sentiment_analysis", "amazon_review_polarity.ftz" if predict_polarity else "amazon_review_full.ftz" ) ) ############################# # Score ############################# CHUNK_SIZE = 10000 # Output Dataset dataset_name = get_output_names_for_role('output_dataset')[0]
# -*- coding: utf-8 -*- import os import logging import os from time import perf_counter from dataiku.customrecipe import get_recipe_resource from spacy_tokenizer import MultilingualTokenizer from wordcloud_visualizer import WordcloudVisualizer from plugin_config_loading import load_plugin_config_wordcloud # Load config params = load_plugin_config_wordcloud() font_folder_path = os.path.join(get_recipe_resource(), "fonts") output_folder = params["output_folder"] output_partition_path = params["output_partition_path"] df = params["df"] # Load wordcloud visualizer worcloud_visualizer = WordcloudVisualizer( tokenizer=MultilingualTokenizer(), text_column=params["text_column"], font_folder_path=font_folder_path, language=params["language"], language_column=params["language_column"], subchart_column=params["subchart_column"], ) # Prepare data and count tokens for each subchart
"""Module with utility functions to annotate images""" import os from typing import List, AnyStr import numpy as np from dataiku.customrecipe import get_recipe_resource from PIL import Image, ImageFont, ImageDraw from io import BytesIO # ============================================================================== # CONSTANT DEFINITION # ============================================================================== BOUNDING_BOX_COLOR = "red" BOUNDING_BOX_FONT_PATH = os.path.join(get_recipe_resource(), "SourceSansPro-Regular.ttf") BOUNDING_BOX_FONT_DEFAULT_SIZE = 18 # ============================================================================== # CLASS AND FUNCTION DEFINITION # ============================================================================== def save_image_bytes(pil_image: Image, path: AnyStr) -> bytes: image_bytes = BytesIO() file_extension = path.split(".")[-1].upper() if file_extension in {"JPG", "JPEG"}: pil_image.save(image_bytes, format="JPEG", quality=100,
def load_plugin_config_cleaning() -> Dict: """Utility function to validate and load text cleaning parameters into a clean dictionary Returns: Dictionary of parameter names (key) and values """ params = {} recipe_config = get_recipe_config() # input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) == 0: raise PluginParamValidationError("Please specify input dataset") params["input_dataset"] = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [p["name"] for p in params["input_dataset"].read_schema()] # output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output dataset") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) # path to the folder of stopwords params["stopwords_folder_path"] = os.path.join(get_recipe_resource(), "stopwords") # Text column selection params["text_column"] = recipe_config.get("text_column") logging.info(f"Text column: {params['text_column']}") if params["text_column"] not in input_dataset_columns: raise PluginParamValidationError(f"Invalid text column selection: {params['text_column']}") # Language selection params["language"] = recipe_config.get("language") if params["language"] == "language_column": params["language_column"] = recipe_config.get("language_column") if params["language_column"] not in input_dataset_columns: raise PluginParamValidationError(f"Invalid language column selection: {params['language_column']}") logging.info(f"Language column: {params['language_column']}") else: if not params["language"]: raise PluginParamValidationError("Empty language selection") if params["language"] not in SUPPORTED_LANGUAGES_SPACY: raise PluginParamValidationError(f"Unsupported language code: {params['language']}") params["language_column"] = "" logging.info(f"Language: {params['language']}") # Cleaning parameters params["token_filters"] = set(recipe_config.get("token_filters", [])) available_token_filters = set(MultilingualTokenizer.DEFAULT_FILTER_TOKEN_ATTRIBUTES.keys()) if not params["token_filters"] <= available_token_filters: raise PluginParamValidationError(f"Invalid token filters: {params['token_filters']-available_token_filters}") logging.info(f"Token filters: {params['token_filters']}") if params["language"] == "language_column": params["lemmatization"] = bool(recipe_config.get("lemmatization_multilingual")) else: params["lemmatization"] = bool(recipe_config.get("lemmatization")) logging.info(f"Lemmatization: {params['lemmatization']}") params["lowercase"] = bool(recipe_config.get("lowercase")) logging.info(f"Lowercase: {params['lowercase']}") # Expert mode if recipe_config.get("expert"): logging.info("Expert mode is enabled") else: logging.info("Expert mode is disabled") params["unicode_normalization"] = UnicodeNormalization[recipe_config.get("unicode_normalization")] logging.info(f"Unicode normalization: {params['unicode_normalization']}") params["keep_filtered_tokens"] = bool(recipe_config.get("keep_filtered_tokens")) logging.info(f"Keep filtered tokens: {params['keep_filtered_tokens']}") return params
def load_plugin_config_spellchecker() -> Dict: """Utility function to validate and load spell checker parameters into a clean dictionary Returns: Dictionary of parameter names (key) and values """ params = {} recipe_config = get_recipe_config() # input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) == 0: raise PluginParamValidationError("Please specify input dataset") params["input_dataset"] = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [p["name"] for p in params["input_dataset"].read_schema()] # output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output dataset") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) # custom_vocabulary (optional input dataset) params["custom_vocabulary_set"] = set() custom_vocabulary_input = get_input_names_for_role("custom_vocabulary") if len(custom_vocabulary_input) != 0: custom_vocabulary_dataset = dataiku.Dataset(custom_vocabulary_input[0]) params["custom_vocabulary_set"] = custom_vocabulary_checker(custom_vocabulary_dataset) logging.info(f"Custom vocabulary set: {params['custom_vocabulary_set']}") # custom_corrections (optional input dataset) params["custom_corrections"] = {} custom_corrections_input = get_input_names_for_role("custom_corrections") if len(custom_corrections_input) != 0: custom_corrections_dataset = dataiku.Dataset(custom_corrections_input[0]) params["custom_corrections"] = custom_corrections_checker(custom_corrections_dataset) logging.info(f"Custom corrections: {params['custom_corrections']}") # diagnosis dataset (optional output dataset) diagnosis_dataset_names = get_output_names_for_role("diagnosis_dataset") params["diagnosis_dataset"] = None params["compute_diagnosis"] = False if len(diagnosis_dataset_names) != 0: logging.info("Spellchecker diagnosis will be computed") params["compute_diagnosis"] = True params["diagnosis_dataset"] = dataiku.Dataset(diagnosis_dataset_names[0]) else: logging.info("Spellchecker diagnosis will not be computed") # path to the folder of stopwords params["stopwords_folder_path"] = os.path.join(get_recipe_resource(), "stopwords") # path to the folder of dictionaries params["dictionary_folder_path"] = os.path.join(get_recipe_resource(), "dictionaries") # Text column selection params["text_column"] = recipe_config.get("text_column") logging.info(f"Text column: {params['text_column']}") if params["text_column"] not in input_dataset_columns: raise PluginParamValidationError(f"Invalid text column selection: {params['text_column']}") # Language selection params["language"] = recipe_config.get("language") if params["language"] == "language_column": params["language_column"] = recipe_config.get("language_column") if params["language_column"] not in input_dataset_columns: raise PluginParamValidationError(f"Invalid language column selection: : {params['language_column']}") logging.info(f"Language column: {params['language_column']}") else: if not params["language"]: raise PluginParamValidationError("Empty language selection") if params["language"] not in SUPPORTED_LANGUAGES_SYMSPELL: raise PluginParamValidationError(f"Unsupported language code: {params['language']}") params["language_column"] = "" logging.info(f"Language: {params['language']}") # Expert mode if recipe_config.get("expert"): logging.info("Expert mode is enabled") else: logging.info("Expert mode is disabled") # edit distance params["edit_distance"] = recipe_config.get("edit_distance") if params["edit_distance"] < 2 or params["edit_distance"] > 100: raise PluginParamValidationError("Edit distance must be between 2 and 100") logging.info(f"Maximum edit distance: {params['edit_distance']}") # ignore token if len(recipe_config.get("ignore_word_regex")) == 0: logging.info("No regular expression for words not to be corrected") params["ignore_word_regex"] = None # symspellpy wants None else: params["ignore_word_regex"] = recipe_config.get("ignore_word_regex") # Check for valid regex try: ignore_token_compiled = re.compile(params["ignore_word_regex"]) except re.error as e: raise PluginParamValidationError(f"Ignore pattern parameter is not a valid regex: {e}") params["ignore_word_regex"] = ignore_token_compiled.pattern logging.info(f"Regular expression for words not to be corrected: {params['ignore_word_regex']}") return params
if text_column_name == None: raise ValueError("You did not choose a text column.") texts = df[text_column_name].apply( lambda s: clean_text(str(s)).decode('utf-8')).values text_language = "english" output_probabilities = bool(recipe_config.get('output_confidence', False)) ############################# # Load Models ############################# en_model = load_model( os.path.join(get_recipe_resource(), "fasttext", "sentiment_analysis", "amazon_review_polarity.ftz")) ############################# # Score ############################# model = en_model predicted_polarities, confidence_list = model.predict(list(texts)) predicted_polarities = np.array( [int(v[0].split('__')[-1]) for v in predicted_polarities]) if text_language == "english": predicted_polarities += -1 # English model predicts 1/2 instead of 0/1 confidence_list = confidence_list.ravel()