def _prepare_df_for_cleaning(self, df: pd.DataFrame, text_column: AnyStr, language_column: AnyStr, language: AnyStr) -> None: """Private method to prepare a Pandas dataframe in-place before feeding it to the `self.clean_df` method Tokenizes the content of the text column into a new column containing spaCy documents Adds new columns to hold the future outputs of the cleaner method Args: df: Input pandas DataFrame text_column: Name of the column containing text data language_column: Name of the column with language codes in ISO 639-1 format language: Language code in ISO 639-1 format If equal to "language_column" this parameter is ignored in favor of language_column """ self.output_column_descriptions = {} for k, v in self.OUTPUT_COLUMN_DESCRIPTIONS.items(): if k == "cleaned": column_name = generate_unique(k, df.keys(), text_column) self.output_column_descriptions[column_name] = v elif k in self.token_filters and self.keep_filtered_tokens: column_name = generate_unique(f"{v.lower()}s", df.keys(), text_column) self.output_column_descriptions[ column_name] = f"{v}s in the original text" self.tokenizer.tokenize_df(df, text_column, language_column, language)
def __init__( self, input_df: pd.DataFrame, input_folder: dataiku.Folder = None, minimum_score: float = 0.0, orientation_correction: bool = True, column_prefix: AnyStr = "text_api", error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG, parallel_workers: int = DEFAULT_PARALLEL_WORKERS, ): super().__init__( input_df=input_df, input_folder=input_folder, column_prefix=column_prefix, error_handling=error_handling, parallel_workers=parallel_workers, ) self.minimum_score = float(minimum_score) self.orientation_correction = bool(orientation_correction) self.orientation_column = generate_unique("orientation_correction", input_df.keys(), column_prefix) self.text_column_list = generate_unique("detections_list", input_df.keys(), column_prefix) self.text_column_concat = generate_unique("detections_concat", input_df.keys(), column_prefix) self._compute_column_description()
def __init__( self, input_df: pd.DataFrame, category_level: UnsafeContentCategoryLevelEnum = UnsafeContentCategoryLevelEnum.TOP, content_categories_top_level: List[ UnsafeContentCategoryTopLevelEnum] = [], content_categories_second_level: List[ UnsafeContentCategorySecondLevelEnum] = [], column_prefix: AnyStr = "moderation_api", error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG, ): super().__init__( input_df=input_df, column_prefix=column_prefix, error_handling=error_handling, ) self.category_level = category_level if self.category_level == UnsafeContentCategoryLevelEnum.TOP: self.content_category_enum = UnsafeContentCategoryTopLevelEnum self.content_categories = content_categories_top_level else: self.content_category_enum = UnsafeContentCategorySecondLevelEnum self.content_categories = content_categories_second_level self.is_unsafe_column = generate_unique("unsafe_content", self.input_df.keys(), self.column_prefix) self.unsafe_list_column = generate_unique("unsafe_categories", self.input_df.keys(), self.column_prefix) self._compute_column_description()
def __init__( self, input_df: pd.DataFrame, num_objects: int, orientation_correction: bool = True, input_folder: dataiku.Folder = None, column_prefix: AnyStr = "object_api", error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG, parallel_workers: int = DEFAULT_PARALLEL_WORKERS, ): super().__init__( input_df=input_df, input_folder=input_folder, column_prefix=column_prefix, error_handling=error_handling, parallel_workers=parallel_workers, ) self.num_objects = int(num_objects) self.orientation_correction = bool(orientation_correction) self.orientation_column = generate_unique("orientation_correction", input_df.keys(), column_prefix) self.label_list_column = generate_unique("label_list", input_df.keys(), column_prefix) self.label_name_columns = [ generate_unique("label_" + str(n + 1) + "_name", input_df.keys(), column_prefix) for n in range(num_objects) ] self.label_score_columns = [ generate_unique("label_" + str(n + 1) + "_score", input_df.keys(), column_prefix) for n in range(num_objects) ] self._compute_column_description()
def _compute_column_description(self): """Compute output column names and descriptions""" self.score_column = generate_unique("score", self.input_df.keys(), self.column_prefix) self.column_description_dict[self.score_column] = "Confidence score in the crop hint from 0 to 1" self.importance_column = generate_unique("importance_fraction", self.input_df.keys(), self.column_prefix) self.column_description_dict[ self.importance_column ] = "Importance of the crop hint with respect to the original image from 0 to 1"
def _compute_column_description(self): """Compute output column names and descriptions""" self.text_column_concat = generate_unique("detections_concat", self.input_df.keys(), self.column_prefix) self.column_description_dict[self.text_column_concat] = "Concatenated text detections from the API" self.language_code_column = generate_unique("language_code", self.input_df.keys(), self.column_prefix) self.column_description_dict[self.language_code_column] = "Detected language code from the API" self.language_score_column = generate_unique("language_score", self.input_df.keys(), self.column_prefix) self.column_description_dict[ self.language_score_column ] = "Confidence score in the detected language from 0 to 1"
def __init__( self, input_df: pd.DataFrame, sentiment_scale: AnyStr = "ternary", column_prefix: AnyStr = "sentiment_api", error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG, ): super().__init__(input_df, column_prefix, error_handling) self.sentiment_scale = sentiment_scale self.sentiment_score_column = generate_unique("score", input_df.keys(), self.column_prefix) self.sentiment_score_scaled_column = generate_unique("score_scaled", input_df.keys(), column_prefix) self.sentiment_magnitude_column = generate_unique("magnitude", input_df.keys(), column_prefix) self._compute_column_description()
def _compute_column_description(self): for n in range(self.num_categories): category_column = generate_unique( "category_" + str(n + 1) + "_name", self.input_df.keys(), self.column_prefix, ) confidence_column = generate_unique( "category_" + str(n + 1) + "_confidence", self.input_df.keys(), self.column_prefix, ) self.column_description_dict[category_column] = "Name of the category {} representing the document".format( str(n + 1) ) self.column_description_dict[confidence_column] = "Classifier's confidence in the category {}".format( str(n + 1) )
def clean_df( self, df: pd.DataFrame, text_column: AnyStr, language_column: AnyStr = "", language: AnyStr = "language_column", ) -> pd.DataFrame: """Public method to clean a text column in a pandas DataFrame, given language information Prepare the dataframe with `self._prepare_df_for_cleaning` to obtain a new column with spaCy documents Run `self.clean_document` on all documents with multithreading Format the output dataframe Args: df: Input pandas DataFrame text_column: Name of the column containing text data language_column: Name of the column with language codes in ISO 639-1 format language: Language code in ISO 639-1 format If equal to "language_column" this parameter is ignored in favor of language_column Returns: Input dataframe with new columns at the end: - Cleaned text after filter, lemmatization, lowercase and unicode normalization steps - One column for each selected `self.token_filters` with a concatenation of filtered tokens """ self._prepare_df_for_cleaning(df, text_column, language_column, language) start = perf_counter() logging.info(f"Cleaning {len(df.index)} document(s)...") output = [{}] * len(df.index) doc_iterator = (doc for doc in df[self.tokenizer.tokenized_column]) with ThreadPoolExecutor( max_workers=self.DEFAULT_NUM_THREADS) as executor: output = list( executor.map(lambda x: self.clean_document(x), doc_iterator)) for k, v in self.OUTPUT_COLUMN_DESCRIPTIONS.items(): if k == "cleaned": column_name = generate_unique(k, df.keys(), text_column) df[column_name] = [d.get(k, "") for d in output] elif k in self.token_filters and self.keep_filtered_tokens: column_name = generate_unique(f"{v.lower()}s", df.keys(), text_column) df[column_name] = [d.get(k, "") for d in output] logging.info( f"Cleaning {len(df.index)} document(s): done in {perf_counter() - start:.2f} seconds" ) del df[self.tokenizer.tokenized_column] return df
def format_row(self, row: Dict) -> Dict: raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) moderation_labels = response.get("ModerationLabels", []) row[self.is_unsafe_column] = False row[self.unsafe_list_column] = "" unsafe_list = [] for category in self.content_categories: confidence_column = generate_unique( category.name.lower() + "_score", self.input_df.keys(), self.column_prefix) row[confidence_column] = "" if self.category_level == UnsafeContentCategoryLevelEnum.TOP: scores = [ l.get("Confidence") for l in moderation_labels if l.get("ParentName", "") == category.value ] else: scores = [ l.get("Confidence") for l in moderation_labels if l.get("Name", "") == category.value ] if len(scores) != 0: unsafe_list.append(str(category.value)) row[confidence_column] = scores[0] if len(unsafe_list) != 0: row[self.is_unsafe_column] = True row[self.unsafe_list_column] = unsafe_list return row
def _compute_column_description(self): """Compute output column names and descriptions""" for name, member in UnsafeContentCategory.__members__.items(): category_column = generate_unique(name.lower() + "_likelihood", self.input_df.keys(), self.column_prefix) self.column_description_dict[ category_column ] = f"Likelihood of category '{member.value}' from 1 (VERY_UNLIKELY) to 5 (VERY_LIKELY)"
def format_row(self, row: Dict) -> Dict: raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) categories = sorted(response.get("categories", []), key=lambda x: x.get("confidence"), reverse=True,) for n in range(self.num_categories): category_column = generate_unique("category_" + str(n + 1) + "_name", row.keys(), self.column_prefix) confidence_column = generate_unique( "category_" + str(n + 1) + "_confidence", row.keys(), self.column_prefix ) if len(categories) > n: row[category_column] = categories[n].get("name", "") row[confidence_column] = categories[n].get("confidence") else: row[category_column] = "" row[confidence_column] = None return row
def _compute_column_description(self): for n, m in EntityTypeEnum.__members__.items(): entity_type_column = generate_unique("entity_type_" + n.lower(), self.input_df.keys(), self.column_prefix) self.column_description_dict[ entity_type_column] = "List of '{}' entities recognized by the API".format( str(m.value))
def _compute_column_description(self): """Compute output column names and descriptions""" if vision.Feature.Type.LABEL_DETECTION in self.content_categories: self.label_list_column = generate_unique("label_list", self.input_df.keys(), self.column_prefix) self.column_description_dict[self.label_list_column] = "List of labels from the API" if vision.Feature.Type.OBJECT_LOCALIZATION in self.content_categories: self.object_list_column = generate_unique("object_list", self.input_df.keys(), self.column_prefix) self.column_description_dict[self.object_list_column] = "List of objects from the API" if vision.Feature.Type.LANDMARK_DETECTION in self.content_categories: self.landmark_list_column = generate_unique("landmark_list", self.input_df.keys(), self.column_prefix) self.column_description_dict[self.landmark_list_column] = "List of landmarks from the API" if vision.Feature.Type.LOGO_DETECTION in self.content_categories: self.logo_list_column = generate_unique("logo_list", self.input_df.keys(), self.column_prefix) self.column_description_dict[self.logo_list_column] = "List of logos from the API" if vision.Feature.Type.WEB_DETECTION in self.content_categories: self.web_label_column = generate_unique("web_label", self.input_df.keys(), self.column_prefix) self.column_description_dict[self.web_label_column] = "Web label from the API" self.web_entity_list_column = generate_unique("web_entity_list", self.input_df.keys(), self.column_prefix) self.column_description_dict[self.web_entity_list_column] = "List of Web entities from the API" self.web_full_matching_image_list_column = generate_unique( "web_full_matching_image_list", self.input_df.keys(), self.column_prefix ) self.column_description_dict[ self.web_full_matching_image_list_column ] = "List of Web images fully matching the input image" self.web_partial_matching_image_list_column = generate_unique( "web_partial_matching_image_list", self.input_df.keys(), self.column_prefix ) self.column_description_dict[ self.web_partial_matching_image_list_column ] = "List of Web images partially matching the input image" self.web_page_match_list_column = generate_unique( "web_page_match_list", self.input_df.keys(), self.column_prefix ) self.column_description_dict[ self.web_page_match_list_column ] = "List of Web pages with images matching the input image" self.web_similar_image_list_column = generate_unique( "web_similar_image_list", self.input_df.keys(), self.column_prefix ) self.column_description_dict[ self.web_similar_image_list_column ] = "List of Web images visually similar to the input image"
def format_row(self, row: Dict) -> Dict: """Extract the likelihood of each unsafe content category from a row with an API response""" raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) moderation_labels = response.get("safeSearchAnnotation", {}) for category in self.unsafe_content_categories: category_column = generate_unique( category.name.lower() + "_likelihood", self.input_df.keys(), self.column_prefix ) row[category_column] = moderation_labels.get(category.name.lower(), "") return row
def tokenize_df(self, df: pd.DataFrame, text_column: AnyStr, language_column: AnyStr = "", language: AnyStr = "language_column") -> pd.DataFrame: """Public method to tokenize a text column in a pandas DataFrame, given language information This methods adds a new column to the DataFrame, whose name is saved as the `tokenized_column` attribute Args: df: Input pandas DataFrame text_column: Name of the column containing text data language_column: Name of the column with language codes in ISO 639-1 format language: Language code in ISO 639-1 format, cf. https://spacy.io/usage/models#languages if equal to "language_column" this parameter is ignored in favor of language_column Returns: DataFrame with all columns from the input, plus a new column with tokenized spaCy documents """ self.tokenized_column = generate_unique("tokenized", df.keys(), text_column) # Initialize the tokenized column to empty documents df[self.tokenized_column] = pd.Series([Doc(Vocab())] * len(df.index), dtype="object") if language == "language_column": languages = df[language_column].dropna().unique() unsupported_languages = set(languages) - set( SUPPORTED_LANGUAGES_SPACY.keys()) if unsupported_languages: raise TokenizationError( f"Found {len(unsupported_languages)} unsupported languages in dataset: {unsupported_languages}" ) for lang in languages: # iterate over languages language_indices = df[language_column] == lang text_slice = df.loc[ language_indices, text_column] # slicing input df by language if len(text_slice) != 0: tokenized_list = self.tokenize_list(text_list=text_slice, language=lang) df.loc[ language_indices, self.tokenized_column] = pd.Series( tokenized_list, dtype="object", index=text_slice.index, # keep index (important) ) else: tokenized_list = self.tokenize_list(text_list=df[text_column], language=language) df[self.tokenized_column] = tokenized_list return df
def _compute_column_description(self): self.column_description_dict[ self.is_unsafe_column] = "Unsafe content detected by the API" self.column_description_dict[ self. unsafe_list_column] = "List of unsafe content categories detected by the API" for n, m in self.content_category_enum.__members__.items(): confidence_column = generate_unique(n.lower() + "_score", self.input_df.keys(), self.column_prefix) self.column_description_dict[ confidence_column] = "Confidence score in category '{}' from 0 to 1".format( m.value)
def detect_languages_df(self, df: pd.DataFrame, text_column: AnyStr) -> pd.DataFrame: self.column_description_dict = OrderedDict() for k, v in self.COLUMN_DESCRIPTION_DICT.items(): self.column_description_dict[generate_unique( k, df.keys(), text_column)] = v doc_iterator = (doc for _, doc in df[text_column].astype(str).iteritems()) output_df = df.copy() with ThreadPoolExecutor(max_workers=self.NUM_THREADS) as executor: lang_output_tuple_list = list( executor.map(self.detect_language_doc, doc_iterator)) for i, col in enumerate(self.column_description_dict.keys()): output_df[col] = [t[i] for t in lang_output_tuple_list] return output_df
def format_row(self, row: Dict) -> Dict: raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) entities = response.get("entities", []) selected_entity_types = sorted([e.name for e in self.entity_types]) for n in selected_entity_types: entity_type_column = generate_unique("entity_type_" + n.lower(), row.keys(), self.column_prefix) row[entity_type_column] = [ e.get("name") for e in entities if e.get("type", "") == n and float(e.get("salience", 0)) >= self.minimum_score ] if len(row[entity_type_column]) == 0: row[entity_type_column] = "" return row
def detect_languages_df(self, df: pd.DataFrame, text_column: AnyStr) -> pd.DataFrame: """Apply the `detect_language_doc` method to a pandas DataFrame with a text column, with multithreading""" self.column_descriptions = {} for k, v in self.COLUMN_DESCRIPTIONS.items(): self.column_descriptions[generate_unique(k, df.keys(), text_column)] = v doc_iterator = (doc for _, doc in df[text_column].astype(str).iteritems()) output_df = df.copy() with ThreadPoolExecutor(max_workers=self.NUM_THREADS) as executor: lang_output_tuple_list = list( executor.map(self.detect_language_doc, doc_iterator)) for i, col in enumerate(self.column_descriptions): output_df[col] = [t[i] for t in lang_output_tuple_list] return output_df
def _prepare_df_for_spellchecker(self, df: pd.DataFrame, text_column: AnyStr, language_column: AnyStr, language: AnyStr) -> None: """Private method to prepare a Pandas dataframe in-place before feeding it to the spellchecker Tokenize the content of the text column into a new column containing spaCy documents Add new columns to hold the future outputs of the spellchecker Args: df: Input pandas DataFrame text_column: Name of the column containing text data language_column: Name of the column with language codes in ISO 639-1 format language: Language code in ISO 639-1 format If equal to "language_column" this parameter is ignored in favor of language_column """ self.output_column_descriptions = {} for k, v in self.OUTPUT_COLUMN_DESCRIPTIONS.items(): column_name = generate_unique(k, df.keys(), text_column) df[column_name] = pd.Series([""] * len(df.index)) self.output_column_descriptions[column_name] = v self.tokenizer.tokenize_df(df, text_column, language_column, language)