def format_row(self, row: Dict) -> Dict: raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) if not self.source_language: row["detected_source_language"] = response.get('detectedSourceLanguage', None) row[self.translated_text_column_name] = response.get('translatedText', None) return row
def format_save_pdf_documents(self, output_folder: dataiku.Folder, output_df: pd.DataFrame) -> Tuple[int, int]: """Open PDF documents in a `dataiku.Folder`, draw text bounding polygons and save them to another folder""" df_iterator = (index_series_pair[1].to_dict() for index_series_pair in output_df.iterrows()) len_iterator = len(output_df.index) api_results = [] start = perf_counter() logging.info(f"Formatting and saving {len_iterator} PDF page(s) to output folder...") with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool: futures = [ pool.submit( self.format_save_pdf_document, output_folder=output_folder, pdf_path=row[self.doc_handler.SPLITTED_PATH_COLUMN], response=safe_json_loads(row[self.api_column_names.response]), ) for row in df_iterator ] for future in tqdm_auto(as_completed(futures), total=len_iterator): api_results.append(future.result()) num_success = sum(api_results) num_error = len(api_results) - num_success logging.info( ( f"Formatting and saving {len_iterator} PDF page(s) to output folder: " f"{num_success} succeeded, {num_error} failed in {(perf_counter() - start):.2f} seconds." ) ) return (num_success, num_error)
def format_row(self, row: Dict) -> Dict: raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) moderation_labels = response.get("ModerationLabels", []) row[self.is_unsafe_column] = False row[self.unsafe_list_column] = "" unsafe_list = [] for category in self.content_categories: confidence_column = generate_unique( category.name.lower() + "_score", self.input_df.keys(), self.column_prefix) row[confidence_column] = "" if self.category_level == UnsafeContentCategoryLevelEnum.TOP: scores = [ l.get("Confidence") for l in moderation_labels if l.get("ParentName", "") == category.value ] else: scores = [ l.get("Confidence") for l in moderation_labels if l.get("Name", "") == category.value ] if len(scores) != 0: unsafe_list.append(str(category.value)) row[confidence_column] = scores[0] if len(unsafe_list) != 0: row[self.is_unsafe_column] = True row[self.unsafe_list_column] = unsafe_list return row
def format_row(self, row: Dict) -> Dict: """Extract crop hints annotations from a row with an API response""" raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) crop_hints = response.get("cropHintsAnnotation", {}).get("cropHints", []) row[self.score_column] = None row[self.importance_column] = None if len(crop_hints) != 0: row[self.score_column] = crop_hints[0].get("confidence") row[self.importance_column] = crop_hints[0].get("importanceFraction") return row
def format_row(self, row: Dict) -> Dict: """Extract the likelihood of each unsafe content category from a row with an API response""" raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) moderation_labels = response.get("safeSearchAnnotation", {}) for category in self.unsafe_content_categories: category_column = generate_unique( category.name.lower() + "_likelihood", self.input_df.keys(), self.column_prefix ) row[category_column] = moderation_labels.get(category.name.lower(), "") return row
def format_row(self, row: Dict) -> Dict: """Extract content lists for all categories from a row with an API response""" raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) if vision.Feature.Type.LABEL_DETECTION in self.content_categories: row[self.label_list_column] = self._extract_content_list_from_response( response, "labelAnnotations", name_key="description", score_key="score" ) if vision.Feature.Type.OBJECT_LOCALIZATION in self.content_categories: row[self.object_list_column] = self._extract_content_list_from_response( response, "localizedObjectAnnotations", name_key="name", score_key="score" ) if vision.Feature.Type.LANDMARK_DETECTION in self.content_categories: row[self.landmark_list_column] = self._extract_content_list_from_response( response, "landmarkAnnotations", name_key="description", score_key="score" ) if vision.Feature.Type.LOGO_DETECTION in self.content_categories: row[self.logo_list_column] = self._extract_content_list_from_response( response, "logoAnnotations", name_key="description", score_key="score" ) if vision.Feature.Type.WEB_DETECTION in self.content_categories: row[self.web_label_column] = self._extract_content_list_from_response( response, "webDetection", subcategory_key="bestGuessLabels", name_key="label" ) if len(row[self.web_label_column]) != 0: row[self.web_label_column] = row[self.web_label_column][0] row[self.web_entity_list_column] = self._extract_content_list_from_response( response, "webDetection", subcategory_key="webEntities", name_key="description", score_key="score", ) row[self.web_full_matching_image_list_column] = self._extract_content_list_from_response( response, "webDetection", subcategory_key="fullMatchingImages", name_key="url" ) if len(row[self.web_full_matching_image_list_column]) != 0: row[self.web_full_matching_image_list_column] = [ match for match in row[self.web_full_matching_image_list_column] if "x-raw-image:///" not in match ] row[self.web_partial_matching_image_list_column] = self._extract_content_list_from_response( response, "webDetection", subcategory_key="partialMatchingImages", name_key="url" ) row[self.web_page_match_list_column] = self._extract_content_list_from_response( response, "webDetection", subcategory_key="pagesWithMatchingImages", name_key="url" ) row[self.web_similar_image_list_column] = self._extract_content_list_from_response( response, "webDetection", subcategory_key="visuallySimilarImages", name_key="url" ) if len(row[self.web_similar_image_list_column]) != 0: row[self.web_similar_image_list_column] = [ similar for similar in row[self.web_similar_image_list_column] if "x-raw-image:///" not in similar ] return row
def format_row(self, row: Dict) -> Dict: raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) entities = response.get("entities", []) selected_entity_types = sorted([e.name for e in self.entity_types]) for n in selected_entity_types: entity_type_column = generate_unique("entity_type_" + n.lower(), row.keys(), self.column_prefix) row[entity_type_column] = [ e.get("name") for e in entities if e.get("type", "") == n and float(e.get("salience", 0)) >= self.minimum_score ] if len(row[entity_type_column]) == 0: row[entity_type_column] = "" return row
def format_row(self, row: Dict) -> Dict: raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) categories = sorted(response.get("categories", []), key=lambda x: x.get("confidence"), reverse=True,) for n in range(self.num_categories): category_column = generate_unique("category_" + str(n + 1) + "_name", row.keys(), self.column_prefix) confidence_column = generate_unique( "category_" + str(n + 1) + "_confidence", row.keys(), self.column_prefix ) if len(categories) > n: row[category_column] = categories[n].get("name", "") row[confidence_column] = categories[n].get("confidence") else: row[category_column] = "" row[confidence_column] = None return row
def format_row(self, row: Dict) -> Dict: raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) sentiment = response.get("documentSentiment", {}) sentiment_score = sentiment.get("score") magnitude_score = sentiment.get("magnitude") if sentiment_score is not None: row[self.sentiment_score_column] = float(sentiment_score) row[self.sentiment_score_scaled_column] = self._scale_sentiment_score(sentiment_score, self.sentiment_scale) else: row[self.sentiment_score_column] = None row[self.sentiment_score_scaled_column] = None if magnitude_score is not None: row[self.sentiment_magnitude_column] = float(magnitude_score) else: row[self.sentiment_magnitude_column] = None return row
def format_save_images( self, output_folder: dataiku.Folder, output_df: pd.DataFrame = None, path_column: AnyStr = PATH_COLUMN, verbose: bool = True, ) -> Tuple[int, int]: """Generic method to apply `self.format_save_image` to all images using an `output_df` with API responses Do not override this method! """ if output_df is None: output_df = self.output_df df_iterator = (index_series_pair[1].to_dict() for index_series_pair in output_df.iterrows()) len_iterator = len(output_df.index) if verbose: logging.info( f"Formatting and saving {len_iterator} image(s) to output folder..." ) start = perf_counter() api_results = [] with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool: futures = [ pool.submit( self.format_save_image, output_folder=output_folder, image_path=row[path_column], response=safe_json_loads( row[self.api_column_names.response]), ) for row in df_iterator ] for future in tqdm_auto(as_completed(futures), total=len_iterator): api_results.append(future.result()) num_success = sum(api_results) num_error = len(api_results) - num_success if verbose: logging.info(( f"Formatting and saving {len_iterator} image(s) to output folder: " f"{num_success} image(s) succeeded, {num_error} failed in {(perf_counter() - start):.2f} seconds." )) return (num_success, num_error)
def format_row(self, row: Dict) -> Dict: """Extract detected text and language information from a row with an API response""" raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) text_annotations = response.get("fullTextAnnotation", {}) row[self.text_column_concat] = text_annotations.get("text", "") row[self.language_code_column] = "" row[self.language_score_column] = None pages = text_annotations.get("pages", []) if len(pages) != 0: detected_languages = sorted( pages[0].get("property", {}).get("detectedLanguages", [{}]), key=lambda x: float(x.get("confidence", 0)), reverse=True, ) if len(detected_languages) != 0: row[self.language_code_column] = detected_languages[0].get("languageCode", "") row[self.language_score_column] = detected_languages[0].get("confidence") return row
def format_row(self, row: Dict) -> Dict: raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) text_detections = response.get("TextDetections", []) text_detections_filtered = [ t for t in text_detections if t.get("Confidence") >= self.minimum_score and t.get("ParentId") is None ] row[self.text_column_list] = "" row[self.text_column_concat] = "" if len(text_detections_filtered) != 0: row[self.text_column_list] = [ t.get("DetectedText", "") for t in text_detections_filtered ] row[self.text_column_concat] = " ".join(row[self.text_column_list]) if self.orientation_correction: row[self.orientation_column] = response.get( "OrientationCorrection", "") return row
def format_row(self, row: Dict) -> Dict: raw_response = row[self.api_column_names.response] response = safe_json_loads(raw_response, self.error_handling) row[self.label_list_column] = "" labels = sorted(response.get("Labels", []), key=lambda x: x.get("Confidence"), reverse=True) if len(labels) != 0: row[self.label_list_column] = [l.get("Name") for l in labels] for n in range(self.num_objects): if len(labels) > n: row[self.label_name_columns[n]] = labels[n].get("Name", "") row[self.label_score_columns[n]] = labels[n].get( "Confidence", "") else: row[self.label_name_columns[n]] = "" row[self.label_score_columns[n]] = None if self.orientation_correction: row[self.orientation_column] = response.get( "OrientationCorrection", "") return row
def format_save_images(self, output_folder: dataiku.Folder): partition = output_folder.writePartition if output_folder.writePartition else "" output_folder.clear_partition(partition) df_iterator = (i[1].to_dict() for i in self.output_df.iterrows()) len_iterator = len(self.output_df.index) logging.info("Saving bounding boxes to output folder...") api_results = [] with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool: futures = [ pool.submit( self.format_save_image, output_folder=output_folder, image_path=row[IMAGE_PATH_COLUMN], response=safe_json_loads( row[self.api_column_names.response]), ) for row in df_iterator ] for f in tqdm_auto(as_completed(futures), total=len_iterator): api_results.append(f.result()) num_success = sum(api_results) num_error = len(api_results) - num_success logging.info( "Saving bounding boxes to output folder: {} images succeeded, {} failed" .format(num_success, num_error))