def format_row(self, row: Dict) -> Dict:
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     if not self.source_language:
         row["detected_source_language"] = response.get('detectedSourceLanguage', None)
     row[self.translated_text_column_name] = response.get('translatedText', None)
     return row
 def format_save_pdf_documents(self, output_folder: dataiku.Folder, output_df: pd.DataFrame) -> Tuple[int, int]:
     """Open PDF documents in a `dataiku.Folder`, draw text bounding polygons and save them to another folder"""
     df_iterator = (index_series_pair[1].to_dict() for index_series_pair in output_df.iterrows())
     len_iterator = len(output_df.index)
     api_results = []
     start = perf_counter()
     logging.info(f"Formatting and saving {len_iterator} PDF page(s) to output folder...")
     with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool:
         futures = [
             pool.submit(
                 self.format_save_pdf_document,
                 output_folder=output_folder,
                 pdf_path=row[self.doc_handler.SPLITTED_PATH_COLUMN],
                 response=safe_json_loads(row[self.api_column_names.response]),
             )
             for row in df_iterator
         ]
         for future in tqdm_auto(as_completed(futures), total=len_iterator):
             api_results.append(future.result())
     num_success = sum(api_results)
     num_error = len(api_results) - num_success
     logging.info(
         (
             f"Formatting and saving {len_iterator} PDF page(s) to output folder: "
             f"{num_success} succeeded, {num_error} failed in {(perf_counter() - start):.2f} seconds."
         )
     )
     return (num_success, num_error)
 def format_row(self, row: Dict) -> Dict:
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     moderation_labels = response.get("ModerationLabels", [])
     row[self.is_unsafe_column] = False
     row[self.unsafe_list_column] = ""
     unsafe_list = []
     for category in self.content_categories:
         confidence_column = generate_unique(
             category.name.lower() + "_score", self.input_df.keys(),
             self.column_prefix)
         row[confidence_column] = ""
         if self.category_level == UnsafeContentCategoryLevelEnum.TOP:
             scores = [
                 l.get("Confidence") for l in moderation_labels
                 if l.get("ParentName", "") == category.value
             ]
         else:
             scores = [
                 l.get("Confidence") for l in moderation_labels
                 if l.get("Name", "") == category.value
             ]
         if len(scores) != 0:
             unsafe_list.append(str(category.value))
             row[confidence_column] = scores[0]
     if len(unsafe_list) != 0:
         row[self.is_unsafe_column] = True
         row[self.unsafe_list_column] = unsafe_list
     return row
 def format_row(self, row: Dict) -> Dict:
     """Extract crop hints annotations from a row with an API response"""
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     crop_hints = response.get("cropHintsAnnotation", {}).get("cropHints", [])
     row[self.score_column] = None
     row[self.importance_column] = None
     if len(crop_hints) != 0:
         row[self.score_column] = crop_hints[0].get("confidence")
         row[self.importance_column] = crop_hints[0].get("importanceFraction")
     return row
 def format_row(self, row: Dict) -> Dict:
     """Extract the likelihood of each unsafe content category from a row with an API response"""
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     moderation_labels = response.get("safeSearchAnnotation", {})
     for category in self.unsafe_content_categories:
         category_column = generate_unique(
             category.name.lower() + "_likelihood", self.input_df.keys(), self.column_prefix
         )
         row[category_column] = moderation_labels.get(category.name.lower(), "")
     return row
 def format_row(self, row: Dict) -> Dict:
     """Extract content lists for all categories from a row with an API response"""
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     if vision.Feature.Type.LABEL_DETECTION in self.content_categories:
         row[self.label_list_column] = self._extract_content_list_from_response(
             response, "labelAnnotations", name_key="description", score_key="score"
         )
     if vision.Feature.Type.OBJECT_LOCALIZATION in self.content_categories:
         row[self.object_list_column] = self._extract_content_list_from_response(
             response, "localizedObjectAnnotations", name_key="name", score_key="score"
         )
     if vision.Feature.Type.LANDMARK_DETECTION in self.content_categories:
         row[self.landmark_list_column] = self._extract_content_list_from_response(
             response, "landmarkAnnotations", name_key="description", score_key="score"
         )
     if vision.Feature.Type.LOGO_DETECTION in self.content_categories:
         row[self.logo_list_column] = self._extract_content_list_from_response(
             response, "logoAnnotations", name_key="description", score_key="score"
         )
     if vision.Feature.Type.WEB_DETECTION in self.content_categories:
         row[self.web_label_column] = self._extract_content_list_from_response(
             response, "webDetection", subcategory_key="bestGuessLabels", name_key="label"
         )
         if len(row[self.web_label_column]) != 0:
             row[self.web_label_column] = row[self.web_label_column][0]
         row[self.web_entity_list_column] = self._extract_content_list_from_response(
             response, "webDetection", subcategory_key="webEntities", name_key="description", score_key="score",
         )
         row[self.web_full_matching_image_list_column] = self._extract_content_list_from_response(
             response, "webDetection", subcategory_key="fullMatchingImages", name_key="url"
         )
         if len(row[self.web_full_matching_image_list_column]) != 0:
             row[self.web_full_matching_image_list_column] = [
                 match for match in row[self.web_full_matching_image_list_column] if "x-raw-image:///" not in match
             ]
         row[self.web_partial_matching_image_list_column] = self._extract_content_list_from_response(
             response, "webDetection", subcategory_key="partialMatchingImages", name_key="url"
         )
         row[self.web_page_match_list_column] = self._extract_content_list_from_response(
             response, "webDetection", subcategory_key="pagesWithMatchingImages", name_key="url"
         )
         row[self.web_similar_image_list_column] = self._extract_content_list_from_response(
             response, "webDetection", subcategory_key="visuallySimilarImages", name_key="url"
         )
         if len(row[self.web_similar_image_list_column]) != 0:
             row[self.web_similar_image_list_column] = [
                 similar for similar in row[self.web_similar_image_list_column] if "x-raw-image:///" not in similar
             ]
     return row
 def format_row(self, row: Dict) -> Dict:
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     entities = response.get("entities", [])
     selected_entity_types = sorted([e.name for e in self.entity_types])
     for n in selected_entity_types:
         entity_type_column = generate_unique("entity_type_" + n.lower(), row.keys(), self.column_prefix)
         row[entity_type_column] = [
             e.get("name")
             for e in entities
             if e.get("type", "") == n and float(e.get("salience", 0)) >= self.minimum_score
         ]
         if len(row[entity_type_column]) == 0:
             row[entity_type_column] = ""
     return row
 def format_row(self, row: Dict) -> Dict:
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     categories = sorted(response.get("categories", []), key=lambda x: x.get("confidence"), reverse=True,)
     for n in range(self.num_categories):
         category_column = generate_unique("category_" + str(n + 1) + "_name", row.keys(), self.column_prefix)
         confidence_column = generate_unique(
             "category_" + str(n + 1) + "_confidence", row.keys(), self.column_prefix
         )
         if len(categories) > n:
             row[category_column] = categories[n].get("name", "")
             row[confidence_column] = categories[n].get("confidence")
         else:
             row[category_column] = ""
             row[confidence_column] = None
     return row
 def format_row(self, row: Dict) -> Dict:
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     sentiment = response.get("documentSentiment", {})
     sentiment_score = sentiment.get("score")
     magnitude_score = sentiment.get("magnitude")
     if sentiment_score is not None:
         row[self.sentiment_score_column] = float(sentiment_score)
         row[self.sentiment_score_scaled_column] = self._scale_sentiment_score(sentiment_score, self.sentiment_scale)
     else:
         row[self.sentiment_score_column] = None
         row[self.sentiment_score_scaled_column] = None
     if magnitude_score is not None:
         row[self.sentiment_magnitude_column] = float(magnitude_score)
     else:
         row[self.sentiment_magnitude_column] = None
     return row
    def format_save_images(
        self,
        output_folder: dataiku.Folder,
        output_df: pd.DataFrame = None,
        path_column: AnyStr = PATH_COLUMN,
        verbose: bool = True,
    ) -> Tuple[int, int]:
        """Generic method to apply `self.format_save_image` to all images using an `output_df` with API responses

        Do not override this method!

        """
        if output_df is None:
            output_df = self.output_df
        df_iterator = (index_series_pair[1].to_dict()
                       for index_series_pair in output_df.iterrows())
        len_iterator = len(output_df.index)
        if verbose:
            logging.info(
                f"Formatting and saving {len_iterator} image(s) to output folder..."
            )
        start = perf_counter()
        api_results = []
        with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool:
            futures = [
                pool.submit(
                    self.format_save_image,
                    output_folder=output_folder,
                    image_path=row[path_column],
                    response=safe_json_loads(
                        row[self.api_column_names.response]),
                ) for row in df_iterator
            ]
            for future in tqdm_auto(as_completed(futures), total=len_iterator):
                api_results.append(future.result())
        num_success = sum(api_results)
        num_error = len(api_results) - num_success
        if verbose:
            logging.info((
                f"Formatting and saving {len_iterator} image(s) to output folder: "
                f"{num_success} image(s) succeeded, {num_error} failed in {(perf_counter() - start):.2f} seconds."
            ))
        return (num_success, num_error)
 def format_row(self, row: Dict) -> Dict:
     """Extract detected text and language information from a row with an API response"""
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     text_annotations = response.get("fullTextAnnotation", {})
     row[self.text_column_concat] = text_annotations.get("text", "")
     row[self.language_code_column] = ""
     row[self.language_score_column] = None
     pages = text_annotations.get("pages", [])
     if len(pages) != 0:
         detected_languages = sorted(
             pages[0].get("property", {}).get("detectedLanguages", [{}]),
             key=lambda x: float(x.get("confidence", 0)),
             reverse=True,
         )
         if len(detected_languages) != 0:
             row[self.language_code_column] = detected_languages[0].get("languageCode", "")
             row[self.language_score_column] = detected_languages[0].get("confidence")
     return row
 def format_row(self, row: Dict) -> Dict:
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     text_detections = response.get("TextDetections", [])
     text_detections_filtered = [
         t for t in text_detections
         if t.get("Confidence") >= self.minimum_score
         and t.get("ParentId") is None
     ]
     row[self.text_column_list] = ""
     row[self.text_column_concat] = ""
     if len(text_detections_filtered) != 0:
         row[self.text_column_list] = [
             t.get("DetectedText", "") for t in text_detections_filtered
         ]
         row[self.text_column_concat] = " ".join(row[self.text_column_list])
     if self.orientation_correction:
         row[self.orientation_column] = response.get(
             "OrientationCorrection", "")
     return row
 def format_row(self, row: Dict) -> Dict:
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     row[self.label_list_column] = ""
     labels = sorted(response.get("Labels", []),
                     key=lambda x: x.get("Confidence"),
                     reverse=True)
     if len(labels) != 0:
         row[self.label_list_column] = [l.get("Name") for l in labels]
     for n in range(self.num_objects):
         if len(labels) > n:
             row[self.label_name_columns[n]] = labels[n].get("Name", "")
             row[self.label_score_columns[n]] = labels[n].get(
                 "Confidence", "")
         else:
             row[self.label_name_columns[n]] = ""
             row[self.label_score_columns[n]] = None
     if self.orientation_correction:
         row[self.orientation_column] = response.get(
             "OrientationCorrection", "")
     return row
 def format_save_images(self, output_folder: dataiku.Folder):
     partition = output_folder.writePartition if output_folder.writePartition else ""
     output_folder.clear_partition(partition)
     df_iterator = (i[1].to_dict() for i in self.output_df.iterrows())
     len_iterator = len(self.output_df.index)
     logging.info("Saving bounding boxes to output folder...")
     api_results = []
     with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool:
         futures = [
             pool.submit(
                 self.format_save_image,
                 output_folder=output_folder,
                 image_path=row[IMAGE_PATH_COLUMN],
                 response=safe_json_loads(
                     row[self.api_column_names.response]),
             ) for row in df_iterator
         ]
         for f in tqdm_auto(as_completed(futures), total=len_iterator):
             api_results.append(f.result())
     num_success = sum(api_results)
     num_error = len(api_results) - num_success
     logging.info(
         "Saving bounding boxes to output folder: {} images succeeded, {} failed"
         .format(num_success, num_error))