def _merge_pdf( self, input_folder: dataiku.Folder, output_folder: dataiku.Folder, input_path_list: List[AnyStr], output_path: AnyStr, ) -> AnyStr: """Merge several PDF files into a single one Args: input_folder: `dataiku.Folder` where the input PDF files are stored output_folder: `dataiku.Folder` where the merged PDF file will be saved input_path_list: List of PDF file paths in the `input_folder` output_path: Path of the merged PDF file Returns: Path of the merged PDF file """ pdf_writer = PdfFileWriter() # Merge all PDF paths in the list for path in input_path_list: with input_folder.get_download_stream(path) as stream: input_pdf = PdfFileReader(BytesIO(stream.read())) for page in range(input_pdf.getNumPages()): pdf_writer.addPage(input_pdf.getPage(page)) # Save the merged PDF in the output folder pdf_bytes = BytesIO() pdf_writer.write(pdf_bytes) output_folder.upload_stream(output_path, pdf_bytes.getvalue()) return output_path
def _split_tiff(self, input_folder: dataiku.Folder, output_folder: dataiku.Folder, input_path: AnyStr) -> List[AnyStr]: """Split a TIFF file into multiple pages and save them as separate files in another folder Args: input_folder: `dataiku.Folder` where the input TIFF file is stored output_folder: `dataiku.Folder` where files will be saved input_path: path of the input TIFF file in the `input_folder` Returns: List of paths generated in the `output_folder` """ with input_folder.get_download_stream(input_path) as stream: pil_image = Image.open(stream) input_path_without_file_name = os.path.split(input_path)[0] input_file_name_without_extension = os.path.splitext( os.path.basename(input_path))[0] page = 0 output_path_list = [] while True: try: pil_image.seek(page) output_path = f"{input_path_without_file_name}/{input_file_name_without_extension}_page_{page+1}.tiff" image_bytes = BytesIO() pil_image.save(image_bytes, format="TIFF") output_folder.upload_stream(output_path, image_bytes.getvalue()) output_path_list.append(output_path) page += 1 except EOFError: break return output_path_list
def generate_path_df(folder: dataiku.Folder, file_extensions: List[AnyStr], path_column: AnyStr) -> pd.DataFrame: """Generate a dataframe of file paths in a Dataiku Folder matching a list of extensions Args: folder: Dataiku managed folder where files are stored This folder can be partitioned or not, this function handles both file_extensions: list of file extensions to match, ex: ["JPG", "PNG"] Expected format is not case-sensitive but should not include leading "." path_column: Name of the column in the output dataframe Returns: DataFrame with one column named `path_column` with all the file paths matching the list of `file_extensions` Raises: RuntimeError: If there are not files matching the list of `file_extensions` """ path_list = [] if folder.read_partitions: for partition in folder.read_partitions: path_list += folder.list_paths_in_partition(partition) else: path_list = folder.list_paths_in_partition() filtered_path_list = [ path for path in path_list if os.path.splitext(path)[1][1:].lower().strip() in file_extensions ] if len(filtered_path_list) == 0: raise RuntimeError( f"No files detected with supported extensions {file_extensions}, check input folder" ) path_df = pd.DataFrame(filtered_path_list, columns=[path_column]) return path_df
def format_save_image(self, output_folder: dataiku.Folder, image_path: AnyStr, response: Dict) -> bool: """Generic method to apply `self.format_image` to an image in `self.input_folder` and save it to an `output folder` Do not override this method! """ result = False with self.input_folder.get_download_stream(image_path) as stream: try: pil_image = Image.open(stream) if len(response) != 0: formatted_image = self.format_image(pil_image, response) else: formatted_image = pil_image.copy() image_bytes = save_image_bytes(formatted_image, image_path) output_folder.upload_stream(image_path, image_bytes.getvalue()) result = True except self.IMAGE_FORMATTING_EXCEPTIONS as error: logging.warning( f"Could not format image on path: {image_path} because of error: {error}" ) if self.error_handling == ErrorHandling.FAIL: logging.exception(error) return result
def _split_pdf(self, input_folder: dataiku.Folder, output_folder: dataiku.Folder, input_path: AnyStr) -> List[AnyStr]: """Split a PDF file into multiple pages and save them as separate files in another folder Args: input_folder: `dataiku.Folder` where the input PDF file is stored output_folder: `dataiku.Folder` where files will be saved input_path: path of the input PDF file in the `input_folder` Returns: List of paths generated in the `output_folder` """ with input_folder.get_download_stream(input_path) as stream: input_pdf = PdfFileReader(BytesIO(stream.read())) input_path_without_file_name = os.path.split(input_path)[0] input_file_name_without_extension = os.path.splitext( os.path.basename(input_path))[0] output_path_list = [] for page in range(input_pdf.getNumPages()): pdf_writer = PdfFileWriter() pdf_writer.addPage(input_pdf.getPage(page)) output_path = f"{input_path_without_file_name}/{input_file_name_without_extension}_page_{page + 1}.pdf" pdf_bytes = BytesIO() pdf_writer.write(pdf_bytes) output_folder.upload_stream(output_path, pdf_bytes.getvalue()) output_path_list.append(output_path) return output_path_list
def get_inputs(self): self.folder = Folder(get_output_names_for_role("folder_id")[0]) self.output_file_path = get_recipe_config()['output_model_path'] self.overwrite_output_model = get_recipe_config( )['overwrite_output_model'] self.batch_size = int(get_recipe_config()['batch_size']) if not get_recipe_config()['show_batch_size']: self.batch_size = -1 self.model = Model(get_input_names_for_role("saved_model_id")[0]) self.float_32 = get_recipe_config()["float_32"]
def do(payload, config, plugin_config, inputs): for recipe_input in inputs: if recipe_input["role"] == "input_folder_id": folder = Folder(recipe_input["fullName"]) paths = folder.list_paths_in_partition() choices = [] for file_name in paths: extension = os.path.splitext(file_name)[1] if extension == '.h5': choices.append({"value": file_name, "label": file_name}) return {"choices": choices}
class Main: def __init__(self): self.input_folder = None self.output_folder = None self.output_file_path = None self.batch_size = None self.overwrite_output_model = None self.model_path = None self.model_name = None self.keras_model = None self.onnx_model = None def get_inputs(self): self.input_folder = Folder( get_input_names_for_role("input_folder_id")[0]) output_folder_id = get_output_names_for_role("output_folder_id")[0] self.output_folder = Folder(output_folder_id) self.output_file_path = get_recipe_config()['output_model_path'] self.batch_size = int(get_recipe_config()['batch_size']) if not get_recipe_config()['show_batch_size']: self.batch_size = -1 self.overwrite_output_model = get_recipe_config( )['overwrite_output_model'] self.model_path = get_recipe_config()['model_path'] self.model_name = os_splitext(os_split(self.model_path)[1])[0] self.float_32 = get_recipe_config()["float_32"] def validate(self): if self.output_folder.get_path_details( self.output_file_path )['exists'] and not self.overwrite_output_model: raise ValueError( 'Output file already exists, check overwrite box or change output path' ) if not self.output_file_path: raise ValueError('Output model path can not be blank') check_keras_version(self.input_folder, self.model_path) def load_h5_to_keras(self): self.keras_model = get_keras_model_from_folder(self.input_folder, self.model_path) def write_output(self): with self.output_folder.get_writer(self.output_file_path) as w: w.write(self.onnx_model.SerializeToString()) def run(self): self.get_inputs() self.validate() self.load_h5_to_keras() self.onnx_model = convert_from_keras_to_onnx(self.keras_model, self.batch_size, self.float_32) self.write_output()
def do(payload, config, plugin_config, inputs): if config.get('input_folder_id'): folder = Folder(config.get('input_folder_id')) paths = folder.list_paths_in_partition() choices = [] for file_name in paths: extension = os.path.splitext(file_name)[1] if extension == '.h5': choices.append({"value": file_name, "label": file_name}) return {"choices": choices} return {}
def do(payload, config, plugin_config, inputs): for recipe_input in inputs: if recipe_input["role"] == "folder": folder = Folder(recipe_input["fullName"]) paths = folder.list_paths_in_partition() choices = [] for fileName in paths: if ".json" in fileName: jsonFile = folder.read_json(fileName) choices.append({"value": fileName, "label": fileName + " (" + jsonFile["name"] + " | " + jsonFile["target"] + ")"}) return {"choices": choices}
def save_array_to_folder(array: np.array, path: AnyStr, folder: dataiku.Folder, compress: bool = True) -> None: """Save a numpy array to a Dataiku folder""" with NamedTemporaryFile() as tmp: if compress: np.savez_compressed(tmp, array) else: np.savez(tmp, array) _ = tmp.seek(0) # Oh, take me back to the start folder.upload_stream(path, tmp)
def main(): # getting the csv folder address havas_logs = Folder(customrecipe.get_input_names_for_role("files_folder")[0]) havas_logs_path = havas_logs.get_path() files_to_process = get_files_to_process(havas_logs_path) # preparing dataset to write into havas_cost_data = DatasetWrapper(customrecipe.get_output_names_for_role("cost_data")[0]) # havas_cost_data.dataset.spec_item['appendMode'] = True # writing into dataset append_files_to_dataset(files_to_process, havas_cost_data) # closing dataset and saving lines havas_cost_data.close()
def merge_document( self, input_folder: dataiku.Folder, output_folder: dataiku.Folder, input_path_list: List[AnyStr], output_path: AnyStr, ) -> AnyStr: """Merge several PDF/TIFF files into a single one Args: input_folder: `dataiku.Folder` where the input PDF/TIFF files are stored output_folder: `dataiku.Folder` where the merged PDF/TIFF file will be saved input_path_list: List of PDF/TIFF file paths in the `input_folder` output_path: Path of the merged PDF/TIFF file Returns: Path of the merged PDF/TIFF file """ if len(input_path_list) == 0: raise RuntimeError("No documents to merge") file_extension = output_path.split(".")[-1].lower() try: if input_path_list[0] == "": raise ValueError("No files to merge") if file_extension == "pdf": output_path = self._merge_pdf(input_folder, output_folder, input_path_list, output_path) logging.info( f"Merged {len(input_path_list)} page(s) of PDF document on path: {output_path}" ) elif file_extension == "tif" or file_extension == "tiff": output_path = self._merge_tiff(input_folder, output_folder, input_path_list, output_path) logging.info( f"Merged {len(input_path_list)} page(s) of TIFF document on path: {output_path}" ) else: raise ValueError("No files with PDF/TIFF extension") for path in input_path_list: input_folder.delete_path(path) except (UnidentifiedImageError, PyPdfError, ValueError, TypeError, OSError) as error: logging.warning( f"Could not merge document on path: {output_path} because of error: {error}" ) output_path = "" if self.error_handling == ErrorHandling.FAIL: logging.exception(error) return output_path
def get_inputs(self): self.input_folder = Folder( get_input_names_for_role("input_folder_id")[0]) output_folder_id = get_output_names_for_role("output_folder_id")[0] self.output_folder = Folder(output_folder_id) self.output_file_path = get_recipe_config()['output_model_path'] self.batch_size = int(get_recipe_config()['batch_size']) if not get_recipe_config()['show_batch_size']: self.batch_size = -1 self.overwrite_output_model = get_recipe_config( )['overwrite_output_model'] self.model_path = get_recipe_config()['model_path'] self.model_name = os_splitext(os_split(self.model_path)[1])[0] self.float_32 = get_recipe_config()["float_32"]
def format_save_pdf_document(self, output_folder: dataiku.Folder, pdf_path: AnyStr, response: Dict) -> bool: """Open a PDF file in a `dataiku.Folder`, draw text bounding polygons and save it to another folder""" result = False with self.input_folder.get_download_stream(pdf_path) as stream: try: pdf = PdfReader(BytesIO(stream.read())) if len(response) != 0: pdf = self.format_pdf_document(pdf, response) pdf_bytes = self.doc_handler.save_pdf_bytes(pdf) output_folder.upload_stream(pdf_path, pdf_bytes.getvalue()) result = True except (PdfError, ValueError, TypeError, OSError) as error: logging.warning(f"Could not annotate PDF on path: {pdf_path} because of error: {error}") if self.error_handling == ErrorHandling.FAIL: logging.exception(error) return result
def call_api_document_text_detection( folder: dataiku.Folder, batch: List[Dict], image_context: Dict = {}, folder_is_gcs: bool = False, folder_bucket: AnyStr = "", folder_root_path: AnyStr = "", **kwargs, ) -> vision.BatchAnnotateFilesResponse: """Call the Google Cloud Vision document annotation API with files stored in a Dataiku managed folder Used by `parallelizer.parallelizer` as `function` argument Activates batching automatically if the Dataiku managed folder is on GCS """ document_path = batch[0].get(PATH_COLUMN, "") # batch contains only 1 page splitted_document_path = batch[0].get(DocumentHandler.SPLITTED_PATH_COLUMN, "") if splitted_document_path == "": raise DocumentSplitError(f"Document could not be split") extension = os.path.splitext(document_path)[1][1:].lower().strip() document_request = { "input_config": {"mime_type": "application/pdf" if extension == "pdf" else "image/tiff"}, "features": [{"type_": vision.Feature.Type.DOCUMENT_TEXT_DETECTION}], "image_context": image_context, } if folder_is_gcs: document_request["input_config"]["gcs_source"] = { "uri": f"gs://{folder_bucket}/{folder_root_path}{splitted_document_path}" } else: with folder.get_download_stream(splitted_document_path) as stream: document_request["input_config"]["content"] = stream.read() responses = self.client.batch_annotate_files(requests=[document_request]) return responses
class Main: def __init__(self): self.folder = None self.output_file_path = None self.batch_size = None self.overwrite_output_model = None self.model = None self.keras_model = None self.onnx_model = None self.float_32 = None def get_inputs(self): self.folder = Folder(get_output_names_for_role("folder_id")[0]) self.output_file_path = get_recipe_config()['output_model_path'] self.overwrite_output_model = get_recipe_config( )['overwrite_output_model'] self.batch_size = int(get_recipe_config()['batch_size']) if not get_recipe_config()['show_batch_size']: self.batch_size = -1 self.model = Model(get_input_names_for_role("saved_model_id")[0]) self.float_32 = get_recipe_config()["float_32"] def validation(self): if self.folder.get_path_details( self.output_file_path )['exists'] and not self.overwrite_output_model: raise ValueError( 'Output file already exists, check overwrite box or change output path' ) if not self.output_file_path: raise ValueError('Output model path can not be blank') def write_output(self): with self.folder.get_writer(self.output_file_path) as w: w.write(self.onnx_model.SerializeToString()) def run(self): self.get_inputs() self.validation() self.keras_model = get_keras_model_from_saved_model( default_project_key(), self.model) self.onnx_model = convert_from_keras_to_onnx(self.keras_model, self.batch_size, self.float_32) self.write_output()
def __init__(self, project_key, config, plugin_config): """ :param project_key: the project in which the runnable executes :param config: the dict of the configuration of the object :param plugin_config: contains the plugin settings """ self.project_key = project_key self.config = config self.plugin_config = plugin_config self.folder = Folder(self._get_folder_id(), project_key=self.project_key) self.model = Model(self.config.get('saved_model_id'), project_key=self.project_key) self.model.list_versions() self.output_file_path = self._get_output_file_path() self.overwrite_output_model = self.config.get('overwrite_output_model') self.batch_size = self._get_batch_size() self.float_32 = self.config.get('float_32')
def format_save_image(self, output_folder: dataiku.Folder, image_path: AnyStr, response: Dict) -> bool: result = False with self.input_folder.get_download_stream(image_path) as stream: try: pil_image = Image.open(stream) if len(response) != 0: formatted_image = self.format_image(pil_image, response) else: formatted_image = pil_image.copy() image_bytes = save_image_bytes(formatted_image, image_path) output_folder.upload_stream(image_path, image_bytes.getvalue()) result = True except (UnidentifiedImageError, TypeError, OSError) as e: logging.warning("Could not load image on path: " + image_path) if self.error_handling == ErrorHandlingEnum.FAIL: raise e return result
def download_file_from_folder_to_tmp( path: AnyStr, folder: dataiku.Folder) -> NamedTemporaryFile: """Download a file from a Dataiku Folder into a local temporary file""" file_extension = Path(path).suffix tmp = NamedTemporaryFile(suffix=file_extension) with folder.get_download_stream(path) as stream: tmp.write(bytes(stream.read())) _ = tmp.seek(0) # Come together, right now return tmp
def do(payload, config, plugin_config, inputs): folder = None for recipe_input in inputs: if recipe_input["role"] == "folder": folder = Folder(recipe_input["fullName"]) if folder: paths = folder.list_paths_in_partition() choices = [] for file_name in paths: if ".json" in file_name: choices.append({"value": file_name, "label": file_name}) return {"choices": choices} else: return { "choices": [{ "value": None, "label": "Invalid : no input folder" }] }
def call_api_generic( row: Dict, api_client: boto3.client, api_client_method_name: AnyStr, input_folder: dataiku.Folder, input_folder_is_s3: bool, input_folder_bucket: AnyStr, input_folder_root_path: AnyStr, orientation_correction: bool = False, num_objects: int = None, minimum_score: int = None, ) -> AnyStr: image_path = row.get(IMAGE_PATH_COLUMN) pil_image = None if input_folder_is_s3: image_request = {"S3Object": {"Bucket": input_folder_bucket, "Name": input_folder_root_path + image_path}} else: with input_folder.get_download_stream(image_path) as stream: image_request = {"Bytes": stream.read()} pil_image = Image.open(BytesIO(image_request["Bytes"])) if orientation_correction: # Need to use another API endpoint to retrieve the estimated orientation orientation_response = api_client.recognize_celebrities(Image=image_request) detected_orientation = orientation_response.get("OrientationCorrection", "") if pil_image is None: with input_folder.get_download_stream(image_path) as stream: pil_image = Image.open(stream) (rotated_image, rotated) = auto_rotate_image(pil_image, detected_orientation) if rotated: logging.info("Corrected image orientation: {}".format(image_path)) image_request = {"Bytes": save_image_bytes(rotated_image, image_path).getvalue()} request_dict = {"Image": image_request} if num_objects: request_dict["MaxLabels"] = num_objects if minimum_score: request_dict["MinConfidence"] = minimum_score response = getattr(api_client, api_client_method_name)(**request_dict) if orientation_correction: response["OrientationCorrection"] = detected_orientation return json.dumps(response)
def _merge_tiff( self, input_folder: dataiku.Folder, output_folder: dataiku.Folder, input_path_list: List[AnyStr], output_path: AnyStr, ) -> AnyStr: """Merge several TIFF files into a single one Args: input_folder: `dataiku.Folder` where the input TIFF files are stored output_folder: `dataiku.Folder` where the merged TIFF file will be saved input_path_list: List of TIFF file paths in the `input_folder` output_path: Path of the merged TIFF file Returns: Path of the merged TIFF file """ # Load all TIFF images in a list image_list = [] for input_path in input_path_list: with input_folder.get_download_stream(input_path) as stream: image_list.append(Image.open(stream)) # Save them to a single image object image_bytes = BytesIO() if len(image_list) > 1: image_list[0].save(image_bytes, append_images=image_list[1:], save_all=True, format="TIFF") else: image_list[0].save(image_bytes, format="TIFF") # Save image to output_folder output_folder.upload_stream(output_path, image_bytes.getvalue()) return output_path
def format_save_images(self, output_folder: dataiku.Folder): partition = output_folder.writePartition if output_folder.writePartition else "" output_folder.clear_partition(partition) df_iterator = (i[1].to_dict() for i in self.output_df.iterrows()) len_iterator = len(self.output_df.index) logging.info("Saving bounding boxes to output folder...") api_results = [] with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool: futures = [ pool.submit( self.format_save_image, output_folder=output_folder, image_path=row[IMAGE_PATH_COLUMN], response=safe_json_loads( row[self.api_column_names.response]), ) for row in df_iterator ] for f in tqdm_auto(as_completed(futures), total=len_iterator): api_results.append(f.result()) num_success = sum(api_results) num_error = len(api_results) - num_success logging.info( "Saving bounding boxes to output folder: {} images succeeded, {} failed" .format(num_success, num_error))
def call_api_annotate_image( folder: dataiku.Folder, features: Dict, image_context: Dict = {}, row: Dict = None, batch: List[Dict] = None, folder_is_gcs: bool = False, folder_bucket: AnyStr = "", folder_root_path: AnyStr = "", **kwargs, ) -> Union[vision.BatchAnnotateImagesResponse, AnyStr]: """Call the Google Cloud Vision image annotation API with files stored in a Dataiku managed folder Used by `parallelizer.parallelizer` as `function` argument Activates batching automatically if the Dataiku managed folder is on GCS """ image_request = { "features": features, "image_context": image_context, } if folder_is_gcs: image_requests = [ { **{ "image": { "source": {"image_uri": f"gs://{folder_bucket}/{folder_root_path}{row[PATH_COLUMN]}"} } }, **image_request, } for row in batch ] responses = self.client.batch_annotate_images(requests=image_requests) return responses else: image_path = row[PATH_COLUMN] with folder.get_download_stream(image_path) as stream: image_request["image"] = {"content": stream.read()} response = self.client.annotate_image(request=image_request) response_dict = json.loads(response.__class__.to_json(response)) if "error" in response_dict.keys(): # Required as annotate_image does not raise exceptions raise GoogleAPIError(response_dict.get("error", {}).get("message", "")) return json.dumps(response_dict)
class MyRunnable(Runnable): """The base interface for a Python runnable""" def __init__(self, project_key, config, plugin_config): """ :param project_key: the project in which the runnable executes :param config: the dict of the configuration of the object :param plugin_config: contains the plugin settings """ self.project_key = project_key self.config = config self.plugin_config = plugin_config self.folder = Folder(self._get_folder_id(), project_key=self.project_key) self.model = Model(self.config.get('saved_model_id'), project_key=self.project_key) self.model.list_versions() self.output_file_path = self._get_output_file_path() self.overwrite_output_model = self.config.get('overwrite_output_model') self.batch_size = self._get_batch_size() self.float_32 = self.config.get('float_32') def _get_batch_size(self): batch_size = int(self.config.get('batch_size')) if not self.config.get('show_batch_size'): batch_size = -1 return batch_size def _get_folder_id(self): folder_id = self.config.get('folder_id', '') if not folder_id: raise ValueError('Output folder can not be blank') return folder_id def _get_output_file_path(self): output_file_path = self.config.get('output_model_path', '') if not output_file_path: raise ValueError('Output model path can not be blank') return output_file_path def get_progress_target(self): return None def run(self, progress_callback): """ Gets a saved model, converts it with keras2onnx, saves it back in the folder, builds a url for the download """ keras_model = get_keras_model_from_saved_model(self.project_key, self.model) onnx_model = convert_from_keras_to_onnx(keras_model, self.batch_size, self.float_32) self._write_onnx_model_to_folder(onnx_model) return self._build_download_url() def _write_onnx_model_to_folder(self, onnx_model): if self.folder.get_path_details( self.output_file_path )['exists'] and not self.overwrite_output_model: raise ValueError( 'Output file already exists, check overwrite box or change output path' ) with self.folder.get_writer(self.output_file_path) as w: w.write(onnx_model.SerializeToString()) def _build_download_url(self): return "/dip/api/managedfolder/download-item/?contextProjectKey={}&projectKey={}&obdId={}&path=%2F{}".format( self.project_key, self.project_key, self.folder.get_id(), self.output_file_path)
def _get_output_folder(self): output_folder_id = self.config.get('output_folder_id', None) if output_folder_id and output_folder_id != '?': return Folder(output_folder_id, project_key=self.project_key) else: return self.input_folder
def _get_input_folder(self): input_folder_id = self.config.get('input_folder_id', '') if not input_folder_id: raise ValueError('Input folder has to be selected') return Folder(input_folder_id, project_key=self.project_key)
def generate_path_list(folder: dataiku.Folder) -> List[AnyStr]: partition = "" if folder.read_partitions is not None: partition = folder.read_partitions[0] path_list = folder.list_paths_in_partition(partition) return path_list