def _merge_pdf(
        self,
        input_folder: dataiku.Folder,
        output_folder: dataiku.Folder,
        input_path_list: List[AnyStr],
        output_path: AnyStr,
    ) -> AnyStr:
        """Merge several PDF files into a single one

        Args:
            input_folder: `dataiku.Folder` where the input PDF files are stored
            output_folder: `dataiku.Folder` where the merged PDF file will be saved
            input_path_list:  List of PDF file paths in the `input_folder`
            output_path: Path of the merged PDF file

        Returns:
            Path of the merged PDF file

        """
        pdf_writer = PdfFileWriter()
        # Merge all PDF paths in the list
        for path in input_path_list:
            with input_folder.get_download_stream(path) as stream:
                input_pdf = PdfFileReader(BytesIO(stream.read()))
            for page in range(input_pdf.getNumPages()):
                pdf_writer.addPage(input_pdf.getPage(page))
        # Save the merged PDF in the output folder
        pdf_bytes = BytesIO()
        pdf_writer.write(pdf_bytes)
        output_folder.upload_stream(output_path, pdf_bytes.getvalue())
        return output_path
    def _split_tiff(self, input_folder: dataiku.Folder,
                    output_folder: dataiku.Folder,
                    input_path: AnyStr) -> List[AnyStr]:
        """Split a TIFF file into multiple pages and save them as separate files in another folder

        Args:
            input_folder: `dataiku.Folder` where the input TIFF file is stored
            output_folder: `dataiku.Folder` where files will be saved
            input_path: path of the input TIFF file in the `input_folder`

        Returns:
            List of paths generated in the `output_folder`

        """
        with input_folder.get_download_stream(input_path) as stream:
            pil_image = Image.open(stream)
        input_path_without_file_name = os.path.split(input_path)[0]
        input_file_name_without_extension = os.path.splitext(
            os.path.basename(input_path))[0]
        page = 0
        output_path_list = []
        while True:
            try:
                pil_image.seek(page)
                output_path = f"{input_path_without_file_name}/{input_file_name_without_extension}_page_{page+1}.tiff"
                image_bytes = BytesIO()
                pil_image.save(image_bytes, format="TIFF")
                output_folder.upload_stream(output_path,
                                            image_bytes.getvalue())
                output_path_list.append(output_path)
                page += 1
            except EOFError:
                break
        return output_path_list
示例#3
0
def generate_path_df(folder: dataiku.Folder, file_extensions: List[AnyStr],
                     path_column: AnyStr) -> pd.DataFrame:
    """Generate a dataframe of file paths in a Dataiku Folder matching a list of extensions

    Args:
        folder: Dataiku managed folder where files are stored
            This folder can be partitioned or not, this function handles both
        file_extensions: list of file extensions to match, ex: ["JPG", "PNG"]
            Expected format is not case-sensitive but should not include leading "."
        path_column: Name of the column in the output dataframe

    Returns:
        DataFrame with one column named `path_column` with all the file paths matching the list of `file_extensions`

    Raises:
        RuntimeError: If there are not files matching the list of `file_extensions`

    """
    path_list = []
    if folder.read_partitions:
        for partition in folder.read_partitions:
            path_list += folder.list_paths_in_partition(partition)
    else:
        path_list = folder.list_paths_in_partition()
    filtered_path_list = [
        path for path in path_list
        if os.path.splitext(path)[1][1:].lower().strip() in file_extensions
    ]
    if len(filtered_path_list) == 0:
        raise RuntimeError(
            f"No files detected with supported extensions {file_extensions}, check input folder"
        )
    path_df = pd.DataFrame(filtered_path_list, columns=[path_column])
    return path_df
    def format_save_image(self, output_folder: dataiku.Folder,
                          image_path: AnyStr, response: Dict) -> bool:
        """Generic method to apply `self.format_image` to an image in `self.input_folder` and save it to an `output folder`

        Do not override this method!

        """
        result = False
        with self.input_folder.get_download_stream(image_path) as stream:
            try:
                pil_image = Image.open(stream)
                if len(response) != 0:
                    formatted_image = self.format_image(pil_image, response)
                else:
                    formatted_image = pil_image.copy()
                image_bytes = save_image_bytes(formatted_image, image_path)
                output_folder.upload_stream(image_path, image_bytes.getvalue())
                result = True
            except self.IMAGE_FORMATTING_EXCEPTIONS as error:
                logging.warning(
                    f"Could not format image on path: {image_path} because of error: {error}"
                )
                if self.error_handling == ErrorHandling.FAIL:
                    logging.exception(error)
        return result
    def _split_pdf(self, input_folder: dataiku.Folder,
                   output_folder: dataiku.Folder,
                   input_path: AnyStr) -> List[AnyStr]:
        """Split a PDF file into multiple pages and save them as separate files in another folder

        Args:
            input_folder: `dataiku.Folder` where the input PDF file is stored
            output_folder: `dataiku.Folder` where files will be saved
            input_path: path of the input PDF file in the `input_folder`

        Returns:
            List of paths generated in the `output_folder`

        """
        with input_folder.get_download_stream(input_path) as stream:
            input_pdf = PdfFileReader(BytesIO(stream.read()))
        input_path_without_file_name = os.path.split(input_path)[0]
        input_file_name_without_extension = os.path.splitext(
            os.path.basename(input_path))[0]
        output_path_list = []
        for page in range(input_pdf.getNumPages()):
            pdf_writer = PdfFileWriter()
            pdf_writer.addPage(input_pdf.getPage(page))
            output_path = f"{input_path_without_file_name}/{input_file_name_without_extension}_page_{page + 1}.pdf"
            pdf_bytes = BytesIO()
            pdf_writer.write(pdf_bytes)
            output_folder.upload_stream(output_path, pdf_bytes.getvalue())
            output_path_list.append(output_path)
        return output_path_list
示例#6
0
 def get_inputs(self):
     self.folder = Folder(get_output_names_for_role("folder_id")[0])
     self.output_file_path = get_recipe_config()['output_model_path']
     self.overwrite_output_model = get_recipe_config(
     )['overwrite_output_model']
     self.batch_size = int(get_recipe_config()['batch_size'])
     if not get_recipe_config()['show_batch_size']:
         self.batch_size = -1
     self.model = Model(get_input_names_for_role("saved_model_id")[0])
     self.float_32 = get_recipe_config()["float_32"]
示例#7
0
def do(payload, config, plugin_config, inputs):
    for recipe_input in inputs:
        if recipe_input["role"] == "input_folder_id":
            folder = Folder(recipe_input["fullName"])
    paths = folder.list_paths_in_partition()
    choices = []
    for file_name in paths:
        extension = os.path.splitext(file_name)[1]
        if extension == '.h5':
            choices.append({"value": file_name, "label": file_name})
    return {"choices": choices}
示例#8
0
class Main:
    def __init__(self):
        self.input_folder = None
        self.output_folder = None
        self.output_file_path = None
        self.batch_size = None
        self.overwrite_output_model = None
        self.model_path = None
        self.model_name = None
        self.keras_model = None
        self.onnx_model = None

    def get_inputs(self):
        self.input_folder = Folder(
            get_input_names_for_role("input_folder_id")[0])
        output_folder_id = get_output_names_for_role("output_folder_id")[0]
        self.output_folder = Folder(output_folder_id)
        self.output_file_path = get_recipe_config()['output_model_path']
        self.batch_size = int(get_recipe_config()['batch_size'])
        if not get_recipe_config()['show_batch_size']:
            self.batch_size = -1
        self.overwrite_output_model = get_recipe_config(
        )['overwrite_output_model']
        self.model_path = get_recipe_config()['model_path']
        self.model_name = os_splitext(os_split(self.model_path)[1])[0]
        self.float_32 = get_recipe_config()["float_32"]

    def validate(self):
        if self.output_folder.get_path_details(
                self.output_file_path
        )['exists'] and not self.overwrite_output_model:
            raise ValueError(
                'Output file already exists, check overwrite box or change output path'
            )
        if not self.output_file_path:
            raise ValueError('Output model path can not be blank')
        check_keras_version(self.input_folder, self.model_path)

    def load_h5_to_keras(self):
        self.keras_model = get_keras_model_from_folder(self.input_folder,
                                                       self.model_path)

    def write_output(self):
        with self.output_folder.get_writer(self.output_file_path) as w:
            w.write(self.onnx_model.SerializeToString())

    def run(self):
        self.get_inputs()
        self.validate()
        self.load_h5_to_keras()
        self.onnx_model = convert_from_keras_to_onnx(self.keras_model,
                                                     self.batch_size,
                                                     self.float_32)
        self.write_output()
示例#9
0
def do(payload, config, plugin_config, inputs):
    if config.get('input_folder_id'):
        folder = Folder(config.get('input_folder_id'))
        paths = folder.list_paths_in_partition()
        choices = []
        for file_name in paths:
            extension = os.path.splitext(file_name)[1]
            if extension == '.h5':
                choices.append({"value": file_name, "label": file_name})
        return {"choices": choices}
    return {}
def do(payload, config, plugin_config, inputs):
    for recipe_input in inputs:
        if recipe_input["role"] == "folder":
            folder = Folder(recipe_input["fullName"])
    paths = folder.list_paths_in_partition()
    choices = []
    for fileName in paths:
        if ".json" in fileName:
            jsonFile = folder.read_json(fileName)
            choices.append({"value": fileName,
                            "label": fileName + " (" + jsonFile["name"] + " | " + jsonFile["target"] + ")"})
    return {"choices": choices}
示例#11
0
def save_array_to_folder(array: np.array,
                         path: AnyStr,
                         folder: dataiku.Folder,
                         compress: bool = True) -> None:
    """Save a numpy array to a Dataiku folder"""
    with NamedTemporaryFile() as tmp:
        if compress:
            np.savez_compressed(tmp, array)
        else:
            np.savez(tmp, array)
        _ = tmp.seek(0)  # Oh, take me back to the start
        folder.upload_stream(path, tmp)
示例#12
0
def main():
    # getting the csv folder address
    havas_logs = Folder(customrecipe.get_input_names_for_role("files_folder")[0])
    havas_logs_path = havas_logs.get_path()

    files_to_process = get_files_to_process(havas_logs_path)

    # preparing dataset to write into
    havas_cost_data = DatasetWrapper(customrecipe.get_output_names_for_role("cost_data")[0])
    # havas_cost_data.dataset.spec_item['appendMode'] = True
    # writing into dataset
    append_files_to_dataset(files_to_process, havas_cost_data)
    # closing dataset and saving lines
    havas_cost_data.close()
    def merge_document(
        self,
        input_folder: dataiku.Folder,
        output_folder: dataiku.Folder,
        input_path_list: List[AnyStr],
        output_path: AnyStr,
    ) -> AnyStr:
        """Merge several PDF/TIFF files into a single one

        Args:
            input_folder: `dataiku.Folder` where the input PDF/TIFF files are stored
            output_folder: `dataiku.Folder` where the merged PDF/TIFF file will be saved
            input_path_list:  List of PDF/TIFF file paths in the `input_folder`
            output_path: Path of the merged PDF/TIFF file

        Returns:
            Path of the merged PDF/TIFF file

        """
        if len(input_path_list) == 0:
            raise RuntimeError("No documents to merge")
        file_extension = output_path.split(".")[-1].lower()
        try:
            if input_path_list[0] == "":
                raise ValueError("No files to merge")
            if file_extension == "pdf":
                output_path = self._merge_pdf(input_folder, output_folder,
                                              input_path_list, output_path)
                logging.info(
                    f"Merged {len(input_path_list)} page(s) of PDF document on path: {output_path}"
                )
            elif file_extension == "tif" or file_extension == "tiff":
                output_path = self._merge_tiff(input_folder, output_folder,
                                               input_path_list, output_path)
                logging.info(
                    f"Merged {len(input_path_list)} page(s) of TIFF document on path: {output_path}"
                )
            else:
                raise ValueError("No files with PDF/TIFF extension")
            for path in input_path_list:
                input_folder.delete_path(path)
        except (UnidentifiedImageError, PyPdfError, ValueError, TypeError,
                OSError) as error:
            logging.warning(
                f"Could not merge document on path: {output_path} because of error: {error}"
            )
            output_path = ""
            if self.error_handling == ErrorHandling.FAIL:
                logging.exception(error)
        return output_path
示例#14
0
 def get_inputs(self):
     self.input_folder = Folder(
         get_input_names_for_role("input_folder_id")[0])
     output_folder_id = get_output_names_for_role("output_folder_id")[0]
     self.output_folder = Folder(output_folder_id)
     self.output_file_path = get_recipe_config()['output_model_path']
     self.batch_size = int(get_recipe_config()['batch_size'])
     if not get_recipe_config()['show_batch_size']:
         self.batch_size = -1
     self.overwrite_output_model = get_recipe_config(
     )['overwrite_output_model']
     self.model_path = get_recipe_config()['model_path']
     self.model_name = os_splitext(os_split(self.model_path)[1])[0]
     self.float_32 = get_recipe_config()["float_32"]
 def format_save_pdf_document(self, output_folder: dataiku.Folder, pdf_path: AnyStr, response: Dict) -> bool:
     """Open a PDF file in a `dataiku.Folder`, draw text bounding polygons and save it to another folder"""
     result = False
     with self.input_folder.get_download_stream(pdf_path) as stream:
         try:
             pdf = PdfReader(BytesIO(stream.read()))
             if len(response) != 0:
                 pdf = self.format_pdf_document(pdf, response)
                 pdf_bytes = self.doc_handler.save_pdf_bytes(pdf)
                 output_folder.upload_stream(pdf_path, pdf_bytes.getvalue())
             result = True
         except (PdfError, ValueError, TypeError, OSError) as error:
             logging.warning(f"Could not annotate PDF on path: {pdf_path} because of error: {error}")
             if self.error_handling == ErrorHandling.FAIL:
                 logging.exception(error)
     return result
        def call_api_document_text_detection(
            folder: dataiku.Folder,
            batch: List[Dict],
            image_context: Dict = {},
            folder_is_gcs: bool = False,
            folder_bucket: AnyStr = "",
            folder_root_path: AnyStr = "",
            **kwargs,
        ) -> vision.BatchAnnotateFilesResponse:
            """Call the Google Cloud Vision document annotation API with files stored in a Dataiku managed folder

            Used by `parallelizer.parallelizer` as `function` argument
            Activates batching automatically if the Dataiku managed folder is on GCS

            """
            document_path = batch[0].get(PATH_COLUMN, "")  # batch contains only 1 page
            splitted_document_path = batch[0].get(DocumentHandler.SPLITTED_PATH_COLUMN, "")
            if splitted_document_path == "":
                raise DocumentSplitError(f"Document could not be split")
            extension = os.path.splitext(document_path)[1][1:].lower().strip()
            document_request = {
                "input_config": {"mime_type": "application/pdf" if extension == "pdf" else "image/tiff"},
                "features": [{"type_": vision.Feature.Type.DOCUMENT_TEXT_DETECTION}],
                "image_context": image_context,
            }
            if folder_is_gcs:
                document_request["input_config"]["gcs_source"] = {
                    "uri": f"gs://{folder_bucket}/{folder_root_path}{splitted_document_path}"
                }
            else:
                with folder.get_download_stream(splitted_document_path) as stream:
                    document_request["input_config"]["content"] = stream.read()
            responses = self.client.batch_annotate_files(requests=[document_request])
            return responses
示例#17
0
class Main:
    def __init__(self):
        self.folder = None
        self.output_file_path = None
        self.batch_size = None
        self.overwrite_output_model = None
        self.model = None
        self.keras_model = None
        self.onnx_model = None
        self.float_32 = None

    def get_inputs(self):
        self.folder = Folder(get_output_names_for_role("folder_id")[0])
        self.output_file_path = get_recipe_config()['output_model_path']
        self.overwrite_output_model = get_recipe_config(
        )['overwrite_output_model']
        self.batch_size = int(get_recipe_config()['batch_size'])
        if not get_recipe_config()['show_batch_size']:
            self.batch_size = -1
        self.model = Model(get_input_names_for_role("saved_model_id")[0])
        self.float_32 = get_recipe_config()["float_32"]

    def validation(self):
        if self.folder.get_path_details(
                self.output_file_path
        )['exists'] and not self.overwrite_output_model:
            raise ValueError(
                'Output file already exists, check overwrite box or change output path'
            )
        if not self.output_file_path:
            raise ValueError('Output model path can not be blank')

    def write_output(self):
        with self.folder.get_writer(self.output_file_path) as w:
            w.write(self.onnx_model.SerializeToString())

    def run(self):
        self.get_inputs()
        self.validation()
        self.keras_model = get_keras_model_from_saved_model(
            default_project_key(), self.model)
        self.onnx_model = convert_from_keras_to_onnx(self.keras_model,
                                                     self.batch_size,
                                                     self.float_32)
        self.write_output()
示例#18
0
 def __init__(self, project_key, config, plugin_config):
     """
     :param project_key: the project in which the runnable executes
     :param config: the dict of the configuration of the object
     :param plugin_config: contains the plugin settings
     """
     self.project_key = project_key
     self.config = config
     self.plugin_config = plugin_config
     self.folder = Folder(self._get_folder_id(),
                          project_key=self.project_key)
     self.model = Model(self.config.get('saved_model_id'),
                        project_key=self.project_key)
     self.model.list_versions()
     self.output_file_path = self._get_output_file_path()
     self.overwrite_output_model = self.config.get('overwrite_output_model')
     self.batch_size = self._get_batch_size()
     self.float_32 = self.config.get('float_32')
 def format_save_image(self, output_folder: dataiku.Folder,
                       image_path: AnyStr, response: Dict) -> bool:
     result = False
     with self.input_folder.get_download_stream(image_path) as stream:
         try:
             pil_image = Image.open(stream)
             if len(response) != 0:
                 formatted_image = self.format_image(pil_image, response)
             else:
                 formatted_image = pil_image.copy()
             image_bytes = save_image_bytes(formatted_image, image_path)
             output_folder.upload_stream(image_path, image_bytes.getvalue())
             result = True
         except (UnidentifiedImageError, TypeError, OSError) as e:
             logging.warning("Could not load image on path: " + image_path)
             if self.error_handling == ErrorHandlingEnum.FAIL:
                 raise e
     return result
示例#20
0
def download_file_from_folder_to_tmp(
        path: AnyStr, folder: dataiku.Folder) -> NamedTemporaryFile:
    """Download a file from a Dataiku Folder into a local temporary file"""
    file_extension = Path(path).suffix
    tmp = NamedTemporaryFile(suffix=file_extension)
    with folder.get_download_stream(path) as stream:
        tmp.write(bytes(stream.read()))
    _ = tmp.seek(0)  # Come together, right now
    return tmp
def do(payload, config, plugin_config, inputs):
    folder = None
    for recipe_input in inputs:
        if recipe_input["role"] == "folder":
            folder = Folder(recipe_input["fullName"])
    if folder:
        paths = folder.list_paths_in_partition()
        choices = []
        for file_name in paths:
            if ".json" in file_name:
                choices.append({"value": file_name, "label": file_name})
        return {"choices": choices}
    else:
        return {
            "choices": [{
                "value": None,
                "label": "Invalid : no input folder"
            }]
        }
def call_api_generic(
    row: Dict,
    api_client: boto3.client,
    api_client_method_name: AnyStr,
    input_folder: dataiku.Folder,
    input_folder_is_s3: bool,
    input_folder_bucket: AnyStr,
    input_folder_root_path: AnyStr,
    orientation_correction: bool = False,
    num_objects: int = None,
    minimum_score: int = None,
) -> AnyStr:
    image_path = row.get(IMAGE_PATH_COLUMN)
    pil_image = None
    if input_folder_is_s3:
        image_request = {"S3Object": {"Bucket": input_folder_bucket, "Name": input_folder_root_path + image_path}}
    else:
        with input_folder.get_download_stream(image_path) as stream:
            image_request = {"Bytes": stream.read()}
            pil_image = Image.open(BytesIO(image_request["Bytes"]))
    if orientation_correction:
        # Need to use another API endpoint to retrieve the estimated orientation
        orientation_response = api_client.recognize_celebrities(Image=image_request)
        detected_orientation = orientation_response.get("OrientationCorrection", "")
        if pil_image is None:
            with input_folder.get_download_stream(image_path) as stream:
                pil_image = Image.open(stream)
        (rotated_image, rotated) = auto_rotate_image(pil_image, detected_orientation)
        if rotated:
            logging.info("Corrected image orientation: {}".format(image_path))
            image_request = {"Bytes": save_image_bytes(rotated_image, image_path).getvalue()}
    request_dict = {"Image": image_request}
    if num_objects:
        request_dict["MaxLabels"] = num_objects
    if minimum_score:
        request_dict["MinConfidence"] = minimum_score
    response = getattr(api_client, api_client_method_name)(**request_dict)
    if orientation_correction:
        response["OrientationCorrection"] = detected_orientation
    return json.dumps(response)
    def _merge_tiff(
        self,
        input_folder: dataiku.Folder,
        output_folder: dataiku.Folder,
        input_path_list: List[AnyStr],
        output_path: AnyStr,
    ) -> AnyStr:
        """Merge several TIFF files into a single one

        Args:
            input_folder: `dataiku.Folder` where the input TIFF files are stored
            output_folder: `dataiku.Folder` where the merged TIFF file will be saved
            input_path_list:  List of TIFF file paths in the `input_folder`
            output_path: Path of the merged TIFF file

        Returns:
            Path of the merged TIFF file

        """
        # Load all TIFF images in a list
        image_list = []
        for input_path in input_path_list:
            with input_folder.get_download_stream(input_path) as stream:
                image_list.append(Image.open(stream))
        # Save them to a single image object
        image_bytes = BytesIO()
        if len(image_list) > 1:
            image_list[0].save(image_bytes,
                               append_images=image_list[1:],
                               save_all=True,
                               format="TIFF")
        else:
            image_list[0].save(image_bytes, format="TIFF")
        # Save image to output_folder
        output_folder.upload_stream(output_path, image_bytes.getvalue())
        return output_path
 def format_save_images(self, output_folder: dataiku.Folder):
     partition = output_folder.writePartition if output_folder.writePartition else ""
     output_folder.clear_partition(partition)
     df_iterator = (i[1].to_dict() for i in self.output_df.iterrows())
     len_iterator = len(self.output_df.index)
     logging.info("Saving bounding boxes to output folder...")
     api_results = []
     with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool:
         futures = [
             pool.submit(
                 self.format_save_image,
                 output_folder=output_folder,
                 image_path=row[IMAGE_PATH_COLUMN],
                 response=safe_json_loads(
                     row[self.api_column_names.response]),
             ) for row in df_iterator
         ]
         for f in tqdm_auto(as_completed(futures), total=len_iterator):
             api_results.append(f.result())
     num_success = sum(api_results)
     num_error = len(api_results) - num_success
     logging.info(
         "Saving bounding boxes to output folder: {} images succeeded, {} failed"
         .format(num_success, num_error))
        def call_api_annotate_image(
            folder: dataiku.Folder,
            features: Dict,
            image_context: Dict = {},
            row: Dict = None,
            batch: List[Dict] = None,
            folder_is_gcs: bool = False,
            folder_bucket: AnyStr = "",
            folder_root_path: AnyStr = "",
            **kwargs,
        ) -> Union[vision.BatchAnnotateImagesResponse, AnyStr]:
            """Call the Google Cloud Vision image annotation API with files stored in a Dataiku managed folder

            Used by `parallelizer.parallelizer` as `function` argument
            Activates batching automatically if the Dataiku managed folder is on GCS

            """
            image_request = {
                "features": features,
                "image_context": image_context,
            }
            if folder_is_gcs:
                image_requests = [
                    {
                        **{
                            "image": {
                                "source": {"image_uri": f"gs://{folder_bucket}/{folder_root_path}{row[PATH_COLUMN]}"}
                            }
                        },
                        **image_request,
                    }
                    for row in batch
                ]
                responses = self.client.batch_annotate_images(requests=image_requests)
                return responses
            else:
                image_path = row[PATH_COLUMN]
                with folder.get_download_stream(image_path) as stream:
                    image_request["image"] = {"content": stream.read()}
                response = self.client.annotate_image(request=image_request)
                response_dict = json.loads(response.__class__.to_json(response))
                if "error" in response_dict.keys():  # Required as annotate_image does not raise exceptions
                    raise GoogleAPIError(response_dict.get("error", {}).get("message", ""))
                return json.dumps(response_dict)
示例#26
0
class MyRunnable(Runnable):
    """The base interface for a Python runnable"""
    def __init__(self, project_key, config, plugin_config):
        """
        :param project_key: the project in which the runnable executes
        :param config: the dict of the configuration of the object
        :param plugin_config: contains the plugin settings
        """
        self.project_key = project_key
        self.config = config
        self.plugin_config = plugin_config
        self.folder = Folder(self._get_folder_id(),
                             project_key=self.project_key)
        self.model = Model(self.config.get('saved_model_id'),
                           project_key=self.project_key)
        self.model.list_versions()
        self.output_file_path = self._get_output_file_path()
        self.overwrite_output_model = self.config.get('overwrite_output_model')
        self.batch_size = self._get_batch_size()
        self.float_32 = self.config.get('float_32')

    def _get_batch_size(self):
        batch_size = int(self.config.get('batch_size'))
        if not self.config.get('show_batch_size'):
            batch_size = -1
        return batch_size

    def _get_folder_id(self):
        folder_id = self.config.get('folder_id', '')
        if not folder_id:
            raise ValueError('Output folder can not be blank')
        return folder_id

    def _get_output_file_path(self):
        output_file_path = self.config.get('output_model_path', '')
        if not output_file_path:
            raise ValueError('Output model path can not be blank')
        return output_file_path

    def get_progress_target(self):
        return None

    def run(self, progress_callback):
        """
        Gets a saved model, converts it with keras2onnx, saves it back in the folder, builds a url for the download
        """

        keras_model = get_keras_model_from_saved_model(self.project_key,
                                                       self.model)
        onnx_model = convert_from_keras_to_onnx(keras_model, self.batch_size,
                                                self.float_32)
        self._write_onnx_model_to_folder(onnx_model)

        return self._build_download_url()

    def _write_onnx_model_to_folder(self, onnx_model):
        if self.folder.get_path_details(
                self.output_file_path
        )['exists'] and not self.overwrite_output_model:
            raise ValueError(
                'Output file already exists, check overwrite box or change output path'
            )
        with self.folder.get_writer(self.output_file_path) as w:
            w.write(onnx_model.SerializeToString())

    def _build_download_url(self):
        return "/dip/api/managedfolder/download-item/?contextProjectKey={}&projectKey={}&obdId={}&path=%2F{}".format(
            self.project_key, self.project_key, self.folder.get_id(),
            self.output_file_path)
示例#27
0
 def _get_output_folder(self):
     output_folder_id = self.config.get('output_folder_id', None)
     if output_folder_id and output_folder_id != '?':
         return Folder(output_folder_id, project_key=self.project_key)
     else:
         return self.input_folder
示例#28
0
 def _get_input_folder(self):
     input_folder_id = self.config.get('input_folder_id', '')
     if not input_folder_id:
         raise ValueError('Input folder has to be selected')
     return Folder(input_folder_id, project_key=self.project_key)
示例#29
0
def generate_path_list(folder: dataiku.Folder) -> List[AnyStr]:
    partition = ""
    if folder.read_partitions is not None:
        partition = folder.read_partitions[0]
    path_list = folder.list_paths_in_partition(partition)
    return path_list