Exemplo n.º 1
0
def get_design_input_output() -> tuple:
    """Returns input and output datasets after sanity check

    :raises: :class:`ValueError`: Missing input or output dataset(s)

    :returns: input and output datasets
    :rtype: tuple
    """
    input_names = get_input_names_for_role("user_list")
    if len(input_names) == 0:
        raise ValueError("No input dataset.")
    output_names = get_output_names_for_role("groups")
    if len(output_names) == 0:
        raise ValueError("No output dataset.")

    input_name = input_names[0]
    input_dataset = dataiku.Dataset(input_name)
    folder_ref = get_input_names_for_role('folder')
    if len(folder_ref) == 0:
        folder_name = None
    else:
        folder_name = folder_ref[0]

    output_name = output_names[0]
    output_dataset = dataiku.Dataset(output_name)

    return input_dataset, folder_name, output_dataset
Exemplo n.º 2
0
def load_input_output(config):
    if not get_input_names_for_role("input_dataset"):
        raise ValueError("No input dataset.")
    input_dataset_name = get_input_names_for_role("input_dataset")[0]
    config.input_dataset = Dataset(input_dataset_name)

    output_dataset_name = get_output_names_for_role("output_dataset")[0]
    config.output_dataset = Dataset(output_dataset_name)
Exemplo n.º 3
0
def get_input_output():
    if len(get_input_names_for_role("input_dataset")) == 0:
        raise ValueError("No input dataset.")
    input_dataset_name = get_input_names_for_role("input_dataset")[0]
    input_dataset = dataiku.Dataset(input_dataset_name)

    output_folder_name = get_output_names_for_role("output_folder")[0]
    output_folder = dataiku.Folder(output_folder_name)
    return (input_dataset, output_folder)
Exemplo n.º 4
0
def get_input_output():
    if len(get_input_names_for_role('input_dataset')) == 0:
        raise ValueError('No input dataset.')
    input_dataset_name = get_input_names_for_role('input_dataset')[0]
    input_dataset = dataiku.Dataset(input_dataset_name)
    if len(get_output_names_for_role('output_dataset')) == 0:
        raise ValueError('No output dataset.')
    output_dataset_name = get_output_names_for_role('output_dataset')[0]
    output_dataset = dataiku.Dataset(output_dataset_name)
    return (input_dataset, output_dataset)
Exemplo n.º 5
0
def load_predict_config():
    """Utility function to load, resolve and validate all predict recipe config into a clean `params` dictionary

    Returns:
        Dictionary of parameter names (key) and values
    """
    params = {}
    recipe_config = get_recipe_config()

    # model folder
    model_folder = dataiku.Folder(get_input_names_for_role("model_folder")[0])
    params["model_folder"] = model_folder
    params["partition_root"] = get_folder_partition_root(params["model_folder"], is_input=True)

    params["external_features_future_dataset"] = None
    external_features_future_dataset_names = get_input_names_for_role("external_features_future_dataset")
    if len(external_features_future_dataset_names) > 0:
        params["external_features_future_dataset"] = dataiku.Dataset(external_features_future_dataset_names[0])

    # output dataset
    output_dataset_names = get_output_names_for_role("output_dataset")
    if len(output_dataset_names) == 0:
        raise PluginParamValidationError("Please specify Forecast dataset in the 'Input / Output' tab of the recipe")
    params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])
    check_only_one_read_partition(params["partition_root"], params["model_folder"])
    check_only_one_read_partition(params["partition_root"], params["external_features_future_dataset"])

    params["manual_selection"] = True if recipe_config.get("model_selection_mode") == "manual" else False

    params["performance_metric"] = recipe_config.get("performance_metric")
    params["selected_session"] = recipe_config.get("manually_selected_session", "latest_session")
    params["selected_model_label"] = recipe_config.get("manually_selected_model_label")

    params["prediction_length"] = recipe_config.get("prediction_length", -1)
    params["confidence_interval"] = recipe_config.get("confidence_interval", 95)
    params["quantiles"] = convert_confidence_interval_to_quantiles(params["confidence_interval"])
    params["include_history"] = recipe_config.get("include_history", False)

    params["sampling_method"] = recipe_config.get("sampling_method", "last_records")
    params["history_length_limit"] = None
    if params["sampling_method"] == "last_records":
        params["history_length_limit"] = recipe_config.get("number_records", 1000)
        if params["history_length_limit"] < 1:
            raise PluginParamValidationError("Number of historical records must be higher than 1")

    printable_params = {param: value for param, value in params.items() if "dataset" not in param and "folder" not in param}
    logger.info(f"Recipe parameters: {printable_params}")
    return params
 def validate_input_params(self) -> Dict:
     """Validate input parameters"""
     input_params = {}
     input_folder_names = get_input_names_for_role("input_folder")
     if len(input_folder_names) == 0:
         raise PluginParamValidationError("Please specify input folder")
     input_params["input_folder"] = dataiku.Folder(input_folder_names[0])
     if self.recipe_id == RecipeID.DOCUMENT_TEXT_DETECTION:
         file_extensions = GoogleCloudVisionAPIWrapper.SUPPORTED_DOCUMENT_FORMATS
         self.batch_support = True
     else:
         file_extensions = GoogleCloudVisionAPIWrapper.SUPPORTED_IMAGE_FORMATS
     input_params["input_df"] = generate_path_df(
         folder=input_params["input_folder"],
         file_extensions=file_extensions,
         path_column=PATH_COLUMN)
     input_folder_type = input_params["input_folder"].get_info().get(
         "type", "")
     input_params["input_folder_is_gcs"] = input_folder_type == "GCS"
     if input_params["input_folder_is_gcs"]:
         self.batch_support = True
         input_folder_access_info = input_params["input_folder"].get_info(
         ).get("accessInfo", {})
         input_params["input_folder_bucket"] = input_folder_access_info.get(
             "bucket")
         input_params["input_folder_root_path"] = str(
             input_folder_access_info.get("root", ""))[1:]
         logging.info(
             "Input folder is stored on GCS, enabling Batch API feature")
     else:
         logging.info(
             f"Input folder is not stored on GCS ({input_folder_type}), disabling Batch API feature"
         )
     return input_params
Exemplo n.º 7
0
def get_config():
    config = {}
    config['input_ds'] = dataiku.Dataset(get_input_names_for_role('input_ds')[0])
    config['output_ds'] = dataiku.Dataset(get_output_names_for_role('output_ds')[0])

    for param in ['lat_column', 'lng_column', 'provider', 'cache_enabled', 'api_key', 'here_app_id', 'here_app_code', 'google_client', 'google_client_secret']:
        config[param] = get_recipe_config().get(param, None)

    config['batch_enabled'] = get_recipe_config().get('batch_enabled', False) \
        and (config['provider'] == 'bing')
    config['batch_size'] = get_recipe_config().get('batch_size_bing', 50)

    config['features'] = []
    prefix = get_recipe_config().get('column_prefix', '')

    for feature in ['address', 'city', 'postal', 'state', 'country']:
        if get_recipe_config().get(feature, False):
            config['features'].append({'name': feature, 'column': prefix + feature})

    if get_plugin_config().get('cache_location', 'original') == 'original':
        config['cache_location'] = os.environ["DIP_HOME"] + '/caches/plugins/geocoder/reverse'
    else:
        config['cache_location'] = get_plugin_config().get('cache_location_custom', '')

    config['cache_size'] = get_plugin_config().get('reverse_cache_size', 1000) * 1000
    config['cache_eviction'] = get_plugin_config().get('reverse_cache_policy', 'least-recently-stored')

    if len(config['features']) == 0:
        raise AttributeError('Please select at least one feature to extract.')

    if config['provider'] is None:
        raise AttributeError('Please select a geocoding provider.')

    return config
Exemplo n.º 8
0
 def validate_input_params(self) -> Dict:
     """Validate input parameters"""
     input_params_dict = {}
     input_folder_names = get_input_names_for_role("input_folder")
     if len(input_folder_names) == 0:
         raise PluginParamValidationError("Please specify input folder")
     input_params_dict["input_folder"] = dataiku.Folder(
         input_folder_names[0])
     image_path_list = [
         p for p in generate_path_list(input_params_dict["input_folder"])
         if p.split(".")[-1].lower() in {"jpeg", "jpg", "png"}
     ]
     if len(image_path_list) == 0:
         raise PluginParamValidationError(
             "No images of supported format (PNG or JPG) were found in input folder"
         )
     input_params_dict["input_df"] = pd.DataFrame(
         image_path_list, columns=[IMAGE_PATH_COLUMN])
     input_params_dict["input_folder_is_s3"] = input_params_dict[
         "input_folder"].get_info().get("type", "") == "S3"
     if input_params_dict["input_folder_is_s3"]:
         input_folder_access_info = input_params_dict[
             "input_folder"].get_info().get("accessInfo", {})
         input_params_dict[
             "input_folder_bucket"] = input_folder_access_info.get("bucket")
         input_params_dict["input_folder_root_path"] = str(
             input_folder_access_info.get("root", ""))[1:]
         logging.info(
             "Input folder is on Amazon S3 with bucket: {} and root path: {}"
             .format(input_params_dict["input_folder_bucket"],
                     input_params_dict["input_folder_root_path"]))
     return input_params_dict
Exemplo n.º 9
0
    def __init__(self):
        """Instanciate class with DkuConfigLoading and add input datasets to dku_config"""

        super().__init__()
        text_input = get_input_names_for_role("document_dataset")[0]
        self.dku_config.add_param(
            name="text_input", value=Dataset(text_input), required=True
        )
        ontology_input = get_input_names_for_role("ontology_dataset")[0]
        self.dku_config.add_param(
            name="ontology_input", value=Dataset(ontology_input), required=True
        )
        self.document_dataset_columns = [
            p["name"] for p in self.dku_config.text_input.read_schema()
        ]
        self.ontology_dataset_columns = [
            p["name"] for p in self.dku_config.ontology_input.read_schema()
        ]
Exemplo n.º 10
0
 def get_inputs(self):
     self.folder = Folder(get_output_names_for_role("folder_id")[0])
     self.output_file_path = get_recipe_config()['output_model_path']
     self.overwrite_output_model = get_recipe_config(
     )['overwrite_output_model']
     self.batch_size = int(get_recipe_config()['batch_size'])
     if not get_recipe_config()['show_batch_size']:
         self.batch_size = -1
     self.model = Model(get_input_names_for_role("saved_model_id")[0])
     self.float_32 = get_recipe_config()["float_32"]
def load_plugin_config_langdetect() -> Dict:
    """Utility function to validate and load language detection parameters into a clean dictionary

    Returns:
        Dictionary of parameter names (key) and values

    """
    params = {}
    # input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) == 0:
        raise PluginParamValidationError("Please specify input dataset")
    params["input_dataset"] = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [
        p["name"] for p in params["input_dataset"].read_schema()
    ]

    # output dataset
    output_dataset_names = get_output_names_for_role("output_dataset")
    if len(output_dataset_names) == 0:
        raise PluginParamValidationError("Please specify output dataset")
    params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])

    # Recipe parameters
    recipe_config = get_recipe_config()
    # Text column
    params["text_column"] = recipe_config.get("text_column")
    if params["text_column"] not in input_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid text column selection: {params['text_column']}")
    logging.info(f"Text column: {params['text_column']}")
    # Language scope
    params["language_scope"] = recipe_config.get("language_scope", [])
    if len(params["language_scope"]) == 0:
        params["language_scope"] = SUPPORTED_LANGUAGES_PYCLD3
    if len(params["language_scope"]) == 0:
        raise PluginParamValidationError(
            f"Invalid language scope: {params['language_scope']}")
    logging.info(
        f"Scope of {len(params['language_scope'])} languages: {params['language_scope']}"
    )
    # Minimum score
    params["minimum_score"] = float(recipe_config.get("minimum_score", 0))
    if params["minimum_score"] < 0 or params["minimum_score"] > 1:
        raise PluginParamValidationError(
            "Minimum score must be between 0 and 1")
    logging.info(f"Minimum score for detection: {params['minimum_score']:.2f}")
    # Fallback language
    params["fallback_language"] = recipe_config.get("fallback_language")
    if not params["fallback_language"] or params["fallback_language"] == "None":
        logging.info("No fallback language")
        params["fallback_language"] = ""
    else:
        logging.info(f"Fallback language: {params['fallback_language']}")
    return params
Exemplo n.º 12
0
def get_input_output(has_model_as_second_input=False):

    if len(get_input_names_for_role('new')) == 0:
        raise ValueError('No new dataset.')
    if len(get_output_names_for_role('output_dataset')) == 0:
        raise ValueError('No output dataset.')

    new_dataset_name = get_input_names_for_role('new')[0]
    new_dataset = dataiku.Dataset(new_dataset_name)

    output_dataset_name = get_output_names_for_role('output_dataset')[0]
    output_dataset = dataiku.Dataset(output_dataset_name)

    if has_model_as_second_input:
        if len(get_input_names_for_role('model')) == 0:
            raise ValueError('No input model.')
        model_name = get_input_names_for_role('model')[0]
        model = dataiku.Model(model_name)
        return (new_dataset, model, output_dataset)
    else:
        if len(get_input_names_for_role('original')) == 0:
            raise ValueError('No original dataset.')

        original_dataset_name = get_input_names_for_role('original')[0]
        original_dataset = dataiku.Dataset(original_dataset_name)
        return (new_dataset, original_dataset, output_dataset)
Exemplo n.º 13
0
def main():
    # getting the csv folder address
    havas_logs = Folder(customrecipe.get_input_names_for_role("files_folder")[0])
    havas_logs_path = havas_logs.get_path()

    files_to_process = get_files_to_process(havas_logs_path)

    # preparing dataset to write into
    havas_cost_data = DatasetWrapper(customrecipe.get_output_names_for_role("cost_data")[0])
    # havas_cost_data.dataset.spec_item['appendMode'] = True
    # writing into dataset
    append_files_to_dataset(files_to_process, havas_cost_data)
    # closing dataset and saving lines
    havas_cost_data.close()
Exemplo n.º 14
0
 def get_inputs(self):
     self.input_folder = Folder(
         get_input_names_for_role("input_folder_id")[0])
     output_folder_id = get_output_names_for_role("output_folder_id")[0]
     self.output_folder = Folder(output_folder_id)
     self.output_file_path = get_recipe_config()['output_model_path']
     self.batch_size = int(get_recipe_config()['batch_size'])
     if not get_recipe_config()['show_batch_size']:
         self.batch_size = -1
     self.overwrite_output_model = get_recipe_config(
     )['overwrite_output_model']
     self.model_path = get_recipe_config()['model_path']
     self.model_name = os_splitext(os_split(self.model_path)[1])[0]
     self.float_32 = get_recipe_config()["float_32"]
Exemplo n.º 15
0
def apply_func(func,
               client=None,
               input_dataset="input_dataset",
               output_dataset="output_dataset"):
    input_dataset_name = get_input_names_for_role(input_dataset)[0]
    input_dataset = dataiku.Dataset(input_dataset_name)
    input_df = input_dataset.get_dataframe()

    output_dataset_name = get_output_names_for_role(output_dataset)[0]
    output_dataset = dataiku.Dataset(output_dataset_name)
    client = client or get_client(get_recipe_config())

    output_df = input_df.dropna().apply(
        lambda row: _safe_call(client, row, func), axis=1)
    output_dataset.write_with_schema(output_df)
Exemplo n.º 16
0
def get_results_input_output() -> tuple:
    """Returns input and output datasets after sanity check

    :raises: :class:`ValueError`: Missing input or output dataset(s)

    :returns: input and output datasets
    :rtype: tuple
    """
    input_names = get_input_names_for_role("results")
    output_names = get_output_names_for_role('statistics')
    if len(input_names) == 0:
        raise ValueError("No input dataset.")
    if len(output_names) == 0:
        raise ValueError("No output dataset.")

    input_dataset = dataiku.Dataset(input_names[0])
    output_dataset = dataiku.Dataset(output_names[0])
    return input_dataset, output_dataset
Exemplo n.º 17
0
def get_config():
    config = {}
    config['input_ds'] = dataiku.Dataset(get_input_names_for_role('input_ds')[0])
    config['output_ds'] = dataiku.Dataset(get_output_names_for_role('output_ds')[0])

    for param in ['address_column', 'cache_enabled', 'provider', 'api_key', 'here_app_id', 'here_app_code', 'google_client', 'google_client_secret']:
        config[param] = get_recipe_config().get(param, None)

    config['batch_enabled'] = get_recipe_config().get('batch_enabled', False) \
        and (config['provider'] == 'bing' or config['provider'] == 'mapquest' or config['provider'] == 'uscensus')

    config['batch_size'] = {
        'bing': get_recipe_config().get('batch_size_bing', 50),
        'mapquest': 100,
        'uscensus': get_recipe_config().get('batch_size_uscensus', 1000)
    }.get(config['provider'], 0)

    config['batch_timeout'] = {
        'bing': 10,
        'mapquest': 30,
        'uscensus': 1800
    }.get(config['provider'], 0)

    if get_plugin_config().get('cache_location', 'original') == 'original':
        config['cache_location'] = os.environ["DIP_HOME"] + '/caches/plugins/geocoder/forward'
    else:
        config['cache_location'] = get_plugin_config().get('cache_location_custom', '')

    config['cache_size'] = get_plugin_config().get('forward_cache_size', 1000) * 1000
    config['cache_eviction'] = get_plugin_config().get('forward_cache_policy', 'least-recently-stored')

    prefix = get_recipe_config().get('column_prefix', '')
    for column_name in ['latitude', 'longitude']:
        config[column_name] = prefix + column_name

    if config['provider'] is None:
        raise AttributeError('Please select a geocoding provider.')

    return config
def load_training_config(recipe_config):
    """Utility function to load, resolve and validate all training recipe config into a clean `params` dictionary

    Returns:
        Dictionary of parameter names (key) and values
    """
    params = {}

    input_dataset_name = get_input_names_for_role("input_dataset")[0]
    params["training_dataset"] = dataiku.Dataset(input_dataset_name)
    training_dataset_columns = [
        p["name"] for p in params["training_dataset"].read_schema()
    ]

    model_folder_name = get_output_names_for_role("model_folder")[0]
    params["model_folder"] = dataiku.Folder(model_folder_name)
    params["partition_root"] = get_folder_partition_root(
        params["model_folder"])
    check_only_one_read_partition(params["partition_root"],
                                  params["training_dataset"])

    evaluation_dataset_name = get_output_names_for_role(
        "evaluation_dataset")[0]
    params["evaluation_dataset"] = dataiku.Dataset(evaluation_dataset_name)

    params["make_forecasts"] = False
    evaluation_forecasts_dataset_names = get_output_names_for_role(
        "evaluation_forecasts_dataset")
    if len(evaluation_forecasts_dataset_names) > 0:
        params["evaluation_forecasts_dataset"] = dataiku.Dataset(
            evaluation_forecasts_dataset_names[0])
        params["make_forecasts"] = True

    params["time_column_name"] = recipe_config.get("time_column")
    if params["time_column_name"] is None:
        raise PluginParamValidationError(
            "Time column is mandatory:, please select one")
    elif params["time_column_name"] not in training_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid time column selection: {params['time_column_name']}")

    params["target_columns_names"] = sanitize_column_list(
        recipe_config.get("target_columns"))
    if len(params["target_columns_names"]) == 0 or not all(
            column in training_dataset_columns
            for column in params["target_columns_names"]):
        raise PluginParamValidationError(
            f"Invalid target column(s) selection: {params['target_columns_names']}"
        )
    params["target_columns_names"] = reorder_column_list(
        params["target_columns_names"], training_dataset_columns)

    long_format = recipe_config.get("additional_columns", False)
    if long_format:
        params["timeseries_identifiers_names"] = sanitize_column_list(
            recipe_config.get("timeseries_identifiers", []))
        if not all(column in training_dataset_columns
                   for column in params["timeseries_identifiers_names"]):
            raise PluginParamValidationError(
                f"Invalid time series identifiers selection: {params['timeseries_identifiers_names']}"
            )
    else:
        params["timeseries_identifiers_names"] = []

    params["is_training_multivariate"] = True if (
        len(params["target_columns_names"]) > 1) or (
            len(params["timeseries_identifiers_names"]) > 0) else False

    if long_format and len(params["timeseries_identifiers_names"]) == 0:
        raise PluginParamValidationError(
            "Long format is activated but no time series identifiers have been provided"
        )

    external_feature_activated = recipe_config.get(
        "external_feature_activated", False)
    if external_feature_activated:
        params["external_features_columns_names"] = sanitize_column_list(
            recipe_config.get("external_feature_columns", []))
    else:
        params["external_features_columns_names"] = []
    if not all(column in training_dataset_columns
               for column in params["external_features_columns_names"]):
        raise PluginParamValidationError(
            f"Invalid external features selection: {params['external_features_columns_names']}"
        )

    params["frequency_unit"] = recipe_config.get("frequency_unit")

    if params["frequency_unit"] == "W":
        params[
            "frequency"] = f"W-{recipe_config.get('frequency_end_of_week', 1)}"
    elif params["frequency_unit"] == "H":
        params[
            "frequency"] = f"{recipe_config.get('frequency_step_hours', 1)}H"
    elif params["frequency_unit"] == "min":
        params[
            "frequency"] = f"{recipe_config.get('frequency_step_minutes', 1)}min"
    else:
        params["frequency"] = params["frequency_unit"]

    params["prediction_length"] = recipe_config.get("prediction_length")
    if not params["prediction_length"]:
        raise PluginParamValidationError("Please specify forecasting horizon")

    params["season_length"] = recipe_config.get(
        f"season_length_{params['frequency_unit']}", 1)
    if params["season_length"] < 1:
        raise PluginParamValidationError("Seasonality must be higher than 1")

    params["use_gpu"] = recipe_config.get("use_gpu", False)
    if params["use_gpu"]:
        params["gpu_location"] = recipe_config.get("gpu_location", "local_gpu")
        if params["gpu_location"] == "local_gpu":
            gpu_devices = recipe_config.get("gpu_devices", [])
            params["gpu_devices"] = parse_gpu_devices(gpu_devices)
        else:
            params["gpu_devices"] = [GPU_CONFIGURATION.CONTAINER_GPU]
    else:
        params["gpu_devices"] = None

    params["forecasting_style"] = recipe_config.get("forecasting_style",
                                                    "auto")
    params["epoch"] = recipe_config.get("epoch", 10)
    params["batch_size"] = recipe_config.get("batch_size", 32)

    params["auto_num_batches_per_epoch"] = recipe_config.get(
        "auto_num_batches_per_epoch", True)
    if params["auto_num_batches_per_epoch"]:
        params["num_batches_per_epoch"] = -1
    else:
        params["num_batches_per_epoch"] = recipe_config.get(
            "num_batches_per_epoch", 50)

    if params["num_batches_per_epoch"] == 0:
        raise PluginParamValidationError(
            "Number of batches per epoch cannot be 0")

    # Overwrite values in case of autoML mode selected
    params = automl_params_overwrite(params)

    params["sampling_method"] = recipe_config.get("sampling_method",
                                                  "last_records")
    params["max_timeseries_length"] = None
    if params["sampling_method"] == "last_records":
        params["max_timeseries_length"] = recipe_config.get(
            "number_records", 10000)
        if params["max_timeseries_length"] < 4:
            raise PluginParamValidationError(
                "Number of records must be higher than 4")

    params["evaluation_strategy"] = "split"
    params["evaluation_only"] = False

    printable_params = {
        param: value
        for param, value in params.items()
        if "dataset" not in param and "folder" not in param
    }
    logger.info(f"Recipe parameters: {printable_params}")
    return params
Exemplo n.º 19
0
# -*- coding: utf-8 -*-
import dataiku
from dataiku.customrecipe import get_input_names_for_role, get_recipe_config, get_output_names_for_role
from jira_client import JiraClient
from utils import de_float_column
import pandas as pd

input_datasets_name = get_input_names_for_role('input_datasets_name')
config = get_recipe_config()

id_column_name = config.get('id_column_name')
id_list_df = dataiku.Dataset(input_datasets_name[0]).get_dataframe()
id_list_df_types = id_list_df.dtypes
de_float_column(id_list_df, id_column_name)

queue_id_column_name = config.get('queue_id_column_name', None)
de_float_column(id_list_df, queue_id_column_name)

access_type = get_recipe_config()['access_type']
connection_details = get_recipe_config()[access_type]
endpoint_name = get_recipe_config()['endpoint_name']
expand = get_recipe_config()['expand']

client = JiraClient(connection_details)
client.start_session(endpoint_name)

results = []
for index in id_list_df.index:
    jira_id = id_list_df[id_column_name][index]
    indexes_columns = {"jira_id": jira_id}
    if queue_id_column_name is not None:
Exemplo n.º 20
0
    "name": "salesforce_record_id",
    "type": "string"
}, {
    "name": "data",
    "type": "object"
}])

# Read configuration
config = get_recipe_config()
object_name = config.get('object_name', None)
if object_name is None:
    raise Exception("Object name has to be set")

client = SalesforceClient(config)

incoming_dataset_name = get_input_names_for_role('incoming_dataset_name')
incoming_dataset = dataiku.Dataset(incoming_dataset_name[0])
incoming_dataset_df = incoming_dataset.get_dataframe()
writer = output.get_writer()
json_dataset = json.loads(
    incoming_dataset_df.to_json(orient="records")
)  # turning row into json would get None int to be replaced by NaN
for salesforce_record in json_dataset:
    salesforce_record_id = salesforce_record.pop("Id", None)
    if salesforce_record_id is None:
        response = client.create_record(object_name, salesforce_record)
        writer.write_row_dict({
            "operation": "Added",
            "error": response.get("error", None),
            "salesforce_record_id": response.get("id", None),
            "data": json.dumps(salesforce_record)
Exemplo n.º 21
0
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
import pandas as pd
from dku_idtb_decision_tree.tree import Tree
from dku_idtb_scoring.score import score, write_with_schema
from dku_idtb_compatibility.utils import safe_str
from dataiku.doctor.prediction.reg_evaluation_recipe import compute_multiclass_metrics, compute_binary_classification_metrics

input_dataset = dataiku.Dataset(get_input_names_for_role("input_dataset")[0])
scored_dataset = dataiku.Dataset(
    get_output_names_for_role("scored_dataset")[0])
metrics_dataset = dataiku.Dataset(
    get_output_names_for_role("metrics_dataset")[0])
folder = dataiku.Folder(get_input_names_for_role("folder")[0])
chunk_size_param = get_recipe_config()["chunk_size"]

try:
    tree = folder.read_json(get_recipe_config()["tree_file"])
except ValueError:
    raise Exception("No tree file named " + get_recipe_config()["tree_file"])

tree["df"] = input_dataset.get_dataframe()
tree = Tree(**tree)

scored_df = score(tree, input_dataset, chunk_size_param, True)

target_mapping = {
    safe_str(label): index
    for index, label in enumerate(tree.target_values)
}
scored_df_nona = scored_df.dropna(subset=["prediction"])
y_actual, y_pred = scored_df_nona[tree.target], scored_df_nona.prediction
Exemplo n.º 22
0
def get_input_dataset(role):
    names = get_input_names_for_role(role)
    return dataiku.Dataset(names[0]) if len(names) > 0 else None
# Params for parallelization
column_prefix = "translation_api"
parallel_workers = api_configuration_preset.get("parallel_workers")
error_handling = ErrorHandlingEnum[get_recipe_config().get("error_handling")]

# Params for translation
client = get_client(api_configuration_preset.get("gcp_service_account_key"))
api_quota_rate_limit = api_configuration_preset.get("api_quota_rate_limit")
api_quota_period = api_configuration_preset.get("api_quota_period")

# ==============================================================================
# DEFINITIONS
# ==============================================================================

input_dataset = dataiku.Dataset(get_input_names_for_role("input_dataset")[0])
output_dataset = dataiku.Dataset(
    get_output_names_for_role("output_dataset")[0])
validate_column_input(text_column,
                      [col["name"] for col in input_dataset.read_schema()])
input_df = input_dataset.get_dataframe()


@retry((RateLimitException, OSError), delay=api_quota_period, tries=5)
@limits(calls=api_quota_rate_limit, period=api_quota_period)
def call_translation_api(row: Dict,
                         text_column: AnyStr,
                         target_language: AnyStr,
                         source_language: AnyStr = None) -> AnyStr:
    text = row[text_column]
    if not isinstance(text, str) or str(text).strip() == "":
def load_input_output_params(recipe_id: RecipeID) -> Dict:
    """Load and validate input/output parameters for both indexing and search recipes

    Returns:
        Dictionary of parameter names (key) and values

    Raises:
        PluginParamValidationError: If a parameter is not valid

    """
    params = {}
    # Index folder
    if recipe_id == RecipeID.SIMILARITY_SEARCH_INDEX:
        output_folder_names = get_output_names_for_role("index_folder")
        if len(output_folder_names) == 0:
            raise PluginParamValidationError(
                "Please specify index folder as output")
        params["index_folder"] = dataiku.Folder(output_folder_names[0])
        params["folder_partition_root"] = get_folder_partition_root(
            params["index_folder"])
    elif recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY:
        input_folder_names = get_input_names_for_role("index_folder")
        if len(input_folder_names) == 0:
            raise PluginParamValidationError(
                "Please specify index folder as input")
        params["index_folder"] = dataiku.Folder(input_folder_names[0])
        params["folder_partition_root"] = get_folder_partition_root(
            params["index_folder"], is_input=True)
        check_only_one_read_partition(params["folder_partition_root"],
                                      params["index_folder"])
    # Input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) == 0:
        raise PluginParamValidationError("Please specify input dataset")
    params["input_dataset"] = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [
        p["name"] for p in params["input_dataset"].read_schema()
    ]
    check_only_one_read_partition(params["folder_partition_root"],
                                  params["input_dataset"])
    if recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY:
        if params["index_folder"].read_partitions != params[
                "input_dataset"].read_partitions:
            raise PluginParamValidationError(
                "Inconsistent partitions between index folder and input dataset, please make sure both are partitioned with the same dimensions"
            )
    # Output dataset - only for search recipe
    if recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY:
        output_dataset_names = get_output_names_for_role("output_dataset")
        if len(output_dataset_names) == 0:
            raise PluginParamValidationError("Please specify output dataset")
        params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])
    # Recipe input parameters
    recipe_config = get_recipe_config()
    params["unique_id_column"] = recipe_config.get("unique_id_column")
    if params["unique_id_column"] not in input_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid unique ID column: {params['unique_id_column']}")
    params["feature_columns"] = recipe_config.get("feature_columns", [])
    if not set(params["feature_columns"]).issubset(set(input_dataset_columns)):
        raise PluginParamValidationError(
            f"Invalid feature column(s): {params['feature_columns']}")
    printable_params = {
        k: v
        for k, v in params.items()
        if k not in {"input_dataset", "index_folder", "output_dataset"}
    }
    logging.info(f"Validated input/output parameters: {printable_params}")
    return params
Exemplo n.º 25
0
def get_partitioning_keys(id_list, dku_flow_variables):
    partitioning_keys = {}
    partitioning = id_list.get_config().get("partitioning")
    if partitioning:
        dimensions_types = partitioning.get("dimensions", [])
        dimensions = []
        for dimension_type in dimensions_types:
            dimensions.append(dimension_type.get("name"))
        for dimension in dimensions:
            dimension_src = "DKU_DST_{}".format(dimension)
            if dimension_src in dku_flow_variables:
                partitioning_keys[dimension] = dku_flow_variables.get(dimension_src)
    return partitioning_keys


input_A_names = get_input_names_for_role('input_A_role')
config = get_recipe_config()
dku_flow_variables = dataiku.get_flow_variables()

logger.info("config={}".format(logger.filter_secrets(config)))

credential_parameters = config.get("credential", {})
endpoint_parameters = get_endpoint_parameters(config)
extraction_key = endpoint_parameters.get("extraction_key", "")
is_raw_output = endpoint_parameters.get("raw_output", True)
parameter_columns = [column for column in config.get("parameter_columns", []) if column]
if len(parameter_columns) == 0:
    raise ValueError("There is no parameter column selected.")
parameter_renamings = get_dku_key_values(config.get("parameter_renamings", {}))
custom_key_values = get_dku_key_values(config.get("custom_key_values", {}))
input_parameters_dataset = dataiku.Dataset(input_A_names[0])
Exemplo n.º 26
0
def load_plugin_config_wordcloud() -> Dict:
    """Utility function to validate and load language detection parameters into a clean dictionary

    Returns:
        Dictionary of parameter names (key) and values

    """
    params = {}
    # Input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) != 1:
        raise PluginParamValidationError("Please specify one input dataset")
    input_dataset = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [p["name"] for p in input_dataset.read_schema()]

    # Output folder
    output_folder_names = get_output_names_for_role("output_folder")
    if len(output_folder_names) != 1:
        raise PluginParamValidationError("Please specify one output folder")
    params["output_folder"] = dataiku.Folder(output_folder_names[0])

    # Partition handling
    params["output_partition_path"] = get_folder_partition_root(
        params["output_folder"])

    # Recipe parameters
    recipe_config = get_recipe_config()

    # Text column
    params["text_column"] = recipe_config.get("text_column")
    if params["text_column"] not in input_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid text column selection: {params['text_column']}")
    logging.info(f"Text column: {params['text_column']}")
    # Language selection
    params["language"] = recipe_config.get("language")
    if params["language"] == "language_column":
        params["language_column"] = recipe_config.get("language_column")
        if params["language_column"] not in input_dataset_columns:
            raise PluginParamValidationError(
                f"Invalid language column selection: {params['language_column']}"
            )
        logging.info(f"Language column: {params['language_column']}")
    else:
        if not params["language"]:
            raise PluginParamValidationError("Empty language selection")
        if params["language"] not in SUPPORTED_LANGUAGES_SPACY:
            raise PluginParamValidationError(
                f"Unsupported language code: {params['language']}")
        params["language_column"] = None
        logging.info(f"Language: {params['language']}")
    # Subcharts
    params["subchart_column"] = recipe_config.get("subchart_column")
    # If parameter is saved then cleared, config retrieves ""
    params["subchart_column"] = None if not params[
        "subchart_column"] else params["subchart_column"]
    if params["subchart_column"] and (
        (params["subchart_column"]
         not in input_dataset_columns + ["order66"])):
        raise PluginParamValidationError(
            f"Invalid categorical column selection: {params['subchart_column']}"
        )
    logging.info(f"Subcharts column: {params['subchart_column']}")

    # Input dataframe
    necessary_columns = [
        column for column in set([
            params["text_column"], params["language_column"],
            params["subchart_column"]
        ]) if (column not in [None, "order66"])
    ]
    params["df"] = input_dataset.get_dataframe(columns=necessary_columns)
    if params["df"].empty:
        raise PluginParamValidationError("Dataframe is empty")
    # Check if unsupported languages in multilingual case
    elif params["language_column"]:
        languages = set(params["df"][params["language_column"]].unique())
        unsupported_lang = languages - SUPPORTED_LANGUAGES_SPACY.keys()
        if unsupported_lang:
            raise PluginParamValidationError(
                f"Found {len(unsupported_lang)} unsupported languages: {', '.join(sorted(unsupported_lang))}"
            )

    logging.info(f"Read dataset of shape: {params['df'].shape}")

    return params
Exemplo n.º 27
0
            write_to_dataset(file_info)
        elif upload_details.json()["processing_status"] == "success":
            response = requests.get(
                endpoint + "/documents/" + file_id +
                "/ocr?include_raw_types=false",
                headers=headers,
            )
            ocr_response = response.json()
            create_dataframe(ocr_response, file_name)
            logger.error("Extracted OCR from document {}".format(file_name))
        else:
            logger.error(
                "Error extracting OCR from document {}".format(file_name))


input_folder = get_input_names_for_role("ocr_file_upload")
output_dataset = get_output_names_for_role("ocr_data")
input_handle = dataiku.Folder(input_folder[0])
findataset = dataiku.Dataset(output_dataset[0])
cred = get_recipe_config()["credentials"]
usr = cred["login_credentials"]["user"]
pwd = cred["login_credentials"]["password"]
endpoint = "https://api.natif.ai"
ocr_dataframe = pd.DataFrame()
allowed_filetypes = ["jpg", "jpeg", "tif", "tiff", "png", "pdf", "gif"]
# Global lists that contain the dataset data width,height,text,file_name,entropy,x1_pos,x2_pos,y1_pos,y2_pos,page_count,box_id
width = list()
height = list()
text = list()
file_name = list()
entropy = list()
Exemplo n.º 28
0
# -*- coding: utf-8 -*-
import datetime
import dataiku
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
from googlesheets import get_spreadsheet
from gspread.utils import rowcol_to_a1

# Input
input_name = get_input_names_for_role('input_role')[0]
input_dataset = dataiku.Dataset(input_name)
input_schema = input_dataset.read_schema()

# Output
output_name = get_output_names_for_role('output_role')[0]
output_dataset = dataiku.Dataset(output_name)
output_dataset.write_schema(input_schema)

# Get configuration
config = get_recipe_config()
credentials = config.get("credentials")
doc_id = config.get("doc_id")
tab_id = config.get("tab_id")
insert_format = config.get("insert_format")

# Load worksheet
ws = get_spreadsheet(credentials, doc_id, tab_id)


# Make available a method of later version of gspread (probably 3.4.0)
# from https://github.com/burnash/gspread/pull/556
def append_rows(self, values, value_input_option='RAW'):
def load_config_and_data_wordcloud() -> Tuple[PluginParams, pd.DataFrame]:
    """Utility function to:
        - Validate and load wordcloud parameters into a clean class
        - Validate input data, keep only necessary columns and drop invalid rows

    Returns:
        - Class instance with parameter names as attributes and associated values
        - Pandas DataFrame with necessary input data
    """

    params = PluginParams()
    # Input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) != 1:
        raise PluginParamValidationError("Please specify one input dataset")
    input_dataset = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [p["name"] for p in input_dataset.read_schema()]

    # Output folder
    output_folder_names = get_output_names_for_role("output_folder")
    if len(output_folder_names) != 1:
        raise PluginParamValidationError("Please specify one output folder")
    params.output_folder = dataiku.Folder(output_folder_names[0])

    # Partition handling
    params.output_partition_path = get_folder_partition_root(
        params.output_folder)

    # Recipe parameters
    recipe_config = get_recipe_config()

    # Text column
    if recipe_config.get("text_column") not in input_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid text column selection: {recipe_config.get('text_column')}"
        )
    params.text_column = recipe_config.get("text_column")
    logging.info(f"Text column: {params.text_column}")
    # Language selection

    if recipe_config.get("language") == "language_column":
        if recipe_config.get("language_column") not in input_dataset_columns:
            raise PluginParamValidationError(
                f"Invalid language column selection: {recipe_config.get('language_column')}"
            )
        params.language = recipe_config.get("language")
        params.language_column = recipe_config.get("language_column")
        logging.info(f"Language column: {params.language_column}")
    else:
        if not recipe_config.get("language"):
            raise PluginParamValidationError("Empty language selection")
        if recipe_config.get("language") not in SUPPORTED_LANGUAGES_SPACY:
            raise PluginParamValidationError(
                f"Unsupported language code: {recipe_config.get('language')}")
        params.language = recipe_config.get("language")
        params.language_column = None
        logging.info(f"Language: {params.language}")

    # Subcharts
    subchart_column = recipe_config.get("subchart_column")
    # If parameter is saved then cleared, config retrieves ""
    subchart_column = None if not subchart_column else subchart_column
    if subchart_column and ((subchart_column
                             not in input_dataset_columns + ["order66"])):
        raise PluginParamValidationError(
            f"Invalid categorical column selection: {subchart_column}")
    params.subchart_column = subchart_column
    logging.info(f"Subcharts column: {params.subchart_column}")

    # Input dataframe
    necessary_columns = [
        column for column in set([
            params.text_column,
            params.language_column,
            params.subchart_column,
        ]) if (column not in [None, "order66"])
    ]
    df = input_dataset.get_dataframe(columns=necessary_columns).dropna(
        subset=necessary_columns)
    if df.empty:
        raise PluginParamValidationError("Dataframe is empty")
    # Check if unsupported languages in multilingual case
    elif params.language_column:
        languages = set(df[params.language_column].unique())
        unsupported_lang = languages - SUPPORTED_LANGUAGES_SPACY.keys()
        if unsupported_lang:
            raise PluginParamValidationError(
                f"Found {len(unsupported_lang)} unsupported languages: {', '.join(sorted(unsupported_lang))}"
            )

    logging.info(f"Read dataset of shape: {df.shape}")

    # Text simplification parameters
    params.remove_stopwords = recipe_config.get("remove_stopwords")
    params.stopwords_folder_path = os.path.join(
        get_recipe_resource(),
        "stopwords") if params.remove_stopwords else None
    params.font_folder_path = os.path.join(get_recipe_resource(), "fonts")
    params.remove_punctuation = recipe_config.get("remove_punctuation")
    params.case_insensitive = recipe_config.get("case_insensitive")
    logging.info(f"Remove stopwords: {params.remove_stopwords}")
    logging.info(f"Stopwords folder path: {params.stopwords_folder_path}")
    logging.info(f"Fonts folder path: {params.font_folder_path}")
    logging.info(f"Remove punctuation: {params.remove_punctuation}")
    logging.info(f"Case-insensitive: {params.case_insensitive}")

    # Display parameters:
    max_words = recipe_config.get("max_words")
    if (not max_words) or not ((isinstance(max_words, int)) and
                               (max_words >= 1)):
        raise PluginParamValidationError(
            "Maximum number of words is not a positive integer")
    params.max_words = max_words
    logging.info(f"Max number of words: {params.max_words}")

    color_palette = recipe_config.get("color_palette")
    if not color_palette:
        raise PluginParamValidationError("Empty color palette selection")
    if color_palette == "custom":
        color_list = recipe_config.get("color_list")
        if not (isinstance(color_list, list) and (len(color_list) >= 1)):
            raise PluginParamValidationError("Empty custom palette")
        if not all(
            [matplotlib.colors.is_color_like(color) for color in color_list]):
            raise PluginParamValidationError(
                f"Invalid custom palette: {color_list}")
        params.color_list = [
            matplotlib.colors.to_hex(color) for color in color_list
        ]
        logging.info(f"Custom palette: {params.color_list}")
    else:
        if color_palette not in {
                builtin_palette["id"]
                for builtin_palette in DSS_BUILTIN_COLOR_PALETTES
        }:
            raise PluginParamValidationError(
                f"Unsupported color palette: {color_palette}")
        selected_palette_dict = [
            builtin_palette for builtin_palette in DSS_BUILTIN_COLOR_PALETTES
            if builtin_palette["id"] == color_palette
        ][0]
        params.color_list = selected_palette_dict["colors"]
        logging.info(
            f"Using built-in DSS palette: '{selected_palette_dict['name']}' with colors: {params.color_list}"
        )

    return params, df
Exemplo n.º 30
0
# See below for why using NLTK_DATA is not possible
# https://stackoverflow.com/questions/44857382/change-nltk-download-path-directory-from-default-ntlk-data/47082481#47082481
cache_folder = os.getenv("NLTK_HOME")
nltk.data.path.append(cache_folder)

##################################
# Find python version
##################################

PY2 = sys.version_info[0] == 2

##################################
# Input data
##################################

input_dataset = get_input_names_for_role('input_dataset')[0]
df = dataiku.Dataset(input_dataset).get_dataframe()

##################################
# Parameters
##################################

recipe_config = get_recipe_config()

text_column_name = recipe_config.get('text_column_name', None)
if text_column_name is None:
    raise ValueError("You did not choose a text column.")

n_sentences = recipe_config.get('n_sentences', None)
if n_sentences is None:
    raise ValueError("You did not set a number of sentences.")
Exemplo n.º 31
0
import dataiku
import meaningcloud
import pandas as pd
from dataiku.customrecipe import (
    get_input_names_for_role,
    get_output_names_for_role,
    get_recipe_config,
    get_plugin_config,
)
from meaningcloud_common import setRequestSource, isBlockingErrorType

# ==============================================================================
# PLUGIN + RECIPE SETTINGS
# ==============================================================================

input_name = get_input_names_for_role("input_dataset")[0]
output_name = get_output_names_for_role("output_dataset")[0]

input_dataset = dataiku.Dataset(input_name)
output_dataset = dataiku.Dataset(output_name)

meaningcloud_connection = get_plugin_config().get("meaningcloud_connection")

license_key = meaningcloud_connection.get("license_key", None)
server = meaningcloud_connection.get("meaningcloud_server",
                                     "https://api.meaningcloud.com")
sentences = int(get_recipe_config().get("sentences", 5))
text_column = get_recipe_config().get("column_name", None)

# ==============================================================================
# AUXILIARY FUNCTIONS