def get_design_input_output() -> tuple: """Returns input and output datasets after sanity check :raises: :class:`ValueError`: Missing input or output dataset(s) :returns: input and output datasets :rtype: tuple """ input_names = get_input_names_for_role("user_list") if len(input_names) == 0: raise ValueError("No input dataset.") output_names = get_output_names_for_role("groups") if len(output_names) == 0: raise ValueError("No output dataset.") input_name = input_names[0] input_dataset = dataiku.Dataset(input_name) folder_ref = get_input_names_for_role('folder') if len(folder_ref) == 0: folder_name = None else: folder_name = folder_ref[0] output_name = output_names[0] output_dataset = dataiku.Dataset(output_name) return input_dataset, folder_name, output_dataset
def load_input_output(config): if not get_input_names_for_role("input_dataset"): raise ValueError("No input dataset.") input_dataset_name = get_input_names_for_role("input_dataset")[0] config.input_dataset = Dataset(input_dataset_name) output_dataset_name = get_output_names_for_role("output_dataset")[0] config.output_dataset = Dataset(output_dataset_name)
def get_input_output(): if len(get_input_names_for_role("input_dataset")) == 0: raise ValueError("No input dataset.") input_dataset_name = get_input_names_for_role("input_dataset")[0] input_dataset = dataiku.Dataset(input_dataset_name) output_folder_name = get_output_names_for_role("output_folder")[0] output_folder = dataiku.Folder(output_folder_name) return (input_dataset, output_folder)
def get_input_output(): if len(get_input_names_for_role('input_dataset')) == 0: raise ValueError('No input dataset.') input_dataset_name = get_input_names_for_role('input_dataset')[0] input_dataset = dataiku.Dataset(input_dataset_name) if len(get_output_names_for_role('output_dataset')) == 0: raise ValueError('No output dataset.') output_dataset_name = get_output_names_for_role('output_dataset')[0] output_dataset = dataiku.Dataset(output_dataset_name) return (input_dataset, output_dataset)
def load_predict_config(): """Utility function to load, resolve and validate all predict recipe config into a clean `params` dictionary Returns: Dictionary of parameter names (key) and values """ params = {} recipe_config = get_recipe_config() # model folder model_folder = dataiku.Folder(get_input_names_for_role("model_folder")[0]) params["model_folder"] = model_folder params["partition_root"] = get_folder_partition_root(params["model_folder"], is_input=True) params["external_features_future_dataset"] = None external_features_future_dataset_names = get_input_names_for_role("external_features_future_dataset") if len(external_features_future_dataset_names) > 0: params["external_features_future_dataset"] = dataiku.Dataset(external_features_future_dataset_names[0]) # output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify Forecast dataset in the 'Input / Output' tab of the recipe") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) check_only_one_read_partition(params["partition_root"], params["model_folder"]) check_only_one_read_partition(params["partition_root"], params["external_features_future_dataset"]) params["manual_selection"] = True if recipe_config.get("model_selection_mode") == "manual" else False params["performance_metric"] = recipe_config.get("performance_metric") params["selected_session"] = recipe_config.get("manually_selected_session", "latest_session") params["selected_model_label"] = recipe_config.get("manually_selected_model_label") params["prediction_length"] = recipe_config.get("prediction_length", -1) params["confidence_interval"] = recipe_config.get("confidence_interval", 95) params["quantiles"] = convert_confidence_interval_to_quantiles(params["confidence_interval"]) params["include_history"] = recipe_config.get("include_history", False) params["sampling_method"] = recipe_config.get("sampling_method", "last_records") params["history_length_limit"] = None if params["sampling_method"] == "last_records": params["history_length_limit"] = recipe_config.get("number_records", 1000) if params["history_length_limit"] < 1: raise PluginParamValidationError("Number of historical records must be higher than 1") printable_params = {param: value for param, value in params.items() if "dataset" not in param and "folder" not in param} logger.info(f"Recipe parameters: {printable_params}") return params
def validate_input_params(self) -> Dict: """Validate input parameters""" input_params = {} input_folder_names = get_input_names_for_role("input_folder") if len(input_folder_names) == 0: raise PluginParamValidationError("Please specify input folder") input_params["input_folder"] = dataiku.Folder(input_folder_names[0]) if self.recipe_id == RecipeID.DOCUMENT_TEXT_DETECTION: file_extensions = GoogleCloudVisionAPIWrapper.SUPPORTED_DOCUMENT_FORMATS self.batch_support = True else: file_extensions = GoogleCloudVisionAPIWrapper.SUPPORTED_IMAGE_FORMATS input_params["input_df"] = generate_path_df( folder=input_params["input_folder"], file_extensions=file_extensions, path_column=PATH_COLUMN) input_folder_type = input_params["input_folder"].get_info().get( "type", "") input_params["input_folder_is_gcs"] = input_folder_type == "GCS" if input_params["input_folder_is_gcs"]: self.batch_support = True input_folder_access_info = input_params["input_folder"].get_info( ).get("accessInfo", {}) input_params["input_folder_bucket"] = input_folder_access_info.get( "bucket") input_params["input_folder_root_path"] = str( input_folder_access_info.get("root", ""))[1:] logging.info( "Input folder is stored on GCS, enabling Batch API feature") else: logging.info( f"Input folder is not stored on GCS ({input_folder_type}), disabling Batch API feature" ) return input_params
def get_config(): config = {} config['input_ds'] = dataiku.Dataset(get_input_names_for_role('input_ds')[0]) config['output_ds'] = dataiku.Dataset(get_output_names_for_role('output_ds')[0]) for param in ['lat_column', 'lng_column', 'provider', 'cache_enabled', 'api_key', 'here_app_id', 'here_app_code', 'google_client', 'google_client_secret']: config[param] = get_recipe_config().get(param, None) config['batch_enabled'] = get_recipe_config().get('batch_enabled', False) \ and (config['provider'] == 'bing') config['batch_size'] = get_recipe_config().get('batch_size_bing', 50) config['features'] = [] prefix = get_recipe_config().get('column_prefix', '') for feature in ['address', 'city', 'postal', 'state', 'country']: if get_recipe_config().get(feature, False): config['features'].append({'name': feature, 'column': prefix + feature}) if get_plugin_config().get('cache_location', 'original') == 'original': config['cache_location'] = os.environ["DIP_HOME"] + '/caches/plugins/geocoder/reverse' else: config['cache_location'] = get_plugin_config().get('cache_location_custom', '') config['cache_size'] = get_plugin_config().get('reverse_cache_size', 1000) * 1000 config['cache_eviction'] = get_plugin_config().get('reverse_cache_policy', 'least-recently-stored') if len(config['features']) == 0: raise AttributeError('Please select at least one feature to extract.') if config['provider'] is None: raise AttributeError('Please select a geocoding provider.') return config
def validate_input_params(self) -> Dict: """Validate input parameters""" input_params_dict = {} input_folder_names = get_input_names_for_role("input_folder") if len(input_folder_names) == 0: raise PluginParamValidationError("Please specify input folder") input_params_dict["input_folder"] = dataiku.Folder( input_folder_names[0]) image_path_list = [ p for p in generate_path_list(input_params_dict["input_folder"]) if p.split(".")[-1].lower() in {"jpeg", "jpg", "png"} ] if len(image_path_list) == 0: raise PluginParamValidationError( "No images of supported format (PNG or JPG) were found in input folder" ) input_params_dict["input_df"] = pd.DataFrame( image_path_list, columns=[IMAGE_PATH_COLUMN]) input_params_dict["input_folder_is_s3"] = input_params_dict[ "input_folder"].get_info().get("type", "") == "S3" if input_params_dict["input_folder_is_s3"]: input_folder_access_info = input_params_dict[ "input_folder"].get_info().get("accessInfo", {}) input_params_dict[ "input_folder_bucket"] = input_folder_access_info.get("bucket") input_params_dict["input_folder_root_path"] = str( input_folder_access_info.get("root", ""))[1:] logging.info( "Input folder is on Amazon S3 with bucket: {} and root path: {}" .format(input_params_dict["input_folder_bucket"], input_params_dict["input_folder_root_path"])) return input_params_dict
def __init__(self): """Instanciate class with DkuConfigLoading and add input datasets to dku_config""" super().__init__() text_input = get_input_names_for_role("document_dataset")[0] self.dku_config.add_param( name="text_input", value=Dataset(text_input), required=True ) ontology_input = get_input_names_for_role("ontology_dataset")[0] self.dku_config.add_param( name="ontology_input", value=Dataset(ontology_input), required=True ) self.document_dataset_columns = [ p["name"] for p in self.dku_config.text_input.read_schema() ] self.ontology_dataset_columns = [ p["name"] for p in self.dku_config.ontology_input.read_schema() ]
def get_inputs(self): self.folder = Folder(get_output_names_for_role("folder_id")[0]) self.output_file_path = get_recipe_config()['output_model_path'] self.overwrite_output_model = get_recipe_config( )['overwrite_output_model'] self.batch_size = int(get_recipe_config()['batch_size']) if not get_recipe_config()['show_batch_size']: self.batch_size = -1 self.model = Model(get_input_names_for_role("saved_model_id")[0]) self.float_32 = get_recipe_config()["float_32"]
def load_plugin_config_langdetect() -> Dict: """Utility function to validate and load language detection parameters into a clean dictionary Returns: Dictionary of parameter names (key) and values """ params = {} # input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) == 0: raise PluginParamValidationError("Please specify input dataset") params["input_dataset"] = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [ p["name"] for p in params["input_dataset"].read_schema() ] # output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output dataset") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) # Recipe parameters recipe_config = get_recipe_config() # Text column params["text_column"] = recipe_config.get("text_column") if params["text_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid text column selection: {params['text_column']}") logging.info(f"Text column: {params['text_column']}") # Language scope params["language_scope"] = recipe_config.get("language_scope", []) if len(params["language_scope"]) == 0: params["language_scope"] = SUPPORTED_LANGUAGES_PYCLD3 if len(params["language_scope"]) == 0: raise PluginParamValidationError( f"Invalid language scope: {params['language_scope']}") logging.info( f"Scope of {len(params['language_scope'])} languages: {params['language_scope']}" ) # Minimum score params["minimum_score"] = float(recipe_config.get("minimum_score", 0)) if params["minimum_score"] < 0 or params["minimum_score"] > 1: raise PluginParamValidationError( "Minimum score must be between 0 and 1") logging.info(f"Minimum score for detection: {params['minimum_score']:.2f}") # Fallback language params["fallback_language"] = recipe_config.get("fallback_language") if not params["fallback_language"] or params["fallback_language"] == "None": logging.info("No fallback language") params["fallback_language"] = "" else: logging.info(f"Fallback language: {params['fallback_language']}") return params
def get_input_output(has_model_as_second_input=False): if len(get_input_names_for_role('new')) == 0: raise ValueError('No new dataset.') if len(get_output_names_for_role('output_dataset')) == 0: raise ValueError('No output dataset.') new_dataset_name = get_input_names_for_role('new')[0] new_dataset = dataiku.Dataset(new_dataset_name) output_dataset_name = get_output_names_for_role('output_dataset')[0] output_dataset = dataiku.Dataset(output_dataset_name) if has_model_as_second_input: if len(get_input_names_for_role('model')) == 0: raise ValueError('No input model.') model_name = get_input_names_for_role('model')[0] model = dataiku.Model(model_name) return (new_dataset, model, output_dataset) else: if len(get_input_names_for_role('original')) == 0: raise ValueError('No original dataset.') original_dataset_name = get_input_names_for_role('original')[0] original_dataset = dataiku.Dataset(original_dataset_name) return (new_dataset, original_dataset, output_dataset)
def main(): # getting the csv folder address havas_logs = Folder(customrecipe.get_input_names_for_role("files_folder")[0]) havas_logs_path = havas_logs.get_path() files_to_process = get_files_to_process(havas_logs_path) # preparing dataset to write into havas_cost_data = DatasetWrapper(customrecipe.get_output_names_for_role("cost_data")[0]) # havas_cost_data.dataset.spec_item['appendMode'] = True # writing into dataset append_files_to_dataset(files_to_process, havas_cost_data) # closing dataset and saving lines havas_cost_data.close()
def get_inputs(self): self.input_folder = Folder( get_input_names_for_role("input_folder_id")[0]) output_folder_id = get_output_names_for_role("output_folder_id")[0] self.output_folder = Folder(output_folder_id) self.output_file_path = get_recipe_config()['output_model_path'] self.batch_size = int(get_recipe_config()['batch_size']) if not get_recipe_config()['show_batch_size']: self.batch_size = -1 self.overwrite_output_model = get_recipe_config( )['overwrite_output_model'] self.model_path = get_recipe_config()['model_path'] self.model_name = os_splitext(os_split(self.model_path)[1])[0] self.float_32 = get_recipe_config()["float_32"]
def apply_func(func, client=None, input_dataset="input_dataset", output_dataset="output_dataset"): input_dataset_name = get_input_names_for_role(input_dataset)[0] input_dataset = dataiku.Dataset(input_dataset_name) input_df = input_dataset.get_dataframe() output_dataset_name = get_output_names_for_role(output_dataset)[0] output_dataset = dataiku.Dataset(output_dataset_name) client = client or get_client(get_recipe_config()) output_df = input_df.dropna().apply( lambda row: _safe_call(client, row, func), axis=1) output_dataset.write_with_schema(output_df)
def get_results_input_output() -> tuple: """Returns input and output datasets after sanity check :raises: :class:`ValueError`: Missing input or output dataset(s) :returns: input and output datasets :rtype: tuple """ input_names = get_input_names_for_role("results") output_names = get_output_names_for_role('statistics') if len(input_names) == 0: raise ValueError("No input dataset.") if len(output_names) == 0: raise ValueError("No output dataset.") input_dataset = dataiku.Dataset(input_names[0]) output_dataset = dataiku.Dataset(output_names[0]) return input_dataset, output_dataset
def get_config(): config = {} config['input_ds'] = dataiku.Dataset(get_input_names_for_role('input_ds')[0]) config['output_ds'] = dataiku.Dataset(get_output_names_for_role('output_ds')[0]) for param in ['address_column', 'cache_enabled', 'provider', 'api_key', 'here_app_id', 'here_app_code', 'google_client', 'google_client_secret']: config[param] = get_recipe_config().get(param, None) config['batch_enabled'] = get_recipe_config().get('batch_enabled', False) \ and (config['provider'] == 'bing' or config['provider'] == 'mapquest' or config['provider'] == 'uscensus') config['batch_size'] = { 'bing': get_recipe_config().get('batch_size_bing', 50), 'mapquest': 100, 'uscensus': get_recipe_config().get('batch_size_uscensus', 1000) }.get(config['provider'], 0) config['batch_timeout'] = { 'bing': 10, 'mapquest': 30, 'uscensus': 1800 }.get(config['provider'], 0) if get_plugin_config().get('cache_location', 'original') == 'original': config['cache_location'] = os.environ["DIP_HOME"] + '/caches/plugins/geocoder/forward' else: config['cache_location'] = get_plugin_config().get('cache_location_custom', '') config['cache_size'] = get_plugin_config().get('forward_cache_size', 1000) * 1000 config['cache_eviction'] = get_plugin_config().get('forward_cache_policy', 'least-recently-stored') prefix = get_recipe_config().get('column_prefix', '') for column_name in ['latitude', 'longitude']: config[column_name] = prefix + column_name if config['provider'] is None: raise AttributeError('Please select a geocoding provider.') return config
def load_training_config(recipe_config): """Utility function to load, resolve and validate all training recipe config into a clean `params` dictionary Returns: Dictionary of parameter names (key) and values """ params = {} input_dataset_name = get_input_names_for_role("input_dataset")[0] params["training_dataset"] = dataiku.Dataset(input_dataset_name) training_dataset_columns = [ p["name"] for p in params["training_dataset"].read_schema() ] model_folder_name = get_output_names_for_role("model_folder")[0] params["model_folder"] = dataiku.Folder(model_folder_name) params["partition_root"] = get_folder_partition_root( params["model_folder"]) check_only_one_read_partition(params["partition_root"], params["training_dataset"]) evaluation_dataset_name = get_output_names_for_role( "evaluation_dataset")[0] params["evaluation_dataset"] = dataiku.Dataset(evaluation_dataset_name) params["make_forecasts"] = False evaluation_forecasts_dataset_names = get_output_names_for_role( "evaluation_forecasts_dataset") if len(evaluation_forecasts_dataset_names) > 0: params["evaluation_forecasts_dataset"] = dataiku.Dataset( evaluation_forecasts_dataset_names[0]) params["make_forecasts"] = True params["time_column_name"] = recipe_config.get("time_column") if params["time_column_name"] is None: raise PluginParamValidationError( "Time column is mandatory:, please select one") elif params["time_column_name"] not in training_dataset_columns: raise PluginParamValidationError( f"Invalid time column selection: {params['time_column_name']}") params["target_columns_names"] = sanitize_column_list( recipe_config.get("target_columns")) if len(params["target_columns_names"]) == 0 or not all( column in training_dataset_columns for column in params["target_columns_names"]): raise PluginParamValidationError( f"Invalid target column(s) selection: {params['target_columns_names']}" ) params["target_columns_names"] = reorder_column_list( params["target_columns_names"], training_dataset_columns) long_format = recipe_config.get("additional_columns", False) if long_format: params["timeseries_identifiers_names"] = sanitize_column_list( recipe_config.get("timeseries_identifiers", [])) if not all(column in training_dataset_columns for column in params["timeseries_identifiers_names"]): raise PluginParamValidationError( f"Invalid time series identifiers selection: {params['timeseries_identifiers_names']}" ) else: params["timeseries_identifiers_names"] = [] params["is_training_multivariate"] = True if ( len(params["target_columns_names"]) > 1) or ( len(params["timeseries_identifiers_names"]) > 0) else False if long_format and len(params["timeseries_identifiers_names"]) == 0: raise PluginParamValidationError( "Long format is activated but no time series identifiers have been provided" ) external_feature_activated = recipe_config.get( "external_feature_activated", False) if external_feature_activated: params["external_features_columns_names"] = sanitize_column_list( recipe_config.get("external_feature_columns", [])) else: params["external_features_columns_names"] = [] if not all(column in training_dataset_columns for column in params["external_features_columns_names"]): raise PluginParamValidationError( f"Invalid external features selection: {params['external_features_columns_names']}" ) params["frequency_unit"] = recipe_config.get("frequency_unit") if params["frequency_unit"] == "W": params[ "frequency"] = f"W-{recipe_config.get('frequency_end_of_week', 1)}" elif params["frequency_unit"] == "H": params[ "frequency"] = f"{recipe_config.get('frequency_step_hours', 1)}H" elif params["frequency_unit"] == "min": params[ "frequency"] = f"{recipe_config.get('frequency_step_minutes', 1)}min" else: params["frequency"] = params["frequency_unit"] params["prediction_length"] = recipe_config.get("prediction_length") if not params["prediction_length"]: raise PluginParamValidationError("Please specify forecasting horizon") params["season_length"] = recipe_config.get( f"season_length_{params['frequency_unit']}", 1) if params["season_length"] < 1: raise PluginParamValidationError("Seasonality must be higher than 1") params["use_gpu"] = recipe_config.get("use_gpu", False) if params["use_gpu"]: params["gpu_location"] = recipe_config.get("gpu_location", "local_gpu") if params["gpu_location"] == "local_gpu": gpu_devices = recipe_config.get("gpu_devices", []) params["gpu_devices"] = parse_gpu_devices(gpu_devices) else: params["gpu_devices"] = [GPU_CONFIGURATION.CONTAINER_GPU] else: params["gpu_devices"] = None params["forecasting_style"] = recipe_config.get("forecasting_style", "auto") params["epoch"] = recipe_config.get("epoch", 10) params["batch_size"] = recipe_config.get("batch_size", 32) params["auto_num_batches_per_epoch"] = recipe_config.get( "auto_num_batches_per_epoch", True) if params["auto_num_batches_per_epoch"]: params["num_batches_per_epoch"] = -1 else: params["num_batches_per_epoch"] = recipe_config.get( "num_batches_per_epoch", 50) if params["num_batches_per_epoch"] == 0: raise PluginParamValidationError( "Number of batches per epoch cannot be 0") # Overwrite values in case of autoML mode selected params = automl_params_overwrite(params) params["sampling_method"] = recipe_config.get("sampling_method", "last_records") params["max_timeseries_length"] = None if params["sampling_method"] == "last_records": params["max_timeseries_length"] = recipe_config.get( "number_records", 10000) if params["max_timeseries_length"] < 4: raise PluginParamValidationError( "Number of records must be higher than 4") params["evaluation_strategy"] = "split" params["evaluation_only"] = False printable_params = { param: value for param, value in params.items() if "dataset" not in param and "folder" not in param } logger.info(f"Recipe parameters: {printable_params}") return params
# -*- coding: utf-8 -*- import dataiku from dataiku.customrecipe import get_input_names_for_role, get_recipe_config, get_output_names_for_role from jira_client import JiraClient from utils import de_float_column import pandas as pd input_datasets_name = get_input_names_for_role('input_datasets_name') config = get_recipe_config() id_column_name = config.get('id_column_name') id_list_df = dataiku.Dataset(input_datasets_name[0]).get_dataframe() id_list_df_types = id_list_df.dtypes de_float_column(id_list_df, id_column_name) queue_id_column_name = config.get('queue_id_column_name', None) de_float_column(id_list_df, queue_id_column_name) access_type = get_recipe_config()['access_type'] connection_details = get_recipe_config()[access_type] endpoint_name = get_recipe_config()['endpoint_name'] expand = get_recipe_config()['expand'] client = JiraClient(connection_details) client.start_session(endpoint_name) results = [] for index in id_list_df.index: jira_id = id_list_df[id_column_name][index] indexes_columns = {"jira_id": jira_id} if queue_id_column_name is not None:
"name": "salesforce_record_id", "type": "string" }, { "name": "data", "type": "object" }]) # Read configuration config = get_recipe_config() object_name = config.get('object_name', None) if object_name is None: raise Exception("Object name has to be set") client = SalesforceClient(config) incoming_dataset_name = get_input_names_for_role('incoming_dataset_name') incoming_dataset = dataiku.Dataset(incoming_dataset_name[0]) incoming_dataset_df = incoming_dataset.get_dataframe() writer = output.get_writer() json_dataset = json.loads( incoming_dataset_df.to_json(orient="records") ) # turning row into json would get None int to be replaced by NaN for salesforce_record in json_dataset: salesforce_record_id = salesforce_record.pop("Id", None) if salesforce_record_id is None: response = client.create_record(object_name, salesforce_record) writer.write_row_dict({ "operation": "Added", "error": response.get("error", None), "salesforce_record_id": response.get("id", None), "data": json.dumps(salesforce_record)
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config import pandas as pd from dku_idtb_decision_tree.tree import Tree from dku_idtb_scoring.score import score, write_with_schema from dku_idtb_compatibility.utils import safe_str from dataiku.doctor.prediction.reg_evaluation_recipe import compute_multiclass_metrics, compute_binary_classification_metrics input_dataset = dataiku.Dataset(get_input_names_for_role("input_dataset")[0]) scored_dataset = dataiku.Dataset( get_output_names_for_role("scored_dataset")[0]) metrics_dataset = dataiku.Dataset( get_output_names_for_role("metrics_dataset")[0]) folder = dataiku.Folder(get_input_names_for_role("folder")[0]) chunk_size_param = get_recipe_config()["chunk_size"] try: tree = folder.read_json(get_recipe_config()["tree_file"]) except ValueError: raise Exception("No tree file named " + get_recipe_config()["tree_file"]) tree["df"] = input_dataset.get_dataframe() tree = Tree(**tree) scored_df = score(tree, input_dataset, chunk_size_param, True) target_mapping = { safe_str(label): index for index, label in enumerate(tree.target_values) } scored_df_nona = scored_df.dropna(subset=["prediction"]) y_actual, y_pred = scored_df_nona[tree.target], scored_df_nona.prediction
def get_input_dataset(role): names = get_input_names_for_role(role) return dataiku.Dataset(names[0]) if len(names) > 0 else None
# Params for parallelization column_prefix = "translation_api" parallel_workers = api_configuration_preset.get("parallel_workers") error_handling = ErrorHandlingEnum[get_recipe_config().get("error_handling")] # Params for translation client = get_client(api_configuration_preset.get("gcp_service_account_key")) api_quota_rate_limit = api_configuration_preset.get("api_quota_rate_limit") api_quota_period = api_configuration_preset.get("api_quota_period") # ============================================================================== # DEFINITIONS # ============================================================================== input_dataset = dataiku.Dataset(get_input_names_for_role("input_dataset")[0]) output_dataset = dataiku.Dataset( get_output_names_for_role("output_dataset")[0]) validate_column_input(text_column, [col["name"] for col in input_dataset.read_schema()]) input_df = input_dataset.get_dataframe() @retry((RateLimitException, OSError), delay=api_quota_period, tries=5) @limits(calls=api_quota_rate_limit, period=api_quota_period) def call_translation_api(row: Dict, text_column: AnyStr, target_language: AnyStr, source_language: AnyStr = None) -> AnyStr: text = row[text_column] if not isinstance(text, str) or str(text).strip() == "":
def load_input_output_params(recipe_id: RecipeID) -> Dict: """Load and validate input/output parameters for both indexing and search recipes Returns: Dictionary of parameter names (key) and values Raises: PluginParamValidationError: If a parameter is not valid """ params = {} # Index folder if recipe_id == RecipeID.SIMILARITY_SEARCH_INDEX: output_folder_names = get_output_names_for_role("index_folder") if len(output_folder_names) == 0: raise PluginParamValidationError( "Please specify index folder as output") params["index_folder"] = dataiku.Folder(output_folder_names[0]) params["folder_partition_root"] = get_folder_partition_root( params["index_folder"]) elif recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY: input_folder_names = get_input_names_for_role("index_folder") if len(input_folder_names) == 0: raise PluginParamValidationError( "Please specify index folder as input") params["index_folder"] = dataiku.Folder(input_folder_names[0]) params["folder_partition_root"] = get_folder_partition_root( params["index_folder"], is_input=True) check_only_one_read_partition(params["folder_partition_root"], params["index_folder"]) # Input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) == 0: raise PluginParamValidationError("Please specify input dataset") params["input_dataset"] = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [ p["name"] for p in params["input_dataset"].read_schema() ] check_only_one_read_partition(params["folder_partition_root"], params["input_dataset"]) if recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY: if params["index_folder"].read_partitions != params[ "input_dataset"].read_partitions: raise PluginParamValidationError( "Inconsistent partitions between index folder and input dataset, please make sure both are partitioned with the same dimensions" ) # Output dataset - only for search recipe if recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY: output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output dataset") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) # Recipe input parameters recipe_config = get_recipe_config() params["unique_id_column"] = recipe_config.get("unique_id_column") if params["unique_id_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid unique ID column: {params['unique_id_column']}") params["feature_columns"] = recipe_config.get("feature_columns", []) if not set(params["feature_columns"]).issubset(set(input_dataset_columns)): raise PluginParamValidationError( f"Invalid feature column(s): {params['feature_columns']}") printable_params = { k: v for k, v in params.items() if k not in {"input_dataset", "index_folder", "output_dataset"} } logging.info(f"Validated input/output parameters: {printable_params}") return params
def get_partitioning_keys(id_list, dku_flow_variables): partitioning_keys = {} partitioning = id_list.get_config().get("partitioning") if partitioning: dimensions_types = partitioning.get("dimensions", []) dimensions = [] for dimension_type in dimensions_types: dimensions.append(dimension_type.get("name")) for dimension in dimensions: dimension_src = "DKU_DST_{}".format(dimension) if dimension_src in dku_flow_variables: partitioning_keys[dimension] = dku_flow_variables.get(dimension_src) return partitioning_keys input_A_names = get_input_names_for_role('input_A_role') config = get_recipe_config() dku_flow_variables = dataiku.get_flow_variables() logger.info("config={}".format(logger.filter_secrets(config))) credential_parameters = config.get("credential", {}) endpoint_parameters = get_endpoint_parameters(config) extraction_key = endpoint_parameters.get("extraction_key", "") is_raw_output = endpoint_parameters.get("raw_output", True) parameter_columns = [column for column in config.get("parameter_columns", []) if column] if len(parameter_columns) == 0: raise ValueError("There is no parameter column selected.") parameter_renamings = get_dku_key_values(config.get("parameter_renamings", {})) custom_key_values = get_dku_key_values(config.get("custom_key_values", {})) input_parameters_dataset = dataiku.Dataset(input_A_names[0])
def load_plugin_config_wordcloud() -> Dict: """Utility function to validate and load language detection parameters into a clean dictionary Returns: Dictionary of parameter names (key) and values """ params = {} # Input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) != 1: raise PluginParamValidationError("Please specify one input dataset") input_dataset = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [p["name"] for p in input_dataset.read_schema()] # Output folder output_folder_names = get_output_names_for_role("output_folder") if len(output_folder_names) != 1: raise PluginParamValidationError("Please specify one output folder") params["output_folder"] = dataiku.Folder(output_folder_names[0]) # Partition handling params["output_partition_path"] = get_folder_partition_root( params["output_folder"]) # Recipe parameters recipe_config = get_recipe_config() # Text column params["text_column"] = recipe_config.get("text_column") if params["text_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid text column selection: {params['text_column']}") logging.info(f"Text column: {params['text_column']}") # Language selection params["language"] = recipe_config.get("language") if params["language"] == "language_column": params["language_column"] = recipe_config.get("language_column") if params["language_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid language column selection: {params['language_column']}" ) logging.info(f"Language column: {params['language_column']}") else: if not params["language"]: raise PluginParamValidationError("Empty language selection") if params["language"] not in SUPPORTED_LANGUAGES_SPACY: raise PluginParamValidationError( f"Unsupported language code: {params['language']}") params["language_column"] = None logging.info(f"Language: {params['language']}") # Subcharts params["subchart_column"] = recipe_config.get("subchart_column") # If parameter is saved then cleared, config retrieves "" params["subchart_column"] = None if not params[ "subchart_column"] else params["subchart_column"] if params["subchart_column"] and ( (params["subchart_column"] not in input_dataset_columns + ["order66"])): raise PluginParamValidationError( f"Invalid categorical column selection: {params['subchart_column']}" ) logging.info(f"Subcharts column: {params['subchart_column']}") # Input dataframe necessary_columns = [ column for column in set([ params["text_column"], params["language_column"], params["subchart_column"] ]) if (column not in [None, "order66"]) ] params["df"] = input_dataset.get_dataframe(columns=necessary_columns) if params["df"].empty: raise PluginParamValidationError("Dataframe is empty") # Check if unsupported languages in multilingual case elif params["language_column"]: languages = set(params["df"][params["language_column"]].unique()) unsupported_lang = languages - SUPPORTED_LANGUAGES_SPACY.keys() if unsupported_lang: raise PluginParamValidationError( f"Found {len(unsupported_lang)} unsupported languages: {', '.join(sorted(unsupported_lang))}" ) logging.info(f"Read dataset of shape: {params['df'].shape}") return params
write_to_dataset(file_info) elif upload_details.json()["processing_status"] == "success": response = requests.get( endpoint + "/documents/" + file_id + "/ocr?include_raw_types=false", headers=headers, ) ocr_response = response.json() create_dataframe(ocr_response, file_name) logger.error("Extracted OCR from document {}".format(file_name)) else: logger.error( "Error extracting OCR from document {}".format(file_name)) input_folder = get_input_names_for_role("ocr_file_upload") output_dataset = get_output_names_for_role("ocr_data") input_handle = dataiku.Folder(input_folder[0]) findataset = dataiku.Dataset(output_dataset[0]) cred = get_recipe_config()["credentials"] usr = cred["login_credentials"]["user"] pwd = cred["login_credentials"]["password"] endpoint = "https://api.natif.ai" ocr_dataframe = pd.DataFrame() allowed_filetypes = ["jpg", "jpeg", "tif", "tiff", "png", "pdf", "gif"] # Global lists that contain the dataset data width,height,text,file_name,entropy,x1_pos,x2_pos,y1_pos,y2_pos,page_count,box_id width = list() height = list() text = list() file_name = list() entropy = list()
# -*- coding: utf-8 -*- import datetime import dataiku from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config from googlesheets import get_spreadsheet from gspread.utils import rowcol_to_a1 # Input input_name = get_input_names_for_role('input_role')[0] input_dataset = dataiku.Dataset(input_name) input_schema = input_dataset.read_schema() # Output output_name = get_output_names_for_role('output_role')[0] output_dataset = dataiku.Dataset(output_name) output_dataset.write_schema(input_schema) # Get configuration config = get_recipe_config() credentials = config.get("credentials") doc_id = config.get("doc_id") tab_id = config.get("tab_id") insert_format = config.get("insert_format") # Load worksheet ws = get_spreadsheet(credentials, doc_id, tab_id) # Make available a method of later version of gspread (probably 3.4.0) # from https://github.com/burnash/gspread/pull/556 def append_rows(self, values, value_input_option='RAW'):
def load_config_and_data_wordcloud() -> Tuple[PluginParams, pd.DataFrame]: """Utility function to: - Validate and load wordcloud parameters into a clean class - Validate input data, keep only necessary columns and drop invalid rows Returns: - Class instance with parameter names as attributes and associated values - Pandas DataFrame with necessary input data """ params = PluginParams() # Input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) != 1: raise PluginParamValidationError("Please specify one input dataset") input_dataset = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [p["name"] for p in input_dataset.read_schema()] # Output folder output_folder_names = get_output_names_for_role("output_folder") if len(output_folder_names) != 1: raise PluginParamValidationError("Please specify one output folder") params.output_folder = dataiku.Folder(output_folder_names[0]) # Partition handling params.output_partition_path = get_folder_partition_root( params.output_folder) # Recipe parameters recipe_config = get_recipe_config() # Text column if recipe_config.get("text_column") not in input_dataset_columns: raise PluginParamValidationError( f"Invalid text column selection: {recipe_config.get('text_column')}" ) params.text_column = recipe_config.get("text_column") logging.info(f"Text column: {params.text_column}") # Language selection if recipe_config.get("language") == "language_column": if recipe_config.get("language_column") not in input_dataset_columns: raise PluginParamValidationError( f"Invalid language column selection: {recipe_config.get('language_column')}" ) params.language = recipe_config.get("language") params.language_column = recipe_config.get("language_column") logging.info(f"Language column: {params.language_column}") else: if not recipe_config.get("language"): raise PluginParamValidationError("Empty language selection") if recipe_config.get("language") not in SUPPORTED_LANGUAGES_SPACY: raise PluginParamValidationError( f"Unsupported language code: {recipe_config.get('language')}") params.language = recipe_config.get("language") params.language_column = None logging.info(f"Language: {params.language}") # Subcharts subchart_column = recipe_config.get("subchart_column") # If parameter is saved then cleared, config retrieves "" subchart_column = None if not subchart_column else subchart_column if subchart_column and ((subchart_column not in input_dataset_columns + ["order66"])): raise PluginParamValidationError( f"Invalid categorical column selection: {subchart_column}") params.subchart_column = subchart_column logging.info(f"Subcharts column: {params.subchart_column}") # Input dataframe necessary_columns = [ column for column in set([ params.text_column, params.language_column, params.subchart_column, ]) if (column not in [None, "order66"]) ] df = input_dataset.get_dataframe(columns=necessary_columns).dropna( subset=necessary_columns) if df.empty: raise PluginParamValidationError("Dataframe is empty") # Check if unsupported languages in multilingual case elif params.language_column: languages = set(df[params.language_column].unique()) unsupported_lang = languages - SUPPORTED_LANGUAGES_SPACY.keys() if unsupported_lang: raise PluginParamValidationError( f"Found {len(unsupported_lang)} unsupported languages: {', '.join(sorted(unsupported_lang))}" ) logging.info(f"Read dataset of shape: {df.shape}") # Text simplification parameters params.remove_stopwords = recipe_config.get("remove_stopwords") params.stopwords_folder_path = os.path.join( get_recipe_resource(), "stopwords") if params.remove_stopwords else None params.font_folder_path = os.path.join(get_recipe_resource(), "fonts") params.remove_punctuation = recipe_config.get("remove_punctuation") params.case_insensitive = recipe_config.get("case_insensitive") logging.info(f"Remove stopwords: {params.remove_stopwords}") logging.info(f"Stopwords folder path: {params.stopwords_folder_path}") logging.info(f"Fonts folder path: {params.font_folder_path}") logging.info(f"Remove punctuation: {params.remove_punctuation}") logging.info(f"Case-insensitive: {params.case_insensitive}") # Display parameters: max_words = recipe_config.get("max_words") if (not max_words) or not ((isinstance(max_words, int)) and (max_words >= 1)): raise PluginParamValidationError( "Maximum number of words is not a positive integer") params.max_words = max_words logging.info(f"Max number of words: {params.max_words}") color_palette = recipe_config.get("color_palette") if not color_palette: raise PluginParamValidationError("Empty color palette selection") if color_palette == "custom": color_list = recipe_config.get("color_list") if not (isinstance(color_list, list) and (len(color_list) >= 1)): raise PluginParamValidationError("Empty custom palette") if not all( [matplotlib.colors.is_color_like(color) for color in color_list]): raise PluginParamValidationError( f"Invalid custom palette: {color_list}") params.color_list = [ matplotlib.colors.to_hex(color) for color in color_list ] logging.info(f"Custom palette: {params.color_list}") else: if color_palette not in { builtin_palette["id"] for builtin_palette in DSS_BUILTIN_COLOR_PALETTES }: raise PluginParamValidationError( f"Unsupported color palette: {color_palette}") selected_palette_dict = [ builtin_palette for builtin_palette in DSS_BUILTIN_COLOR_PALETTES if builtin_palette["id"] == color_palette ][0] params.color_list = selected_palette_dict["colors"] logging.info( f"Using built-in DSS palette: '{selected_palette_dict['name']}' with colors: {params.color_list}" ) return params, df
# See below for why using NLTK_DATA is not possible # https://stackoverflow.com/questions/44857382/change-nltk-download-path-directory-from-default-ntlk-data/47082481#47082481 cache_folder = os.getenv("NLTK_HOME") nltk.data.path.append(cache_folder) ################################## # Find python version ################################## PY2 = sys.version_info[0] == 2 ################################## # Input data ################################## input_dataset = get_input_names_for_role('input_dataset')[0] df = dataiku.Dataset(input_dataset).get_dataframe() ################################## # Parameters ################################## recipe_config = get_recipe_config() text_column_name = recipe_config.get('text_column_name', None) if text_column_name is None: raise ValueError("You did not choose a text column.") n_sentences = recipe_config.get('n_sentences', None) if n_sentences is None: raise ValueError("You did not set a number of sentences.")
import dataiku import meaningcloud import pandas as pd from dataiku.customrecipe import ( get_input_names_for_role, get_output_names_for_role, get_recipe_config, get_plugin_config, ) from meaningcloud_common import setRequestSource, isBlockingErrorType # ============================================================================== # PLUGIN + RECIPE SETTINGS # ============================================================================== input_name = get_input_names_for_role("input_dataset")[0] output_name = get_output_names_for_role("output_dataset")[0] input_dataset = dataiku.Dataset(input_name) output_dataset = dataiku.Dataset(output_name) meaningcloud_connection = get_plugin_config().get("meaningcloud_connection") license_key = meaningcloud_connection.get("license_key", None) server = meaningcloud_connection.get("meaningcloud_server", "https://api.meaningcloud.com") sentences = int(get_recipe_config().get("sentences", 5)) text_column = get_recipe_config().get("column_name", None) # ============================================================================== # AUXILIARY FUNCTIONS