def validate_output_params(self) -> Dict: """Validate output parameters""" output_params = {} # Output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output dataset") output_params["output_dataset"] = dataiku.Dataset( output_dataset_names[0]) # Output folder output_folder_names = get_output_names_for_role("output_folder") if self.recipe_id == RecipeID.DOCUMENT_TEXT_DETECTION or self.recipe_id == RecipeID.CROPPING: if len(output_folder_names) == 0: raise PluginParamValidationError( "Please specify output folder") output_params["output_folder"] = dataiku.Folder( output_folder_names[0]) output_folder_type = output_params["output_folder"].get_info().get( "type", "") output_params["output_folder_is_gcs"] = output_folder_type == "GCS" if output_params["output_folder_is_gcs"]: output_folder_access_info = output_params[ "output_folder"].get_info().get("accessInfo", {}) output_params[ "output_folder_bucket"] = output_folder_access_info.get( "bucket") output_params["output_folder_root_path"] = str( output_folder_access_info.get("root", ""))[1:] logging.info("Output folder is stored on GCS") else: logging.info( f"Output folder is stored on {output_folder_type}") return output_params
def get_input_output(has_model_as_second_input=False): if len(get_input_names_for_role('new')) == 0: raise ValueError('No new dataset.') if len(get_output_names_for_role('output_dataset')) == 0: raise ValueError('No output dataset.') new_dataset_name = get_input_names_for_role('new')[0] new_dataset = dataiku.Dataset(new_dataset_name) output_dataset_name = get_output_names_for_role('output_dataset')[0] output_dataset = dataiku.Dataset(output_dataset_name) if has_model_as_second_input: if len(get_input_names_for_role('model')) == 0: raise ValueError('No input model.') model_name = get_input_names_for_role('model')[0] model = dataiku.Model(model_name) return (new_dataset, model, output_dataset) else: if len(get_input_names_for_role('original')) == 0: raise ValueError('No original dataset.') original_dataset_name = get_input_names_for_role('original')[0] original_dataset = dataiku.Dataset(original_dataset_name) return (new_dataset, original_dataset, output_dataset)
def get_input_output(): if len(get_input_names_for_role('input_dataset')) == 0: raise ValueError('No input dataset.') input_dataset_name = get_input_names_for_role('input_dataset')[0] input_dataset = dataiku.Dataset(input_dataset_name) if len(get_output_names_for_role('output_dataset')) == 0: raise ValueError('No output dataset.') output_dataset_name = get_output_names_for_role('output_dataset')[0] output_dataset = dataiku.Dataset(output_dataset_name) return (input_dataset, output_dataset)
def validate_output_params(self) -> Dict: """Validate output parameters""" output_params_dict = {} # Mandatory output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output folder") output_params_dict["output_dataset"] = dataiku.Dataset( output_dataset_names[0]) # Optional output folder output_folder_names = get_output_names_for_role("output_folder") output_params_dict["output_folder"] = None if len(output_folder_names) != 0: output_params_dict["output_folder"] = dataiku.Folder( output_folder_names[0]) return output_params_dict
def get_design_input_output() -> tuple: """Returns input and output datasets after sanity check :raises: :class:`ValueError`: Missing input or output dataset(s) :returns: input and output datasets :rtype: tuple """ input_names = get_input_names_for_role("user_list") if len(input_names) == 0: raise ValueError("No input dataset.") output_names = get_output_names_for_role("groups") if len(output_names) == 0: raise ValueError("No output dataset.") input_name = input_names[0] input_dataset = dataiku.Dataset(input_name) folder_ref = get_input_names_for_role('folder') if len(folder_ref) == 0: folder_name = None else: folder_name = folder_ref[0] output_name = output_names[0] output_dataset = dataiku.Dataset(output_name) return input_dataset, folder_name, output_dataset
def _add_output_dataset(self): output_dataset_name = get_output_names_for_role("tagged_documents")[0] self.dku_config.add_param( name="output_dataset", value=Dataset(output_dataset_name), required=True, )
def get_config(): config = {} config['input_ds'] = dataiku.Dataset(get_input_names_for_role('input_ds')[0]) config['output_ds'] = dataiku.Dataset(get_output_names_for_role('output_ds')[0]) for param in ['lat_column', 'lng_column', 'provider', 'cache_enabled', 'api_key', 'here_app_id', 'here_app_code', 'google_client', 'google_client_secret']: config[param] = get_recipe_config().get(param, None) config['batch_enabled'] = get_recipe_config().get('batch_enabled', False) \ and (config['provider'] == 'bing') config['batch_size'] = get_recipe_config().get('batch_size_bing', 50) config['features'] = [] prefix = get_recipe_config().get('column_prefix', '') for feature in ['address', 'city', 'postal', 'state', 'country']: if get_recipe_config().get(feature, False): config['features'].append({'name': feature, 'column': prefix + feature}) if get_plugin_config().get('cache_location', 'original') == 'original': config['cache_location'] = os.environ["DIP_HOME"] + '/caches/plugins/geocoder/reverse' else: config['cache_location'] = get_plugin_config().get('cache_location_custom', '') config['cache_size'] = get_plugin_config().get('reverse_cache_size', 1000) * 1000 config['cache_eviction'] = get_plugin_config().get('reverse_cache_policy', 'least-recently-stored') if len(config['features']) == 0: raise AttributeError('Please select at least one feature to extract.') if config['provider'] is None: raise AttributeError('Please select a geocoding provider.') return config
def load_input_output(config): if not get_input_names_for_role("input_dataset"): raise ValueError("No input dataset.") input_dataset_name = get_input_names_for_role("input_dataset")[0] config.input_dataset = Dataset(input_dataset_name) output_dataset_name = get_output_names_for_role("output_dataset")[0] config.output_dataset = Dataset(output_dataset_name)
def get_input_output(): if len(get_input_names_for_role("input_dataset")) == 0: raise ValueError("No input dataset.") input_dataset_name = get_input_names_for_role("input_dataset")[0] input_dataset = dataiku.Dataset(input_dataset_name) output_folder_name = get_output_names_for_role("output_folder")[0] output_folder = dataiku.Folder(output_folder_name) return (input_dataset, output_folder)
def get_inputs(self): self.folder = Folder(get_output_names_for_role("folder_id")[0]) self.output_file_path = get_recipe_config()['output_model_path'] self.overwrite_output_model = get_recipe_config( )['overwrite_output_model'] self.batch_size = int(get_recipe_config()['batch_size']) if not get_recipe_config()['show_batch_size']: self.batch_size = -1 self.model = Model(get_input_names_for_role("saved_model_id")[0]) self.float_32 = get_recipe_config()["float_32"]
def load_plugin_config_langdetect() -> Dict: """Utility function to validate and load language detection parameters into a clean dictionary Returns: Dictionary of parameter names (key) and values """ params = {} # input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) == 0: raise PluginParamValidationError("Please specify input dataset") params["input_dataset"] = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [ p["name"] for p in params["input_dataset"].read_schema() ] # output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output dataset") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) # Recipe parameters recipe_config = get_recipe_config() # Text column params["text_column"] = recipe_config.get("text_column") if params["text_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid text column selection: {params['text_column']}") logging.info(f"Text column: {params['text_column']}") # Language scope params["language_scope"] = recipe_config.get("language_scope", []) if len(params["language_scope"]) == 0: params["language_scope"] = SUPPORTED_LANGUAGES_PYCLD3 if len(params["language_scope"]) == 0: raise PluginParamValidationError( f"Invalid language scope: {params['language_scope']}") logging.info( f"Scope of {len(params['language_scope'])} languages: {params['language_scope']}" ) # Minimum score params["minimum_score"] = float(recipe_config.get("minimum_score", 0)) if params["minimum_score"] < 0 or params["minimum_score"] > 1: raise PluginParamValidationError( "Minimum score must be between 0 and 1") logging.info(f"Minimum score for detection: {params['minimum_score']:.2f}") # Fallback language params["fallback_language"] = recipe_config.get("fallback_language") if not params["fallback_language"] or params["fallback_language"] == "None": logging.info("No fallback language") params["fallback_language"] = "" else: logging.info(f"Fallback language: {params['fallback_language']}") return params
def get_inputs(self): self.input_folder = Folder( get_input_names_for_role("input_folder_id")[0]) output_folder_id = get_output_names_for_role("output_folder_id")[0] self.output_folder = Folder(output_folder_id) self.output_file_path = get_recipe_config()['output_model_path'] self.batch_size = int(get_recipe_config()['batch_size']) if not get_recipe_config()['show_batch_size']: self.batch_size = -1 self.overwrite_output_model = get_recipe_config( )['overwrite_output_model'] self.model_path = get_recipe_config()['model_path'] self.model_name = os_splitext(os_split(self.model_path)[1])[0] self.float_32 = get_recipe_config()["float_32"]
def main(): # getting the csv folder address havas_logs = Folder(customrecipe.get_input_names_for_role("files_folder")[0]) havas_logs_path = havas_logs.get_path() files_to_process = get_files_to_process(havas_logs_path) # preparing dataset to write into havas_cost_data = DatasetWrapper(customrecipe.get_output_names_for_role("cost_data")[0]) # havas_cost_data.dataset.spec_item['appendMode'] = True # writing into dataset append_files_to_dataset(files_to_process, havas_cost_data) # closing dataset and saving lines havas_cost_data.close()
def apply_func(func, client=None, input_dataset="input_dataset", output_dataset="output_dataset"): input_dataset_name = get_input_names_for_role(input_dataset)[0] input_dataset = dataiku.Dataset(input_dataset_name) input_df = input_dataset.get_dataframe() output_dataset_name = get_output_names_for_role(output_dataset)[0] output_dataset = dataiku.Dataset(output_dataset_name) client = client or get_client(get_recipe_config()) output_df = input_df.dropna().apply( lambda row: _safe_call(client, row, func), axis=1) output_dataset.write_with_schema(output_df)
def load_predict_config(): """Utility function to load, resolve and validate all predict recipe config into a clean `params` dictionary Returns: Dictionary of parameter names (key) and values """ params = {} recipe_config = get_recipe_config() # model folder model_folder = dataiku.Folder(get_input_names_for_role("model_folder")[0]) params["model_folder"] = model_folder params["partition_root"] = get_folder_partition_root(params["model_folder"], is_input=True) params["external_features_future_dataset"] = None external_features_future_dataset_names = get_input_names_for_role("external_features_future_dataset") if len(external_features_future_dataset_names) > 0: params["external_features_future_dataset"] = dataiku.Dataset(external_features_future_dataset_names[0]) # output dataset output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify Forecast dataset in the 'Input / Output' tab of the recipe") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) check_only_one_read_partition(params["partition_root"], params["model_folder"]) check_only_one_read_partition(params["partition_root"], params["external_features_future_dataset"]) params["manual_selection"] = True if recipe_config.get("model_selection_mode") == "manual" else False params["performance_metric"] = recipe_config.get("performance_metric") params["selected_session"] = recipe_config.get("manually_selected_session", "latest_session") params["selected_model_label"] = recipe_config.get("manually_selected_model_label") params["prediction_length"] = recipe_config.get("prediction_length", -1) params["confidence_interval"] = recipe_config.get("confidence_interval", 95) params["quantiles"] = convert_confidence_interval_to_quantiles(params["confidence_interval"]) params["include_history"] = recipe_config.get("include_history", False) params["sampling_method"] = recipe_config.get("sampling_method", "last_records") params["history_length_limit"] = None if params["sampling_method"] == "last_records": params["history_length_limit"] = recipe_config.get("number_records", 1000) if params["history_length_limit"] < 1: raise PluginParamValidationError("Number of historical records must be higher than 1") printable_params = {param: value for param, value in params.items() if "dataset" not in param and "folder" not in param} logger.info(f"Recipe parameters: {printable_params}") return params
def get_results_input_output() -> tuple: """Returns input and output datasets after sanity check :raises: :class:`ValueError`: Missing input or output dataset(s) :returns: input and output datasets :rtype: tuple """ input_names = get_input_names_for_role("results") output_names = get_output_names_for_role('statistics') if len(input_names) == 0: raise ValueError("No input dataset.") if len(output_names) == 0: raise ValueError("No output dataset.") input_dataset = dataiku.Dataset(input_names[0]) output_dataset = dataiku.Dataset(output_names[0]) return input_dataset, output_dataset
def get_config(): config = {} config['input_ds'] = dataiku.Dataset(get_input_names_for_role('input_ds')[0]) config['output_ds'] = dataiku.Dataset(get_output_names_for_role('output_ds')[0]) for param in ['address_column', 'cache_enabled', 'provider', 'api_key', 'here_app_id', 'here_app_code', 'google_client', 'google_client_secret']: config[param] = get_recipe_config().get(param, None) config['batch_enabled'] = get_recipe_config().get('batch_enabled', False) \ and (config['provider'] == 'bing' or config['provider'] == 'mapquest' or config['provider'] == 'uscensus') config['batch_size'] = { 'bing': get_recipe_config().get('batch_size_bing', 50), 'mapquest': 100, 'uscensus': get_recipe_config().get('batch_size_uscensus', 1000) }.get(config['provider'], 0) config['batch_timeout'] = { 'bing': 10, 'mapquest': 30, 'uscensus': 1800 }.get(config['provider'], 0) if get_plugin_config().get('cache_location', 'original') == 'original': config['cache_location'] = os.environ["DIP_HOME"] + '/caches/plugins/geocoder/forward' else: config['cache_location'] = get_plugin_config().get('cache_location_custom', '') config['cache_size'] = get_plugin_config().get('forward_cache_size', 1000) * 1000 config['cache_eviction'] = get_plugin_config().get('forward_cache_policy', 'least-recently-stored') prefix = get_recipe_config().get('column_prefix', '') for column_name in ['latitude', 'longitude']: config[column_name] = prefix + column_name if config['provider'] is None: raise AttributeError('Please select a geocoding provider.') return config
def get_output_dataset(role): names = get_output_names_for_role(role) return dataiku.Dataset(names[0]) if len(names) > 0 else None
# -*- coding: utf-8 -*- import dataiku from dataiku.customrecipe import get_output_names_for_role, get_recipe_config import oncrawl as oc from oncrawl import oncrawlDataAPI as ocd from oncrawl import oncrawlProjectAPI as ocp output_names = get_output_names_for_role('output') output_datasets = [dataiku.Dataset(name) for name in output_names] output = output_datasets[0] #------------------------------config & vars config = get_recipe_config() #config checker to raise better error e = None if 'api_key' not in config.keys(): e = 'Please add your API key' if 'list_projects_id_name' not in config.keys() or len( config['list_projects_id_name'].keys()) == 0: e = 'Your Oncrawl account seems to have no projects available. Please check with your Oncrawl account.' if 'list_configs_crawls' not in config.keys() or len( config['list_configs_crawls'].keys( )) == 0 or 'list_crawls_project' not in config.keys() or len( config['list_crawls_project'].keys()) == 0: e = 'Your Oncrawl account seems to have no crawls available. Please check the choosen project and date range with your Oncrawl account.' if e is not None: raise Exception(e)
format='Warp10 recipe %(levelname)s - %(message)s') recipe_config = get_recipe_config() warp10_connection = recipe_config.get('warp10_connection', None) warpscript = recipe_config.get('code', None) if not warp10_connection: raise ValueError('No Warp10 connection defined') if not warpscript: raise ValueError('No WarpScript code entered') warp10_client = Warp10Client(warp10_connection) logger.info('Appending UPDATE function to end of WarpScript code') warpscript = warpscript + "\n'{}' UPDATE".format(warp10_connection['write_token']) result = warp10_client.exec_warpscript(warpscript) have_folder = get_output_names_for_role('main_output') if have_folder: # Semi-dummy output since there is really nothing to do at this point output_folder_name = get_output_names_for_role('main_output')[0] output_folder = dataiku.Folder(output_folder_name) filename = 'Run_{}.txt'.format(datetime.now().strftime('%Y-%m-%dT%H-%M-%S-%f')[:-3]) logger.info('Writing response file {} in output folder', filename) with open(os.path.join(output_folder.get_path(), filename), 'w') as results_file: results_file.write('Response of successful WarpScript execution:\n' + result)
def load_input_output_params(recipe_id: RecipeID) -> Dict: """Load and validate input/output parameters for both indexing and search recipes Returns: Dictionary of parameter names (key) and values Raises: PluginParamValidationError: If a parameter is not valid """ params = {} # Index folder if recipe_id == RecipeID.SIMILARITY_SEARCH_INDEX: output_folder_names = get_output_names_for_role("index_folder") if len(output_folder_names) == 0: raise PluginParamValidationError( "Please specify index folder as output") params["index_folder"] = dataiku.Folder(output_folder_names[0]) params["folder_partition_root"] = get_folder_partition_root( params["index_folder"]) elif recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY: input_folder_names = get_input_names_for_role("index_folder") if len(input_folder_names) == 0: raise PluginParamValidationError( "Please specify index folder as input") params["index_folder"] = dataiku.Folder(input_folder_names[0]) params["folder_partition_root"] = get_folder_partition_root( params["index_folder"], is_input=True) check_only_one_read_partition(params["folder_partition_root"], params["index_folder"]) # Input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) == 0: raise PluginParamValidationError("Please specify input dataset") params["input_dataset"] = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [ p["name"] for p in params["input_dataset"].read_schema() ] check_only_one_read_partition(params["folder_partition_root"], params["input_dataset"]) if recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY: if params["index_folder"].read_partitions != params[ "input_dataset"].read_partitions: raise PluginParamValidationError( "Inconsistent partitions between index folder and input dataset, please make sure both are partitioned with the same dimensions" ) # Output dataset - only for search recipe if recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY: output_dataset_names = get_output_names_for_role("output_dataset") if len(output_dataset_names) == 0: raise PluginParamValidationError("Please specify output dataset") params["output_dataset"] = dataiku.Dataset(output_dataset_names[0]) # Recipe input parameters recipe_config = get_recipe_config() params["unique_id_column"] = recipe_config.get("unique_id_column") if params["unique_id_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid unique ID column: {params['unique_id_column']}") params["feature_columns"] = recipe_config.get("feature_columns", []) if not set(params["feature_columns"]).issubset(set(input_dataset_columns)): raise PluginParamValidationError( f"Invalid feature column(s): {params['feature_columns']}") printable_params = { k: v for k, v in params.items() if k not in {"input_dataset", "index_folder", "output_dataset"} } logging.info(f"Validated input/output parameters: {printable_params}") return params
credential_parameters = config.get("credential", {}) endpoint_parameters = get_endpoint_parameters(config) extraction_key = endpoint_parameters.get("extraction_key", "") is_raw_output = endpoint_parameters.get("raw_output", True) parameter_columns = [column for column in config.get("parameter_columns", []) if column] if len(parameter_columns) == 0: raise ValueError("There is no parameter column selected.") parameter_renamings = get_dku_key_values(config.get("parameter_renamings", {})) custom_key_values = get_dku_key_values(config.get("custom_key_values", {})) input_parameters_dataset = dataiku.Dataset(input_A_names[0]) partitioning_keys = get_partitioning_keys(input_parameters_dataset, dku_flow_variables) custom_key_values.update(partitioning_keys) input_parameters_dataframe = input_parameters_dataset.get_dataframe() recipe_session = RestApiRecipeSession( custom_key_values, credential_parameters, endpoint_parameters, extraction_key, parameter_columns, parameter_renamings ) results = recipe_session.process_dataframe(input_parameters_dataframe, is_raw_output) output_names_stats = get_output_names_for_role('api_output') odf = pd.DataFrame(results) if odf.size > 0: api_output = dataiku.Dataset(output_names_stats[0]) api_output.write_with_schema(odf)
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config import pandas as pd from dku_idtb_decision_tree.tree import Tree from dku_idtb_scoring.score import score, write_with_schema from dku_idtb_compatibility.utils import safe_str from dataiku.doctor.prediction.reg_evaluation_recipe import compute_multiclass_metrics, compute_binary_classification_metrics input_dataset = dataiku.Dataset(get_input_names_for_role("input_dataset")[0]) scored_dataset = dataiku.Dataset( get_output_names_for_role("scored_dataset")[0]) metrics_dataset = dataiku.Dataset( get_output_names_for_role("metrics_dataset")[0]) folder = dataiku.Folder(get_input_names_for_role("folder")[0]) chunk_size_param = get_recipe_config()["chunk_size"] try: tree = folder.read_json(get_recipe_config()["tree_file"]) except ValueError: raise Exception("No tree file named " + get_recipe_config()["tree_file"]) tree["df"] = input_dataset.get_dataframe() tree = Tree(**tree) scored_df = score(tree, input_dataset, chunk_size_param, True) target_mapping = { safe_str(label): index for index, label in enumerate(tree.target_values) } scored_df_nona = scored_df.dropna(subset=["prediction"]) y_actual, y_pred = scored_df_nona[tree.target], scored_df_nona.prediction
# -*- coding: utf-8 -*- import datetime import dataiku from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config from googlesheets import get_spreadsheet from gspread.utils import rowcol_to_a1 # Input input_name = get_input_names_for_role('input_role')[0] input_dataset = dataiku.Dataset(input_name) input_schema = input_dataset.read_schema() # Output output_name = get_output_names_for_role('output_role')[0] output_dataset = dataiku.Dataset(output_name) output_dataset.write_schema(input_schema) # Get configuration config = get_recipe_config() credentials = config.get("credentials") doc_id = config.get("doc_id") tab_id = config.get("tab_id") insert_format = config.get("insert_format") # Load worksheet ws = get_spreadsheet(credentials, doc_id, tab_id) # Make available a method of later version of gspread (probably 3.4.0) # from https://github.com/burnash/gspread/pull/556 def append_rows(self, values, value_input_option='RAW'):
jira_id = id_list_df[id_column_name][index] indexes_columns = {"jira_id": jira_id} if queue_id_column_name is not None: queue_id = id_list_df[queue_id_column_name][index] indexes_columns.update({"queue_id": queue_id}) else: queue_id = None data = client.get_endpoint(endpoint_name, jira_id, "", expand=expand, raise_exception=False, queue_id=queue_id) while len(data) > 0: for result in data: record = dict(indexes_columns) record.update(result) results.append(client.format(record)) if client.pagination.is_next_page(): data = client.get_next_page() else: break output_names_stats = get_output_names_for_role('jira_output') odf = pd.DataFrame(results) if odf.size > 0: jira_output = dataiku.Dataset(output_names_stats[0]) jira_output.write_with_schema(odf)
# -*- coding: utf-8 -*- import dataiku from dataiku.customrecipe import get_output_names_for_role, get_recipe_config, get_input_names_for_role import json from salesforce import SalesforceClient # Output output_name = get_output_names_for_role('main')[0] output = dataiku.Dataset(output_name) output.write_schema([{ "name": "operation", "type": "string" }, { "name": "error", "type": "string" }, { "name": "salesforce_record_id", "type": "string" }, { "name": "data", "type": "object" }]) # Read configuration config = get_recipe_config() object_name = config.get('object_name', None) if object_name is None: raise Exception("Object name has to be set") client = SalesforceClient(config)
from api_formatting import get_query import dataiku from dataiku.customrecipe import (get_recipe_config, get_output_names_for_role) # ============================================================================== # SETUP # ============================================================================== api_configuration_preset = get_recipe_config().get("api_configuration_preset") if api_configuration_preset is None or api_configuration_preset == {}: raise ValueError("Please specify an API configuration preset") HEADERS = { "authorization": "Bearer " + api_configuration_preset.get("access_token") } groups_name = get_output_names_for_role("campaign_group_dataset")[0] groups_dataset = dataiku.Dataset(groups_name) campaigns_names = get_output_names_for_role("campaign_dataset")[0] campaigns_dataset = dataiku.Dataset(campaigns_names) creatives_names = get_output_names_for_role("creative_dataset")[0] creatives_dataset = dataiku.Dataset(creatives_names) campaigns_analytics_names = get_output_names_for_role( "campaign_analytics_dataset")[0] campaign_analytics_dataset = dataiku.Dataset(campaigns_analytics_names) creatives_analytics_names = get_output_names_for_role( "creatives_analytics_dataset")[0] creatives_analytics_dataset = dataiku.Dataset(creatives_analytics_names)
def load_training_config(recipe_config): """Utility function to load, resolve and validate all training recipe config into a clean `params` dictionary Returns: Dictionary of parameter names (key) and values """ params = {} input_dataset_name = get_input_names_for_role("input_dataset")[0] params["training_dataset"] = dataiku.Dataset(input_dataset_name) training_dataset_columns = [ p["name"] for p in params["training_dataset"].read_schema() ] model_folder_name = get_output_names_for_role("model_folder")[0] params["model_folder"] = dataiku.Folder(model_folder_name) params["partition_root"] = get_folder_partition_root( params["model_folder"]) check_only_one_read_partition(params["partition_root"], params["training_dataset"]) evaluation_dataset_name = get_output_names_for_role( "evaluation_dataset")[0] params["evaluation_dataset"] = dataiku.Dataset(evaluation_dataset_name) params["make_forecasts"] = False evaluation_forecasts_dataset_names = get_output_names_for_role( "evaluation_forecasts_dataset") if len(evaluation_forecasts_dataset_names) > 0: params["evaluation_forecasts_dataset"] = dataiku.Dataset( evaluation_forecasts_dataset_names[0]) params["make_forecasts"] = True params["time_column_name"] = recipe_config.get("time_column") if params["time_column_name"] is None: raise PluginParamValidationError( "Time column is mandatory:, please select one") elif params["time_column_name"] not in training_dataset_columns: raise PluginParamValidationError( f"Invalid time column selection: {params['time_column_name']}") params["target_columns_names"] = sanitize_column_list( recipe_config.get("target_columns")) if len(params["target_columns_names"]) == 0 or not all( column in training_dataset_columns for column in params["target_columns_names"]): raise PluginParamValidationError( f"Invalid target column(s) selection: {params['target_columns_names']}" ) params["target_columns_names"] = reorder_column_list( params["target_columns_names"], training_dataset_columns) long_format = recipe_config.get("additional_columns", False) if long_format: params["timeseries_identifiers_names"] = sanitize_column_list( recipe_config.get("timeseries_identifiers", [])) if not all(column in training_dataset_columns for column in params["timeseries_identifiers_names"]): raise PluginParamValidationError( f"Invalid time series identifiers selection: {params['timeseries_identifiers_names']}" ) else: params["timeseries_identifiers_names"] = [] params["is_training_multivariate"] = True if ( len(params["target_columns_names"]) > 1) or ( len(params["timeseries_identifiers_names"]) > 0) else False if long_format and len(params["timeseries_identifiers_names"]) == 0: raise PluginParamValidationError( "Long format is activated but no time series identifiers have been provided" ) external_feature_activated = recipe_config.get( "external_feature_activated", False) if external_feature_activated: params["external_features_columns_names"] = sanitize_column_list( recipe_config.get("external_feature_columns", [])) else: params["external_features_columns_names"] = [] if not all(column in training_dataset_columns for column in params["external_features_columns_names"]): raise PluginParamValidationError( f"Invalid external features selection: {params['external_features_columns_names']}" ) params["frequency_unit"] = recipe_config.get("frequency_unit") if params["frequency_unit"] == "W": params[ "frequency"] = f"W-{recipe_config.get('frequency_end_of_week', 1)}" elif params["frequency_unit"] == "H": params[ "frequency"] = f"{recipe_config.get('frequency_step_hours', 1)}H" elif params["frequency_unit"] == "min": params[ "frequency"] = f"{recipe_config.get('frequency_step_minutes', 1)}min" else: params["frequency"] = params["frequency_unit"] params["prediction_length"] = recipe_config.get("prediction_length") if not params["prediction_length"]: raise PluginParamValidationError("Please specify forecasting horizon") params["season_length"] = recipe_config.get( f"season_length_{params['frequency_unit']}", 1) if params["season_length"] < 1: raise PluginParamValidationError("Seasonality must be higher than 1") params["use_gpu"] = recipe_config.get("use_gpu", False) if params["use_gpu"]: params["gpu_location"] = recipe_config.get("gpu_location", "local_gpu") if params["gpu_location"] == "local_gpu": gpu_devices = recipe_config.get("gpu_devices", []) params["gpu_devices"] = parse_gpu_devices(gpu_devices) else: params["gpu_devices"] = [GPU_CONFIGURATION.CONTAINER_GPU] else: params["gpu_devices"] = None params["forecasting_style"] = recipe_config.get("forecasting_style", "auto") params["epoch"] = recipe_config.get("epoch", 10) params["batch_size"] = recipe_config.get("batch_size", 32) params["auto_num_batches_per_epoch"] = recipe_config.get( "auto_num_batches_per_epoch", True) if params["auto_num_batches_per_epoch"]: params["num_batches_per_epoch"] = -1 else: params["num_batches_per_epoch"] = recipe_config.get( "num_batches_per_epoch", 50) if params["num_batches_per_epoch"] == 0: raise PluginParamValidationError( "Number of batches per epoch cannot be 0") # Overwrite values in case of autoML mode selected params = automl_params_overwrite(params) params["sampling_method"] = recipe_config.get("sampling_method", "last_records") params["max_timeseries_length"] = None if params["sampling_method"] == "last_records": params["max_timeseries_length"] = recipe_config.get( "number_records", 10000) if params["max_timeseries_length"] < 4: raise PluginParamValidationError( "Number of records must be higher than 4") params["evaluation_strategy"] = "split" params["evaluation_only"] = False printable_params = { param: value for param, value in params.items() if "dataset" not in param and "folder" not in param } logger.info(f"Recipe parameters: {printable_params}") return params
import meaningcloud import pandas as pd from dataiku.customrecipe import ( get_input_names_for_role, get_output_names_for_role, get_recipe_config, get_plugin_config, ) from meaningcloud_common import setRequestSource, isBlockingErrorType # ============================================================================== # PLUGIN + RECIPE SETTINGS # ============================================================================== input_name = get_input_names_for_role("input_dataset")[0] output_name = get_output_names_for_role("output_dataset")[0] input_dataset = dataiku.Dataset(input_name) output_dataset = dataiku.Dataset(output_name) meaningcloud_connection = get_plugin_config().get("meaningcloud_connection") license_key = meaningcloud_connection.get("license_key", None) server = meaningcloud_connection.get("meaningcloud_server", "https://api.meaningcloud.com") sentences = int(get_recipe_config().get("sentences", 5)) text_column = get_recipe_config().get("column_name", None) # ============================================================================== # AUXILIARY FUNCTIONS # ==============================================================================
sentences = [ str(s) for s in summarizer(parser.document, sentences_count=n_sentences) ] if all_capital: output_sentences = ' '.join(sentences).upper() all_capital = False else: output_sentences = ' '.join(sentences) return output_sentences else: return '' # Checking for existing columns with same name new_column_name = text_column_name + "_summary" if new_column_name in df.columns: j = 1 while new_column_name + "_{}".format(j) in df.columns: j += 1 new_column_name += "_{}".format(j) # Adding a new column with computed summaries df[new_column_name] = [summarize(text) for text in df[text_column_name].values] # Write recipe outputs output_dataset = get_output_names_for_role('output_dataset')[0] dataiku.Dataset(output_dataset).write_with_schema(df)
def load_plugin_config_wordcloud() -> Dict: """Utility function to validate and load language detection parameters into a clean dictionary Returns: Dictionary of parameter names (key) and values """ params = {} # Input dataset input_dataset_names = get_input_names_for_role("input_dataset") if len(input_dataset_names) != 1: raise PluginParamValidationError("Please specify one input dataset") input_dataset = dataiku.Dataset(input_dataset_names[0]) input_dataset_columns = [p["name"] for p in input_dataset.read_schema()] # Output folder output_folder_names = get_output_names_for_role("output_folder") if len(output_folder_names) != 1: raise PluginParamValidationError("Please specify one output folder") params["output_folder"] = dataiku.Folder(output_folder_names[0]) # Partition handling params["output_partition_path"] = get_folder_partition_root( params["output_folder"]) # Recipe parameters recipe_config = get_recipe_config() # Text column params["text_column"] = recipe_config.get("text_column") if params["text_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid text column selection: {params['text_column']}") logging.info(f"Text column: {params['text_column']}") # Language selection params["language"] = recipe_config.get("language") if params["language"] == "language_column": params["language_column"] = recipe_config.get("language_column") if params["language_column"] not in input_dataset_columns: raise PluginParamValidationError( f"Invalid language column selection: {params['language_column']}" ) logging.info(f"Language column: {params['language_column']}") else: if not params["language"]: raise PluginParamValidationError("Empty language selection") if params["language"] not in SUPPORTED_LANGUAGES_SPACY: raise PluginParamValidationError( f"Unsupported language code: {params['language']}") params["language_column"] = None logging.info(f"Language: {params['language']}") # Subcharts params["subchart_column"] = recipe_config.get("subchart_column") # If parameter is saved then cleared, config retrieves "" params["subchart_column"] = None if not params[ "subchart_column"] else params["subchart_column"] if params["subchart_column"] and ( (params["subchart_column"] not in input_dataset_columns + ["order66"])): raise PluginParamValidationError( f"Invalid categorical column selection: {params['subchart_column']}" ) logging.info(f"Subcharts column: {params['subchart_column']}") # Input dataframe necessary_columns = [ column for column in set([ params["text_column"], params["language_column"], params["subchart_column"] ]) if (column not in [None, "order66"]) ] params["df"] = input_dataset.get_dataframe(columns=necessary_columns) if params["df"].empty: raise PluginParamValidationError("Dataframe is empty") # Check if unsupported languages in multilingual case elif params["language_column"]: languages = set(params["df"][params["language_column"]].unique()) unsupported_lang = languages - SUPPORTED_LANGUAGES_SPACY.keys() if unsupported_lang: raise PluginParamValidationError( f"Found {len(unsupported_lang)} unsupported languages: {', '.join(sorted(unsupported_lang))}" ) logging.info(f"Read dataset of shape: {params['df'].shape}") return params