def validate_output_params(self) -> Dict:
     """Validate output parameters"""
     output_params = {}
     # Output dataset
     output_dataset_names = get_output_names_for_role("output_dataset")
     if len(output_dataset_names) == 0:
         raise PluginParamValidationError("Please specify output dataset")
     output_params["output_dataset"] = dataiku.Dataset(
         output_dataset_names[0])
     # Output folder
     output_folder_names = get_output_names_for_role("output_folder")
     if self.recipe_id == RecipeID.DOCUMENT_TEXT_DETECTION or self.recipe_id == RecipeID.CROPPING:
         if len(output_folder_names) == 0:
             raise PluginParamValidationError(
                 "Please specify output folder")
         output_params["output_folder"] = dataiku.Folder(
             output_folder_names[0])
         output_folder_type = output_params["output_folder"].get_info().get(
             "type", "")
         output_params["output_folder_is_gcs"] = output_folder_type == "GCS"
         if output_params["output_folder_is_gcs"]:
             output_folder_access_info = output_params[
                 "output_folder"].get_info().get("accessInfo", {})
             output_params[
                 "output_folder_bucket"] = output_folder_access_info.get(
                     "bucket")
             output_params["output_folder_root_path"] = str(
                 output_folder_access_info.get("root", ""))[1:]
             logging.info("Output folder is stored on GCS")
         else:
             logging.info(
                 f"Output folder is stored on {output_folder_type}")
     return output_params
def get_input_output(has_model_as_second_input=False):

    if len(get_input_names_for_role('new')) == 0:
        raise ValueError('No new dataset.')
    if len(get_output_names_for_role('output_dataset')) == 0:
        raise ValueError('No output dataset.')

    new_dataset_name = get_input_names_for_role('new')[0]
    new_dataset = dataiku.Dataset(new_dataset_name)

    output_dataset_name = get_output_names_for_role('output_dataset')[0]
    output_dataset = dataiku.Dataset(output_dataset_name)

    if has_model_as_second_input:
        if len(get_input_names_for_role('model')) == 0:
            raise ValueError('No input model.')
        model_name = get_input_names_for_role('model')[0]
        model = dataiku.Model(model_name)
        return (new_dataset, model, output_dataset)
    else:
        if len(get_input_names_for_role('original')) == 0:
            raise ValueError('No original dataset.')

        original_dataset_name = get_input_names_for_role('original')[0]
        original_dataset = dataiku.Dataset(original_dataset_name)
        return (new_dataset, original_dataset, output_dataset)
示例#3
0
def get_input_output():
    if len(get_input_names_for_role('input_dataset')) == 0:
        raise ValueError('No input dataset.')
    input_dataset_name = get_input_names_for_role('input_dataset')[0]
    input_dataset = dataiku.Dataset(input_dataset_name)
    if len(get_output_names_for_role('output_dataset')) == 0:
        raise ValueError('No output dataset.')
    output_dataset_name = get_output_names_for_role('output_dataset')[0]
    output_dataset = dataiku.Dataset(output_dataset_name)
    return (input_dataset, output_dataset)
示例#4
0
 def validate_output_params(self) -> Dict:
     """Validate output parameters"""
     output_params_dict = {}
     # Mandatory output dataset
     output_dataset_names = get_output_names_for_role("output_dataset")
     if len(output_dataset_names) == 0:
         raise PluginParamValidationError("Please specify output folder")
     output_params_dict["output_dataset"] = dataiku.Dataset(
         output_dataset_names[0])
     # Optional output folder
     output_folder_names = get_output_names_for_role("output_folder")
     output_params_dict["output_folder"] = None
     if len(output_folder_names) != 0:
         output_params_dict["output_folder"] = dataiku.Folder(
             output_folder_names[0])
     return output_params_dict
示例#5
0
def get_design_input_output() -> tuple:
    """Returns input and output datasets after sanity check

    :raises: :class:`ValueError`: Missing input or output dataset(s)

    :returns: input and output datasets
    :rtype: tuple
    """
    input_names = get_input_names_for_role("user_list")
    if len(input_names) == 0:
        raise ValueError("No input dataset.")
    output_names = get_output_names_for_role("groups")
    if len(output_names) == 0:
        raise ValueError("No output dataset.")

    input_name = input_names[0]
    input_dataset = dataiku.Dataset(input_name)
    folder_ref = get_input_names_for_role('folder')
    if len(folder_ref) == 0:
        folder_name = None
    else:
        folder_name = folder_ref[0]

    output_name = output_names[0]
    output_dataset = dataiku.Dataset(output_name)

    return input_dataset, folder_name, output_dataset
示例#6
0
 def _add_output_dataset(self):
     output_dataset_name = get_output_names_for_role("tagged_documents")[0]
     self.dku_config.add_param(
         name="output_dataset",
         value=Dataset(output_dataset_name),
         required=True,
     )
示例#7
0
def get_config():
    config = {}
    config['input_ds'] = dataiku.Dataset(get_input_names_for_role('input_ds')[0])
    config['output_ds'] = dataiku.Dataset(get_output_names_for_role('output_ds')[0])

    for param in ['lat_column', 'lng_column', 'provider', 'cache_enabled', 'api_key', 'here_app_id', 'here_app_code', 'google_client', 'google_client_secret']:
        config[param] = get_recipe_config().get(param, None)

    config['batch_enabled'] = get_recipe_config().get('batch_enabled', False) \
        and (config['provider'] == 'bing')
    config['batch_size'] = get_recipe_config().get('batch_size_bing', 50)

    config['features'] = []
    prefix = get_recipe_config().get('column_prefix', '')

    for feature in ['address', 'city', 'postal', 'state', 'country']:
        if get_recipe_config().get(feature, False):
            config['features'].append({'name': feature, 'column': prefix + feature})

    if get_plugin_config().get('cache_location', 'original') == 'original':
        config['cache_location'] = os.environ["DIP_HOME"] + '/caches/plugins/geocoder/reverse'
    else:
        config['cache_location'] = get_plugin_config().get('cache_location_custom', '')

    config['cache_size'] = get_plugin_config().get('reverse_cache_size', 1000) * 1000
    config['cache_eviction'] = get_plugin_config().get('reverse_cache_policy', 'least-recently-stored')

    if len(config['features']) == 0:
        raise AttributeError('Please select at least one feature to extract.')

    if config['provider'] is None:
        raise AttributeError('Please select a geocoding provider.')

    return config
def load_input_output(config):
    if not get_input_names_for_role("input_dataset"):
        raise ValueError("No input dataset.")
    input_dataset_name = get_input_names_for_role("input_dataset")[0]
    config.input_dataset = Dataset(input_dataset_name)

    output_dataset_name = get_output_names_for_role("output_dataset")[0]
    config.output_dataset = Dataset(output_dataset_name)
示例#9
0
def get_input_output():
    if len(get_input_names_for_role("input_dataset")) == 0:
        raise ValueError("No input dataset.")
    input_dataset_name = get_input_names_for_role("input_dataset")[0]
    input_dataset = dataiku.Dataset(input_dataset_name)

    output_folder_name = get_output_names_for_role("output_folder")[0]
    output_folder = dataiku.Folder(output_folder_name)
    return (input_dataset, output_folder)
示例#10
0
 def get_inputs(self):
     self.folder = Folder(get_output_names_for_role("folder_id")[0])
     self.output_file_path = get_recipe_config()['output_model_path']
     self.overwrite_output_model = get_recipe_config(
     )['overwrite_output_model']
     self.batch_size = int(get_recipe_config()['batch_size'])
     if not get_recipe_config()['show_batch_size']:
         self.batch_size = -1
     self.model = Model(get_input_names_for_role("saved_model_id")[0])
     self.float_32 = get_recipe_config()["float_32"]
def load_plugin_config_langdetect() -> Dict:
    """Utility function to validate and load language detection parameters into a clean dictionary

    Returns:
        Dictionary of parameter names (key) and values

    """
    params = {}
    # input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) == 0:
        raise PluginParamValidationError("Please specify input dataset")
    params["input_dataset"] = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [
        p["name"] for p in params["input_dataset"].read_schema()
    ]

    # output dataset
    output_dataset_names = get_output_names_for_role("output_dataset")
    if len(output_dataset_names) == 0:
        raise PluginParamValidationError("Please specify output dataset")
    params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])

    # Recipe parameters
    recipe_config = get_recipe_config()
    # Text column
    params["text_column"] = recipe_config.get("text_column")
    if params["text_column"] not in input_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid text column selection: {params['text_column']}")
    logging.info(f"Text column: {params['text_column']}")
    # Language scope
    params["language_scope"] = recipe_config.get("language_scope", [])
    if len(params["language_scope"]) == 0:
        params["language_scope"] = SUPPORTED_LANGUAGES_PYCLD3
    if len(params["language_scope"]) == 0:
        raise PluginParamValidationError(
            f"Invalid language scope: {params['language_scope']}")
    logging.info(
        f"Scope of {len(params['language_scope'])} languages: {params['language_scope']}"
    )
    # Minimum score
    params["minimum_score"] = float(recipe_config.get("minimum_score", 0))
    if params["minimum_score"] < 0 or params["minimum_score"] > 1:
        raise PluginParamValidationError(
            "Minimum score must be between 0 and 1")
    logging.info(f"Minimum score for detection: {params['minimum_score']:.2f}")
    # Fallback language
    params["fallback_language"] = recipe_config.get("fallback_language")
    if not params["fallback_language"] or params["fallback_language"] == "None":
        logging.info("No fallback language")
        params["fallback_language"] = ""
    else:
        logging.info(f"Fallback language: {params['fallback_language']}")
    return params
示例#12
0
 def get_inputs(self):
     self.input_folder = Folder(
         get_input_names_for_role("input_folder_id")[0])
     output_folder_id = get_output_names_for_role("output_folder_id")[0]
     self.output_folder = Folder(output_folder_id)
     self.output_file_path = get_recipe_config()['output_model_path']
     self.batch_size = int(get_recipe_config()['batch_size'])
     if not get_recipe_config()['show_batch_size']:
         self.batch_size = -1
     self.overwrite_output_model = get_recipe_config(
     )['overwrite_output_model']
     self.model_path = get_recipe_config()['model_path']
     self.model_name = os_splitext(os_split(self.model_path)[1])[0]
     self.float_32 = get_recipe_config()["float_32"]
示例#13
0
def main():
    # getting the csv folder address
    havas_logs = Folder(customrecipe.get_input_names_for_role("files_folder")[0])
    havas_logs_path = havas_logs.get_path()

    files_to_process = get_files_to_process(havas_logs_path)

    # preparing dataset to write into
    havas_cost_data = DatasetWrapper(customrecipe.get_output_names_for_role("cost_data")[0])
    # havas_cost_data.dataset.spec_item['appendMode'] = True
    # writing into dataset
    append_files_to_dataset(files_to_process, havas_cost_data)
    # closing dataset and saving lines
    havas_cost_data.close()
示例#14
0
def apply_func(func,
               client=None,
               input_dataset="input_dataset",
               output_dataset="output_dataset"):
    input_dataset_name = get_input_names_for_role(input_dataset)[0]
    input_dataset = dataiku.Dataset(input_dataset_name)
    input_df = input_dataset.get_dataframe()

    output_dataset_name = get_output_names_for_role(output_dataset)[0]
    output_dataset = dataiku.Dataset(output_dataset_name)
    client = client or get_client(get_recipe_config())

    output_df = input_df.dropna().apply(
        lambda row: _safe_call(client, row, func), axis=1)
    output_dataset.write_with_schema(output_df)
示例#15
0
def load_predict_config():
    """Utility function to load, resolve and validate all predict recipe config into a clean `params` dictionary

    Returns:
        Dictionary of parameter names (key) and values
    """
    params = {}
    recipe_config = get_recipe_config()

    # model folder
    model_folder = dataiku.Folder(get_input_names_for_role("model_folder")[0])
    params["model_folder"] = model_folder
    params["partition_root"] = get_folder_partition_root(params["model_folder"], is_input=True)

    params["external_features_future_dataset"] = None
    external_features_future_dataset_names = get_input_names_for_role("external_features_future_dataset")
    if len(external_features_future_dataset_names) > 0:
        params["external_features_future_dataset"] = dataiku.Dataset(external_features_future_dataset_names[0])

    # output dataset
    output_dataset_names = get_output_names_for_role("output_dataset")
    if len(output_dataset_names) == 0:
        raise PluginParamValidationError("Please specify Forecast dataset in the 'Input / Output' tab of the recipe")
    params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])
    check_only_one_read_partition(params["partition_root"], params["model_folder"])
    check_only_one_read_partition(params["partition_root"], params["external_features_future_dataset"])

    params["manual_selection"] = True if recipe_config.get("model_selection_mode") == "manual" else False

    params["performance_metric"] = recipe_config.get("performance_metric")
    params["selected_session"] = recipe_config.get("manually_selected_session", "latest_session")
    params["selected_model_label"] = recipe_config.get("manually_selected_model_label")

    params["prediction_length"] = recipe_config.get("prediction_length", -1)
    params["confidence_interval"] = recipe_config.get("confidence_interval", 95)
    params["quantiles"] = convert_confidence_interval_to_quantiles(params["confidence_interval"])
    params["include_history"] = recipe_config.get("include_history", False)

    params["sampling_method"] = recipe_config.get("sampling_method", "last_records")
    params["history_length_limit"] = None
    if params["sampling_method"] == "last_records":
        params["history_length_limit"] = recipe_config.get("number_records", 1000)
        if params["history_length_limit"] < 1:
            raise PluginParamValidationError("Number of historical records must be higher than 1")

    printable_params = {param: value for param, value in params.items() if "dataset" not in param and "folder" not in param}
    logger.info(f"Recipe parameters: {printable_params}")
    return params
示例#16
0
def get_results_input_output() -> tuple:
    """Returns input and output datasets after sanity check

    :raises: :class:`ValueError`: Missing input or output dataset(s)

    :returns: input and output datasets
    :rtype: tuple
    """
    input_names = get_input_names_for_role("results")
    output_names = get_output_names_for_role('statistics')
    if len(input_names) == 0:
        raise ValueError("No input dataset.")
    if len(output_names) == 0:
        raise ValueError("No output dataset.")

    input_dataset = dataiku.Dataset(input_names[0])
    output_dataset = dataiku.Dataset(output_names[0])
    return input_dataset, output_dataset
示例#17
0
def get_config():
    config = {}
    config['input_ds'] = dataiku.Dataset(get_input_names_for_role('input_ds')[0])
    config['output_ds'] = dataiku.Dataset(get_output_names_for_role('output_ds')[0])

    for param in ['address_column', 'cache_enabled', 'provider', 'api_key', 'here_app_id', 'here_app_code', 'google_client', 'google_client_secret']:
        config[param] = get_recipe_config().get(param, None)

    config['batch_enabled'] = get_recipe_config().get('batch_enabled', False) \
        and (config['provider'] == 'bing' or config['provider'] == 'mapquest' or config['provider'] == 'uscensus')

    config['batch_size'] = {
        'bing': get_recipe_config().get('batch_size_bing', 50),
        'mapquest': 100,
        'uscensus': get_recipe_config().get('batch_size_uscensus', 1000)
    }.get(config['provider'], 0)

    config['batch_timeout'] = {
        'bing': 10,
        'mapquest': 30,
        'uscensus': 1800
    }.get(config['provider'], 0)

    if get_plugin_config().get('cache_location', 'original') == 'original':
        config['cache_location'] = os.environ["DIP_HOME"] + '/caches/plugins/geocoder/forward'
    else:
        config['cache_location'] = get_plugin_config().get('cache_location_custom', '')

    config['cache_size'] = get_plugin_config().get('forward_cache_size', 1000) * 1000
    config['cache_eviction'] = get_plugin_config().get('forward_cache_policy', 'least-recently-stored')

    prefix = get_recipe_config().get('column_prefix', '')
    for column_name in ['latitude', 'longitude']:
        config[column_name] = prefix + column_name

    if config['provider'] is None:
        raise AttributeError('Please select a geocoding provider.')

    return config
示例#18
0
def get_output_dataset(role):
    names = get_output_names_for_role(role)
    return dataiku.Dataset(names[0]) if len(names) > 0 else None
示例#19
0
# -*- coding: utf-8 -*-
import dataiku
from dataiku.customrecipe import get_output_names_for_role, get_recipe_config
import oncrawl as oc
from oncrawl import oncrawlDataAPI as ocd
from oncrawl import oncrawlProjectAPI as ocp

output_names = get_output_names_for_role('output')
output_datasets = [dataiku.Dataset(name) for name in output_names]
output = output_datasets[0]

#------------------------------config & vars
config = get_recipe_config()

#config checker to raise better error
e = None
if 'api_key' not in config.keys():
    e = 'Please add your API key'

if 'list_projects_id_name' not in config.keys() or len(
        config['list_projects_id_name'].keys()) == 0:
    e = 'Your Oncrawl account seems to have no projects available. Please check with your Oncrawl account.'

if 'list_configs_crawls' not in config.keys() or len(
        config['list_configs_crawls'].keys(
        )) == 0 or 'list_crawls_project' not in config.keys() or len(
            config['list_crawls_project'].keys()) == 0:
    e = 'Your Oncrawl account seems to have no crawls available. Please check the choosen project and date range with your Oncrawl account.'

if e is not None:
    raise Exception(e)
示例#20
0
                    format='Warp10 recipe %(levelname)s - %(message)s')

recipe_config = get_recipe_config()

warp10_connection = recipe_config.get('warp10_connection', None)
warpscript = recipe_config.get('code', None)

if not warp10_connection:
    raise ValueError('No Warp10 connection defined')

if not warpscript:
    raise ValueError('No WarpScript code entered')

warp10_client = Warp10Client(warp10_connection)

logger.info('Appending UPDATE function to end of WarpScript code')
warpscript = warpscript + "\n'{}' UPDATE".format(warp10_connection['write_token'])

result = warp10_client.exec_warpscript(warpscript)

have_folder = get_output_names_for_role('main_output')
if have_folder:
    # Semi-dummy output since there is really nothing to do at this point
    output_folder_name = get_output_names_for_role('main_output')[0]
    output_folder = dataiku.Folder(output_folder_name)

    filename = 'Run_{}.txt'.format(datetime.now().strftime('%Y-%m-%dT%H-%M-%S-%f')[:-3])
    logger.info('Writing response file {} in output folder', filename)
    with open(os.path.join(output_folder.get_path(), filename), 'w') as results_file:
        results_file.write('Response of successful WarpScript execution:\n' + result)
def load_input_output_params(recipe_id: RecipeID) -> Dict:
    """Load and validate input/output parameters for both indexing and search recipes

    Returns:
        Dictionary of parameter names (key) and values

    Raises:
        PluginParamValidationError: If a parameter is not valid

    """
    params = {}
    # Index folder
    if recipe_id == RecipeID.SIMILARITY_SEARCH_INDEX:
        output_folder_names = get_output_names_for_role("index_folder")
        if len(output_folder_names) == 0:
            raise PluginParamValidationError(
                "Please specify index folder as output")
        params["index_folder"] = dataiku.Folder(output_folder_names[0])
        params["folder_partition_root"] = get_folder_partition_root(
            params["index_folder"])
    elif recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY:
        input_folder_names = get_input_names_for_role("index_folder")
        if len(input_folder_names) == 0:
            raise PluginParamValidationError(
                "Please specify index folder as input")
        params["index_folder"] = dataiku.Folder(input_folder_names[0])
        params["folder_partition_root"] = get_folder_partition_root(
            params["index_folder"], is_input=True)
        check_only_one_read_partition(params["folder_partition_root"],
                                      params["index_folder"])
    # Input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) == 0:
        raise PluginParamValidationError("Please specify input dataset")
    params["input_dataset"] = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [
        p["name"] for p in params["input_dataset"].read_schema()
    ]
    check_only_one_read_partition(params["folder_partition_root"],
                                  params["input_dataset"])
    if recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY:
        if params["index_folder"].read_partitions != params[
                "input_dataset"].read_partitions:
            raise PluginParamValidationError(
                "Inconsistent partitions between index folder and input dataset, please make sure both are partitioned with the same dimensions"
            )
    # Output dataset - only for search recipe
    if recipe_id == RecipeID.SIMILARITY_SEARCH_QUERY:
        output_dataset_names = get_output_names_for_role("output_dataset")
        if len(output_dataset_names) == 0:
            raise PluginParamValidationError("Please specify output dataset")
        params["output_dataset"] = dataiku.Dataset(output_dataset_names[0])
    # Recipe input parameters
    recipe_config = get_recipe_config()
    params["unique_id_column"] = recipe_config.get("unique_id_column")
    if params["unique_id_column"] not in input_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid unique ID column: {params['unique_id_column']}")
    params["feature_columns"] = recipe_config.get("feature_columns", [])
    if not set(params["feature_columns"]).issubset(set(input_dataset_columns)):
        raise PluginParamValidationError(
            f"Invalid feature column(s): {params['feature_columns']}")
    printable_params = {
        k: v
        for k, v in params.items()
        if k not in {"input_dataset", "index_folder", "output_dataset"}
    }
    logging.info(f"Validated input/output parameters: {printable_params}")
    return params
示例#22
0
credential_parameters = config.get("credential", {})
endpoint_parameters = get_endpoint_parameters(config)
extraction_key = endpoint_parameters.get("extraction_key", "")
is_raw_output = endpoint_parameters.get("raw_output", True)
parameter_columns = [column for column in config.get("parameter_columns", []) if column]
if len(parameter_columns) == 0:
    raise ValueError("There is no parameter column selected.")
parameter_renamings = get_dku_key_values(config.get("parameter_renamings", {}))
custom_key_values = get_dku_key_values(config.get("custom_key_values", {}))
input_parameters_dataset = dataiku.Dataset(input_A_names[0])
partitioning_keys = get_partitioning_keys(input_parameters_dataset, dku_flow_variables)
custom_key_values.update(partitioning_keys)
input_parameters_dataframe = input_parameters_dataset.get_dataframe()

recipe_session = RestApiRecipeSession(
    custom_key_values,
    credential_parameters,
    endpoint_parameters,
    extraction_key,
    parameter_columns,
    parameter_renamings
)
results = recipe_session.process_dataframe(input_parameters_dataframe, is_raw_output)

output_names_stats = get_output_names_for_role('api_output')
odf = pd.DataFrame(results)

if odf.size > 0:
    api_output = dataiku.Dataset(output_names_stats[0])
    api_output.write_with_schema(odf)
示例#23
0
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
import pandas as pd
from dku_idtb_decision_tree.tree import Tree
from dku_idtb_scoring.score import score, write_with_schema
from dku_idtb_compatibility.utils import safe_str
from dataiku.doctor.prediction.reg_evaluation_recipe import compute_multiclass_metrics, compute_binary_classification_metrics

input_dataset = dataiku.Dataset(get_input_names_for_role("input_dataset")[0])
scored_dataset = dataiku.Dataset(
    get_output_names_for_role("scored_dataset")[0])
metrics_dataset = dataiku.Dataset(
    get_output_names_for_role("metrics_dataset")[0])
folder = dataiku.Folder(get_input_names_for_role("folder")[0])
chunk_size_param = get_recipe_config()["chunk_size"]

try:
    tree = folder.read_json(get_recipe_config()["tree_file"])
except ValueError:
    raise Exception("No tree file named " + get_recipe_config()["tree_file"])

tree["df"] = input_dataset.get_dataframe()
tree = Tree(**tree)

scored_df = score(tree, input_dataset, chunk_size_param, True)

target_mapping = {
    safe_str(label): index
    for index, label in enumerate(tree.target_values)
}
scored_df_nona = scored_df.dropna(subset=["prediction"])
y_actual, y_pred = scored_df_nona[tree.target], scored_df_nona.prediction
示例#24
0
# -*- coding: utf-8 -*-
import datetime
import dataiku
from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
from googlesheets import get_spreadsheet
from gspread.utils import rowcol_to_a1

# Input
input_name = get_input_names_for_role('input_role')[0]
input_dataset = dataiku.Dataset(input_name)
input_schema = input_dataset.read_schema()

# Output
output_name = get_output_names_for_role('output_role')[0]
output_dataset = dataiku.Dataset(output_name)
output_dataset.write_schema(input_schema)

# Get configuration
config = get_recipe_config()
credentials = config.get("credentials")
doc_id = config.get("doc_id")
tab_id = config.get("tab_id")
insert_format = config.get("insert_format")

# Load worksheet
ws = get_spreadsheet(credentials, doc_id, tab_id)


# Make available a method of later version of gspread (probably 3.4.0)
# from https://github.com/burnash/gspread/pull/556
def append_rows(self, values, value_input_option='RAW'):
示例#25
0
    jira_id = id_list_df[id_column_name][index]
    indexes_columns = {"jira_id": jira_id}
    if queue_id_column_name is not None:
        queue_id = id_list_df[queue_id_column_name][index]
        indexes_columns.update({"queue_id": queue_id})
    else:
        queue_id = None

    data = client.get_endpoint(endpoint_name,
                               jira_id,
                               "",
                               expand=expand,
                               raise_exception=False,
                               queue_id=queue_id)
    while len(data) > 0:
        for result in data:
            record = dict(indexes_columns)
            record.update(result)
            results.append(client.format(record))
        if client.pagination.is_next_page():
            data = client.get_next_page()
        else:
            break

output_names_stats = get_output_names_for_role('jira_output')
odf = pd.DataFrame(results)

if odf.size > 0:
    jira_output = dataiku.Dataset(output_names_stats[0])
    jira_output.write_with_schema(odf)
示例#26
0
# -*- coding: utf-8 -*-
import dataiku
from dataiku.customrecipe import get_output_names_for_role, get_recipe_config, get_input_names_for_role
import json
from salesforce import SalesforceClient

# Output
output_name = get_output_names_for_role('main')[0]
output = dataiku.Dataset(output_name)
output.write_schema([{
    "name": "operation",
    "type": "string"
}, {
    "name": "error",
    "type": "string"
}, {
    "name": "salesforce_record_id",
    "type": "string"
}, {
    "name": "data",
    "type": "object"
}])

# Read configuration
config = get_recipe_config()
object_name = config.get('object_name', None)
if object_name is None:
    raise Exception("Object name has to be set")

client = SalesforceClient(config)
from api_formatting import get_query

import dataiku
from dataiku.customrecipe import (get_recipe_config, get_output_names_for_role)

# ==============================================================================
# SETUP
# ==============================================================================
api_configuration_preset = get_recipe_config().get("api_configuration_preset")
if api_configuration_preset is None or api_configuration_preset == {}:
    raise ValueError("Please specify an API configuration preset")
HEADERS = {
    "authorization": "Bearer " + api_configuration_preset.get("access_token")
}

groups_name = get_output_names_for_role("campaign_group_dataset")[0]
groups_dataset = dataiku.Dataset(groups_name)

campaigns_names = get_output_names_for_role("campaign_dataset")[0]
campaigns_dataset = dataiku.Dataset(campaigns_names)

creatives_names = get_output_names_for_role("creative_dataset")[0]
creatives_dataset = dataiku.Dataset(creatives_names)

campaigns_analytics_names = get_output_names_for_role(
    "campaign_analytics_dataset")[0]
campaign_analytics_dataset = dataiku.Dataset(campaigns_analytics_names)

creatives_analytics_names = get_output_names_for_role(
    "creatives_analytics_dataset")[0]
creatives_analytics_dataset = dataiku.Dataset(creatives_analytics_names)
def load_training_config(recipe_config):
    """Utility function to load, resolve and validate all training recipe config into a clean `params` dictionary

    Returns:
        Dictionary of parameter names (key) and values
    """
    params = {}

    input_dataset_name = get_input_names_for_role("input_dataset")[0]
    params["training_dataset"] = dataiku.Dataset(input_dataset_name)
    training_dataset_columns = [
        p["name"] for p in params["training_dataset"].read_schema()
    ]

    model_folder_name = get_output_names_for_role("model_folder")[0]
    params["model_folder"] = dataiku.Folder(model_folder_name)
    params["partition_root"] = get_folder_partition_root(
        params["model_folder"])
    check_only_one_read_partition(params["partition_root"],
                                  params["training_dataset"])

    evaluation_dataset_name = get_output_names_for_role(
        "evaluation_dataset")[0]
    params["evaluation_dataset"] = dataiku.Dataset(evaluation_dataset_name)

    params["make_forecasts"] = False
    evaluation_forecasts_dataset_names = get_output_names_for_role(
        "evaluation_forecasts_dataset")
    if len(evaluation_forecasts_dataset_names) > 0:
        params["evaluation_forecasts_dataset"] = dataiku.Dataset(
            evaluation_forecasts_dataset_names[0])
        params["make_forecasts"] = True

    params["time_column_name"] = recipe_config.get("time_column")
    if params["time_column_name"] is None:
        raise PluginParamValidationError(
            "Time column is mandatory:, please select one")
    elif params["time_column_name"] not in training_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid time column selection: {params['time_column_name']}")

    params["target_columns_names"] = sanitize_column_list(
        recipe_config.get("target_columns"))
    if len(params["target_columns_names"]) == 0 or not all(
            column in training_dataset_columns
            for column in params["target_columns_names"]):
        raise PluginParamValidationError(
            f"Invalid target column(s) selection: {params['target_columns_names']}"
        )
    params["target_columns_names"] = reorder_column_list(
        params["target_columns_names"], training_dataset_columns)

    long_format = recipe_config.get("additional_columns", False)
    if long_format:
        params["timeseries_identifiers_names"] = sanitize_column_list(
            recipe_config.get("timeseries_identifiers", []))
        if not all(column in training_dataset_columns
                   for column in params["timeseries_identifiers_names"]):
            raise PluginParamValidationError(
                f"Invalid time series identifiers selection: {params['timeseries_identifiers_names']}"
            )
    else:
        params["timeseries_identifiers_names"] = []

    params["is_training_multivariate"] = True if (
        len(params["target_columns_names"]) > 1) or (
            len(params["timeseries_identifiers_names"]) > 0) else False

    if long_format and len(params["timeseries_identifiers_names"]) == 0:
        raise PluginParamValidationError(
            "Long format is activated but no time series identifiers have been provided"
        )

    external_feature_activated = recipe_config.get(
        "external_feature_activated", False)
    if external_feature_activated:
        params["external_features_columns_names"] = sanitize_column_list(
            recipe_config.get("external_feature_columns", []))
    else:
        params["external_features_columns_names"] = []
    if not all(column in training_dataset_columns
               for column in params["external_features_columns_names"]):
        raise PluginParamValidationError(
            f"Invalid external features selection: {params['external_features_columns_names']}"
        )

    params["frequency_unit"] = recipe_config.get("frequency_unit")

    if params["frequency_unit"] == "W":
        params[
            "frequency"] = f"W-{recipe_config.get('frequency_end_of_week', 1)}"
    elif params["frequency_unit"] == "H":
        params[
            "frequency"] = f"{recipe_config.get('frequency_step_hours', 1)}H"
    elif params["frequency_unit"] == "min":
        params[
            "frequency"] = f"{recipe_config.get('frequency_step_minutes', 1)}min"
    else:
        params["frequency"] = params["frequency_unit"]

    params["prediction_length"] = recipe_config.get("prediction_length")
    if not params["prediction_length"]:
        raise PluginParamValidationError("Please specify forecasting horizon")

    params["season_length"] = recipe_config.get(
        f"season_length_{params['frequency_unit']}", 1)
    if params["season_length"] < 1:
        raise PluginParamValidationError("Seasonality must be higher than 1")

    params["use_gpu"] = recipe_config.get("use_gpu", False)
    if params["use_gpu"]:
        params["gpu_location"] = recipe_config.get("gpu_location", "local_gpu")
        if params["gpu_location"] == "local_gpu":
            gpu_devices = recipe_config.get("gpu_devices", [])
            params["gpu_devices"] = parse_gpu_devices(gpu_devices)
        else:
            params["gpu_devices"] = [GPU_CONFIGURATION.CONTAINER_GPU]
    else:
        params["gpu_devices"] = None

    params["forecasting_style"] = recipe_config.get("forecasting_style",
                                                    "auto")
    params["epoch"] = recipe_config.get("epoch", 10)
    params["batch_size"] = recipe_config.get("batch_size", 32)

    params["auto_num_batches_per_epoch"] = recipe_config.get(
        "auto_num_batches_per_epoch", True)
    if params["auto_num_batches_per_epoch"]:
        params["num_batches_per_epoch"] = -1
    else:
        params["num_batches_per_epoch"] = recipe_config.get(
            "num_batches_per_epoch", 50)

    if params["num_batches_per_epoch"] == 0:
        raise PluginParamValidationError(
            "Number of batches per epoch cannot be 0")

    # Overwrite values in case of autoML mode selected
    params = automl_params_overwrite(params)

    params["sampling_method"] = recipe_config.get("sampling_method",
                                                  "last_records")
    params["max_timeseries_length"] = None
    if params["sampling_method"] == "last_records":
        params["max_timeseries_length"] = recipe_config.get(
            "number_records", 10000)
        if params["max_timeseries_length"] < 4:
            raise PluginParamValidationError(
                "Number of records must be higher than 4")

    params["evaluation_strategy"] = "split"
    params["evaluation_only"] = False

    printable_params = {
        param: value
        for param, value in params.items()
        if "dataset" not in param and "folder" not in param
    }
    logger.info(f"Recipe parameters: {printable_params}")
    return params
示例#29
0
import meaningcloud
import pandas as pd
from dataiku.customrecipe import (
    get_input_names_for_role,
    get_output_names_for_role,
    get_recipe_config,
    get_plugin_config,
)
from meaningcloud_common import setRequestSource, isBlockingErrorType

# ==============================================================================
# PLUGIN + RECIPE SETTINGS
# ==============================================================================

input_name = get_input_names_for_role("input_dataset")[0]
output_name = get_output_names_for_role("output_dataset")[0]

input_dataset = dataiku.Dataset(input_name)
output_dataset = dataiku.Dataset(output_name)

meaningcloud_connection = get_plugin_config().get("meaningcloud_connection")

license_key = meaningcloud_connection.get("license_key", None)
server = meaningcloud_connection.get("meaningcloud_server",
                                     "https://api.meaningcloud.com")
sentences = int(get_recipe_config().get("sentences", 5))
text_column = get_recipe_config().get("column_name", None)

# ==============================================================================
# AUXILIARY FUNCTIONS
# ==============================================================================
        sentences = [
            str(s)
            for s in summarizer(parser.document, sentences_count=n_sentences)
        ]

        if all_capital:
            output_sentences = ' '.join(sentences).upper()
            all_capital = False
        else:
            output_sentences = ' '.join(sentences)

        return output_sentences
    else:
        return ''


# Checking for existing columns with same name
new_column_name = text_column_name + "_summary"
if new_column_name in df.columns:
    j = 1
    while new_column_name + "_{}".format(j) in df.columns:
        j += 1
    new_column_name += "_{}".format(j)

# Adding a new column with computed summaries
df[new_column_name] = [summarize(text) for text in df[text_column_name].values]

# Write recipe outputs
output_dataset = get_output_names_for_role('output_dataset')[0]
dataiku.Dataset(output_dataset).write_with_schema(df)
示例#31
0
def load_plugin_config_wordcloud() -> Dict:
    """Utility function to validate and load language detection parameters into a clean dictionary

    Returns:
        Dictionary of parameter names (key) and values

    """
    params = {}
    # Input dataset
    input_dataset_names = get_input_names_for_role("input_dataset")
    if len(input_dataset_names) != 1:
        raise PluginParamValidationError("Please specify one input dataset")
    input_dataset = dataiku.Dataset(input_dataset_names[0])
    input_dataset_columns = [p["name"] for p in input_dataset.read_schema()]

    # Output folder
    output_folder_names = get_output_names_for_role("output_folder")
    if len(output_folder_names) != 1:
        raise PluginParamValidationError("Please specify one output folder")
    params["output_folder"] = dataiku.Folder(output_folder_names[0])

    # Partition handling
    params["output_partition_path"] = get_folder_partition_root(
        params["output_folder"])

    # Recipe parameters
    recipe_config = get_recipe_config()

    # Text column
    params["text_column"] = recipe_config.get("text_column")
    if params["text_column"] not in input_dataset_columns:
        raise PluginParamValidationError(
            f"Invalid text column selection: {params['text_column']}")
    logging.info(f"Text column: {params['text_column']}")
    # Language selection
    params["language"] = recipe_config.get("language")
    if params["language"] == "language_column":
        params["language_column"] = recipe_config.get("language_column")
        if params["language_column"] not in input_dataset_columns:
            raise PluginParamValidationError(
                f"Invalid language column selection: {params['language_column']}"
            )
        logging.info(f"Language column: {params['language_column']}")
    else:
        if not params["language"]:
            raise PluginParamValidationError("Empty language selection")
        if params["language"] not in SUPPORTED_LANGUAGES_SPACY:
            raise PluginParamValidationError(
                f"Unsupported language code: {params['language']}")
        params["language_column"] = None
        logging.info(f"Language: {params['language']}")
    # Subcharts
    params["subchart_column"] = recipe_config.get("subchart_column")
    # If parameter is saved then cleared, config retrieves ""
    params["subchart_column"] = None if not params[
        "subchart_column"] else params["subchart_column"]
    if params["subchart_column"] and (
        (params["subchart_column"]
         not in input_dataset_columns + ["order66"])):
        raise PluginParamValidationError(
            f"Invalid categorical column selection: {params['subchart_column']}"
        )
    logging.info(f"Subcharts column: {params['subchart_column']}")

    # Input dataframe
    necessary_columns = [
        column for column in set([
            params["text_column"], params["language_column"],
            params["subchart_column"]
        ]) if (column not in [None, "order66"])
    ]
    params["df"] = input_dataset.get_dataframe(columns=necessary_columns)
    if params["df"].empty:
        raise PluginParamValidationError("Dataframe is empty")
    # Check if unsupported languages in multilingual case
    elif params["language_column"]:
        languages = set(params["df"][params["language_column"]].unique())
        unsupported_lang = languages - SUPPORTED_LANGUAGES_SPACY.keys()
        if unsupported_lang:
            raise PluginParamValidationError(
                f"Found {len(unsupported_lang)} unsupported languages: {', '.join(sorted(unsupported_lang))}"
            )

    logging.info(f"Read dataset of shape: {params['df'].shape}")

    return params