def build_scenario(build_plan, filter_on='ready', connection='dataiku_workspace', ref_table='referentialclient', ref_project='DIReferential', add_ecrm_context=True, finish_on_client=None, single_client=None): scenario = Scenario() if not isinstance(filter_on, list): filter_on = [filter_on] project_name = dataiku.default_project_key() project_key = dataiku.api_client().get_project(project_name) local_variables = project_key.get_variables()['local'] env = local_variables['env'] kut.display_message('reading client context referential') executor = SQLExecutor2(connection=connection) sql_query_referential_client = "SELECT * FROM " + '_'.join( [env, ref_project, ref_table]) client_ref = executor.query_to_df(sql_query_referential_client) filter_query = ' & '.join(filter_on) client_ref = client_ref.query(filter_query) if filter_query else client_ref kut.display_message('Client ready for automation : ' + client_ref.clientName.unique()) kut.display_message('run configuration') print(build_plan) if not pd.isnull(finish_on_client): finish_client = client_ref[client_ref.clientName == finish_on_client] if len(finish_client) == 0: kut.display_message( 'finish client not found in plan ' + finish_on_client + ' is the client name valid ?' ) # Example: load a DSS dataset as a Pandas dataframe other_clients = client_ref[client_ref.clientName != finish_on_client] client_ref = pd.concat([other_clients, finish_client], ignore_index=True) success = [] if single_client is not None: requested_client = client_ref[client_ref.clientName == single_client] if not len(single_client): kut.display_message( 'requested single client is not found,building all allowed clients' ) else: client_ref = requested_client for index, client_row in client_ref.iterrows(): variables = set_client_context(client_row=client_row, add_ecrm_context=add_ecrm_context, connection=connection) client_name = variables['local']['clientName'] kut.display_message('starting builds on ' + client_name) run_scenario(table_plan=build_plan, scenario=scenario) success.append(client_name) scenario.set_global_variables(successfullRun=success) print('done_________________' + client_name) return success
def get_token(): # Read in the existing conf dss = dataiku.api_client() project = dss.get_project(dataiku.default_project_key()) variables = project.get_variables()["standard"] conf = variables.get("powerbi-settings", None) # Decrypt key = request.args.get("api-key") pbi = {} pbi["username"] = conf["username"] pbi["password"] = decrypt_string(conf["password"], key) pbi["client_id"] = conf["client_id"] pbi["client_secret"] = decrypt_string(conf["client_secret"], key) pbi["resource"] = conf["resource"] pbi["grant_type"] = conf["grant_type"] pbi["scope"] = conf["scope"] # Get the token response = requests.post( 'https://login.microsoftonline.com/common/oauth2/token', data=pbi) o = {} o["token"] = response.json().get("access_token") return json.dumps(o)
def get_snowflake_datasets(): project_key = default_project_key() project = api_client().get_project(project_key) return [ dataset for dataset in project.list_datasets() if dataset.type == 'Snowflake' ]
def get_folder_partition_root(folder, is_input=False): """Retrieve the partition root path using a dataiku.Folder. Args: folder (dataiku.Folder): Input or output folder of the recipe used to retrieve the partition path pattern. is_input: True if the folder must be considered as a input, False if output Returns: Partition path or None if folder is not partitioned. """ folder_id = folder.get_id() input_id = folder_id if is_input else None dku_flow_variables = dataiku.get_flow_variables() client = dataiku.api_client() project = client.get_project(dataiku.default_project_key()) folder = project.get_managed_folder(folder_id) folder_config = folder.get_definition() partitioning_config = folder_config.get("partitioning") if not partitioning_config: return "" file_path_pattern = partitioning_config.get("filePathPattern", None) dimensions, types = get_dimensions(partitioning_config) partitions = get_partitions(dku_flow_variables, dimensions, input_id=input_id) file_path = complete_file_path_pattern(file_path_pattern, partitions, dimensions, types) file_path = complete_file_path_time_pattern(dku_flow_variables, file_path, input_id=input_id) return file_path
def save_new_token(): # Read in the conf and get a token conf = json.loads(request.data) key = conf["api-key"] pbi = {} pbi["username"] = conf["powerbi-username"] pbi["password"] = conf["powerbi-password"] pbi["client_id"] = conf["powerbi-client-id"] pbi["client_secret"] = conf["powerbi-client-secret"] pbi["resource"] = conf["powerbi-resource"] pbi["grant_type"] = conf["powerbi-grant-type"] pbi["scope"] = conf["powerbi-scope"] response = requests.post( 'https://login.microsoftonline.com/common/oauth2/token', data=pbi) # Save the token data = pbi data["password"] = encrypt_string(conf["powerbi-password"], key) data["client_secret"] = encrypt_string(conf["powerbi-client-secret"], key) data["access_token"] = response.json().get("access_token") data["created_at"] = str(datetime.datetime.utcnow()) data["dss_port"] = os.environ["DKU_BACKEND_PORT"] data["webapp_project"] = conf["webapp-url"].split("/")[-3] data["webapp_id"] = conf["webapp-url"].split("/")[-2] data["project_key"] = os.environ["DKU_CURRENT_PROJECT_KEY"] set_dss_variables(dataiku.default_project_key(), data) # Send back some results o = {} o["powerbi-access-token"] = data["access_token"] return json.dumps(o)
def count_records(dataset: dataiku.Dataset) -> int: """ Count the number of records of a dataset using the Dataiku dataset metrics API """ metric_id = "records:COUNT_RECORDS" dataset_name = dataset.name.split(".")[1] partitions = dataset.read_partitions client = dataiku.api_client() project = client.get_project(dataiku.default_project_key()) logging.info("Counting records of dataset: {}".format(dataset_name)) if partitions is None or len(partitions) == 0: project.get_dataset(dataset_name).compute_metrics(metric_ids=[metric_id]) metric = dataset.get_last_metric_values() record_count = dataiku.ComputedMetrics.get_value_from_data(metric.get_global_data(metric_id=metric_id)) logging.info("Dataset contains {:d} records and is not partitioned".format(record_count)) else: record_count = 0 for partition in partitions: project.get_dataset(dataset_name).compute_metrics(partition=partition, metric_ids=[metric_id]) metric = dataset.get_last_metric_values() record_count += dataiku.ComputedMetrics.get_value_from_data( metric.get_partition_data(partition=partition, metric_id=metric_id) ) logging.info("Dataset contains {:d} records in partition(s) {}".format(record_count, partitions)) return record_count
def get_dataset_flow(): client = dataiku.api_client() project_key = dataiku.default_project_key() project = client.get_project(project_key) datasets = project.list_datasets() dataset_names = [datasets[i]["name"] for i in range(len(datasets))] return json.jsonify({"dataset_names": dataset_names})
def save_data(id, payload, content_type, label=None, project_key=None, encoding=None): """ Saves data as a DSS static insight that can be exposed on the dashboard :param str id: Unique identifier of the insight within the project. If an insight with the same identifier already exists, it will be replaced :param payload: bytes-oriented data, or Base64 string :param content_type: the MIME type of the data in payload (example: text/html or image/png) :param str label: Optional display label for the insight. If None, the id will be used as label :param str project_key: Project key in which the insight must be saved. If None, the contextual (current) project is used :param str encoding: If the payload was a Base64 string, this must be "base64". Else, this must be None """ if project_key is None: project_key = default_project_key() backend_void_call( "insights/save-static-file-insight", { "projectKey": project_key, "id": id, "payload": _get_payload(payload, encoding), "contentType": content_type, "label": label })
def list_datasets(): project_key = dataiku.default_project_key() client = dataiku.api_client() project = client.get_project(project_key) dataset_list = [{ "name": dataset_dict['name'] } for dataset_dict in project.list_datasets()] return json.dumps({'dataset_list': dataset_list})
def get_sql_table(referential_name, project_key='DIReferential'): env = get_project_variables(scope='local')['env'] project_key = dataiku.default_project_key( ) if not project_key or project_key == 'self' else project_key table_name = '_'.join([env, project_key, referential_name.lower()]) query = 'SELECT * FROM ' + table_name connection = SQLExecutor2(connection='dataiku_workspace') return connection.query_to_df(query)
def run(self): self.get_inputs() self.validation() self.keras_model = get_keras_model_from_saved_model( default_project_key(), self.model) self.onnx_model = convert_from_keras_to_onnx(self.keras_model, self.batch_size, self.float_32) self.write_output()
def do(payload, config, plugin_config, inputs): project_key = dataiku.default_project_key() project_managed_folders = api_client.get_project(project_key).list_managed_folders() choices = [{ 'label': '{} ({})'.format(mf['name'], mf['type']), 'value': mf['id'] } for mf in project_managed_folders] choices.append({'label': 'Create new Filesystem folder...', 'value': 'create_new_folder'}) return {"choices": choices}
def draw_graph(): #get data project_key = dataiku.default_project_key() similarity = float(request.args.get('similarity')) node_source = request.args.get('node_source') node_target = request.args.get('node_target') interactions = request.args.get('interactions') dataset = request.args.get('dataset') name = project_key + '.' + dataset print name df = dataiku.Dataset(name).get_dataframe() df = df[df[interactions] > similarity] df = df[[node_source, node_target, interactions]] df.columns = ['source', 'target', 'weight'] print "%d rows" % df.shape[0] G = nx.Graph() G.add_edges_from(zip(df.source, df.target)) print nx.info(G) # degree for node, val in dict(nx.degree(G)).iteritems(): G.node[node]['degree'] = val # pagerank for node, val in dict(nx.pagerank(G)).iteritems(): G.node[node]['pagerank'] = val # connected components components = sorted(nx.connected_components(G), key=len, reverse=True) for component, nodes in enumerate(components): for node in nodes: G.node[node]['cc'] = component # community partition = best_partition(G) for node, cluster in dict(partition).iteritems(): G.node[node]['community'] = cluster # convert to JSON data = json_graph.node_link_data(G) #fix for networkx>=2.0 change of API if nx.__version__ > 2: dict_name_id = { data["nodes"][i]["id"]: i for i in xrange(len(data["nodes"])) } for link in data["links"]: link["source"] = dict_name_id[link["source"]] link["target"] = dict_name_id[link["target"]] return json.dumps({"status": "ok", "graph": data})
def do(payload, config, plugin_config, inputs): if payload["funtastic"] == "engines": client = dataiku.api_client() project = client.get_project(dataiku.default_project_key()) engines = project.get_settings().get_raw( )['metrics']['engineConfig'].keys() return {'engines': engines} if payload["funtastic"] == "connections": client = dataiku.api_client() connections = client.list_connections().keys() return {'connections': connections}
def add_project_variable(variable, key, scope='local', list_shaped=True, project_key=None, unique=False): project_key = dataiku.default_project_key( ) if not project_key else project_key project = dataiku.api_client().get_project(project_key) variables = project.get_variables() value = \ kut.unique_values_as_string(array=variable, list_shaped=list_shaped, unique=unique) if type(variable) == list \ else variable variables[scope][key] = value project.set_variables(variables) return variables
def add_ECRM_context(variables, connection='dataiku_workspace'): print('adding ecrm context') project_name = dataiku.default_project_key() project = dataiku.api_client().get_project(project_name) local_variables = project.get_variables()['local'] env = local_variables['env'] executor = SQLExecutor2(connection=connection) sql_query_client_ecrm = "SELECT * FROM " + env + "_DIReferential_referentialECRMOperation" client_ecrm = executor.query_to_df(sql_query_client_ecrm) ecrm_info = client_ecrm[client_ecrm.clientName == variables['local'] ['clientName']] print('found', len(ecrm_info), 'relevant entries') variables['local']['ecrmOperations'] = {} for i, operation_row in ecrm_info.iterrows(): operation_dict = operation_row.to_dict() operation_type = operation_dict['operationType'] del (operation_dict['operationType']) variables['local']['ecrmOperations'][operation_type] = operation_dict return variables
def get_existing_credentials(): # Read in the existing conf dss = dataiku.api_client() project = dss.get_project(dataiku.default_project_key()) variables = project.get_variables()["standard"] conf = variables.get("powerbi-settings", None) # Decrypt key = request.args.get("api-key") pbi = {} pbi["powerbi-username"] = conf["username"] pbi["powerbi-password"] = decrypt_string(conf["password"], key) pbi["powerbi-client-id"] = conf["client_id"] pbi["powerbi-client-secret"] = decrypt_string(conf["client_secret"], key) pbi["powerbi-resource"] = conf["resource"] pbi["powerbi-grant-type"] = conf["grant_type"] pbi["powerbi-scope"] = conf["scope"] # Send back some results return json.dumps(pbi)
def set_client_context(client_row, project_key=None, add_ecrm_context=True, connection='dataiku_workspace'): kut.display_message('setting context', secondary=True) if not project_key: project_name = dataiku.default_project_key() project_key = dataiku.api_client().get_project(project_name) print('inferring project key:', project_key) new_vars = serialize_variables(new_vars=client_row.to_dict(), project=project_key, context='local') if add_ecrm_context: new_vars = add_ECRM_context(new_vars, connection=connection) project_key.set_variables(new_vars) variables = project_key.get_variables() local_variables = project_key.get_variables()['local'] client_name = local_variables['clientName'] print('client name:', client_name) print(local_variables) return variables
def get_default_project(self): """ Get a handle to the current default project, if available (i.e. if dataiku.default_project_key() is valid) """ import dataiku return DSSProject(self, dataiku.default_project_key())
) sys.exit("AWS S3 Credential error") if input_connection["encryptionMode"] != "NONE": print( "[-] Found the connection {} but it is configured to use encryption which is not currently supported." .format(connection_name)) sys.exit("AWS S3 Credential error") AWS_ACCESS_KEY = input_connection["accessKey"] AWS_SECRET_KEY = input_connection["secretKey"] elif USE_PROJECT_VARIABLES: print( "[+] Use S3 credentials defined as Local, Project, or Global Variables. First, looking in Local Variables..." ) dss = dataiku.api_client() project = dss.get_project(dataiku.default_project_key()) variables = project.get_variables() if "snowflake" in variables["local"]: if "aws_access_key" in variables["local"][ "snowflake"] and "aws_secret_key" in variables["local"][ "snowflake"]: print("[+] Found AWS credentials in Local Variables") AWS_ACCESS_KEY = variables["local"]["snowflake"]["aws_access_key"] AWS_SECRET_KEY = variables["local"]["snowflake"]["aws_secret_key"] else: print( "[-] 'snowflake' key found in Local Variables but could not retrieve aws_access_key and/or aws_secret_key." ) print("[-] Please check and correct your Local Variables.") sys.exit("Local Variables error") elif "snowflake" in variables["standard"]:
from flask import request from distutils.util import strtobool import json import traceback import dataiku from dataiku.customwebapp import get_webapp_config from design_experiment.sample_size import min_sample_size, z_value from helpers import save_parameters from constants import Parameters from dku_tools import get_output_folder config_settings = get_webapp_config() project_key = dataiku.default_project_key() client = dataiku.api_client() @app.route('/sample_size', methods=['POST']) def get_sample_size(): try: config = json.loads(request.data) baseline_conversion_rate = float(config.get(Parameters.BCR.value))/100 minimum_detectable_effect = float(config.get(Parameters.MDE.value))/100 alpha = 1-float(config.get(Parameters.SIG_LEVEL.value))/100 power = float(config.get(Parameters.POWER.value))/100 ratio = float(config.get(Parameters.RATIO.value))/100 reach = float(config.get(Parameters.REACH.value))/100 two_tailed = strtobool(config.get(Parameters.TAIL.value)) sample_size_A, sample_size_B = min_sample_size(baseline_conversion_rate, minimum_detectable_effect, alpha, power, ratio, two_tailed) sample_size_A = round(sample_size_A / reach) sample_size_B = round(sample_size_B / reach)
def is_dataset_valid(dataset_name): project_key = default_project_key() project = api_client().get_project(project_key) dss_dataset = project.get_dataset(dataset_name) return dss_dataset.get_settings().type == 'Snowflake'
import dataiku INPUT_DATASET = "mydataset" COLUMN_TO_PARTITION_BY = "mypartitioningcolumn" dataset = dataiku.Dataset(INPUT_DATASET) df = dataset.get_dataframe(columns=[COLUMN_TO_PARTITION_BY]) combinations = df[COLUMN_TO_PARTITION_BY].unique() combinations_str = "/".join(combinations) client = dataiku.api_client() project = client.get_project(dataiku.default_project_key()) variables = project.get_variables() variables["standard"]["myPartitionList"] = combinations_str project.set_variables(variables)
def get_project_variables(scope=None, project_key=None): project_key = dataiku.default_project_key( ) if not project_key else project_key project = dataiku.api_client().get_project(project_key) return project.get_variables()[scope] if scope else project.get_variables()