def do(payload, config, plugin_config, inputs): if payload["funtastic"] == "engines": client = dataiku.api_client() project = client.get_project(dataiku.default_project_key()) engines = project.get_settings().get_raw( )['metrics']['engineConfig'].keys() return {'engines': engines} if payload["funtastic"] == "connections": client = dataiku.api_client() connections = client.list_connections().keys() return {'connections': connections}
def get_dataset_flow(): client = dataiku.api_client() project_key = dataiku.default_project_key() project = client.get_project(project_key) datasets = project.list_datasets() dataset_names = [datasets[i]["name"] for i in range(len(datasets))] return json.jsonify({"dataset_names": dataset_names})
def run(self, progress_callback): # You should really comment your code better # get remote key and url remote_url = self.config.get('remote_url') remote_key = self.config.get('remote_api') project_key = self.project_key # get local and remote wikis for current project rp = dataikuapi.DSSClient(remote_url, api_key=remote_key).get_project(project_key) cp = dataiku.api_client().get_project(project_key) local_wiki = cp.get_wiki() remote_wiki = rp.get_wiki() # replace or create new articles in the project wikis for l_article in local_wiki.list_articles(): for r_article in remote_wiki.list_articles(): if l_article.article_id == r_article.article_id: r_article.delete() remote_wiki.create_article(l_article.article_id, content=l_article.get_data().get_body()) return '<body>Wiki Updated on instance running at: ' + remote_url + '</body>'
def count_records(dataset: dataiku.Dataset) -> int: """ Count the number of records of a dataset using the Dataiku dataset metrics API """ metric_id = "records:COUNT_RECORDS" dataset_name = dataset.name.split(".")[1] partitions = dataset.read_partitions client = dataiku.api_client() project = client.get_project(dataiku.default_project_key()) logging.info("Counting records of dataset: {}".format(dataset_name)) if partitions is None or len(partitions) == 0: project.get_dataset(dataset_name).compute_metrics(metric_ids=[metric_id]) metric = dataset.get_last_metric_values() record_count = dataiku.ComputedMetrics.get_value_from_data(metric.get_global_data(metric_id=metric_id)) logging.info("Dataset contains {:d} records and is not partitioned".format(record_count)) else: record_count = 0 for partition in partitions: project.get_dataset(dataset_name).compute_metrics(partition=partition, metric_ids=[metric_id]) metric = dataset.get_last_metric_values() record_count += dataiku.ComputedMetrics.get_value_from_data( metric.get_partition_data(partition=partition, metric_id=metric_id) ) logging.info("Dataset contains {:d} records in partition(s) {}".format(record_count, partitions)) return record_count
def get_token(): # Read in the existing conf dss = dataiku.api_client() project = dss.get_project(dataiku.default_project_key()) variables = project.get_variables()["standard"] conf = variables.get("powerbi-settings", None) # Decrypt key = request.args.get("api-key") pbi = {} pbi["username"] = conf["username"] pbi["password"] = decrypt_string(conf["password"], key) pbi["client_id"] = conf["client_id"] pbi["client_secret"] = decrypt_string(conf["client_secret"], key) pbi["resource"] = conf["resource"] pbi["grant_type"] = conf["grant_type"] pbi["scope"] = conf["scope"] # Get the token response = requests.post( 'https://login.microsoftonline.com/common/oauth2/token', data=pbi) o = {} o["token"] = response.json().get("access_token") return json.dumps(o)
def __init__(self, project_key, config, plugin_config): """ :param project_key: the project in which the runnable executes :param config: the dict of the configuration of the object :param plugin_config: contains the plugin settings """ self.project_key = project_key self.config = config self.plugin_config = plugin_config self.Helm = Helm() self.client = dataiku.api_client() cluster = self.client.get_cluster(self.config.get('cluster')) self.kubeconfig = cluster.get_settings().get_raw( )['data']['kube_config_path'] cluster_endpoint = cluster.get_settings().get_raw( )['data']['cluster']['Endpoint'] self.cluster_region = re.findall(r'((?:\w+-)+\w+)', cluster_endpoint)[0] kubernetes.config.load_kube_config(config_file=self.kubeconfig)
def get_snowflake_datasets(): project_key = default_project_key() project = api_client().get_project(project_key) return [ dataset for dataset in project.list_datasets() if dataset.type == 'Snowflake' ]
def __init__(self, gds_name, devops_team): #client object for this class to use self.__client = dataiku.api_client() #Validate user is allowed to do this self.gds_name = gds_name self.devops_team = devops_team
def do(payload, config, plugin_config, inputs): if "method" not in payload: return {} client = dataiku.api_client() if payload["method"] == "get-valid-csv-filenames": required_columns = ["id", "className"] sep = "," # Retrieving model folder model_folder_full_name = [ inp for inp in inputs if inp["role"] == "modelFolder" ][0]["fullName"] model_folder = dataiku.Folder(model_folder_full_name).get_path() csv_files_root_mf = glob.glob(model_folder + "/*.csv") # Filtering out files without required columns csv_valid_filenames = [] for f in csv_files_root_mf: schema = retrieve_schema_from_pandas_compatible_csv_file(f, sep) if len([col for col in required_columns if col not in schema]) == 0: valid_file = {"path": f, "name": os.path.basename(f)} csv_valid_filenames.append(valid_file) return {"csv_valid_filenames": csv_valid_filenames}
def get_folder_partition_root(folder, is_input=False): """Retrieve the partition root path using a dataiku.Folder. Args: folder (dataiku.Folder): Input or output folder of the recipe used to retrieve the partition path pattern. is_input: True if the folder must be considered as a input, False if output Returns: Partition path or None if folder is not partitioned. """ folder_id = folder.get_id() input_id = folder_id if is_input else None dku_flow_variables = dataiku.get_flow_variables() client = dataiku.api_client() project = client.get_project(dataiku.default_project_key()) folder = project.get_managed_folder(folder_id) folder_config = folder.get_definition() partitioning_config = folder_config.get("partitioning") if not partitioning_config: return "" file_path_pattern = partitioning_config.get("filePathPattern", None) dimensions, types = get_dimensions(partitioning_config) partitions = get_partitions(dku_flow_variables, dimensions, input_id=input_id) file_path = complete_file_path_pattern(file_path_pattern, partitions, dimensions, types) file_path = complete_file_path_time_pattern(dku_flow_variables, file_path, input_id=input_id) return file_path
def build_scenario(build_plan, filter_on='ready', connection='dataiku_workspace', ref_table='referentialclient', ref_project='DIReferential', add_ecrm_context=True, finish_on_client=None, single_client=None): scenario = Scenario() if not isinstance(filter_on, list): filter_on = [filter_on] project_name = dataiku.default_project_key() project_key = dataiku.api_client().get_project(project_name) local_variables = project_key.get_variables()['local'] env = local_variables['env'] kut.display_message('reading client context referential') executor = SQLExecutor2(connection=connection) sql_query_referential_client = "SELECT * FROM " + '_'.join( [env, ref_project, ref_table]) client_ref = executor.query_to_df(sql_query_referential_client) filter_query = ' & '.join(filter_on) client_ref = client_ref.query(filter_query) if filter_query else client_ref kut.display_message('Client ready for automation : ' + client_ref.clientName.unique()) kut.display_message('run configuration') print(build_plan) if not pd.isnull(finish_on_client): finish_client = client_ref[client_ref.clientName == finish_on_client] if len(finish_client) == 0: kut.display_message( 'finish client not found in plan ' + finish_on_client + ' is the client name valid ?' ) # Example: load a DSS dataset as a Pandas dataframe other_clients = client_ref[client_ref.clientName != finish_on_client] client_ref = pd.concat([other_clients, finish_client], ignore_index=True) success = [] if single_client is not None: requested_client = client_ref[client_ref.clientName == single_client] if not len(single_client): kut.display_message( 'requested single client is not found,building all allowed clients' ) else: client_ref = requested_client for index, client_row in client_ref.iterrows(): variables = set_client_context(client_row=client_row, add_ecrm_context=add_ecrm_context, connection=connection) client_name = variables['local']['clientName'] kut.display_message('starting builds on ' + client_name) run_scenario(table_plan=build_plan, scenario=scenario) success.append(client_name) scenario.set_global_variables(successfullRun=success) print('done_________________' + client_name) return success
def __init__(self, project_key, config): self.project_key = project_key self.azure_ad_connection = config.get("azure_ad_connection", {}) self.flag_simulate = config.get("flag_simulate") self.auth_method = self.azure_ad_connection.get("auth_method") # Read the group configuration data from DSS self.groups_dataset = config.get("groups_dataset", None) if not self.groups_dataset: raise Exception("No groups dataset has been selected.") groups_dataset_handle = dataiku.Dataset(self.groups_dataset, self.project_key) self.groups_df = groups_dataset_handle.get_dataframe() self.client = dataiku.api_client() self.run_user = self.client.get_auth_info()["authIdentifier"] self.session = requests.Session() # Initialize a dataframe that will contain log data self.log_df = pd.DataFrame(columns=["date", "user", "type", "message"]) # Configure auth method self.required_credentials = self.get_required_credentials( self.azure_ad_connection.get("auth_method") ) # Read credentials if self.azure_ad_connection.get("flag_user_credentials"): self.credentials = self.get_credentials("user") else: self.credentials = self.get_credentials("parameters") # Connect to Graph API self.set_session_headers()
def asterDo(): # Recipe inputs main_input_name = get_input_names_for_role('main')[0] input_dataset = dataiku.Dataset(main_input_name) # Recipe outputs main_output_name = get_output_names_for_role('main')[0] output_dataset = dataiku.Dataset(main_output_name) # Recipe function param dss_function = get_recipe_config().get('function', None) # Daitaiku DSS params client = dataiku.api_client() projectkey = main_input_name.split('.')[0] project = client.get_project(projectkey) try: # output dataset outputTable = outputtableinfo( output_dataset.get_location_info()['info'], main_output_name, get_recipe_config() or {}) except Exception as error: raise RuntimeError( """Error obtaining connection settings for output table.""" """ Make sure connection setting is set to 'Read a database table'.""" """ This plugin only supports Aster tables.""") # input datasets try: main_input_names = get_input_names_for_role('main') inputTables = [] for inputname in main_input_names: inconnectioninfo = dataiku.Dataset( inputname).get_location_info()['info'] inTable = inputtableinfo(inconnectioninfo, inputname, dss_function) inputTables.append(inTable) except Exception as error: raise RuntimeError( """Error obtaining connection settings from one of the input tables.""" """ Make sure connection setting is set to 'Read a database table'.""" """ This plugin only supports Aster tables.""") # actual query query = getFunctionsQuery(dss_function, inputTables, outputTable) print('\n'.join(query)) executor = SQLExecutor2(dataset=input_dataset) if dss_function.get('dropIfExists', False): dropAllQuery = getDropOutputTableArgumentsStatements( dss_function.get('arguments', [])) executor.query_to_df('END TRANSACTION;', dropAllQuery) executor.query_to_df("END TRANSACTION;", pre_queries=query) # write table schema nQuery = '''SELECT * FROM {} LIMIT (1);'''.format(outputTable.tablename) selectResult = executor.query_to_df(nQuery) output_schema = [] for column in selectResult.columns: output_schema.append({"name": column, "type": "string"}) output_dataset.write_schema(output_schema)
def create_sync_recipe_from_dataset(project_key, recipe_name, inp_dataset_name, out_dataset_name, connection): client = dataiku.api_client() prj = dataikuapi.dss.project.DSSProject(client, project_key) #r = SyncRecipeCreator(recipe_name, prj).with_input(inp_dataset_name).\ r = dataikuapi.dss.recipe.SingleOutputRecipeCreator( 'shaker', recipe_name, prj ).with_input(inp_dataset_name).\ with_new_output(out_dataset_name, connection, format_option_id='PARQUET_HIVE').build()
def get_cluster_from_dss_cluster(dss_cluster_id): # get the public API client client = dataiku.api_client() # get the cluster object in DSS found = False for c in client.list_clusters(): if c['name'] == dss_cluster_id: found = True if not found: raise Exception("DSS cluster %s doesn't exist" % dss_cluster_id) dss_cluster = client.get_cluster(dss_cluster_id) # get the settings in it dss_cluster_settings = dss_cluster.get_settings() dss_cluster_config = dss_cluster_settings.get_raw()['params']['config'] # resolve since we get the config with the raw preset setup dss_cluster_config = backend_json_call( 'plugins/get-resolved-settings', data={ 'elementConfig': json.dumps(dss_cluster_config), 'elementType': dss_cluster_settings.get_raw()['type'] }) logging.info("Resolved cluster config : %s" % json.dumps(dss_cluster_config)) # build the helper class from the cluster settings (the macro doesn't have the params) clusters = get_cluster_from_connection_info( dss_cluster_config['config']['connectionInfo'], dss_cluster_config['pluginConfig']['connectionInfo']) cluster_data = dss_cluster_settings.get_plugin_data() return cluster_data, clusters, dss_cluster_settings, dss_cluster_config
def list_datasets(): project_key = dataiku.default_project_key() client = dataiku.api_client() project = client.get_project(project_key) dataset_list = [{ "name": dataset_dict['name'] } for dataset_dict in project.list_datasets()] return json.dumps({'dataset_list': dataset_list})
def __init__(self, project_key, config, plugin_config): self.project_key = project_key self.config = config self.plugin_config = plugin_config self.client = dataiku.api_client() if self.config.get('all_projects'): self.projects = self.client.list_project_keys() else: self.projects = [self.project_key]
def test_recipe(spark_session, scenario, src_project_key, src_recipe_key, testbench_project_key, test_params): # Trigger dataiku, not parquet context.set("BIRGITTA_DATASET_STORAGE", "DATAIKU") # Trigger dataiku, not parquet context.set("BIRGITTA_S3_BUCKET", "birgitta_s3_bucket") print('####################################################') print('Test recipe: %s (in project %s)' % (src_recipe_key, src_project_key)) if src_project_key == testbench_project_key: raise ValueError('Cannot clone recipe to same project as src project') print('Clone dataset schemas') schemas = test_params['schemas'] client = dataiku.api_client() cloned_input_datasets = schemas['inputs'].keys() cloned_input_datasets = clone_schemas(client, src_project_key, testbench_project_key, cloned_input_datasets, 'Inline') cloned_output_datasets = schemas['outputs'].keys() cloned_output_datasets = clone_schemas(client, src_project_key, testbench_project_key, cloned_output_datasets, 'HDFS') expected_output_datasets = create_expected_output_schemas( client, src_project_key, testbench_project_key, cloned_output_datasets) print('Clone recipe') recipe_manage.clone(client, src_project_key, src_recipe_key, testbench_project_key, test_name(src_recipe_key), cloned_input_datasets, cloned_output_datasets) test_cases = test_params['test_cases'] for test_case in test_cases: print('Setup test case: ' + test_case['name']) print('Empty and fill datasets with fixtures') empty_and_fill_datasets(testbench_project_key, cloned_input_datasets, schemas['inputs'], test_case['inputs']) empty_and_fill_datasets(testbench_project_key, cloned_output_datasets, schemas['outputs'], False) # empty dataset empty_and_fill_datasets(testbench_project_key, expected_output_datasets, expected_params(schemas['outputs']), expected_params(test_case['outputs'])) print('Run recipe') testbench_output_dataset_key = test_params['principal_output_dataset'] scenario.build_dataset(dataset_name=testbench_output_dataset_key, project_key=testbench_project_key) print('Validate output') for dataset_name in test_case['outputs']: print('Validate output dataset: %s' % (dataset_name)) validate.datasets(spark_session, dataset_name, expected_name(dataset_name), testbench_project_key) print('Successfully validated output dataset: %s' % (dataset_name)) print('Delete testbench recipe TODO') print('Delete datasets TODO') print('Tests successful')
def set_schema(self, dataset_name, dku_dataset, schema, project_key): dataiku_schema = dkuschema.to_dataiku(schema) client = dataiku.api_client() if not project_key: project_key = dku_dataset.get_config()['projectKey'] project = dataikuapi.dss.project.DSSProject(client, project_key) dapi_dataset = project.get_dataset(dataset_name) ret = dapi_dataset.set_schema(dataiku_schema) print(f"dataset.set_schema() for {dataset_name}", repr(ret))
def __init__(self, project_key, config, plugin_config): """ :param project_key: the project in which the runnable executes :param config: the dict of the configuration of the object :param plugin_config: contains the plugin settings """ self.project_key = project_key self.config = config self.plugin_config = plugin_config self.client = dataiku.api_client()
def do(payload, config, plugin_config, inputs): if 'method' not in payload: return {} client = dataiku.api_client() if payload['method'] == 'get-fetching-functions': return get_fetching_functions() return {}
def run(self, progress_callback): dss_cluster = dataiku.api_client().get_cluster( self.config["dss_cluster_id"]) settings = dss_cluster.get_settings() (client, cluster_id) = dku_dataproc.get_client_and_wait(settings) client.scaleCluster( cluster_id, self.config["regular_worker_instances"], numberOfSecondaryInstance=self.config["spot_worker_instances"]) return "Done"
def get_sensitive_data(): headers = dict(request.headers) # Get the auth info of the user performing the request auth_info = dataiku.api_client().get_auth_info_from_browser_headers( headers) print("User doing the query is %s" % auth_info["authIdentifier"]) # If the user's group is not TRUSTED_GROUP, raise an exception if TRUSTED_GROUP not in auth_info["groups"]: raise Exception("You do not belong here, go away") else: data = {"status": "ok", "you_are": auth_info["authIdentifier"]} return json.dumps(data)
def do(payload, config, plugin_config, inputs): """ Create list of options of projects in instance. """ projects = dataiku.api_client().list_projects() choices = [] for project in projects: choices.append({ "value": project.get('projectKey'), "label": project.get('projectKey') }) return {"choices": choices}
def run(self, progress_callback): dss_cluster = dataiku.api_client().get_cluster( self.config["dss_cluster_id"]) settings = dss_cluster.get_settings() (client, emr_cluster_id) = dku_emr.get_client_and_wait(settings) logging.info("retrieving instance groups") instance_groups = client.list_instance_groups(ClusterId=emr_cluster_id) logging.info("retrieving master instance") master_instances = client.list_instances(ClusterId=emr_cluster_id, InstanceGroupTypes=['MASTER'], InstanceStates=[ 'AWAITING_FULFILLMENT', 'PROVISIONING', 'BOOTSTRAPPING', 'RUNNING' ]) master_instance_info = { "privateIpAddress": master_instances['Instances'][0]["PrivateIpAddress"] } logging.info("retrieving slave instances") slave_instances = client.list_instances( ClusterId=emr_cluster_id, InstanceGroupTypes=['CORE', 'TASK'], InstanceStates=[ 'AWAITING_FULFILLMENT', 'PROVISIONING', 'BOOTSTRAPPING', 'RUNNING' ]) slave_instances_info = [{ "privateIpAddress": inst["PrivateIpAddress"] } for inst in slave_instances["Instances"]] return { "masterInstance": master_instance_info, "slaveInstances": slave_instances_info, 'instanceGroups': [{ "instanceGroupId": x["Id"], "runningInstanceCount": x["RunningInstanceCount"], "instanceType": x["InstanceType"], "instanceGroupType": x["InstanceGroupType"], "status": x["Status"]["State"] } for x in instance_groups["InstanceGroups"]] }
def __init__(self, gds_name, base_ad_group): #client object for this class to use self.__client = dataiku.api_client() #Validate user is allowed to do this self.gds_name = AdminValidator(self.__client, gds_name) assert (base_ad_group in self.__client.get_general_settings( ).settings['ldapSettings']['authorizedGroups'] ), 'Whitelist the AD group first / check for ad group typos.' self.base_ad_group = base_ad_group self.devops_team = base_ad_group.replace('GEN-ZZ-APP-GG-ai-', '').replace('-', '_')
def get_emr_cluster_info(dss_cluster_id): """Returns dictionary containing info about the EMR cluster used by a project.""" # client = client or dataiku.api_client() # cluster_info_macro = proj.get_macro("pyrunnable_emr-clusters_get-cluster-info") # dss_cluster_info = {'dss_cluster_id': dss_cluster_id} # response = cluster_info_macro.get_result(cluster_info_macro.run(params=dss_cluster_info)) cluster = dataiku.api_client().get_cluster(dss_cluster_id) cluster_settings = cluster.get_settings().settings emr_cluster_id = cluster_settings['data']['emrClusterId'] aws_region_id = cluster_settings['params']['config']['awsRegionId'] boto_client = boto3.client('emr', region_name=aws_region_id) instance_groups = { g['Id']: {} for g in boto_client.list_instance_groups(ClusterId=emr_cluster_id) } for gid in instance_groups.keys(): pass dss_cluster_info = { 'dss_cluster_id': dss_cluster_id, 'emr_cluster_id': emr_cluster_id, 'instance_groups': instance_groups } logging.info("retrieving master instance") master_instances = boto_client.list_instances( ClusterId=emr_cluster_id, InstanceGroupTypes=['MASTER'], InstanceStates=[ 'AWAITING_FULFILLMENT', 'PROVISIONING', 'BOOTSTRAPPING', 'RUNNING' ]) master_instance_info = { "privateIpAddress": master_instances['Instances'][0]["PublicIpAddress"] } # if cluster_mgr_project_key is not None: # cluster_mgr_info = client.get_project(cluster_mgr_project_key).get_variables( # )['standard']['emr']['clusters'].get(dss_cluster_id) # if cluster_mgr_info is not None: # instance_groups = cluster_mgr_info['instance_groups'] # for dss_grp_info in dss_cluster_info['instanceGroups']: # grp_id = dss_grp_info['instanceGroupId'] # cluster_mgr_grp_info = instance_groups.get(grp_id) # if cluster_mgr_grp_info: # dss_grp_info.update( # {'resizable': cluster_mgr_grp_info.get('resizable', False)} # ) return dss_cluster_info
def create_parquet_dataset(project_key, dataset_name, connection_name, hive_db='default', hive_tbl='default', dataset_path='default'): client = dataiku.api_client() prj = client.get_project(project_key) ds_formatparams = { 'parquetBlockSizeMB': 128, 'parquetCompressionMethod': 'SNAPPY', 'parquetFlavor': 'HIVE', 'parquetLowerCaseIdentifiers': False, 'representsNullFields': False } if hive_db == 'default': # We use the default settings, the database is stored in ${hive_db_<CONN>} meta_sync = True hive_database = '${hive_db_' + connection_name + '}' if hive_tbl == 'default': hive_table_name = '${projectKey}_' + dataset_name else: hive_table_name = hive_tbl if hive_db is None: hive_database = None hive_table_name = None meta_sync = False if dataset_path == 'default': path = '/${projectKey}/' + dataset_name else: path = dataset_path ds_params = { u'path': path, u'connection': connection_name, u'notReadyIfEmpty': False, u'metastoreSynchronizationEnabled': meta_sync, u'hiveDatabase': hive_database, u'hiveTableName': hive_table_name, u'filesSelectionRules': { u'excludeRules': [], u'explicitFiles': [], u'mode': u'ALL', u'includeRules': [] }, u'timeout': 10000 } return prj.create_dataset(dataset_name, 'HDFS', params=ds_params, formatType='parquet', formatParams=ds_formatparams)
def pytest_generate_tests(metafunc): if "scenario_id" in metafunc.fixturenames: p_host = metafunc.config.getoption('--host') p_api = metafunc.config.getoption('--api') p_project = metafunc.config.getoption('--project') dataiku.set_remote_dss(p_host, p_api) client = dataiku.api_client() project = client.get_project(p_project) list_scenarios = [] for scenario in project.list_scenarios(): if scenario["id"].startswith("TEST_"): print("Adding scenario to test :", scenario["id"]) list_scenarios.append(scenario["id"]) metafunc.parametrize("scenario_id", list_scenarios)
def __init__(self, folder_name, host="127.0.0.1", verbosity=logging.WARN): Thread.__init__(self) self.project_key = os.environ["DKU_CURRENT_PROJECT_KEY"] self.folder_name = folder_name self.client = dataiku.api_client() logging.set_verbosity(verbosity) # Getting app logs_path = self.__get_logs_path() app = self.__get_tb_app(logs_path) # Setting server self.srv = make_server(host, 0, app)