Пример #1
0
def do(payload, config, plugin_config, inputs):
    if payload["funtastic"] == "engines":
        client = dataiku.api_client()
        project = client.get_project(dataiku.default_project_key())
        engines = project.get_settings().get_raw(
        )['metrics']['engineConfig'].keys()
        return {'engines': engines}

    if payload["funtastic"] == "connections":
        client = dataiku.api_client()
        connections = client.list_connections().keys()
        return {'connections': connections}
Пример #2
0
def get_dataset_flow():
    client = dataiku.api_client()
    project_key = dataiku.default_project_key()
    project = client.get_project(project_key)
    datasets = project.list_datasets()
    dataset_names = [datasets[i]["name"] for i in range(len(datasets))]
    return json.jsonify({"dataset_names": dataset_names})
Пример #3
0
    def run(self, progress_callback):
        # You should really comment your code better

        # get remote key and url
        remote_url = self.config.get('remote_url')
        remote_key = self.config.get('remote_api')
        project_key = self.project_key

        # get local and remote wikis for current project
        rp = dataikuapi.DSSClient(remote_url,
                                  api_key=remote_key).get_project(project_key)
        cp = dataiku.api_client().get_project(project_key)

        local_wiki = cp.get_wiki()
        remote_wiki = rp.get_wiki()

        # replace or create new articles in the project wikis
        for l_article in local_wiki.list_articles():
            for r_article in remote_wiki.list_articles():
                if l_article.article_id == r_article.article_id:
                    r_article.delete()
            remote_wiki.create_article(l_article.article_id,
                                       content=l_article.get_data().get_body())

        return '<body>Wiki Updated on instance running at: ' + remote_url + '</body>'
def count_records(dataset: dataiku.Dataset) -> int:
    """
    Count the number of records of a dataset using the Dataiku dataset metrics API
    """
    metric_id = "records:COUNT_RECORDS"
    dataset_name = dataset.name.split(".")[1]
    partitions = dataset.read_partitions
    client = dataiku.api_client()
    project = client.get_project(dataiku.default_project_key())
    logging.info("Counting records of dataset: {}".format(dataset_name))
    if partitions is None or len(partitions) == 0:
        project.get_dataset(dataset_name).compute_metrics(metric_ids=[metric_id])
        metric = dataset.get_last_metric_values()
        record_count = dataiku.ComputedMetrics.get_value_from_data(metric.get_global_data(metric_id=metric_id))
        logging.info("Dataset contains {:d} records and is not partitioned".format(record_count))
    else:
        record_count = 0
        for partition in partitions:
            project.get_dataset(dataset_name).compute_metrics(partition=partition, metric_ids=[metric_id])
            metric = dataset.get_last_metric_values()
            record_count += dataiku.ComputedMetrics.get_value_from_data(
                metric.get_partition_data(partition=partition, metric_id=metric_id)
            )
        logging.info("Dataset contains {:d} records in partition(s) {}".format(record_count, partitions))
    return record_count
Пример #5
0
def get_token():

    # Read in the existing conf
    dss = dataiku.api_client()
    project = dss.get_project(dataiku.default_project_key())
    variables = project.get_variables()["standard"]
    conf = variables.get("powerbi-settings", None)

    # Decrypt
    key = request.args.get("api-key")
    pbi = {}
    pbi["username"] = conf["username"]
    pbi["password"] = decrypt_string(conf["password"], key)
    pbi["client_id"] = conf["client_id"]
    pbi["client_secret"] = decrypt_string(conf["client_secret"], key)
    pbi["resource"] = conf["resource"]
    pbi["grant_type"] = conf["grant_type"]
    pbi["scope"] = conf["scope"]

    # Get the token
    response = requests.post(
        'https://login.microsoftonline.com/common/oauth2/token', data=pbi)
    o = {}
    o["token"] = response.json().get("access_token")

    return json.dumps(o)
Пример #6
0
    def __init__(self, project_key, config, plugin_config):
        """
        :param project_key: the project in which the runnable executes
        :param config: the dict of the configuration of the object
        :param plugin_config: contains the plugin settings
        """
        self.project_key = project_key
        self.config = config
        self.plugin_config = plugin_config

        self.Helm = Helm()

        self.client = dataiku.api_client()

        cluster = self.client.get_cluster(self.config.get('cluster'))

        self.kubeconfig = cluster.get_settings().get_raw(
        )['data']['kube_config_path']
        cluster_endpoint = cluster.get_settings().get_raw(
        )['data']['cluster']['Endpoint']

        self.cluster_region = re.findall(r'((?:\w+-)+\w+)',
                                         cluster_endpoint)[0]

        kubernetes.config.load_kube_config(config_file=self.kubeconfig)
Пример #7
0
def get_snowflake_datasets():
    project_key = default_project_key()
    project = api_client().get_project(project_key)
    return [
        dataset for dataset in project.list_datasets()
        if dataset.type == 'Snowflake'
    ]
Пример #8
0
    def __init__(self, gds_name, devops_team):
        #client object for this class to use
        self.__client = dataiku.api_client()

        #Validate user is allowed to do this
        self.gds_name = gds_name
        self.devops_team = devops_team
Пример #9
0
def do(payload, config, plugin_config, inputs):
    if "method" not in payload:
        return {}

    client = dataiku.api_client()

    if payload["method"] == "get-valid-csv-filenames":

        required_columns = ["id", "className"]
        sep = ","

        # Retrieving model folder
        model_folder_full_name = [
            inp for inp in inputs if inp["role"] == "modelFolder"
        ][0]["fullName"]
        model_folder = dataiku.Folder(model_folder_full_name).get_path()

        csv_files_root_mf = glob.glob(model_folder + "/*.csv")

        # Filtering out files without required columns
        csv_valid_filenames = []
        for f in csv_files_root_mf:
            schema = retrieve_schema_from_pandas_compatible_csv_file(f, sep)
            if len([col for col in required_columns
                    if col not in schema]) == 0:
                valid_file = {"path": f, "name": os.path.basename(f)}
                csv_valid_filenames.append(valid_file)

    return {"csv_valid_filenames": csv_valid_filenames}
def get_folder_partition_root(folder, is_input=False):
    """Retrieve the partition root path using a dataiku.Folder.

    Args:
        folder (dataiku.Folder): Input or output folder of the recipe used to retrieve the partition path pattern.
        is_input:  True if the folder must be considered as a input, False if output

    Returns:
        Partition path or None if folder is not partitioned.
    """
    folder_id = folder.get_id()
    input_id = folder_id if is_input else None
    dku_flow_variables = dataiku.get_flow_variables()
    client = dataiku.api_client()
    project = client.get_project(dataiku.default_project_key())
    folder = project.get_managed_folder(folder_id)
    folder_config = folder.get_definition()
    partitioning_config = folder_config.get("partitioning")
    if not partitioning_config:
        return ""
    file_path_pattern = partitioning_config.get("filePathPattern", None)
    dimensions, types = get_dimensions(partitioning_config)
    partitions = get_partitions(dku_flow_variables,
                                dimensions,
                                input_id=input_id)
    file_path = complete_file_path_pattern(file_path_pattern, partitions,
                                           dimensions, types)
    file_path = complete_file_path_time_pattern(dku_flow_variables,
                                                file_path,
                                                input_id=input_id)
    return file_path
def build_scenario(build_plan,
                   filter_on='ready',
                   connection='dataiku_workspace',
                   ref_table='referentialclient',
                   ref_project='DIReferential',
                   add_ecrm_context=True,
                   finish_on_client=None,
                   single_client=None):
    scenario = Scenario()
    if not isinstance(filter_on, list):
        filter_on = [filter_on]
    project_name = dataiku.default_project_key()
    project_key = dataiku.api_client().get_project(project_name)
    local_variables = project_key.get_variables()['local']
    env = local_variables['env']
    kut.display_message('reading client context referential')

    executor = SQLExecutor2(connection=connection)
    sql_query_referential_client = "SELECT * FROM " + '_'.join(
        [env, ref_project, ref_table])
    client_ref = executor.query_to_df(sql_query_referential_client)
    filter_query = ' & '.join(filter_on)
    client_ref = client_ref.query(filter_query) if filter_query else client_ref
    kut.display_message('Client ready for automation  : ' +
                        client_ref.clientName.unique())

    kut.display_message('run configuration')
    print(build_plan)

    if not pd.isnull(finish_on_client):
        finish_client = client_ref[client_ref.clientName == finish_on_client]
        if len(finish_client) == 0:
            kut.display_message(
                'finish client not found in plan ' + finish_on_client +
                ' is the client name valid ?'
            )  # Example: load a DSS dataset as a Pandas dataframe
        other_clients = client_ref[client_ref.clientName != finish_on_client]
        client_ref = pd.concat([other_clients, finish_client],
                               ignore_index=True)
    success = []
    if single_client is not None:
        requested_client = client_ref[client_ref.clientName == single_client]
        if not len(single_client):
            kut.display_message(
                'requested single client is not found,building all allowed clients'
            )
        else:
            client_ref = requested_client
    for index, client_row in client_ref.iterrows():
        variables = set_client_context(client_row=client_row,
                                       add_ecrm_context=add_ecrm_context,
                                       connection=connection)
        client_name = variables['local']['clientName']
        kut.display_message('starting builds on ' + client_name)

        run_scenario(table_plan=build_plan, scenario=scenario)
        success.append(client_name)
        scenario.set_global_variables(successfullRun=success)
        print('done_________________' + client_name)
    return success
    def __init__(self, project_key, config):
        self.project_key = project_key
        self.azure_ad_connection = config.get("azure_ad_connection", {})
        self.flag_simulate = config.get("flag_simulate")
        self.auth_method = self.azure_ad_connection.get("auth_method")
        # Read the group configuration data from DSS
        self.groups_dataset = config.get("groups_dataset", None)
        if not self.groups_dataset:
            raise Exception("No groups dataset has been selected.")

        groups_dataset_handle = dataiku.Dataset(self.groups_dataset, self.project_key)
        self.groups_df = groups_dataset_handle.get_dataframe()

        self.client = dataiku.api_client()
        self.run_user = self.client.get_auth_info()["authIdentifier"]
        self.session = requests.Session()

        # Initialize a dataframe that will contain log data
        self.log_df = pd.DataFrame(columns=["date", "user", "type", "message"])

        # Configure auth method
        self.required_credentials = self.get_required_credentials(
            self.azure_ad_connection.get("auth_method")
        )

        # Read credentials
        if self.azure_ad_connection.get("flag_user_credentials"):
            self.credentials = self.get_credentials("user")
        else:
            self.credentials = self.get_credentials("parameters")

        # Connect to Graph API
        self.set_session_headers()
Пример #13
0
def asterDo():
    # Recipe inputs
    main_input_name = get_input_names_for_role('main')[0]
    input_dataset = dataiku.Dataset(main_input_name)

    # Recipe outputs
    main_output_name = get_output_names_for_role('main')[0]
    output_dataset = dataiku.Dataset(main_output_name)

    # Recipe function param
    dss_function = get_recipe_config().get('function', None)

    # Daitaiku DSS params
    client = dataiku.api_client()
    projectkey = main_input_name.split('.')[0]
    project = client.get_project(projectkey)

    try:
        # output dataset
        outputTable = outputtableinfo(
            output_dataset.get_location_info()['info'], main_output_name,
            get_recipe_config() or {})
    except Exception as error:
        raise RuntimeError(
            """Error obtaining connection settings for output table."""
            """ Make sure connection setting is set to 'Read a database table'."""
            """ This plugin only supports Aster tables.""")

    # input datasets
    try:
        main_input_names = get_input_names_for_role('main')
        inputTables = []
        for inputname in main_input_names:
            inconnectioninfo = dataiku.Dataset(
                inputname).get_location_info()['info']
            inTable = inputtableinfo(inconnectioninfo, inputname, dss_function)
            inputTables.append(inTable)
    except Exception as error:
        raise RuntimeError(
            """Error obtaining connection settings from one of the input tables."""
            """ Make sure connection setting is set to 'Read a database table'."""
            """ This plugin only supports Aster tables.""")

    # actual query
    query = getFunctionsQuery(dss_function, inputTables, outputTable)
    print('\n'.join(query))
    executor = SQLExecutor2(dataset=input_dataset)
    if dss_function.get('dropIfExists', False):
        dropAllQuery = getDropOutputTableArgumentsStatements(
            dss_function.get('arguments', []))
        executor.query_to_df('END TRANSACTION;', dropAllQuery)
    executor.query_to_df("END TRANSACTION;", pre_queries=query)

    # write table schema
    nQuery = '''SELECT * FROM {} LIMIT (1);'''.format(outputTable.tablename)
    selectResult = executor.query_to_df(nQuery)
    output_schema = []
    for column in selectResult.columns:
        output_schema.append({"name": column, "type": "string"})
    output_dataset.write_schema(output_schema)
def create_sync_recipe_from_dataset(project_key, recipe_name, inp_dataset_name,
                                    out_dataset_name, connection):
    client = dataiku.api_client()
    prj = dataikuapi.dss.project.DSSProject(client, project_key)
    #r = SyncRecipeCreator(recipe_name, prj).with_input(inp_dataset_name).\
    r = dataikuapi.dss.recipe.SingleOutputRecipeCreator( 'shaker', recipe_name, prj ).with_input(inp_dataset_name).\
    with_new_output(out_dataset_name, connection, format_option_id='PARQUET_HIVE').build()
Пример #15
0
def get_cluster_from_dss_cluster(dss_cluster_id):
    # get the public API client
    client = dataiku.api_client()

    # get the cluster object in DSS
    found = False
    for c in client.list_clusters():
        if c['name'] == dss_cluster_id:
            found = True
    if not found:
        raise Exception("DSS cluster %s doesn't exist" % dss_cluster_id)
    dss_cluster = client.get_cluster(dss_cluster_id)

    # get the settings in it
    dss_cluster_settings = dss_cluster.get_settings()
    dss_cluster_config = dss_cluster_settings.get_raw()['params']['config']
    # resolve since we get the config with the raw preset setup
    dss_cluster_config = backend_json_call(
        'plugins/get-resolved-settings',
        data={
            'elementConfig': json.dumps(dss_cluster_config),
            'elementType': dss_cluster_settings.get_raw()['type']
        })
    logging.info("Resolved cluster config : %s" %
                 json.dumps(dss_cluster_config))
    # build the helper class from the cluster settings (the macro doesn't have the params)
    clusters = get_cluster_from_connection_info(
        dss_cluster_config['config']['connectionInfo'],
        dss_cluster_config['pluginConfig']['connectionInfo'])

    cluster_data = dss_cluster_settings.get_plugin_data()

    return cluster_data, clusters, dss_cluster_settings, dss_cluster_config
Пример #16
0
def list_datasets():
    project_key = dataiku.default_project_key()
    client = dataiku.api_client()
    project = client.get_project(project_key)
    dataset_list = [{
        "name": dataset_dict['name']
    } for dataset_dict in project.list_datasets()]
    return json.dumps({'dataset_list': dataset_list})
Пример #17
0
 def __init__(self, project_key, config, plugin_config):
     self.project_key = project_key
     self.config = config
     self.plugin_config = plugin_config
     self.client = dataiku.api_client()
     if self.config.get('all_projects'):
         self.projects = self.client.list_project_keys()
     else:
         self.projects = [self.project_key]
Пример #18
0
def test_recipe(spark_session, scenario, src_project_key, src_recipe_key,
                testbench_project_key, test_params):
    # Trigger dataiku, not parquet
    context.set("BIRGITTA_DATASET_STORAGE", "DATAIKU")
    # Trigger dataiku, not parquet
    context.set("BIRGITTA_S3_BUCKET", "birgitta_s3_bucket")
    print('####################################################')
    print('Test recipe: %s (in project %s)' %
          (src_recipe_key, src_project_key))
    if src_project_key == testbench_project_key:
        raise ValueError('Cannot clone recipe to same project as src project')

    print('Clone dataset schemas')
    schemas = test_params['schemas']
    client = dataiku.api_client()
    cloned_input_datasets = schemas['inputs'].keys()
    cloned_input_datasets = clone_schemas(client, src_project_key,
                                          testbench_project_key,
                                          cloned_input_datasets, 'Inline')
    cloned_output_datasets = schemas['outputs'].keys()
    cloned_output_datasets = clone_schemas(client, src_project_key,
                                           testbench_project_key,
                                           cloned_output_datasets, 'HDFS')
    expected_output_datasets = create_expected_output_schemas(
        client, src_project_key, testbench_project_key, cloned_output_datasets)
    print('Clone recipe')
    recipe_manage.clone(client, src_project_key,
                        src_recipe_key, testbench_project_key,
                        test_name(src_recipe_key), cloned_input_datasets,
                        cloned_output_datasets)

    test_cases = test_params['test_cases']
    for test_case in test_cases:
        print('Setup test case: ' + test_case['name'])
        print('Empty and fill datasets with fixtures')
        empty_and_fill_datasets(testbench_project_key, cloned_input_datasets,
                                schemas['inputs'], test_case['inputs'])
        empty_and_fill_datasets(testbench_project_key, cloned_output_datasets,
                                schemas['outputs'], False)  # empty dataset
        empty_and_fill_datasets(testbench_project_key,
                                expected_output_datasets,
                                expected_params(schemas['outputs']),
                                expected_params(test_case['outputs']))
        print('Run recipe')
        testbench_output_dataset_key = test_params['principal_output_dataset']
        scenario.build_dataset(dataset_name=testbench_output_dataset_key,
                               project_key=testbench_project_key)
        print('Validate output')
        for dataset_name in test_case['outputs']:
            print('Validate output dataset: %s' % (dataset_name))
            validate.datasets(spark_session, dataset_name,
                              expected_name(dataset_name),
                              testbench_project_key)
            print('Successfully validated output dataset: %s' % (dataset_name))
    print('Delete testbench recipe TODO')
    print('Delete datasets TODO')
    print('Tests successful')
Пример #19
0
 def set_schema(self, dataset_name, dku_dataset, schema, project_key):
     dataiku_schema = dkuschema.to_dataiku(schema)
     client = dataiku.api_client()
     if not project_key:
         project_key = dku_dataset.get_config()['projectKey']
     project = dataikuapi.dss.project.DSSProject(client, project_key)
     dapi_dataset = project.get_dataset(dataset_name)
     ret = dapi_dataset.set_schema(dataiku_schema)
     print(f"dataset.set_schema() for {dataset_name}", repr(ret))
Пример #20
0
 def __init__(self, project_key, config, plugin_config):
     """
     :param project_key: the project in which the runnable executes
     :param config: the dict of the configuration of the object
     :param plugin_config: contains the plugin settings
     """
     self.project_key = project_key
     self.config = config
     self.plugin_config = plugin_config
     self.client = dataiku.api_client()
def do(payload, config, plugin_config, inputs):
    if 'method' not in payload:
        return {}

    client = dataiku.api_client()

    if payload['method'] == 'get-fetching-functions':
        return get_fetching_functions()

    return {}
Пример #22
0
    def run(self, progress_callback):
        dss_cluster = dataiku.api_client().get_cluster(
            self.config["dss_cluster_id"])
        settings = dss_cluster.get_settings()
        (client, cluster_id) = dku_dataproc.get_client_and_wait(settings)

        client.scaleCluster(
            cluster_id,
            self.config["regular_worker_instances"],
            numberOfSecondaryInstance=self.config["spot_worker_instances"])

        return "Done"
Пример #23
0
def get_sensitive_data():
    headers = dict(request.headers)
    # Get the auth info of the user performing the request
    auth_info = dataiku.api_client().get_auth_info_from_browser_headers(
        headers)
    print("User doing the query is %s" % auth_info["authIdentifier"])
    # If the user's group is not TRUSTED_GROUP, raise an exception
    if TRUSTED_GROUP not in auth_info["groups"]:
        raise Exception("You do not belong here, go away")
    else:
        data = {"status": "ok", "you_are": auth_info["authIdentifier"]}
    return json.dumps(data)
Пример #24
0
def do(payload, config, plugin_config, inputs):
    """
    Create list of options of projects in instance.
    """
    projects = dataiku.api_client().list_projects()
    choices = []
    for project in projects:
        choices.append({
            "value": project.get('projectKey'),
            "label": project.get('projectKey')
        })
    return {"choices": choices}
Пример #25
0
    def run(self, progress_callback):
        dss_cluster = dataiku.api_client().get_cluster(
            self.config["dss_cluster_id"])
        settings = dss_cluster.get_settings()
        (client, emr_cluster_id) = dku_emr.get_client_and_wait(settings)

        logging.info("retrieving instance groups")
        instance_groups = client.list_instance_groups(ClusterId=emr_cluster_id)

        logging.info("retrieving master instance")
        master_instances = client.list_instances(ClusterId=emr_cluster_id,
                                                 InstanceGroupTypes=['MASTER'],
                                                 InstanceStates=[
                                                     'AWAITING_FULFILLMENT',
                                                     'PROVISIONING',
                                                     'BOOTSTRAPPING', 'RUNNING'
                                                 ])
        master_instance_info = {
            "privateIpAddress":
            master_instances['Instances'][0]["PrivateIpAddress"]
        }

        logging.info("retrieving slave instances")
        slave_instances = client.list_instances(
            ClusterId=emr_cluster_id,
            InstanceGroupTypes=['CORE', 'TASK'],
            InstanceStates=[
                'AWAITING_FULFILLMENT', 'PROVISIONING', 'BOOTSTRAPPING',
                'RUNNING'
            ])
        slave_instances_info = [{
            "privateIpAddress": inst["PrivateIpAddress"]
        } for inst in slave_instances["Instances"]]

        return {
            "masterInstance":
            master_instance_info,
            "slaveInstances":
            slave_instances_info,
            'instanceGroups': [{
                "instanceGroupId":
                x["Id"],
                "runningInstanceCount":
                x["RunningInstanceCount"],
                "instanceType":
                x["InstanceType"],
                "instanceGroupType":
                x["InstanceGroupType"],
                "status":
                x["Status"]["State"]
            } for x in instance_groups["InstanceGroups"]]
        }
Пример #26
0
    def __init__(self, gds_name, base_ad_group):
        #client object for this class to use
        self.__client = dataiku.api_client()

        #Validate user is allowed to do this
        self.gds_name = AdminValidator(self.__client, gds_name)

        assert (base_ad_group in self.__client.get_general_settings(
        ).settings['ldapSettings']['authorizedGroups']
                ), 'Whitelist the AD group first / check for ad group typos.'
        self.base_ad_group = base_ad_group
        self.devops_team = base_ad_group.replace('GEN-ZZ-APP-GG-ai-',
                                                 '').replace('-', '_')
def get_emr_cluster_info(dss_cluster_id):
    """Returns dictionary containing info about the EMR cluster used by a project."""
    # client = client or dataiku.api_client()
    # cluster_info_macro = proj.get_macro("pyrunnable_emr-clusters_get-cluster-info")
    # dss_cluster_info = {'dss_cluster_id': dss_cluster_id}
    # response = cluster_info_macro.get_result(cluster_info_macro.run(params=dss_cluster_info))

    cluster = dataiku.api_client().get_cluster(dss_cluster_id)
    cluster_settings = cluster.get_settings().settings
    emr_cluster_id = cluster_settings['data']['emrClusterId']
    aws_region_id = cluster_settings['params']['config']['awsRegionId']

    boto_client = boto3.client('emr', region_name=aws_region_id)
    instance_groups = {
        g['Id']: {}
        for g in boto_client.list_instance_groups(ClusterId=emr_cluster_id)
    }
    for gid in instance_groups.keys():
        pass

    dss_cluster_info = {
        'dss_cluster_id': dss_cluster_id,
        'emr_cluster_id': emr_cluster_id,
        'instance_groups': instance_groups
    }

    logging.info("retrieving master instance")
    master_instances = boto_client.list_instances(
        ClusterId=emr_cluster_id,
        InstanceGroupTypes=['MASTER'],
        InstanceStates=[
            'AWAITING_FULFILLMENT', 'PROVISIONING', 'BOOTSTRAPPING', 'RUNNING'
        ])
    master_instance_info = {
        "privateIpAddress": master_instances['Instances'][0]["PublicIpAddress"]
    }

    # if cluster_mgr_project_key is not None:
    #     cluster_mgr_info = client.get_project(cluster_mgr_project_key).get_variables(
    #     )['standard']['emr']['clusters'].get(dss_cluster_id)
    #     if cluster_mgr_info is not None:
    #         instance_groups = cluster_mgr_info['instance_groups']
    #         for dss_grp_info in dss_cluster_info['instanceGroups']:
    #             grp_id = dss_grp_info['instanceGroupId']
    #             cluster_mgr_grp_info = instance_groups.get(grp_id)
    #             if cluster_mgr_grp_info:
    #                 dss_grp_info.update(
    #                     {'resizable': cluster_mgr_grp_info.get('resizable', False)}
    #                 )

    return dss_cluster_info
def create_parquet_dataset(project_key,
                           dataset_name,
                           connection_name,
                           hive_db='default',
                           hive_tbl='default',
                           dataset_path='default'):
    client = dataiku.api_client()
    prj = client.get_project(project_key)
    ds_formatparams = {
        'parquetBlockSizeMB': 128,
        'parquetCompressionMethod': 'SNAPPY',
        'parquetFlavor': 'HIVE',
        'parquetLowerCaseIdentifiers': False,
        'representsNullFields': False
    }
    if hive_db == 'default':
        # We use the default settings, the database is stored in ${hive_db_<CONN>}
        meta_sync = True
        hive_database = '${hive_db_' + connection_name + '}'
        if hive_tbl == 'default':
            hive_table_name = '${projectKey}_' + dataset_name
        else:
            hive_table_name = hive_tbl
    if hive_db is None:
        hive_database = None
        hive_table_name = None
        meta_sync = False
    if dataset_path == 'default':
        path = '/${projectKey}/' + dataset_name
    else:
        path = dataset_path
    ds_params = {
        u'path': path,
        u'connection': connection_name,
        u'notReadyIfEmpty': False,
        u'metastoreSynchronizationEnabled': meta_sync,
        u'hiveDatabase': hive_database,
        u'hiveTableName': hive_table_name,
        u'filesSelectionRules': {
            u'excludeRules': [],
            u'explicitFiles': [],
            u'mode': u'ALL',
            u'includeRules': []
        },
        u'timeout': 10000
    }
    return prj.create_dataset(dataset_name,
                              'HDFS',
                              params=ds_params,
                              formatType='parquet',
                              formatParams=ds_formatparams)
Пример #29
0
def pytest_generate_tests(metafunc):
    if "scenario_id" in metafunc.fixturenames:
        p_host = metafunc.config.getoption('--host')
        p_api = metafunc.config.getoption('--api')
        p_project = metafunc.config.getoption('--project')
        dataiku.set_remote_dss(p_host, p_api)
        client = dataiku.api_client()
        project = client.get_project(p_project)
        list_scenarios = []
        for scenario in project.list_scenarios():
            if scenario["id"].startswith("TEST_"):
                print("Adding scenario to test :", scenario["id"])
                list_scenarios.append(scenario["id"])
        metafunc.parametrize("scenario_id", list_scenarios)
    def __init__(self, folder_name, host="127.0.0.1", verbosity=logging.WARN):
        Thread.__init__(self)
        self.project_key = os.environ["DKU_CURRENT_PROJECT_KEY"]
        self.folder_name = folder_name
        self.client = dataiku.api_client()

        logging.set_verbosity(verbosity)

        # Getting app
        logs_path = self.__get_logs_path()
        app = self.__get_tb_app(logs_path)

        # Setting server
        self.srv = make_server(host, 0, app)