def build_scenario(build_plan,
                   filter_on='ready',
                   connection='dataiku_workspace',
                   ref_table='referentialclient',
                   ref_project='DIReferential',
                   add_ecrm_context=True,
                   finish_on_client=None,
                   single_client=None):
    scenario = Scenario()
    if not isinstance(filter_on, list):
        filter_on = [filter_on]
    project_name = dataiku.default_project_key()
    project_key = dataiku.api_client().get_project(project_name)
    local_variables = project_key.get_variables()['local']
    env = local_variables['env']
    kut.display_message('reading client context referential')

    executor = SQLExecutor2(connection=connection)
    sql_query_referential_client = "SELECT * FROM " + '_'.join(
        [env, ref_project, ref_table])
    client_ref = executor.query_to_df(sql_query_referential_client)
    filter_query = ' & '.join(filter_on)
    client_ref = client_ref.query(filter_query) if filter_query else client_ref
    kut.display_message('Client ready for automation  : ' +
                        client_ref.clientName.unique())

    kut.display_message('run configuration')
    print(build_plan)

    if not pd.isnull(finish_on_client):
        finish_client = client_ref[client_ref.clientName == finish_on_client]
        if len(finish_client) == 0:
            kut.display_message(
                'finish client not found in plan ' + finish_on_client +
                ' is the client name valid ?'
            )  # Example: load a DSS dataset as a Pandas dataframe
        other_clients = client_ref[client_ref.clientName != finish_on_client]
        client_ref = pd.concat([other_clients, finish_client],
                               ignore_index=True)
    success = []
    if single_client is not None:
        requested_client = client_ref[client_ref.clientName == single_client]
        if not len(single_client):
            kut.display_message(
                'requested single client is not found,building all allowed clients'
            )
        else:
            client_ref = requested_client
    for index, client_row in client_ref.iterrows():
        variables = set_client_context(client_row=client_row,
                                       add_ecrm_context=add_ecrm_context,
                                       connection=connection)
        client_name = variables['local']['clientName']
        kut.display_message('starting builds on ' + client_name)

        run_scenario(table_plan=build_plan, scenario=scenario)
        success.append(client_name)
        scenario.set_global_variables(successfullRun=success)
        print('done_________________' + client_name)
    return success
Пример #2
0
def asterDo():
    # Recipe inputs
    main_input_name = get_input_names_for_role('main')[0]
    input_dataset = dataiku.Dataset(main_input_name)

    # Recipe outputs
    main_output_name = get_output_names_for_role('main')[0]
    output_dataset = dataiku.Dataset(main_output_name)

    # Recipe function param
    dss_function = get_recipe_config().get('function', None)

    # Daitaiku DSS params
    client = dataiku.api_client()
    projectkey = main_input_name.split('.')[0]
    project = client.get_project(projectkey)

    try:
        # output dataset
        outputTable = outputtableinfo(
            output_dataset.get_location_info()['info'], main_output_name,
            get_recipe_config() or {})
    except Exception as error:
        raise RuntimeError(
            """Error obtaining connection settings for output table."""
            """ Make sure connection setting is set to 'Read a database table'."""
            """ This plugin only supports Aster tables.""")

    # input datasets
    try:
        main_input_names = get_input_names_for_role('main')
        inputTables = []
        for inputname in main_input_names:
            inconnectioninfo = dataiku.Dataset(
                inputname).get_location_info()['info']
            inTable = inputtableinfo(inconnectioninfo, inputname, dss_function)
            inputTables.append(inTable)
    except Exception as error:
        raise RuntimeError(
            """Error obtaining connection settings from one of the input tables."""
            """ Make sure connection setting is set to 'Read a database table'."""
            """ This plugin only supports Aster tables.""")

    # actual query
    query = getFunctionsQuery(dss_function, inputTables, outputTable)
    print('\n'.join(query))
    executor = SQLExecutor2(dataset=input_dataset)
    if dss_function.get('dropIfExists', False):
        dropAllQuery = getDropOutputTableArgumentsStatements(
            dss_function.get('arguments', []))
        executor.query_to_df('END TRANSACTION;', dropAllQuery)
    executor.query_to_df("END TRANSACTION;", pre_queries=query)

    # write table schema
    nQuery = '''SELECT * FROM {} LIMIT (1);'''.format(outputTable.tablename)
    selectResult = executor.query_to_df(nQuery)
    output_schema = []
    for column in selectResult.columns:
        output_schema.append({"name": column, "type": "string"})
    output_dataset.write_schema(output_schema)
def get_sql_table(referential_name, project_key='DIReferential'):
    env = get_project_variables(scope='local')['env']
    project_key = dataiku.default_project_key(
    ) if not project_key or project_key == 'self' else project_key
    table_name = '_'.join([env, project_key, referential_name.lower()])
    query = 'SELECT * FROM ' + table_name
    connection = SQLExecutor2(connection='dataiku_workspace')
    return connection.query_to_df(query)
def add_ECRM_context(variables, connection='dataiku_workspace'):
    print('adding ecrm context')
    project_name = dataiku.default_project_key()
    project = dataiku.api_client().get_project(project_name)
    local_variables = project.get_variables()['local']
    env = local_variables['env']
    executor = SQLExecutor2(connection=connection)
    sql_query_client_ecrm = "SELECT * FROM " + env + "_DIReferential_referentialECRMOperation"
    client_ecrm = executor.query_to_df(sql_query_client_ecrm)
    ecrm_info = client_ecrm[client_ecrm.clientName == variables['local']
                            ['clientName']]
    print('found', len(ecrm_info), 'relevant entries')
    variables['local']['ecrmOperations'] = {}
    for i, operation_row in ecrm_info.iterrows():
        operation_dict = operation_row.to_dict()
        operation_type = operation_dict['operationType']
        del (operation_dict['operationType'])
        variables['local']['ecrmOperations'][operation_type] = operation_dict
    return variables
def store_name_of_views(view_ids=None, silent_mode=True):
    env = nxauto.get_project_variables(scope='local')['env']
    table_name = '_'.join([env, 'DIReferential', 'referentialstorename'])
    query = 'SELECT * FROM ' + table_name
    connection = SQLExecutor2(connection='dataiku_workspace')
    store_referential = connection.query_to_df(query)
    store_referential = store_referential[['storeName', 'viewId']]
    store_referential = store_referential[~pd.isnull(store_referential.viewId)]
    if view_ids is not None:
        store_referential = kut.filter_on_values(
            df=store_referential,
            value_list=view_ids,
            column_name='viewId'
        )
    store_referential['viewId'] = store_referential.viewId.astype(int)
    store_name_dic = store_referential.set_index('viewId')['storeName'].to_dict()
    if not silent_mode:
        print(store_referential)
        print(store_referential.dtypes)
        print(store_name_dic)

    return store_name_dic
Пример #6
0
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Import the class that allows us to execute SQL on the Studio connections
from dataiku.core.sql import SQLExecutor2

# Read recipe inputs
sfo_prepared = dataiku.Dataset("sfo_prepared")
sfo_prepared_df = sfo_prepared.get_dataframe()


# We create an executor. We pass to it the dataset instance. This way, the
# executor  knows which SQL database should be targeted
executor = SQLExecutor2(dataset=sfo_prepared)

# Get the 5 most frequent manufacturers by total landing count
# (over the whole period)
mf_manufacturers = executor.query_to_df(
    """
    select      "Aircraft Manufacturer" as manufacturer,
                sum("Landing Count") as count
            from sfo_prepared
            group by "Aircraft Manufacturer"
            order by count desc limit 5
    """)

# The "query_to_df" method returns a Pandas dataframe that
# contains the manufacturers
Пример #7
0
columns_right = get_recipe_config()['columns_2']


#############################
# Original recipe
#############################

#Start the loop
joins = ['LEFT', 'RIGHT', 'INNER']
join_conds = []

for key in range(len(key_a)):
    join_cond = Expression()
    globals()['join_cond_'+str(key)] = join_cond.and_(Column(key_a[key], input_A_names[0]).eq_null_unsafe(Column(key_b[key], input_B_names[0])))
    join_conds.append(globals()['join_cond_'+str(key)])  
    
for i in joins:
    query = SelectQuery()
    query.select_from(input_A_datasets[0], alias = input_A_names[0])
    for j in columns_left:
        query.select(Column(j, input_A_names[0]),alias = j)
    for k in columns_right:
        query.select(Column(k, input_B_names[0]),alias = k) 
    query.join(input_B_datasets[0], i, join_conds, operatorBetweenConditions = operator , alias= input_B_names[0])
    globals()['sql_'+str(i)] = toSQL(query, input_A_datasets[0])


e = SQLExecutor2()
e.exec_recipe_fragment(output_A_datasets[0], sql_LEFT)
e.exec_recipe_fragment(output_B_datasets[0], sql_RIGHT)
e.exec_recipe_fragment(output_C_datasets[0], sql_INNER)
def asterDo():
    """
    Takes the parameters set by the user from the UI, creates the query, and then executes it.
    """
    # Formatting options.
    SEP_LENGTH = 80
    SEP = "=" * SEP_LENGTH

    # Recipe inputs
    main_input_name = get_input_names_for_role('main')[0]
    input_dataset = dataiku.Dataset(main_input_name)

    # Recipe outputs
    main_output_name = get_output_names_for_role('main')[0]
    output_dataset = dataiku.Dataset(main_output_name)

    # Daitaiku DSS params
    client = dataiku.api_client()
    projectkey = main_input_name.split('.')[0]
    project = client.get_project(projectkey)

    # Connection properties.
    # TODO: Handle Input and Output Table connection properties.
    properties = input_dataset.get_location_info(
        sensitive_info=True)['info'].get('connectionParams').get('properties')
    autocommit = input_dataset.get_location_info(
        sensitive_info=True)['info'].get('connectionParams').get(
            'autocommitMode')

    # SQL Executor.
    executor = SQLExecutor2(dataset=input_dataset)

    # Handle pre- and post-query additions.
    # Assume autocommit TERA mode by default.
    pre_query = None
    post_query = None

    print(SEP)
    if not autocommit:
        print("NOT AUTOCOMMIT MODE.")
        print("Assuming TERA mode.")
        pre_query = ["BEGIN TRANSACTION;"]
        post_query = ["END TRANSACTION;"]
        for prop in properties:
            if prop['name'] == 'TMODE':
                if prop['value'] == 'ANSI':
                    print("ANSI mode.")
                    pre_query = [";"]
                    post_query = ["COMMIT WORK;"]

    else:
        print("AUTOCOMMIT MODE.")
        print("No pre- and post-query.")
    print(SEP)

    # Recipe function param
    dss_function = get_recipe_config().get('function', None)
    pp = pprint.PrettyPrinter(indent=4)

    print(SEP)
    print('DSS Function:')
    pp.pprint(dss_function)
    print(SEP)

    # output dataset
    try:
        outputTable = outputtableinfo(
            output_dataset.get_location_info()['info'], main_output_name,
            get_recipe_config() or {})
    except Exception as error:
        raise RuntimeError(
            """Error obtaining connection settings for output table."""
            """ This plugin only supports Teradata tables.""")

    # input datasets
    try:
        main_input_names = get_input_names_for_role('main')
        inputTables = []
        for inputname in main_input_names:
            inconnectioninfo = dataiku.Dataset(
                inputname).get_location_info()['info']
            inTable = inputtableinfo(inconnectioninfo, inputname, dss_function)
            inputTables.append(inTable)
    except Exception as error:
        raise RuntimeError(
            """Error obtaining connection settings from one of the input tables."""
            """ This plugin only supports Teradata tables.""")

    # Handle dropping of output tables.
    if dss_function.get('dropIfExists', False):
        print("Preparing to drop tables.")
        drop_query = dropTableStatement(outputTable)

        print(SEP)
        print("DROP query:")
        print(drop_query)
        print(SEP)
        try:
            # dataiku's query_to_df's pre_query parameter seems to not work. This is a work-around to ensure that the
            # "START TRANSACTION;" block applies for non-autocommit TERA mode connections.
            if not autocommit:
                executor.query_to_df(pre_query)
            executor.query_to_df(drop_query, post_queries=post_query)
        except Exception as e:
            print(e)

        # Drop other output tables if they exist.
        drop_all_query = getDropOutputTableArgumentsStatements(
            dss_function.get('output_tables', []))

        print(SEP)
        print('Drop ALL Query:')
        print(drop_all_query)
        print(SEP)

        for drop_q in drop_all_query:
            # dataiku's query_to_df's pre_query parameter seems to not work. This is a work-around to ensure that the
            # "START TRANSACTION;" block applies for non-autocommit TERA mode connections.
            if not autocommit:
                executor.query_to_df(pre_query)
            executor.query_to_df(drop_q, post_queries=post_query)

    # CREATE query.
    create_query = getFunctionsQuery(dss_function, inputTables, outputTable,
                                     get_recipe_config() or {})
    print(SEP)
    print("CREATE query:")
    print(create_query)
    print(SEP)
    # Detect error
    try:
        # dataiku's query_to_df's pre_query parameter seems to not work. This is a work-around to ensure that the
        # "START TRANSACTION;" block applies for non-autocommit TERA mode connections.
        if not autocommit:
            executor.query_to_df(pre_query)
        executor.query_to_df(create_query, post_queries=post_query)
    except Exception as error:
        err_str = str(error)
        err_str_list = err_str.split(" ")
        # trying to shorten the error for the modal in front-end
        if len(err_str_list) > 15:
            print(SEP)
            print(error)
            print(SEP)
            new_err_str = err_str_list[:15]
            new_err_str.append("\n\n")
            new_err_str.append("...")
            new_err_str = " ".join(new_err_str)
            raise RuntimeError(new_err_str)
        else:
            raise RuntimeError(err_str)

    print('Moving results to output...')

    select_sample_query = 'SELECT * from ' + outputTable.tablename + ' SAMPLE 0'
    # dataiku's query_to_df's pre_query parameter seems to not work. This is a work-around to ensure that the
    # "START TRANSACTION;" block applies for non-autocommit TERA mode connections.
    if not autocommit:
        executor.query_to_df(pre_query)
    sel_res = executor.query_to_df(select_sample_query,
                                   post_queries=post_query)

    pythonrecipe_out = output_dataset
    pythonrecipe_out.write_schema_from_dataframe(sel_res)

    # Additional Tables
    outtables = dss_function.get('output_tables', [])
    if (outtables != []):
        tableCounter = 1
        print('Working on output tables')
        print(get_output_names_for_role('main'))
        print(outtables)
        for table in outtables:
            if table.get('value') != '' and table.get('value') != None:
                # try:
                print('Table')
                print(table)
                #Need to change this to max of split in order for multiple database or no-database table inputs
                #Change below might fix issue 4 of Jan 4 2018 for Nico. For non-drop tables
                try:
                    main_output_name2 = list(
                        filter(
                            lambda out_dataset: out_dataset.split('.')[len(
                                out_dataset.split('.')) - 1] == table.
                            get('value').split('.')[len(
                                table.get('value').split('.')) - 1].strip('\"'
                                                                          ),
                            get_output_names_for_role('main')))[0]
                except Exception as error:
                    # print(error.message)
                    raise RuntimeError(
                        'Unable to find an output dataset for ' +
                        table.get('value') +
                        'It may not exist as an output Dataset: ' +
                        table.get('value') + "\n\Runtime Error:" +
                        error.message)
                print('Output name 2')
                print(main_output_name2)
                output_dataset2 = dataiku.Dataset(main_output_name2)
                # print("od2 printer")
                tableNamePrefix = output_dataset2.get_location_info(
                    sensitive_info=True)['info'].get('connectionParams').get(
                        'namingRule').get('tableNameDatasetNamePrefix')
                if tableNamePrefix != None or tableNamePrefix != '':
                    print('Table prefix is not empty:' + tableNamePrefix)
                # except:
                #     #Need to change this error
                #     print('Error: Dataset for' + table.get('name') + ' not found')
                #     raise Value
                customOutputTableSQL = 'SELECT * from ' + table.get(
                    'value') + ' SAMPLE 0'
                print('Working on table number:')
                print(tableCounter)
                print(customOutputTableSQL)
                # dataiku's query_to_df's pre_query parameter seems to not work. This is a work-around to ensure that the
                # "START TRANSACTION;" block applies for non-autocommit TERA mode connections.
                if not autocommit:
                    executor.query_to_df(pre_query)
                sel_res = executor.query_to_df(customOutputTableSQL,
                                               post_queries=post_query)
                # selRes = executor.query_to_df(customOutputTableSQL, pre_queries=pre_query, post_queries=post_query)
                tableCounter += 1
                pythonrecipe_out2 = output_dataset2
                pythonrecipe_out2.write_schema_from_dataframe(selRes)
    print('Complete!')


# Uncomment end
 def __init__(self, dataset):
     self.dataset = dataset
     self.is_hdfs = 'hiveTableName' in dataset.get_config().get(
         'params').keys()
     self.executor = HiveExecutor(
         dataset=dataset) if self.is_hdfs else SQLExecutor2(dataset=dataset)
Пример #10
0
    "date": date_columns,
    "boolean": bool_columns
}
for col in dataset.read_schema():
    dispatch.get(col["type"], str_columns).append(col["name"])

#==============================================================================
# RUN
#==============================================================================

logging.info("DSS features: %s" % dss_settings["features"])

is_hivelike = False
if dataset_config["type"] in SUPPORTED_DB:
    q = '"'
    sqlexec = SQLExecutor2(dataset=dataset)
    logging.info("Dataset config: %s" % dataset_config)
    #table = dataset_config["params"].get("table", dataset.short_name)
    table = '_'.join(dataset.name.split('.'))
elif dataset_config["type"] == "HDFS":
    q = '`'
    if use_impala and compute_distinct:
        raise ValueError("Cannot compute distinct values on Impala")
    if "IMPALA" in dss_settings["features"] and use_impala:
        sqlexec = ImpalaExecutor(dataset=dataset)
    else:
        sqlexec = HiveExecutor(dataset=dataset)
    is_hivelike = True
    table = dataset.short_name
else:
    raise Exception("Unsupported input dataset type: %s" %
Пример #11
0
 def _execute(self, table, output_dataset):
     query = toSQL(table, dataset=output_dataset)
     logger.info(f"Executing query:\n{query}")
     sql_executor = SQLExecutor2(dataset=output_dataset)
     sql_executor.exec_recipe_fragment(output_dataset, query)
     logger.info("Done executing query !")
Пример #12
0
def asterDo():
    # Recipe inputs
    main_input_name = get_input_names_for_role('main')[0]
    input_dataset = dataiku.Dataset(main_input_name)    

    # print('Connection info:')
    # print(input_dataset.get_location_info(sensitive_info=True)['info'])
    properties = input_dataset.get_location_info(sensitive_info=True)['info'].get('connectionParams').get('properties')
    autocommit = input_dataset.get_location_info(sensitive_info=True)['info'].get('connectionParams').get('autocommitMode')
    
    requiresTransactions = True
     
    print('I am assuming in TERA MODE by default')
    if autocommit:
        print('I am assuming autocommmit TERA mode')
        requiresTransactions = False
        tmode = 'TERA'
        stTxn = ';'
        edTxn = ';'
    else:
        #Detected TERA
        print('I am assuming non-autocommit TERA MODE')
        tmode = 'TERA'
        stTxn = 'BEGIN TRANSACTION;'
        edTxn = 'END TRANSACTION;'

    for prop in properties:
        if prop['name'] == 'TMODE' and prop['value'] == 'TERA':
            if autocommit:
                print('I am in autocommmit TERA mode')
                requiresTransactions = False
                tmode = 'TERA'
                stTxn = ';'
                edTxn = ';'
            else:
                #Detected TERA
                print('I am in TERA MODE')
                tmode = 'TERA'
                stTxn = 'BEGIN TRANSACTION;'
                edTxn = 'END TRANSACTION;'
        elif prop['name'] == 'TMODE' and prop['value'] == 'ANSI':
            #Detected ANSI
            print('I am in ANSI MODE')
            tmode = 'ANSI'
            stTxn = ';'
            edTxn = 'COMMIT WORK;'
    print(properties)
    


    # Recipe outputs
    main_output_name = get_output_names_for_role('main')[0]
    output_dataset =  dataiku.Dataset(main_output_name)
    
    # Recipe function param
    dss_function = get_recipe_config().get('function', None)
    print ("\n=================================\n")
    print ('Showing DSS Function')
    print (dss_function)
    print ("\n=================================\n")
    
    # Daitaiku DSS params
    client = dataiku.api_client()
    projectkey = main_input_name.split('.')[0]
    project = client.get_project(projectkey)

    try:
        # output dataset
        outputTable = outputtableinfo(output_dataset.get_location_info()['info'], main_output_name,
                                  get_recipe_config() or {})
    except Exception as error:
        raise RuntimeError("""Error obtaining connection settings for output table."""                           
                           """ This plugin only supports Teradata tables.""")
        # raise RuntimeError("""Error obtaining connection settings for output table."""
        #                    """ Make sure connection setting is set to 'Read a database table'."""
        #                    """ This plugin only supports Aster tables.""")

    # input datasets
    try:
        main_input_names = get_input_names_for_role('main')
        inputTables = []
        for inputname in main_input_names:
            inconnectioninfo = dataiku.Dataset(inputname).get_location_info()['info']
            inTable = inputtableinfo(inconnectioninfo, inputname, dss_function)
            inputTables.append(inTable)
    except Exception as error:
        raise RuntimeError("""Error obtaining connection settings from one of the input tables."""                           
                           """ This plugin only supports Teradata tables.""")
        
    # actual query
    query = getFunctionsQuery(dss_function, inputTables, outputTable, get_recipe_config() or {})
    # print("=============================")
    # print (query)
    # print("=============================")
    # raise RuntimeError("""I Just wanted to make this execution stop: """)
    
    # Uncomment below
    executor = SQLExecutor2(dataset=input_dataset)   
    
   

    if dss_function.get('dropIfExists', False):

        # 
        dropTableStr = dropTableStatement(outputTable)
        # executor.query_to_df(dropTableStr)
        if requiresTransactions:
            print('Start transaction for drop')
            print(stTxn)
            executor.query_to_df(stTxn)

            print('End transaction for drop')
            print(edTxn)
            executor.query_to_df(edTxn,[dropTableStr])
        else:
            executor.query_to_df([dropTableStr])

        #Start transaction
        #Move to drop query and make each DROP run separately
        dropAllQuery = getDropOutputTableArgumentsStatements(dss_function.get('output_tables', []))
        print('Drop ALL Query:')
        print(dropAllQuery)        
        
        # if requiresTransactions:
            # print('Start transaction for drop')
            # print(stTxn)
            # executor.query_to_df(stTxn)
        # dropAllQuery = getDropOutputTableArgumentsStatements(dss_function.get('arguments', []))
        
        #Change dropAllQuery to string/s? or execute one drop per item in list.            
        if requiresTransactions:
            for dropTblStmt in dropAllQuery:
                print('Start transaction for drop')
                print(stTxn)
                executor.query_to_df(stTxn)

                print('End transaction for drop')
                print(edTxn)
                executor.query_to_df(edTxn,[dropTblStmt])
            # executor.query_to_df(edTxn)
        else:
            for dropTblStmt in dropAllQuery:
                executor.query_to_df([dropTblStmt])
    # executor.query_to_df("END TRANSACTION;", pre_queries=query)
    
    # write table schema ACTUAL COMMENT
    # nQuery = '''SELECT * FROM {} LIMIT (1);'''.format(outputTable.tablename)
    # selectResult = executor.query_to_df(nQuery);
    # output_schema = []
    # for column in selectResult.columns:
    #     output_schema.append({"name":column, "type":"string"})
    # output_dataset.write_schema(output_schema)
    print('\n'.join(query))
    # print(dss_function)

    # recipe_output_table = dss_function.get('recipeOutputTable', "")
    # print('recipe_output_table before IF')
    # print(recipe_output_table)
    if requiresTransactions:
        print('Start transaction for create table')
        print(stTxn)
        executor.query_to_df(stTxn)

    # Detect error
    try:
        selectResult = executor.query_to_df(query)
    except Exception as error:

        err_str = str(error)
        err_str_list = err_str.split(" ")
        # trying to shorten the error for the modal in front-end
        if len(err_str_list) > 15:
            print ("\n=================================\n")
            print (error)
            print ("\n=================================\n")
            new_err_str = err_str_list[:15]
            new_err_str.append("\n\n")
            new_err_str.append("...")
            new_err_str = " ".join(new_err_str)
            raise RuntimeError(new_err_str)
        else:
            raise RuntimeError(err_str)

    if requiresTransactions:
        print('End transaction for create table')
        print(edTxn)
        executor.query_to_df(edTxn)

    
    print('Moving results to output...')

    if requiresTransactions:
        print('Start transaction for schema building')
        print(stTxn)
        executor.query_to_df(stTxn)

    # pythonrecipe_out = output_dataset
    # pythonrecipe_out.write_with_schema(selectResult)
    customOutputTableSQL = 'SELECT * from '+ outputTable.tablename + ' SAMPLE 0'
    selRes = executor.query_to_df(customOutputTableSQL)

    pythonrecipe_out = output_dataset
    pythonrecipe_out.write_schema_from_dataframe(selRes)

    # Additional Tables
    outtables = dss_function.get('output_tables', [])
    if(outtables != []):
        tableCounter = 1
        print('Working on output tables')
        print(get_output_names_for_role('main'))
        print(outtables)
        for table in outtables:
            if table.get('value') != '' and table.get('value') != None:
                # try:
                print('Table')
                print(table)
                #Need to change this to max of split in order for multiple database or no-database table inputs
                #Change below might fix issue 4 of Jan 4 2018 for Nico. For non-drop tables
                try:
                    main_output_name2 = list(filter(lambda out_dataset: out_dataset.split('.')[len(out_dataset.split('.'))-1] == table.get('value').split('.')[len(table.get('value').split('.'))-1].strip('\"'),get_output_names_for_role('main')))[0]
                except Exception as error:
                    # print(error.message)                    
                    raise RuntimeError('Unable to find an output dataset for '+table.get('value')+ 'It may not exist as an output Dataset: '+table.get('value')+"\n\Runtime Error:"+ error.message)
                print('Output name 2')
                print(main_output_name2)
                output_dataset2 =  dataiku.Dataset(main_output_name2)   
                # print("od2 printer")
                tableNamePrefix = output_dataset2.get_location_info(sensitive_info=True)['info'].get('connectionParams').get('namingRule').get('tableNameDatasetNamePrefix')
                if tableNamePrefix != None or tableNamePrefix != '':
                    print('Table prefix is not empty:' + tableNamePrefix)
                # except:
                #     #Need to change this error
                #     print('Error: Dataset for' + table.get('name') + ' not found')  
                #     raise Value              
                customOutputTableSQL = 'SELECT * from '+ table.get('value') + ' SAMPLE 0'
                print('Working on table number:')
                print(tableCounter)
                print(customOutputTableSQL)
                selRes = executor.query_to_df(customOutputTableSQL)
                tableCounter += 1
                pythonrecipe_out2 = output_dataset2
                pythonrecipe_out2.write_schema_from_dataframe(selRes)
    if requiresTransactions:
        print('End transaction for schema building')
        print(edTxn)
        executor.query_to_df(edTxn)
    print('Complete!')  


# Uncomment end
sf_output = dataiku.Dataset(get_output_names_for_role('sf_output')[0])
sf_output_config = sf_output.get_config()

sf_stage_name = get_stage_name(get_recipe_config(), get_plugin_config())
sf_table_name = get_table_name(sf_output_config)
sf_connection_name = get_connection_name(sf_output_config)
# When creating parquet, we want to prefix with "part". But when importing parquet, we don't care.
sf_location = get_hdfs_location(hdfs_input_config, sf_stage_name)

logger.info(f'SF Stage Location: {sf_location}')
logger.info(f'SF Stage: {sf_stage_name}')
logger.info(f'SF Table: {sf_table_name}')
logger.info(f'SF Connection Name: {sf_connection_name}')

# write the schema for our Snowflake table
sf_output.write_schema(hdfs_input.read_schema(), dropAndCreate=True)

# open a writer and close it so that it'll force DSS to create the table.
with sf_output.get_writer():
    pass

sql = get_hdfs_to_snowflake_query(sf_location, sf_table_name,
                                  hdfs_input.read_schema())

logger.info(f'SF Query: {sql}')

executor = SQLExecutor2(connection=sf_connection_name)
results = executor.query_to_df(sql, post_queries=['COMMIT'])

logger.info(f'COPY results: ${results.to_string()}')
Пример #14
0
        #Detected TERA
        print('I am in TERA MODE')
        tmode = 'TERA'
        stTxn = 'BEGIN TRANSACTION;'
        edTxn = 'END TRANSACTION;'

    elif prop['name'] == 'TMODE' and prop['value'] == 'ANSI':
        #Detected ANSI
        print('I am in ANSI MODE')
        tmode = 'ANSI'
        stTxn = ';'
        edTxn = 'COMMIT WORK;'

empty_table = input_A_datasets[0]
#SQL Executor
executor = SQLExecutor2(dataset=empty_table)

defaultDB = default_database()
if default_database(input_A_datasets[0]) != default_database():
    raise RuntimeError(
        'Input dataset and output dataset have different connection details')

output_location = output_A_datasets[0].get_location_info()['info']

print('Getting Function Config')
scriptAlias = function_config.get('script_alias', '')
scriptFileName = function_config.get('script_filename', '')
scriptLocation = function_config.get('script_location', '')
searchPath = default_database()
outputTable = output_A_datasets[0].get_location_info()['info'].get('table', '')
partitionOrHash = function_config.get('partitionOrHash', '')