示例#1
0
def model_dict_load(model_dict, config_path, config_name, verbose=True):
    """ Load the model dict from the python config file.
       ### Issue wiht passing function durin pickle on disk
    :return:
    """
    if model_dict is None:
        log("#### Model Params Dynamic loading  ###############################################"
            )
        model_dict_fun = load_function_uri(uri_name=config_path + "::" +
                                           config_name)
        model_dict = model_dict_fun()  ### params

    else:
        ### Passing dict
        ### Due to Error when saving on disk the model, function definition is LOST, need dynamic load
        path_config = model_dict['global_pars']['config_path']

        p1 = path_config + "::" + model_dict['model_pars'][
            'post_process_fun'].__name__
        model_dict['model_pars']['post_process_fun'] = load_function_uri(p1)

        p1 = path_config + "::" + model_dict['model_pars']['pre_process_pars'][
            'y_norm_fun'].__name__
        model_dict['model_pars']['pre_process_pars'][
            'y_norm_fun'] = load_function_uri(p1)

    return model_dict
示例#2
0
def pd_ts_generic(
    df,
    col=None,
    pars=None,
):
    """
       { 'name': 'deltapy.transform::robust_scaler',                 'pars': {'drop':["Close_1"]} },

    """
    ###### Custom code ################################################################
    model_name = pars['name']
    model_pars = pars.get('pars', {})

    dfin = df[col]
    dfin = dfin.fillna(method='ffill')

    if 'a_chi' in model_name:
        # Normalize the input for the chi
        dfin = (dfin - dfin.min()) / (dfin.max() - dfin.min())

    ##### Transform Data  ############################################################
    model = load_function_uri(model_name)
    df_out = model(dfin, **model_pars)

    if 'extract' in model_name:
        # Extract only returns one value, so no columns to loop over.
        col_out = "0_" + model_name

    else:
        model_name2 = model_name.replace("::", "-")
        col_out = [coli + "_" + model_name2 for coli in df_out.columns]
        df_out.columns = col_out
        df_out.index = df.index

    return df_out
示例#3
0
def model_dict_load(model_dict, config_path, config_name, verbose=True):
    if model_dict is None :
       log("#### Model Params Dynamic loading  ###############################################")
       model_dict_fun = load_function_uri(uri_name=config_path + "::" + config_name)
       model_dict     = model_dict_fun()   ### params
    if verbose : log( model_dict )
    return model_dict
示例#4
0
def model_dict_load(model_dict, config_path, config_name, verbose=True):
    """ load the model dict from the python config file.
    :return:
    """
    if model_dict is None:
        log("#### Model Params Dynamic loading  ###############################################"
            )
        model_dict_fun = load_function_uri(uri_name=config_path + "::" +
                                           config_name)
        model_dict = model_dict_fun()  ### params
    log3(model_dict)
    return model_dict
示例#5
0
def pd_colts_generate(df=None, col=None, pars={}):
    """
       pars : {  'model_name' :  "transform.robust_scaler",
                 'model_pars'  :  {}


       }
    """
    prefix = 'colts_generate'

    ###### Custom code ################################################################
    dfin = df[col].fillna(method='ffill')
    model_name = pars['model_name']
    model_pars = pars.get('model_pars', {})

    if 'path_pipeline' in pars:  #### Prediction time
        model = load(pars['path_pipeline'] + f"/{prefix}_model.pkl")
        pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")

    else:  ### Training time  : Dynamic function load
        from util_feature import load_function_uri
        ##### transform.robust_scaler(df, drop=["Close_1"])
        model = load_function_uri(model_name)

    model_name = model_name.replace(".", "_")

    ##### Transform Data  ############################################################
    df_out = model(dfin, col, **model_pars)
    col_out = [coli + "_" + model_name for coli in df_out.columns]
    df_out.columns = col_out
    df_out.index = train_X.index
    col_new = col_out

    ###### Export #####################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_out, 'df_' + prefix, pars['path_features_store'])
        save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_new, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: col_new  ### list of columns
    }
    return df_out, col_pars
示例#6
0
def run_preprocess(model_name, path_data, path_output, path_config_model="source/config_model.py", n_sample=5000,
              mode='run_preprocess',):     #prefix "pre" added, in order to make if loop possible
    """
      Configuration of the model is in config_model.py file
    """
    path_output         = root + path_output
    path_data           = root + path_data
    path_features_store = path_output + "/features_store/"
    path_pipeline_out   = path_output + "/pipeline/"
    path_model_out      = path_output + "/model/"
    path_check_out      = path_output + "/check/"
    path_train_X        = path_data   + "/features*"    ### Can be a list of zip or parquet files
    path_train_y        = path_data   + "/target*"      ### Can be a list of zip or parquet files
    log(path_output)


    log("#### load input column family  ###################################################")
    cols_group = json.load(open(path_data + "/cols_group.json", mode='r'))
    log(cols_group)


    log("#### Model parameters Dynamic loading  ############################################")
    model_dict_fun = load_function_uri(uri_name= path_config_model + "::" + model_name)
    model_dict     = model_dict_fun(path_model_out)   ### params


    log("#### Preprocess  #################################################################")
    preprocess_pars = model_dict['model_pars']['pre_process_pars']
    filter_pars     = model_dict['data_pars']['filter_pars']

    if mode == "run_preprocess" :
        dfXy, cols      = preprocess(path_train_X, path_train_y, path_pipeline_out, cols_group, n_sample,
                                 preprocess_pars, filter_pars, path_features_store)

    elif mode == "load_preprocess" :
        dfXy, cols      = preprocess_load(path_train_X, path_train_y, path_pipeline_out, cols_group, n_sample,
                                 preprocess_pars, filter_pars, path_features_store)


    model_dict['data_pars']['coly'] = cols['coly']
    
    ### Generate actual column names from colum groups : colnum , colcat
    model_dict['data_pars']['cols_model'] = sum([  cols[colgroup] for colgroup in model_dict['data_pars']['cols_model_group'] ]   , [])                
    log(  model_dict['data_pars']['cols_model'] , model_dict['data_pars']['coly'])
    
   
    log("######### finish #################################", )
示例#7
0
def model_dict_load(model_dict, config_path, config_name, verbose=True):
    """model_dict_load
    Args:
        model_dict ([type]): [description]
        config_path ([type]): [description]
        config_name ([type]): [description]
        verbose (bool, optional): [description]. Defaults to True.

    Returns:
        [type]: [description]
    """
    if model_dict is None:
        log("#### Model Params Dynamic loading  ###############################################"
            )
        model_dict_fun = load_function_uri(uri_name=config_path + "::" +
                                           config_name)
        model_dict = model_dict_fun()  ### params
    if verbose: log(model_dict)
    return model_dict
示例#8
0
def preprocess(path_train_X="",
               path_train_y="",
               path_pipeline_export="",
               cols_group=None,
               n_sample=5000,
               preprocess_pars={},
               path_features_store=None):
    """
      Used for trainiing only
      Save params on disk

    :param path_train_X:
    :param path_train_y:
    :param path_pipeline_export:
    :param cols_group:
    :param n_sample:
    :param preprocess_pars:
    :param path_features_store:
    :return:
    """
    ##### column names for feature generation #####################################################
    log(cols_group)
    coly = cols_group['coly']  # 'salary'
    colid = cols_group['colid']  # "jobId"
    colcat = cols_group[
        'colcat']  # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ]
    colnum = cols_group['colnum']  # ['yearsExperience', 'milesFromMetropolis']
    os.makedirs(path_pipeline_export, exist_ok=True)
    log(path_pipeline_export)
    save(colid, f'{path_pipeline_export}/colid.pkl')

    ### Pipeline Execution ##########################################
    pipe_default = [{
        'uri': 'source/prepro.py::pd_coly',
        'pars': {},
        'cols_family': 'coly',
        'type': 'coly'
    }, {
        'uri': 'source/prepro.py::pd_colnum_bin',
        'pars': {},
        'cols_family': 'colnum',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colnum_binto_onehot',
        'pars': {},
        'cols_family': 'colnum_bin',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colcat_bin',
        'pars': {},
        'cols_family': 'colcat',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colcat_to_onehot',
        'pars': {},
        'cols_family': 'colcat_bin',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colcross',
        'pars': {},
        'cols_family': 'colcross',
        'type': 'cross'
    }]

    pipe_list = preprocess_pars.get('pipe_list', pipe_default)
    pipe_list_X = [
        task for task in pipe_list
        if task.get('type', '') not in ['coly', 'filter']
    ]
    pipe_list_y = [
        task for task in pipe_list if task.get('type', '') in ['coly']
    ]
    pipe_filter = [
        task for task in pipe_list if task.get('type', '') in ['filter']
    ]
    ##### Load data #################################################################################
    df = load_dataset(path_train_X, path_train_y, colid, n_sample=n_sample)

    ##### Generate features ##########################################################################
    dfi_all = {}  ### Dict of all features
    cols_family_all = {'colid': colid, 'colnum': colnum, 'colcat': colcat}

    if len(pipe_filter) > 0:
        log("#####  Filter  #########################################################################"
            )
        pipe_i = pipe_filter[0]
        pipe_fun = load_function_uri(pipe_i['uri'])
        df, col_pars = pipe_fun(df,
                                list(df.columns),
                                pars=pipe_i.get('pars', {}))

    if len(pipe_list_y) > 0:
        log("#####  coly  ###########################################################################"
            )
        pipe_i = pipe_list_y[0]
        pipe_fun = load_function_uri(pipe_i['uri'])
        logs("----------df----------\n", df)
        pars = pipe_i.get('pars', {})
        pars['path_features_store'] = path_features_store
        pars['path_pipeline_export'] = path_pipeline_export
        df, col_pars = pipe_fun(df, cols_group['coly'],
                                pars=pars)  ### coly can remove rows

        logs("----------df----------\n", df)
        dfi_all['coly'] = df[cols_group['coly']]
        cols_family_all['coly'] = cols_group['coly']
        save_features(df[cols_group['coly']], "coly",
                      path_features_store)  ### already saved
        save(coly, f'{path_pipeline_export}/coly.pkl')

    #####  Processors  ###############################################################################
    dfi_all['coly'] = df[cols_group['coly']]
    #for colg, colg_list in cols_group.items() :
    #   if colg not in  ['colid']:
    #      dfi_all[colg]   = df[colg_list]   ## colnum colcat, coly

    for pipe_i in pipe_list_X:
        log("###################", pipe_i,
            "##########################################################")
        pipe_fun = load_function_uri(
            pipe_i['uri'])  ### Load the code definition  into pipe_fun
        cols_name = pipe_i['cols_family']
        col_type = pipe_i['type']

        pars = pipe_i.get('pars', {})
        pars[
            'path_features_store'] = path_features_store  ### intermdiate dataframe
        pars['path_pipeline_export'] = path_pipeline_export  ### Store pipeline

        if col_type == 'cross':
            log("###################  Adding Cross ###################################################"
                )
            pars['dfnum_hot'] = dfi_all[
                'colnum_onehot']  ### dfnum_hot --> dfcross
            pars['dfcat_hot'] = dfi_all['colcat_onehot']
            pars['colid'] = colid
            pars['colcross_single'] = cols_group.get('colcross', [])

        elif col_type == 'add_coly':
            log('add_coly genetic', cols_group['coly'])
            pars['coly'] = cols_group['coly']
            pars['dfy'] = dfi_all['coly']  ### Transformed dfy

        ### Input columns or prevously Computed Columns ( colnum_bin )
        cols_list = cols_group[cols_name] if cols_name in cols_group else list(
            dfi_all[cols_name].columns)
        df_ = df[cols_list] if cols_name in cols_group else dfi_all[cols_name]
        #cols_list  = list(dfi_all[cols_name].columns)
        #df_        = dfi_all[cols_name]

        dfi, col_pars = pipe_fun(df_, cols_list, pars=pars)

        ### Concatenate colnum, colnum_bin into cols_family_all , dfi_all  ###########################
        for colj, colist in col_pars['cols_new'].items():
            ### Merge sub-family
            cols_family_all[colj] = cols_family_all.get(colj, []) + colist
            dfi_all[colj] = pd.concat(
                (dfi_all[colj], dfi), axis=1) if colj in dfi_all else dfi
            # save_features(dfi_all[colj], colj, path_features_store)

    ######  Merge AlL int dfXy  ##################################################################
    dfXy = df[[coly] + colnum + colcat]
    #dfXy = df[ [coly]  ]

    for t in dfi_all.keys():
        if t not in ['coly', 'colnum', 'colcat']:
            dfXy = pd.concat((dfXy, dfi_all[t]), axis=1)
    save_features(dfXy, 'dfX', path_features_store)

    colXy = list(dfXy.columns)
    colXy.remove(coly)  ##### Only X columns
    if len(colid) > 0:
        cols_family_all['colid'] = colid
    cols_family_all['colX'] = colXy

    ####  Cols group for model input  ###########################################################

    save(colXy, f'{path_pipeline_export}/colsX.pkl')
    save(cols_family_all, f'{path_pipeline_export}/cols_family.pkl')

    ###### Return values  #######################################################################
    return dfXy, cols_family_all
示例#9
0
def preprocess_inference(df,
                         path_pipeline="data/pipeline/pipe_01/",
                         preprocess_pars={},
                         cols_group=None):
    """
       At Inference time, load model, params and preprocess data.
       Not saving the data, only output final dataframe
    :param df: input dataframe
    :param path_pipeline:  path where processors are stored
    :param preprocess_pars: dict of params specific to preprocessing
    :param cols_group:  dict of column family
    :return: dfXy  Final dataframe,
             cols_family_full : dict of column family
    """
    from util_feature import load, load_function_uri, load_dataset

    #### Pipeline Execution  ####################################################
    pipe_default = [{
        'uri': 'source/prepro.py::pd_colnum_bin',
        'pars': {},
        'cols_family': 'colnum',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colnum_binto_onehot',
        'pars': {},
        'cols_family': 'colnum_bin',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colcat_bin',
        'pars': {},
        'cols_family': 'colcat',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colcat_to_onehot',
        'pars': {},
        'cols_family': 'colcat_bin',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colcross',
        'pars': {},
        'cols_family': 'colcross',
        'type': 'cross'
    }]
    pipe_list = preprocess_pars.get('pipe_list', pipe_default)
    pipe_list_X = [
        task for task in pipe_list
        if task.get('type', '') not in ['coly', 'filter']
    ]
    pipe_filter = [
        task for task in pipe_list if task.get('type', '') in ['filter']
    ]

    log("########### Load column by column type ##################################"
        )
    cols_group = preprocess_pars['cols_group']
    log(cols_group)  ### list of model columns familty
    colid = cols_group['colid']  # "jobId"
    coly = cols_group['coly']
    colcat = cols_group[
        'colcat']  # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ]
    colnum = cols_group['colnum']  # ['yearsExperience', 'milesFromMetropolis']

    ##### Generate features ########################################################################
    dfi_all = {}  ### Dict of all features
    cols_family_full = {'coly': coly}

    if len(pipe_filter) > 0:
        log("#####  Filter  #######################################################################"
            )
        pipe_i = pipe_filter[0]
        pipe_fun = load_function_uri(pipe_i['uri'])
        df, col_pars = pipe_fun(df,
                                list(df.columns),
                                pars=pipe_i.get('pars', {}))

    #####  Processors  #############################################################################
    #for colg, colg_list in cols_group.items() :
    #   if colg not in  ['colid', 'coly' ]:
    #      dfi_all[colg]   = df[colg_list]   ## colnum colcat, coly

    for pipe_i in pipe_list_X:
        log("###################", pipe_i,
            "#######################################################")
        pipe_fun = load_function_uri(
            pipe_i['uri'])  ### Load the code definition  into pipe_fun
        cols_name = pipe_i['cols_family']
        col_type = pipe_i['type']
        pars = pipe_i.get('pars', {})

        ### Load data from disk : inference time
        pars['path_pipeline'] = path_pipeline

        cols_list = cols_group[
            cols_name] if cols_name in cols_group else cols_family_full[
                cols_name]
        df_ = df[cols_group[
            cols_name]] if cols_name in cols_group else dfi_all[cols_name]
        # cols_list  = list(dfi_all[cols_name].columns)
        # df_        = dfi_all[cols_name]
        logs(df_, cols_list)

        if col_type == 'cross':
            pars['dfnum_hot'] = dfi_all[
                'colnum_onehot']  ### dfnum_hot --> dfcross
            pars['dfcat_hot'] = dfi_all['colcat_onehot']
            pars['colid'] = colid
            pars['colcross_single'] = cols_group.get('colcross', [])
        elif col_type == 'add_coly':
            pass

        dfi, col_pars = pipe_fun(df_, cols_list, pars=pars)

        ### Concatenate colnum, colnum_bin into cols_family_all
        for colj, colist in col_pars['cols_new'].items():
            ### Merge sub-family
            cols_family_full[colj] = cols_family_full.get(colj, []) + colist
            dfi_all[colj] = pd.concat(
                (dfi_all[colj], dfi), axis=1) if colj in dfi_all else dfi

    log("######  Merge AlL int dfXy  #############################################################"
        )
    dfXy = df[colnum + colcat]
    for t in dfi_all.keys():
        if t not in ['colnum', 'colcat']:
            dfXy = pd.concat((dfXy, dfi_all[t]), axis=1)

    colXy = list(dfXy.columns)
    if len(colid) > 0:
        cols_family_full['colid'] = colid
    cols_family_full['colX'] = colXy

    return dfXy, cols_family_full