예제 #1
0
def pd_colcat_encoder_generic(df, col, pars):
    """
        Create a Class or decorator
        https://pypi.org/project/category-encoders/
        encoder = ce.BackwardDifferenceEncoder(cols=[...])
        encoder = ce.BaseNEncoder(cols=[...])
        encoder = ce.BinaryEncoder(cols=[...])
        encoder = ce.CatBoostEncoder(cols=[...])
        encoder = ce.CountEncoder(cols=[...])
        encoder = ce.GLMMEncoder(cols=[...])
        encoder = ce.HashingEncoder(cols=[...])
        encoder = ce.HelmertEncoder(cols=[...])
        encoder = ce.JamesSteinEncoder(cols=[...])
        encoder = ce.LeaveOneOutEncoder(cols=[...])
        encoder = ce.MEstimateEncoder(cols=[...])
        encoder = ce.OneHotEncoder(cols=[...])
        encoder = ce.OrdinalEncoder(cols=[...])
        encoder = ce.SumEncoder(cols=[...])
        encoder = ce.PolynomialEncoder(cols=[...])
        encoder = ce.TargetEncoder(cols=[...])
        encoder = ce.WOEEncoder(cols=[...])
    """
    prefix = "colcat_encoder_generic"
    pars_model = None
    if 'path_pipeline' in pars:  ### Load during Inference
        colcat_encoder = load(pars['path_pipeline'] + f"/{prefix}.pkl")
        pars_model = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")
        #model         = load( pars['path_pipeline'] + f"/{prefix}_model.pkl" )

    ####### Custom Code ###############################################################
    from category_encoders import HashingEncoder, WOEEncoder
    pars_model = pars.get('model_pars',
                          {}) if pars_model is None else pars_model
    pars_model['cols'] = col
    model_name = pars.get('model_name', 'HashingEncoder')

    model_class = {'HashingEncoder': HashingEncoder}[model_name]
    model = model_class(**pars_model)
    dfcat_encoder = model.fit_transform(df[col])

    dfcat_encoder.columns = [t + "_cod" for t in dfcat_encoder.columns]
    colcat_encoder = list(dfcat_encoder.columns)

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfcat_encoder, 'dfcat_encoder',
                      pars['path_features_store'])
        save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(pars_model, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")
        save(colcat_encoder, pars['path_pipeline_export'] + f"/{prefix}.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        'colcat_encoder_generic': colcat_encoder  ### list
    }
    return dfcat_encoder, col_pars
예제 #2
0
def predict(model_name, path_model, dfX, cols_family):
    """
    if config_name in ['ElasticNet', 'ElasticNetCV', 'LGBMRegressor', 'LGBMModel', 'TweedieRegressor', 'Ridge']:
        from models import model_sklearn as modelx

    elif config_name == 'model_bayesian_pyro':
        from models import model_bayesian_pyro as modelx

    elif config_name == 'model_widedeep':
        from models import model_widedeep as modelx
    """
    modelx = map_model(model_name)
    modelx.reset()
    log(modelx, path_model)
    #log(os.getcwd())
    sys.path.append(root)  #### Needed due to import source error

    modelx.model = load(path_model + "/model/model.pkl")
    # stats = load(path_model + "/model/info.pkl")
    colsX = load(path_model + "/model/colsX.pkl")  ## column name
    # coly  = load( path_model + "/model/coly.pkl"   )
    assert colsX is not None
    assert modelx.model is not None

    log(modelx.model.model)

    ### Prediction
    dfX1 = dfX.reindex(columns=colsX)  #reindex included
    ypred = modelx.predict(dfX1)

    return ypred
예제 #3
0
def pd_colcat_symbolic(df, col, pars):
    """
       https://github.com/arita37/deltapy

       pip install deltapy

    """
    pars_encoder = pars
    pars_encoder['cols'] = col
    if 'path_pipeline_export' in pars:
        try:
            pars_encoder = load(pars['path_pipeline_export'] +
                                '/col_genetic_pars.pkl')
            model_encoder = load(pars['path_pipeline_export'] +
                                 '/col_genetic_model.pkl')
            col_encoder = load(pars['path_pipeline_export'] +
                               '/col_genetic.pkl')
        except:
            pass

    ###################################################################################
    coly = pars['coly']
    from gplearn.genetic import SymbolicTransformer
    function_set = [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan'
    ]

    gp = SymbolicTransformer(generations=20,
                             population_size=200,
                             hall_of_fame=100,
                             n_components=10,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9,
                             verbose=1,
                             random_state=0,
                             n_jobs=6)

    gen_feats = gp.fit_transform(df[col], df[coly])
    gen_feats = pd.DataFrame(
        gen_feats,
        columns=["gen_" + str(a) for a in range(gen_feats.shape[1])])
    gen_feats.index = df.index
    dfnew = gen_feats
    dfnew.columns = [t for t in dfnew.columns]

    ###################################################################################
    colnew = list(dfnew.columns)
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfnew, 'dfgen', pars['path_features_store'])
        save(gp, pars['path_pipeline_export'] + "/col_genetic_model.pkl")
        save(pars_encoder,
             pars['path_pipeline_export'] + "/col_genetic_pars.pkl")
        save(colnew, pars['path_pipeline_export'] + "/col_genetic.pkl")

    col_pars = {'model': gp}
    col_pars['cols_new'] = {
        'col_genetic': colnew  ### list
    }
    return dfnew, col_pars
예제 #4
0
def predict(model_name, path_model, dfX, cols_family):
    """
    """
    modelx = map_model(model_name)
    modelx.reset()
    log(modelx, path_model)
    #log(os.getcwd())
    sys.path.append(root)  #### Needed due to import source error

    log("#### Load model  ############################################")
    print(path_model + "/model/model.pkl")
    # modelx.model = load(path_model + "/model//model.pkl")
    modelx.model = load(path_model + "/model.pkl")

    # stats = load(path_model + "/model/info.pkl")
    # colsX       = load(path_model + "/model/colsX.pkl")   ## column name
    colsX = load(path_model + "/colsX.pkl")  ## column name

    # coly  = load( path_model + "/model/coly.pkl"   )
    assert colsX is not None, "cannot load colsx, " + path_model
    assert modelx.model is not None, "cannot load modelx, " + path_model
    log("#### modelx\n", modelx.model.model)

    log("### Prediction  ############################################")
    dfX1 = dfX.reindex(columns=colsX)  #reindex included

    ypred = modelx.predict(dfX1)

    return ypred
예제 #5
0
def pd_ts_deltapy2(
    df=None,
    col=None,
    pars={},
):
    """
       Delta py
       pars : {  'name' :  "robust_scaler",
                 'pars'  :  {}
       }
    """
    prefix = 'colts_deltapy'

    ###### Custom code ################################################################
    dfin = df.fillna(method='ffill')
    model_name = pars['name']
    model_pars = pars.get('pars', {})

    if 'path_pipeline' in pars:  #### Prediction time
        model = load(pars['path_pipeline'] + f"/{prefix}_model.pkl")
        pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")

    else:  ### Training time  : Dynamic function load
        from util_feature import load_function_uri
        ##### transform.robust_scaler(df, drop=["Close_1"])
        model = load_function_uri2(model_name)

    ##### Transform Data  ############################################################
    df_out = model(dfin, **model_pars)

    # Extract only returns one value, so no columns to loop over.
    model_name2 = model_name.replace("::", "-")
    if 'extract' in model_name:
        col_out = "0_" + model_name
    else:
        col_out = [coli + "_" + model_name for coli in df_out.columns]
        df_out.columns = col_out
        df_out.index = df_out.index
    col_new = col_out

    ###### Export #####################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_out, 'df_' + prefix, pars['path_features_store'])
        save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_new, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: col_new  ### list of columns
    }
    return df_out, col_pars
예제 #6
0
def pd_colts_generate(df=None, col=None, pars={}):
    """
       pars : {  'model_name' :  "transform.robust_scaler",
                 'model_pars'  :  {}


       }
    """
    prefix = 'colts_generate'

    ###### Custom code ################################################################
    dfin = df[col].fillna(method='ffill')
    model_name = pars['model_name']
    model_pars = pars.get('model_pars', {})

    if 'path_pipeline' in pars:  #### Prediction time
        model = load(pars['path_pipeline'] + f"/{prefix}_model.pkl")
        pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")

    else:  ### Training time  : Dynamic function load
        from util_feature import load_function_uri
        ##### transform.robust_scaler(df, drop=["Close_1"])
        model = load_function_uri(model_name)

    model_name = model_name.replace(".", "_")

    ##### Transform Data  ############################################################
    df_out = model(dfin, col, **model_pars)
    col_out = [coli + "_" + model_name for coli in df_out.columns]
    df_out.columns = col_out
    df_out.index = train_X.index
    col_new = col_out

    ###### Export #####################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_out, 'df_' + prefix, pars['path_features_store'])
        save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_new, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: col_new  ### list of columns
    }
    return df_out, col_pars
예제 #7
0
def prepro_load(prefix, pars):
    """  Load previously savec preprocessors
    :param prefix:
    :param pars:
    :return:
    """
    prepro = None
    pars_saved = None
    cols_saved = None
    if "path_pipeline" in pars:
        prepro = load(pars["path_pipeline"] + f"/{prefix}_model.pkl")
        pars_saved = load(pars["path_pipeline"] + f"/{prefix}_pars.pkl")
        cols_saved = load(pars["path_pipeline"] + f"/{prefix}_cols.pkl")

    return prepro, pars_saved, cols_saved
예제 #8
0
def run_model_check(path_output, scoring):
    """
    :param path_output:
    :param scoring:
    :return:
    """
    import pandas as pd
    try:
        #### Load model
        from source.util_feature import load
        from source.models import model_sklearn as modelx
        import sys
        from source import models
        sys.modules['models'] = models

        dir_model = path_output
        modelx.model = load(dir_model + "/model/model.pkl")
        stats = load(dir_model + "/model/info.pkl")
        colsX = load(dir_model + "/model/colsX.pkl")
        coly = load(dir_model + "/model/coly.pkl")
        print(stats)
        print(modelx.model.model)

        ### Metrics on test data
        log(stats['metrics_test'])

        #### Loading training data  ######################################################
        dfX = pd.read_csv(dir_model + "/check/dfX.csv")  #to load csv
        #dfX = pd.read_parquet(dir_model + "/check/dfX.parquet")    #to load parquet
        dfy = dfX[coly]
        colused = colsX

        dfXtest = pd.read_csv(dir_model + "/check/dfXtest.csv")  #to load csv
        #dfXtest = pd.read_parquet(dir_model + "/check/dfXtest.parquet"    #to load parquet
        dfytest = dfXtest[coly]
        print(dfX.shape, dfXtest.shape)

        #### Feature importance on training data  #######################################
        from util_feature import feature_importance_perm
        lgb_featimpt_train, _ = feature_importance_perm(modelx,
                                                        dfX[colused],
                                                        dfy,
                                                        colused,
                                                        n_repeats=1,
                                                        scoring=scoring)
        print(lgb_featimpt_train)
    except:
        pass
예제 #9
0
def preprocess_load(path_train_X="",
                    path_train_y="",
                    path_pipeline_export="",
                    cols_group=None,
                    n_sample=5000,
                    preprocess_pars={},
                    path_features_store=None):
    """
        Load pre-computed dataframe
    :param path_train_X:
    :param path_train_y:
    :param path_pipeline_export:
    :param cols_group:
    :param n_sample:
    :param preprocess_pars:
    :param path_features_store:
    :return:
    """
    from source.util_feature import load

    dfXy = pd.read_parquet(path_features_store + "/dfX/features.parquet")

    try:
        dfy = pd.read_parquet(path_features_store + "/dfy/features.parquet")
        dfXy = dfXy.join(dfy, on=cols_group['colid'], how="left")

    except:
        log('Error no label', path_features_store + "/dfy/features.parquet")

    cols_family = load(f'{path_pipeline_export}/cols_family.pkl')

    return dfXy, cols_family
예제 #10
0
def pd_colnum_binto_onehot(df, col=None, pars=None):
    assert isinstance(col, list) and isinstance(df, pd.DataFrame)

    dfnum_bin = df[col]
    colnum_bin = col

    path_pipeline = pars.get('path_pipeline', False)
    colnum_onehot = load(
        f'{path_pipeline}/colnum_onehot.pkl') if path_pipeline else None

    log("###### colnum bin to One Hot  #################################################"
        )
    from util_feature import pd_col_to_onehot
    dfnum_hot, colnum_onehot = pd_col_to_onehot(dfnum_bin[colnum_bin],
                                                colname=colnum_bin,
                                                colonehot=colnum_onehot,
                                                return_val="dataframe,param")
    log(colnum_onehot)

    if 'path_features_store' in pars:
        save_features(dfnum_hot, 'colnum_onehot', pars['path_features_store'])
        save(colnum_onehot,
             pars['path_pipeline_export'] + "/colnum_onehot.pkl")

    col_pars = {}
    col_pars['colnum_onehot'] = colnum_onehot
    col_pars['cols_new'] = {
        # 'colnum'        :  col ,    ###list
        'colnum_onehot': colnum_onehot  ### list
    }
    return dfnum_hot, col_pars
예제 #11
0
def run_predict(model_name, path_model, path_data, path_output, n_sample=-1):
    path_output = root + path_output
    path_data = root + path_data + "/features.zip"  #.zip
    path_model = root + path_model
    path_pipeline = path_model + "/pipeline/"
    path_test_X = path_data + "/features.zip"  #.zip #added path to testing features
    log(path_data, path_model, path_output)

    colid = load(f'{path_pipeline}/colid.pkl')

    df = load_dataset(path_data,
                      path_data_y=None,
                      colid=colid,
                      n_sample=n_sample)

    dfX, cols_family = preprocess(df, path_pipeline)

    ypred, yproba = predict(model_name, path_model, dfX, cols_family)

    log("Saving prediction", ypred.shape, path_output)
    os.makedirs(path_output, exist_ok=True)
    df[cols_family["coly"] + "_pred"] = ypred
    if yproba is not None:
        df[cols_family["coly"] + "_pred_proba"] = yproba
    df.to_csv(f"{path_output}/prediction.csv")
    log(df.head(8))

    #####  Export Specific
    df[cols_family["coly"]] = ypred
    df[[cols_family["coly"]]].to_csv(f"{path_output}/pred_only.csv")
예제 #12
0
파일: prepro.py 프로젝트: iamrehman/dsa2
def pd_colcat_minhash(df, col, pars):
    """
       MinHash Algo for category
       https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087

    """
    prefix = 'colcat_minhash'
    colcat              = col

    pars_minhash = {'n_component' : [4, 2], 'model_pretrain_dict' : None,}
    if 'path_pipeline_export' in pars :
        try :
            pars_minhash = load( pars['path_pipeline_export'] + '/colcat_minhash_pars.pkl')
        except : pass

    log("#### Colcat to Hash encoding #############################################")
    from utils import util_text
    dfcat_bin, col_hash_model= util_text.pd_coltext_minhash(df[colcat], colcat,
                                                            return_val="dataframe,param", **pars_minhash )
    colcat_minhash = list(dfcat_bin.columns)
    log(col_hash_model)

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
       save_features(dfcat_bin, prefix, pars['path_features_store'])
       save(colcat_minhash, pars['path_pipeline_export'] + f"/{prefix}.pkl" )
       save(pars_minhash,   pars['path_pipeline_export'] + f"/{prefix}_pars.pkl" )
       save(col_hash_model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl" )

    col_pars = {}
    col_pars['col_hash_model'] = col_hash_model
    col_pars['cols_new'] = {
     'colcat_minhash' :  colcat_minhash  ### list
    }
    return dfcat_bin, col_pars
예제 #13
0
def pd_colcat_bin(df, col=None, pars=None):
    # dfbum_bin = df[col]
    path_pipeline = pars.get('path_pipeline', False)
    colcat_bin_map = load(
        f'{path_pipeline}/colcat_bin_map.pkl') if path_pipeline else None
    colcat = [col] if isinstance(col, str) else col

    log("#### Colcat to integer encoding ")
    dfcat_bin, colcat_bin_map = util_feature.pd_colcat_toint(
        df[colcat], colname=colcat, colcat_map=colcat_bin_map, suffix="_int")
    colcat_bin = list(dfcat_bin.columns)
    ##### Colcat processing   ################################################################
    colcat_map = util_feature.pd_colcat_mapping(df, colcat)
    log(df[colcat].dtypes, colcat_map)

    if 'path_features_store' in pars:
        save_features(dfcat_bin, 'dfcat_bin', pars['path_features_store'])
        save(colcat_bin_map,
             pars['path_pipeline_export'] + "/colcat_bin_map.pkl")
        save(colcat_bin, pars['path_pipeline_export'] + "/colcat_bin.pkl")

    col_pars = {}
    col_pars['colcat_bin_map'] = colcat_bin_map
    col_pars['cols_new'] = {
        'colcat': col,  ###list
        'colcat_bin': colcat_bin  ### list
    }

    return dfcat_bin, col_pars
예제 #14
0
파일: prepro.py 프로젝트: iamrehman/dsa2
def pd_colnum_bin(df, col, pars):
    from util_feature import  pd_colnum_tocat

    path_pipeline = pars.get('path_pipeline', False)
    colnum_binmap  = load(f'{path_pipeline}/colnum_binmap.pkl') if  path_pipeline else None
    log(colnum_binmap)

    colnum = col

    log("### colnum Map numerics to Category bin  ###########################################")
    dfnum_bin, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=colnum_binmap,
                                               bins=10, suffix="_bin", method="uniform",
                                               return_val="dataframe,param")
    log(colnum_binmap)
    ### Renaming colunm_bin with suffix
    colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())]
    log(colnum_bin)

    if 'path_features_store' in pars:
        scol = "_".join(col[:5])
        save_features(dfnum_bin, 'colnum_bin' + "-" + scol, pars['path_features_store'])
        save(colnum_binmap,  pars['path_pipeline_export'] + "/colnum_binmap.pkl" )
        save(colnum_bin,     pars['path_pipeline_export'] + "/colnum_bin.pkl" )


    col_pars = {}
    col_pars['colnumbin_map'] = colnum_binmap
    col_pars['cols_new'] = {
     'colnum'     :  col ,    ###list
     'colnum_bin' :  colnum_bin       ### list
    }
    return dfnum_bin, col_pars
예제 #15
0
def pd_colcross(df, col, pars):
    """
     cross_feature_new =  feat1 X feat2  (pair feature)

    """
    log("#####  Cross Features From OneHot Features   ######################################"
        )
    prefix = 'colcross_onehot'

    # params_check(pars,  [('dfcat_hot', pd.DataFrame), 'colid',   ])
    from util_feature import pd_feature_generate_cross

    dfcat_hot = pars['dfcat_hot']
    colid = pars['colid']

    try:
        dfnum_hot = pars['dfnum_hot']
        df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left')
    except:
        df_onehot = copy.deepcopy(dfcat_hot)

    colcross_single = pars['colcross_single']
    pars_model = {'pct_threshold': 0.02, 'm_combination': 2}
    if 'path_pipeline' in pars:  #### Load existing column list
        colcross_single = load(pars['path_pipeline'] + f'/{prefix}_select.pkl')
        # pars_model      = load( pars['path_pipeline']  + f'/{prefix}_pars.pkl')

    colcross_single_onehot_select = []  ## Select existing columns
    for t in list(df_onehot.columns):
        for c1 in colcross_single:
            if c1 in t:
                colcross_single_onehot_select.append(t)

    df_onehot = df_onehot[colcross_single_onehot_select]
    dfcross_hot, colcross_pair = pd_feature_generate_cross(
        df_onehot, colcross_single_onehot_select, **pars_model)
    log(dfcross_hot.head(2).T)
    colcross_pair_onehot = list(dfcross_hot.columns)

    model = None
    ##############################################################################
    if 'path_features_store' in pars:
        save_features(dfcross_hot, 'colcross_onehot',
                      pars['path_features_store'])
        save(colcross_single_onehot_select,
             pars['path_pipeline_export'] + f'/{prefix}_select.pkl')
        save(colcross_pair,
             pars['path_pipeline_export'] + f'/{prefix}_stats.pkl')
        save(colcross_pair_onehot,
             pars['path_pipeline_export'] + f'/{prefix}_pair.pkl')
        save(model, pars['path_pipeline_export'] + f'/{prefix}_pars.pkl')

    col_pars = {'model': model, 'stats': colcross_pair}
    col_pars['cols_new'] = {
        # 'colcross_single'     :  col ,    ###list
        'colcross_pair': colcross_pair_onehot  ### list
    }
    return dfcross_hot, col_pars
예제 #16
0
def pd_col_atemplate(df=None, col=None, pars={}):
    """
    Example of custom Processor
    Used at prediction time
        "path_pipeline"  : 

    Training time :
        "path_features_store" :  to store intermediate dataframe
        "path_pipeline_export":  to store pipeline  for later usage

    """
    from source.util_feature import save, load
    prefix = "col_myfun"
    #### Inference time LOAD previous pars  ###########################################
    if "path_pipeline" in pars:
        prepro = load(pars["path_pipeline"] + f"/{prefix}_model.pkl")
        pars = load(pars["path_pipeline"] + f"/{prefix}_pars.pkl")
        pars = {} if pars is None else pars

    #### Do something #################################################################
    df_new = df[col]  ### Do nithi
    df_new.columns = [col + "_myfun" for col in df.columns]
    cols_new = list(df_new.columns)

    prepro = None  ### model
    pars_new = None  ### new params

    ###################################################################################
    ###### Training time save all #####################################################
    if "path_features_store" in pars and "path_pipeline_export" in pars:
        save(prepro, pars["path_pipeline_export"] + f"/{prefix}_model.pkl")
        save(cols_new, pars["path_pipeline_export"] + f"/{prefix}.pkl")
        save(pars_new, pars["path_pipeline_export"] + f"/{prefix}_pars.pkl")

    ###### Training & Inference time : df + new column names ##########################
    col_pars = {
        "prefix": prefix,
        "path": pars.get("path_pipeline_export",
                         pars.get("path_pipeline", None))
    }
    col_pars["cols_new"] = {
        "col_myfun": cols_new  ### new column list
    }
    return df_new, col_pars
예제 #17
0
def pd_colcat_encoder_generic(df, col, pars):
    """
       https://pypi.org/project/category-encoders/
       encoder = ce.BackwardDifferenceEncoder(cols=[...])
encoder = ce.BaseNEncoder(cols=[...])
encoder = ce.BinaryEncoder(cols=[...])
encoder = ce.CatBoostEncoder(cols=[...])
encoder = ce.CountEncoder(cols=[...])
encoder = ce.GLMMEncoder(cols=[...])
encoder = ce.HashingEncoder(cols=[...])
encoder = ce.HelmertEncoder(cols=[...])
encoder = ce.JamesSteinEncoder(cols=[...])
encoder = ce.LeaveOneOutEncoder(cols=[...])
encoder = ce.MEstimateEncoder(cols=[...])
encoder = ce.OneHotEncoder(cols=[...])
encoder = ce.OrdinalEncoder(cols=[...])
encoder = ce.SumEncoder(cols=[...])
encoder = ce.PolynomialEncoder(cols=[...])
encoder = ce.TargetEncoder(cols=[...])
encoder = ce.WOEEncoder(cols=[...])


    """
    colcat = col
    import category_encoders as ce
    pars_encoder = pars
    pars_encoder['cols'] = col
    if 'path_pipeline_export' in pars:
        try:
            pars_encoder = load(pars['path_pipeline_export'] +
                                '/colcat_encoder_pars.pkl')
        except:
            pass

    encoder = ce.HashingEncoder(**pars_encoder)
    dfcat_bin = encoder.fit_transform(df[col])

    dfcat_bin.columns = [t for t in dfcat_bin.columns]
    colcat_encoder = list(dfcat_bin.columns)

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfcat_bin, 'dfcat_encoder', pars['path_features_store'])
        save(encoder,
             pars['path_pipeline_export'] + "/colcat_encoder_model.pkl")
        save(pars_encoder,
             pars['path_pipeline_export'] + "/colcat_encoder_pars.pkl")
        save(colcat_encoder,
             pars['path_pipeline_export'] + "/colcat_encoder.pkl")

    col_pars = {}
    col_pars['col_encode_model'] = encoder
    col_pars['cols_new'] = {
        'colcat_encoder': colcat_encoder  ### list
    }
    return dfcat_bin, col_pars
예제 #18
0
def pd_colnum_normalize(df: pd.DataFrame, col: list = None, pars: dict = None):
    """ Float num INTO [0,1]
      'quantile_cutoff', 'quantile_cutoff_2', 'minmax'      
      'name': 'fillna', 'na_val' : 0.0 

    """
    prefix = 'colnum_norm'  ### == cols_out
    df = df[col]
    log2(
        "### colnum normalize  #############################################################"
    )
    from util_feature import pd_colnum_normalize as pd_normalize_fun
    colnum = col
    if pars is None:
        pars = {
            'pipe_list': [
                {
                    'name': 'quantile_cutoff'
                },  #  
                {
                    'name': 'fillna',
                    'na_val': 0.0
                },
            ]
        }
    if 'path_pipeline' in pars:  #### Load existing column list
        pars = load(pars['path_pipeline'] + f'/{prefix}_pars.pkl')

    dfnum_norm, colnum_norm = pd_normalize_fun(df,
                                               colname=colnum,
                                               pars=pars,
                                               suffix="_norm",
                                               return_val="dataframe,param")
    log3('dfnum_norm', dfnum_norm.head(4), colnum_norm)
    log3('dfnum_norn NA', dfnum_norm.isna().sum())
    colnew = colnum_norm

    log3(
        "##### Export ######################################################################"
    )
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfnum_norm, prefix, pars['path_features_store'])
        save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: colnew  ### list
    }
    return dfnum_norm, col_pars
예제 #19
0
def run_predict(config_name,
                config_path,
                n_sample=-1,
                path_data=None,
                path_output=None,
                pars={},
                model_dict=None):

    model_dict = model_dict_load(model_dict,
                                 config_path,
                                 config_name,
                                 verbose=True)
    m = model_dict['global_pars']

    model_class = model_dict['model_pars']['model_class']
    path_data = m['path_pred_data'] if path_data is None else path_data
    path_pipeline = m['path_pred_pipeline']  #   path_output + "/pipeline/" )
    path_model = m['path_pred_model']

    path_output = m['path_pred_output'] if path_output is None else path_output
    log(path_data, path_model, path_output)

    pars = {
        'cols_group': model_dict['data_pars']['cols_input_type'],
        'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list']
    }

    ##########################################################################################
    colid = load(f'{path_pipeline}/colid.pkl')
    df = load_dataset(path_data,
                      path_data_y=None,
                      colid=colid,
                      n_sample=n_sample)

    from run_preprocess import preprocess_inference as preprocess
    dfX, cols_family = preprocess(df, path_pipeline, preprocess_pars=pars)
    ypred, yproba = predict(model_class, path_model, dfX, cols_family)

    log("############ Saving prediction  ###################################################"
        )
    log(ypred.shape, path_output)
    os.makedirs(path_output, exist_ok=True)
    df[cols_family["coly"] + "_pred"] = ypred
    if yproba is not None:
        df[cols_family["coly"] + "_pred_proba"] = yproba
    df.to_csv(f"{path_output}/prediction.csv")
    log(df.head(8))

    log("###########  Export Specific ######################################################"
        )
    df[cols_family["coly"]] = ypred
    df[[cols_family["coly"]]].to_csv(f"{path_output}/pred_only.csv")
예제 #20
0
def predict(model_name, path_model, dfX, cols_family):
    """

    Arguments:
        model_name {[str]} -- [description]
        path_model {[str]} -- [description]
        dfX {[DataFrame]} -- [description]
        cols_family {[dict]} -- [description]

    Returns: ypred
        [numpy.array] -- [vector of prediction]
    """

    modelx = map_model(model_name)
    modelx.reset()
    log(modelx, path_model)
    #log(os.getcwd())
    sys.path.append(root)  #### Needed due to import source error

    log("#### Load model  ############################################")
    print(path_model + "/model/model.pkl")
    # modelx.model = load(path_model + "/model//model.pkl")
    modelx.model = load(path_model + "/model.pkl")

    # stats = load(path_model + "/model/info.pkl")
    # colsX       = load(path_model + "/model/colsX.pkl")   ## column name
    colsX = load(path_model + "/colsX.pkl")  ## column name

    # coly  = load( path_model + "/model/coly.pkl"   )
    assert colsX is not None, "cannot load colsx, " + path_model
    assert modelx.model is not None, "cannot load modelx, " + path_model
    log("#### modelx\n", modelx.model.model)

    log("### Prediction  ############################################")
    dfX1 = dfX.reindex(columns=colsX)  #reindex included

    ypred = modelx.predict(dfX1)

    return ypred
예제 #21
0
def pd_colcat_to_onehot(df, col=None, pars=None):
    """

    """
    log("#### colcat to onehot")
    col = [col] if isinstance(col, str) else col
    if len(col) == 1:
        colnew = [col[0] + "_onehot"]
        df[colnew] = df[col]
        col_pars = {}
        col_pars['colcat_onehot'] = colnew
        col_pars['cols_new'] = {
            # 'colnum'        :  col ,    ###list
            'colcat_onehot': colnew  ### list
        }
        return df[colnew], col_pars

    colcat_onehot = None
    if 'path_pipeline' in pars:
        colcat_onehot = load(pars['path_pipeline'] + '/colcat_onehot.pkl')

    ######################################################################################
    colcat = col
    dfcat_hot, colcat_onehot = util_feature.pd_col_to_onehot(
        df[colcat],
        colname=colcat,
        colonehot=colcat_onehot,
        return_val="dataframe,param")
    log(dfcat_hot[colcat_onehot].head(5))

    ######################################################################################
    if 'path_features_store' in pars:
        save_features(dfcat_hot, 'colcat_onehot', pars['path_features_store'])
        save(colcat_onehot,
             pars['path_pipeline_export'] + "/colcat_onehot.pkl")
        save(colcat, pars['path_pipeline_export'] + "/colcat.pkl")

    col_pars = {}
    col_pars['colcat_onehot'] = colcat_onehot
    col_pars['cols_new'] = {
        # 'colnum'        :  col ,    ###list
        'colcat_onehot': colcat_onehot  ### list
    }

    print("ok ------------")
    return dfcat_hot, col_pars
예제 #22
0
def pd_colcat_to_onehot(df, col=None, pars=None):
    dfbum_bin = df[col]
    if len(col) == 1:

        colnew = [col[0] + "_onehot"]
        df[colnew] = df[col]
        col_pars = {}
        col_pars['colcat_onehot'] = colnew
        col_pars['cols_new'] = {
            # 'colnum'        :  col ,    ###list
            'colcat_onehot': colnew  ### list
        }
        return df[colnew], col_pars

    path_pipeline = pars.get('path_pipeline', False)
    colcat_onehot = load(
        f'{path_pipeline}/colcat_onehot.pkl') if path_pipeline else None

    colcat = col
    log("#### colcat to onehot")
    dfcat_hot, colcat_onehot = util_feature.pd_col_to_onehot(
        df[colcat],
        colname=colcat,
        colonehot=colcat_onehot,
        return_val="dataframe,param")
    log(dfcat_hot[colcat_onehot].head(5))

    if 'path_features_store' in pars:
        path_features_store = pars['path_features_store']
        save_features(dfcat_hot, 'colcat_onehot', path_features_store)
        save(colcat_onehot,
             pars['path_pipeline_export'] + "/colcat_onehot.pkl")
        save(colcat, pars['path_pipeline_export'] + "/colcat.pkl")

    col_pars = {}
    col_pars['colcat_onehot'] = colcat_onehot
    col_pars['cols_new'] = {
        # 'colnum'        :  col ,    ###list
        'colcat_onehot': colcat_onehot  ### list
    }

    print("ok ------------")
    return dfcat_hot, col_pars
예제 #23
0
def transform(model_name, path_model, dfX, model_dict, task_type='transform'):
    """
    Arguments:
        model_name {[str]} -- [description]
        path_model {[str]} -- [description]
        dfX {[DataFrame]} -- [description]
        cols_family {[dict]} -- [description]

    Returns: ypred
        [numpy.array] -- [vector of prediction]
    """
    modelx = map_model(model_name)
    modelx.reset()
    log(modelx, path_model)
    sys.path.append(root)  #### Needed due to import source error

    log("#### Load model  ############################################")
    log2(path_model + "/model/model.pkl")
    modelx.model = modelx.load(path_model + "/model.pkl")
    colsX = load(path_model + "/colsX.pkl")  ## column name
    # coly  = load( path_model + "/model/coly.pkl"   )
    assert colsX is not None, "cannot load colsx, " + path_model
    assert modelx.model is not None, "cannot load modelx, " + path_model
    log("#### modelx\n", modelx.model.model)

    log("### Task Inference  #############################################")
    # dfX1  = dfX.reindex(columns=colsX)   #reindex included

    if task_type == 'encode':
        dfX = modelx.encode(dfX,
                            data_pars=model_dict['data_pars'],
                            compute_pars=model_dict['compute_pars'])

    elif task_type == 'decode':
        dfX = modelx.encode(dfX,
                            data_pars=model_dict['data_pars'],
                            compute_pars=model_dict['compute_pars'])
    else:
        dfX = modelx.transform(dfX,
                               data_pars=model_dict['data_pars'],
                               compute_pars=model_dict['compute_pars'])

    return dfX
예제 #24
0
def run_data_check(path_data,
                   path_data_ref,
                   path_model,
                   path_output,
                   sample_ratio=0.5):
    """
     Calcualata Dataset Shift before prediction.
    """
    from run_preprocess import preprocess_inference as preprocess
    path_output = root + path_output
    path_data = root + path_data
    path_data_ref = root + path_data_ref
    path_pipeline = root + path_model + "/pipeline/"

    os.makedirs(path_output, exist_ok=True)
    colid = load(f'{path_pipeline}/colid.pkl')

    df1 = load_dataset(path_data_ref, colid=colid)
    dfX1, cols_family1 = preprocess(df1, path_pipeline)

    df2 = load_dataset(path_data, colid=colid)
    dfX2, cols_family2 = preprocess(df2, path_pipeline)

    colsX = cols_family1["colnum_bin"] + cols_family1["colcat_bin"]
    dfX1 = dfX1[colsX]
    dfX2 = dfX2[colsX]

    from util_feature import pd_stat_dataset_shift
    nsample = int(min(len(dfX1), len(dfX2)) * sample_ratio)
    metrics_psi = pd_stat_dataset_shift(dfX2,
                                        dfX1,
                                        colsX,
                                        nsample=nsample,
                                        buckets=7,
                                        axis=0)
    metrics_psi.to_csv(f"{path_output}/prediction_features_metrics.csv")
    log(metrics_psi)
예제 #25
0
def predict(model_name, path_model, dfX, cols_family, model_dict):
    """
    Arguments:
        model_name {[str]} -- [description]
        path_model {[str]} -- [description]
        dfX {[DataFrame]} -- [description]
        cols_family {[dict]} -- [description]

    Returns: ypred
        [numpy.array] -- [vector of prediction]
    """
    log("#### Load  model class  ############################################")
    modelx = map_model(model_name)
    assert modelx is not None, "cannot load modelx, " + path_model
    modelx.reset()
    log2(modelx, path_model)
    sys.path.append(root)  #### Needed due to import source error

    log("#### Load existing model weights  #################################")
    log2(path_model + "/model/")
    # modelx.model = load(path_model + "/model//model.pkl")
    # modelx.model = load(path_model + "/model.pkl")
    modelx.load_model(path_model)
    colsX = load(path_model + "/colsX.pkl")  ## column name

    assert colsX is not None, "cannot load colsx, " + path_model
    assert modelx.model is not None, "cannot load modelx, " + path_model
    log2("#### modelx\n", modelx.model)

    log("### Prediction  ###################################################")
    dfX = dfX.reindex(columns=colsX)  #reindex included
    ypred_tuple = modelx.predict(dfX,
                                 data_pars=model_dict['data_pars'],
                                 compute_pars=model_dict['compute_pars'])
    log2('ypred shape', str(ypred_tuple)[:100])
    return ypred_tuple
예제 #26
0
def pd_coltext_universal_google(df, col, pars={}):
    """
     # Universal sentence encoding from Tensorflow
       Text ---> Vectors
    from source.preprocessors import  pd_coltext_universal_google
    https://tfhub.dev/google/universal-sentence-encoder-multilingual/3

    #latest Tensorflow that supports sentencepiece is 1.13.1
    !pip uninstall --quiet --yes tensorflow
    !pip install --quiet tensorflow-gpu==1.13.1
    !pip install --quiet tensorflow-hub
    pip install --quiet tf-sentencepiece, simpleneighbors
    !pip install --quiet simpleneighbors

    # df : dataframe
    # col : list of text colnum names
    pars
    """
    prefix = "coltext_universal_google"
    if 'path_pipeline' in  pars  :   ### Load during Inference
       coltext_embed = load( pars['path_pipeline'] + "/{prefix}.pkl" )
       pars_model    = load( pars['path_pipeline'] + "/{prefix}_pars.pkl" )

    ####### Custom Code ###############################################################
    import tensorflow as tf
    import tensorflow_hub as hub
    import tensorflow_text
    #from tqdm import tqdm #progress bar
    uri_list = [
    ]
    url_default = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
    url         = pars.get("model_uri", url_default )
    model       = hub.load( url )
    pars_model  = {}
    dfall       = None
    for coli in col[:1] :
        X = []
        for r in (df[coli]):
            if pd.isnull(r)==True :
                r=""
            emb = model(r)
            review_emb = tf.reshape(emb, [-1]).numpy()
            X.append(review_emb)

        dfi   = pd.DataFrame(X, columns= [ coli + "_" + str(i) for i in range( len(X[0]))   ] ,
                             index = df.index)
        dfall = pd.concat((dfall, dfi))  if dfall is not None else dfi

    coltext_embed = list(dfall.columns)


    ##### Export ####################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
       save_features(dfall, 'dftext_embed', pars['path_features_store'])
       save(coltext_embed,  pars['path_pipeline_export'] + "/{prefix}.pkl" )
       save(pars_model,     pars['path_pipeline_export'] + "/{prefix}_pars.pkl" )
       # save(model,          pars['path_pipeline_export'] + "/{prefix}_model.pkl" )
       # model_uri = pars['path_pipeline_export'] + "/{prefix}_model.pkl"


    # col_pars = {'model_uri' :  model_uri, 'pars': pars_model}
    col_pars = {'model_uri' :  url , 'pars': pars_model} # model_uri
    col_pars['cols_new']      = {
       'coltext_universal_google' :  coltext_embed ### list
    }
    return dfall, col_pars
예제 #27
0
def pd_col_genetic_transform(df=None, col=None, pars=None):
    """
        Find Symbolic formulae for faeture engineering

    """
    prefix = 'col_genetic'
    ######################################################################################
    from gplearn.genetic import SymbolicTransformer
    from gplearn.functions import make_function
    import random

    colX = col  # [col_ for col_ in col if col_ not in coly]
    train_X = df[colX].fillna(method='ffill')
    feature_name_ = colX

    def squaree(x):
        return x * x

    square_ = make_function(function=squaree, name='square_', arity=1)

    function_set = pars.get('function_set', [
        'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan',
        square_
    ])
    pars_genetic = pars.get(
        'pars_genetic',
        {
            'generations': 5,
            'population_size': 10,  ### Higher than nb_features
            'metric': 'spearman',
            'tournament_size': 20,
            'stopping_criteria': 1.0,
            'const_range': (-1., 1.),
            'p_crossover': 0.9,
            'p_subtree_mutation': 0.01,
            'p_hoist_mutation': 0.01,
            'p_point_mutation': 0.01,
            'p_point_replace': 0.05,
            'parsimony_coefficient': 0.005,  ####   0.00005 Control Complexity
            'max_samples': 0.9,
            'verbose': 1,

            #'n_components'      ### Control number of outtput features  : n_components
            'random_state': 0,
            'n_jobs': 4,
        })

    if 'path_pipeline' in pars:  #### Inference time
        gp = load(pars['path_pipeline'] + f"/{prefix}_model.pkl")
        pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl")
    else:  ### Training time
        coly = pars['coly']
        train_y = pars['dfy']
        gp = SymbolicTransformer(
            hall_of_fame=train_X.shape[1] + 1,  ### Buggy
            n_components=pars_genetic.get('n_components', train_X.shape[1]),
            feature_names=feature_name_,
            function_set=function_set,
            **pars_genetic)
        gp.fit(train_X, train_y)

    ##### Transform Data  #########################################
    df_genetic = gp.transform(train_X)
    tag = random.randint(0, 10)  #### UNIQUE TAG
    col_genetic = [f"gen_{tag}_{i}" for i in range(df_genetic.shape[1])]
    df_genetic = pd.DataFrame(df_genetic,
                              columns=col_genetic,
                              index=train_X.index)
    df_genetic.index = train_X.index
    pars_gen_all = {'pars_genetic': pars_genetic, 'function_set': function_set}

    ##### Formulae Exrraction #####################################
    formula = str(gp).replace("[", "").replace("]", "")
    flist = formula.split(",\n")
    form_dict = {x: flist[i] for i, x in enumerate(col_genetic)}
    pars_gen_all['formulae_dict'] = form_dict
    log("########## Formulae ", form_dict)
    # col_pars['map_dict'] = dict(zip(train_X.columns.to_list(), feature_name_))

    col_new = col_genetic

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df_genetic, 'df_genetic', pars['path_features_store'])
        save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")
        save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_gen_all,
             pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")
        # save(form_dict,      pars['path_pipeline_export'] + f"/{prefix}_formula.pkl")
        save_json(form_dict, pars['path_pipeline_export'] +
                  f"/{prefix}_formula.json")  ### Human readable

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: col_new  ### list
    }
    return df_genetic, col_pars
예제 #28
0
def pd_coltext(df, col, pars={}):
    """
    df : Datframe
    col : list of columns
    pars : dict of pars

    """
    from utils import util_text, util_model

    #### Load pars ###################################################################
    path_pipeline        = pars.get('path_pipeline', None)
    word_tokeep_dict_all = load(  path_pipeline + "/word_tokeep_dict_all.pkl" )  if path_pipeline is not None else {}
    # dftext_tdidf_all = load(f'{path_pipeline}/dftext_tdidf.pkl') if  path_pipeline else None
    # dftext_svd_list_all      = load(f'{path_pipeline}/dftext_svd.pkl')   if  path_pipeline else None
    dimpca       = pars.get('dimpca', 2)
    word_minfreq = pars.get('word_minfreq', 3)

    #### Process  ####################################################################
    stopwords           = nlp_get_stopwords()
    dftext              = pd_coltext_clean(df, col, stopwords= stopwords , pars=pars)
    dftext_svd_list_all = None
    dftext_tdidf_all    = None

    ### Processing each of text columns to create a bag of word/to load the bag of word -> tf-idf -> svd
    for col_ in col:

            if path_pipeline is not None:
                ### If it is in Inference step, use the saved bag of word for the column `col_`
                word_tokeep = word_tokeep_dict_all[col_]

            else:
                ### If it is not, create a bag of word
                coltext_freq, word_tokeep = pd_coltext_wordfreq(df, col_, stopwords, ntoken=100)  ## nb of words to keep
                word_tokeep_dict_all[col_] = word_tokeep  ## save the bag of wrod for `col_` in a dict

            dftext_tdidf_dict, word_tokeep_dict = util_text.pd_coltext_tdidf(dftext, coltext=col_, word_minfreq= word_minfreq,
                                                                             word_tokeep = word_tokeep,
                                                                             return_val  = "dataframe,param")

            dftext_tdidf_all = pd.DataFrame(dftext_tdidf_dict) if dftext_tdidf_all is None else pd.concat((dftext_tdidf_all,pd.DataFrame(dftext_tdidf_dict)),axis=1)
            log(word_tokeep_dict)

            ###  Dimesnion reduction for Sparse Matrix
            dftext_svd_list, svd_list = util_model.pd_dim_reduction(dftext_tdidf_dict,
                                                           colname        = None,
                                                           model_pretrain = None,
                                                           colprefix      = col_ + "_svd",
                                                           method         = "svd",  dimpca=dimpca,  return_val="dataframe,param")

            dftext_svd_list_all = dftext_svd_list if dftext_svd_list_all is None else pd.concat((dftext_svd_list_all,dftext_svd_list),axis=1)
    #################################################################################

    ###### Save and Export ##########################################################
    if 'path_features_store' in pars:
            save_features(dftext_svd_list_all, 'dftext_svd' + "-" + str(col), pars['path_features_store'])
            # save(dftext_svd_list_all,  pars['path_pipeline_export'] + "/dftext_svd.pkl")
            # save(dftext_tdidf_all,     pars['path_pipeline_export'] + "/dftext_tdidf.pkl" )
            save(word_tokeep_dict_all,     pars['path_pipeline_export'] + "/word_tokeep_dict_all.pkl" )

    col_pars = {}
    col_pars['cols_new'] = {
     # 'coltext_tdidf'    : dftext_tdidf_all.columns.tolist(),       ### list
     'coltext_svd'      : dftext_svd_list_all.columns.tolist()      ### list
    }

    dftext_svd_list_all.index = dftext.index
    # return pd.concat((dftext_svd_list_all,dftext_svd_list_all),axis=1), col_pars
    return dftext_svd_list_all, col_pars
예제 #29
0
def pd_augmentation_sdv(df, col=None, pars={})  :
    '''
    Using SDV Variation Autoencoders, the function augments more data into the dataset
    params:
            df          : (pandas dataframe) original dataframe
            col : column name for data enancement
            pars        : (dict - optional) contains:
                n_samples     : (int - optional) number of samples you would like to add, defaul is 10%
                primary_key   : (String - optional) the primary key of dataframe
                aggregate  : (boolean - optional) if False, prints SVD metrics, else it averages them
                path_model_save: saving location if save_model is set to True
                path_model_load: saved model location to skip training
                path_data_new  : new data where saved
    returns:
            df_new      : (pandas dataframe) df with more augmented data
            col         : (list of strings) same columns
    '''
    n_samples       = pars.get('n_samples', max(1, int(len(df) * 0.10) ) )   ## Add 10% or 1 sample by default value
    primary_key     = pars.get('colid', None)  ### Custom can be created on the fly
    metrics_type    = pars.get('aggregate', False)
    path_model_save = pars.get('path_model_save', 'data/output/ztmp/')
    model_name      = pars.get('model_name', "TVAE")

    # model fitting
    if 'path_model_load' in pars:
            model = load(pars['path_model_load'])
    else:
            log('##### Training Started #####')

            model = {'TVAE' : TVAE, 'CTGAN' : CTGAN, 'PAR' : PAR}[model_name]
            if model_name == 'PAR':
                model = model(entity_columns = pars['entity_columns'],
                              context_columns = pars['context_columns'],
                              sequence_index = pars['sequence_index'])
            else:
                model = model(primary_key=primary_key)
            model.fit(df)
            log('##### Training Finshed #####')
            try:
                 save(model, path_model_save )
                 log('model saved at: ', path_model_save  )
            except:
                 log('saving model failed: ', path_model_save)

    log('##### Generating Samples #############')
    new_data = model.sample(n_samples)
    log_pd( new_data, n=7)


    log('######### Evaluation Results #########')
    if metrics_type == True:
      evals = evaluate(new_data, df, aggregate= True )
      log(evals)
    else:
      evals = evaluate(new_data, df, aggregate= False )
      log_pd(evals, n=7)

    # appending new data
    df_new = df.append(new_data)
    log(str(len(df_new) - len(df)) + ' new data added')

    if 'path_newdata' in pars :
        new_data.to_parquet( pars['path_newdata'] + '/features.parquet' )
        log('###### df augmentation save on disk', pars['path_newdata'] )

    log('###### augmentation complete ######')
    return df_new, col
예제 #30
0
def pd_colnum_quantile_norm(df, col, pars={}):
    """
     colnum normalization by quantile
  """
    prefix = "colnum_quantile_norm"
    df = df[col]
    num_col = col

    ##### Grab previous computed params  ################################################
    pars2 = {}
    if 'path_pipeline' in pars:  #### Load existing column list
        colnum_quantile_norm = load(pars['path_pipeline'] + f'/{prefix}.pkl')
        model = load(pars['path_pipeline'] + f'/{prefix}_model.pkl')
        pars2 = load(pars['path_pipeline'] + f'/{prefix}_pars.pkl')

    lower_bound_sparse = pars2.get('lower_bound_sparse', None)
    upper_bound_sparse = pars2.get('upper_bound_sparse', None)
    lower_bound = pars2.get('lower_bound_sparse', None)
    upper_bound = pars2.get('upper_bound_sparse', None)
    sparse_col = pars2.get('colsparse', ['capital-gain', 'capital-loss'])

    ####### Find IQR and implement to numericals and sparse columns seperately ##########
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1

    for col in num_col:
        if col in sparse_col:
            df_nosparse = pd.DataFrame(df[df[col] != df[col].mode()[0]][col])

            if lower_bound_sparse is not None:
                pass

            elif df_nosparse[col].quantile(
                    0.25) < df[col].mode()[0]:  #Unexpected case
                lower_bound_sparse = df_nosparse[col].quantile(0.25)

            else:
                lower_bound_sparse = df[col].mode()[0]

            if upper_bound_sparse is not None:
                pass

            elif df_nosparse[col].quantile(
                    0.75) < df[col].mode()[0]:  #Unexpected case
                upper_bound_sparse = df[col].mode()[0]

            else:
                upper_bound_sparse = df_nosparse[col].quantile(0.75)

            n_outliers = len(df[(df[col] < lower_bound_sparse) |
                                (df[col] > upper_bound_sparse)][col])

            if n_outliers > 0:
                df.loc[df[col] < lower_bound_sparse,
                       col] = lower_bound_sparse * 0.75  #--> MAIN DF CHANGED
                df.loc[df[col] > upper_bound_sparse,
                       col] = upper_bound_sparse * 1.25  # --> MAIN DF CHANGED

        else:
            if lower_bound is None or upper_bound is None:
                lower_bound = df[col].quantile(0.25) - 1.5 * IQR[col]
                upper_bound = df[col].quantile(0.75) + 1.5 * IQR[col]

            df[col] = np.where(df[col] > upper_bound, 1.25 * upper_bound,
                               df[col])
            df[col] = np.where(df[col] < lower_bound, 0.75 * lower_bound,
                               df[col])

    df.columns = [t + "_qt_norm" for t in df.columns]
    pars_new = {
        'lower_bound': lower_bound,
        'upper_bound': upper_bound,
        'lower_bound_sparse': lower_bound_sparse,
        'upper_bound_sparse': upper_bound_sparse
    }
    dfnew = df
    model = None
    colnew = list(df.columns)

    ##### Export ##############################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(df, prefix, pars['path_features_store'])
        save(colnew, pars['path_pipeline_export'] + f"/{prefix}.pkl")
        save(pars_new, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl")
        save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl")

    col_pars = {
        'prefix': prefix,
        'path': pars.get('path_pipeline_export',
                         pars.get('path_pipeline', None))
    }
    col_pars['cols_new'] = {
        prefix: colnew  ### list
    }
    return dfnew, col_pars