def pd_colcat_encoder_generic(df, col, pars): """ Create a Class or decorator https://pypi.org/project/category-encoders/ encoder = ce.BackwardDifferenceEncoder(cols=[...]) encoder = ce.BaseNEncoder(cols=[...]) encoder = ce.BinaryEncoder(cols=[...]) encoder = ce.CatBoostEncoder(cols=[...]) encoder = ce.CountEncoder(cols=[...]) encoder = ce.GLMMEncoder(cols=[...]) encoder = ce.HashingEncoder(cols=[...]) encoder = ce.HelmertEncoder(cols=[...]) encoder = ce.JamesSteinEncoder(cols=[...]) encoder = ce.LeaveOneOutEncoder(cols=[...]) encoder = ce.MEstimateEncoder(cols=[...]) encoder = ce.OneHotEncoder(cols=[...]) encoder = ce.OrdinalEncoder(cols=[...]) encoder = ce.SumEncoder(cols=[...]) encoder = ce.PolynomialEncoder(cols=[...]) encoder = ce.TargetEncoder(cols=[...]) encoder = ce.WOEEncoder(cols=[...]) """ prefix = "colcat_encoder_generic" pars_model = None if 'path_pipeline' in pars: ### Load during Inference colcat_encoder = load(pars['path_pipeline'] + f"/{prefix}.pkl") pars_model = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") #model = load( pars['path_pipeline'] + f"/{prefix}_model.pkl" ) ####### Custom Code ############################################################### from category_encoders import HashingEncoder, WOEEncoder pars_model = pars.get('model_pars', {}) if pars_model is None else pars_model pars_model['cols'] = col model_name = pars.get('model_name', 'HashingEncoder') model_class = {'HashingEncoder': HashingEncoder}[model_name] model = model_class(**pars_model) dfcat_encoder = model.fit_transform(df[col]) dfcat_encoder.columns = [t + "_cod" for t in dfcat_encoder.columns] colcat_encoder = list(dfcat_encoder.columns) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfcat_encoder, 'dfcat_encoder', pars['path_features_store']) save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(pars_model, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") save(colcat_encoder, pars['path_pipeline_export'] + f"/{prefix}.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { 'colcat_encoder_generic': colcat_encoder ### list } return dfcat_encoder, col_pars
def predict(model_name, path_model, dfX, cols_family): """ if config_name in ['ElasticNet', 'ElasticNetCV', 'LGBMRegressor', 'LGBMModel', 'TweedieRegressor', 'Ridge']: from models import model_sklearn as modelx elif config_name == 'model_bayesian_pyro': from models import model_bayesian_pyro as modelx elif config_name == 'model_widedeep': from models import model_widedeep as modelx """ modelx = map_model(model_name) modelx.reset() log(modelx, path_model) #log(os.getcwd()) sys.path.append(root) #### Needed due to import source error modelx.model = load(path_model + "/model/model.pkl") # stats = load(path_model + "/model/info.pkl") colsX = load(path_model + "/model/colsX.pkl") ## column name # coly = load( path_model + "/model/coly.pkl" ) assert colsX is not None assert modelx.model is not None log(modelx.model.model) ### Prediction dfX1 = dfX.reindex(columns=colsX) #reindex included ypred = modelx.predict(dfX1) return ypred
def pd_colcat_symbolic(df, col, pars): """ https://github.com/arita37/deltapy pip install deltapy """ pars_encoder = pars pars_encoder['cols'] = col if 'path_pipeline_export' in pars: try: pars_encoder = load(pars['path_pipeline_export'] + '/col_genetic_pars.pkl') model_encoder = load(pars['path_pipeline_export'] + '/col_genetic_model.pkl') col_encoder = load(pars['path_pipeline_export'] + '/col_genetic.pkl') except: pass ################################################################################### coly = pars['coly'] from gplearn.genetic import SymbolicTransformer function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan' ] gp = SymbolicTransformer(generations=20, population_size=200, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6) gen_feats = gp.fit_transform(df[col], df[coly]) gen_feats = pd.DataFrame( gen_feats, columns=["gen_" + str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = df.index dfnew = gen_feats dfnew.columns = [t for t in dfnew.columns] ################################################################################### colnew = list(dfnew.columns) if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfnew, 'dfgen', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + "/col_genetic_model.pkl") save(pars_encoder, pars['path_pipeline_export'] + "/col_genetic_pars.pkl") save(colnew, pars['path_pipeline_export'] + "/col_genetic.pkl") col_pars = {'model': gp} col_pars['cols_new'] = { 'col_genetic': colnew ### list } return dfnew, col_pars
def predict(model_name, path_model, dfX, cols_family): """ """ modelx = map_model(model_name) modelx.reset() log(modelx, path_model) #log(os.getcwd()) sys.path.append(root) #### Needed due to import source error log("#### Load model ############################################") print(path_model + "/model/model.pkl") # modelx.model = load(path_model + "/model//model.pkl") modelx.model = load(path_model + "/model.pkl") # stats = load(path_model + "/model/info.pkl") # colsX = load(path_model + "/model/colsX.pkl") ## column name colsX = load(path_model + "/colsX.pkl") ## column name # coly = load( path_model + "/model/coly.pkl" ) assert colsX is not None, "cannot load colsx, " + path_model assert modelx.model is not None, "cannot load modelx, " + path_model log("#### modelx\n", modelx.model.model) log("### Prediction ############################################") dfX1 = dfX.reindex(columns=colsX) #reindex included ypred = modelx.predict(dfX1) return ypred
def pd_ts_deltapy2( df=None, col=None, pars={}, ): """ Delta py pars : { 'name' : "robust_scaler", 'pars' : {} } """ prefix = 'colts_deltapy' ###### Custom code ################################################################ dfin = df.fillna(method='ffill') model_name = pars['name'] model_pars = pars.get('pars', {}) if 'path_pipeline' in pars: #### Prediction time model = load(pars['path_pipeline'] + f"/{prefix}_model.pkl") pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") else: ### Training time : Dynamic function load from util_feature import load_function_uri ##### transform.robust_scaler(df, drop=["Close_1"]) model = load_function_uri2(model_name) ##### Transform Data ############################################################ df_out = model(dfin, **model_pars) # Extract only returns one value, so no columns to loop over. model_name2 = model_name.replace("::", "-") if 'extract' in model_name: col_out = "0_" + model_name else: col_out = [coli + "_" + model_name for coli in df_out.columns] df_out.columns = col_out df_out.index = df_out.index col_new = col_out ###### Export ##################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_out, 'df_' + prefix, pars['path_features_store']) save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_new, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: col_new ### list of columns } return df_out, col_pars
def pd_colts_generate(df=None, col=None, pars={}): """ pars : { 'model_name' : "transform.robust_scaler", 'model_pars' : {} } """ prefix = 'colts_generate' ###### Custom code ################################################################ dfin = df[col].fillna(method='ffill') model_name = pars['model_name'] model_pars = pars.get('model_pars', {}) if 'path_pipeline' in pars: #### Prediction time model = load(pars['path_pipeline'] + f"/{prefix}_model.pkl") pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") else: ### Training time : Dynamic function load from util_feature import load_function_uri ##### transform.robust_scaler(df, drop=["Close_1"]) model = load_function_uri(model_name) model_name = model_name.replace(".", "_") ##### Transform Data ############################################################ df_out = model(dfin, col, **model_pars) col_out = [coli + "_" + model_name for coli in df_out.columns] df_out.columns = col_out df_out.index = train_X.index col_new = col_out ###### Export ##################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_out, 'df_' + prefix, pars['path_features_store']) save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_new, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: col_new ### list of columns } return df_out, col_pars
def prepro_load(prefix, pars): """ Load previously savec preprocessors :param prefix: :param pars: :return: """ prepro = None pars_saved = None cols_saved = None if "path_pipeline" in pars: prepro = load(pars["path_pipeline"] + f"/{prefix}_model.pkl") pars_saved = load(pars["path_pipeline"] + f"/{prefix}_pars.pkl") cols_saved = load(pars["path_pipeline"] + f"/{prefix}_cols.pkl") return prepro, pars_saved, cols_saved
def run_model_check(path_output, scoring): """ :param path_output: :param scoring: :return: """ import pandas as pd try: #### Load model from source.util_feature import load from source.models import model_sklearn as modelx import sys from source import models sys.modules['models'] = models dir_model = path_output modelx.model = load(dir_model + "/model/model.pkl") stats = load(dir_model + "/model/info.pkl") colsX = load(dir_model + "/model/colsX.pkl") coly = load(dir_model + "/model/coly.pkl") print(stats) print(modelx.model.model) ### Metrics on test data log(stats['metrics_test']) #### Loading training data ###################################################### dfX = pd.read_csv(dir_model + "/check/dfX.csv") #to load csv #dfX = pd.read_parquet(dir_model + "/check/dfX.parquet") #to load parquet dfy = dfX[coly] colused = colsX dfXtest = pd.read_csv(dir_model + "/check/dfXtest.csv") #to load csv #dfXtest = pd.read_parquet(dir_model + "/check/dfXtest.parquet" #to load parquet dfytest = dfXtest[coly] print(dfX.shape, dfXtest.shape) #### Feature importance on training data ####################################### from util_feature import feature_importance_perm lgb_featimpt_train, _ = feature_importance_perm(modelx, dfX[colused], dfy, colused, n_repeats=1, scoring=scoring) print(lgb_featimpt_train) except: pass
def preprocess_load(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000, preprocess_pars={}, path_features_store=None): """ Load pre-computed dataframe :param path_train_X: :param path_train_y: :param path_pipeline_export: :param cols_group: :param n_sample: :param preprocess_pars: :param path_features_store: :return: """ from source.util_feature import load dfXy = pd.read_parquet(path_features_store + "/dfX/features.parquet") try: dfy = pd.read_parquet(path_features_store + "/dfy/features.parquet") dfXy = dfXy.join(dfy, on=cols_group['colid'], how="left") except: log('Error no label', path_features_store + "/dfy/features.parquet") cols_family = load(f'{path_pipeline_export}/cols_family.pkl') return dfXy, cols_family
def pd_colnum_binto_onehot(df, col=None, pars=None): assert isinstance(col, list) and isinstance(df, pd.DataFrame) dfnum_bin = df[col] colnum_bin = col path_pipeline = pars.get('path_pipeline', False) colnum_onehot = load( f'{path_pipeline}/colnum_onehot.pkl') if path_pipeline else None log("###### colnum bin to One Hot #################################################" ) from util_feature import pd_col_to_onehot dfnum_hot, colnum_onehot = pd_col_to_onehot(dfnum_bin[colnum_bin], colname=colnum_bin, colonehot=colnum_onehot, return_val="dataframe,param") log(colnum_onehot) if 'path_features_store' in pars: save_features(dfnum_hot, 'colnum_onehot', pars['path_features_store']) save(colnum_onehot, pars['path_pipeline_export'] + "/colnum_onehot.pkl") col_pars = {} col_pars['colnum_onehot'] = colnum_onehot col_pars['cols_new'] = { # 'colnum' : col , ###list 'colnum_onehot': colnum_onehot ### list } return dfnum_hot, col_pars
def run_predict(model_name, path_model, path_data, path_output, n_sample=-1): path_output = root + path_output path_data = root + path_data + "/features.zip" #.zip path_model = root + path_model path_pipeline = path_model + "/pipeline/" path_test_X = path_data + "/features.zip" #.zip #added path to testing features log(path_data, path_model, path_output) colid = load(f'{path_pipeline}/colid.pkl') df = load_dataset(path_data, path_data_y=None, colid=colid, n_sample=n_sample) dfX, cols_family = preprocess(df, path_pipeline) ypred, yproba = predict(model_name, path_model, dfX, cols_family) log("Saving prediction", ypred.shape, path_output) os.makedirs(path_output, exist_ok=True) df[cols_family["coly"] + "_pred"] = ypred if yproba is not None: df[cols_family["coly"] + "_pred_proba"] = yproba df.to_csv(f"{path_output}/prediction.csv") log(df.head(8)) ##### Export Specific df[cols_family["coly"]] = ypred df[[cols_family["coly"]]].to_csv(f"{path_output}/pred_only.csv")
def pd_colcat_minhash(df, col, pars): """ MinHash Algo for category https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087 """ prefix = 'colcat_minhash' colcat = col pars_minhash = {'n_component' : [4, 2], 'model_pretrain_dict' : None,} if 'path_pipeline_export' in pars : try : pars_minhash = load( pars['path_pipeline_export'] + '/colcat_minhash_pars.pkl') except : pass log("#### Colcat to Hash encoding #############################################") from utils import util_text dfcat_bin, col_hash_model= util_text.pd_coltext_minhash(df[colcat], colcat, return_val="dataframe,param", **pars_minhash ) colcat_minhash = list(dfcat_bin.columns) log(col_hash_model) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfcat_bin, prefix, pars['path_features_store']) save(colcat_minhash, pars['path_pipeline_export'] + f"/{prefix}.pkl" ) save(pars_minhash, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl" ) save(col_hash_model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl" ) col_pars = {} col_pars['col_hash_model'] = col_hash_model col_pars['cols_new'] = { 'colcat_minhash' : colcat_minhash ### list } return dfcat_bin, col_pars
def pd_colcat_bin(df, col=None, pars=None): # dfbum_bin = df[col] path_pipeline = pars.get('path_pipeline', False) colcat_bin_map = load( f'{path_pipeline}/colcat_bin_map.pkl') if path_pipeline else None colcat = [col] if isinstance(col, str) else col log("#### Colcat to integer encoding ") dfcat_bin, colcat_bin_map = util_feature.pd_colcat_toint( df[colcat], colname=colcat, colcat_map=colcat_bin_map, suffix="_int") colcat_bin = list(dfcat_bin.columns) ##### Colcat processing ################################################################ colcat_map = util_feature.pd_colcat_mapping(df, colcat) log(df[colcat].dtypes, colcat_map) if 'path_features_store' in pars: save_features(dfcat_bin, 'dfcat_bin', pars['path_features_store']) save(colcat_bin_map, pars['path_pipeline_export'] + "/colcat_bin_map.pkl") save(colcat_bin, pars['path_pipeline_export'] + "/colcat_bin.pkl") col_pars = {} col_pars['colcat_bin_map'] = colcat_bin_map col_pars['cols_new'] = { 'colcat': col, ###list 'colcat_bin': colcat_bin ### list } return dfcat_bin, col_pars
def pd_colnum_bin(df, col, pars): from util_feature import pd_colnum_tocat path_pipeline = pars.get('path_pipeline', False) colnum_binmap = load(f'{path_pipeline}/colnum_binmap.pkl') if path_pipeline else None log(colnum_binmap) colnum = col log("### colnum Map numerics to Category bin ###########################################") dfnum_bin, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=colnum_binmap, bins=10, suffix="_bin", method="uniform", return_val="dataframe,param") log(colnum_binmap) ### Renaming colunm_bin with suffix colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())] log(colnum_bin) if 'path_features_store' in pars: scol = "_".join(col[:5]) save_features(dfnum_bin, 'colnum_bin' + "-" + scol, pars['path_features_store']) save(colnum_binmap, pars['path_pipeline_export'] + "/colnum_binmap.pkl" ) save(colnum_bin, pars['path_pipeline_export'] + "/colnum_bin.pkl" ) col_pars = {} col_pars['colnumbin_map'] = colnum_binmap col_pars['cols_new'] = { 'colnum' : col , ###list 'colnum_bin' : colnum_bin ### list } return dfnum_bin, col_pars
def pd_colcross(df, col, pars): """ cross_feature_new = feat1 X feat2 (pair feature) """ log("##### Cross Features From OneHot Features ######################################" ) prefix = 'colcross_onehot' # params_check(pars, [('dfcat_hot', pd.DataFrame), 'colid', ]) from util_feature import pd_feature_generate_cross dfcat_hot = pars['dfcat_hot'] colid = pars['colid'] try: dfnum_hot = pars['dfnum_hot'] df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left') except: df_onehot = copy.deepcopy(dfcat_hot) colcross_single = pars['colcross_single'] pars_model = {'pct_threshold': 0.02, 'm_combination': 2} if 'path_pipeline' in pars: #### Load existing column list colcross_single = load(pars['path_pipeline'] + f'/{prefix}_select.pkl') # pars_model = load( pars['path_pipeline'] + f'/{prefix}_pars.pkl') colcross_single_onehot_select = [] ## Select existing columns for t in list(df_onehot.columns): for c1 in colcross_single: if c1 in t: colcross_single_onehot_select.append(t) df_onehot = df_onehot[colcross_single_onehot_select] dfcross_hot, colcross_pair = pd_feature_generate_cross( df_onehot, colcross_single_onehot_select, **pars_model) log(dfcross_hot.head(2).T) colcross_pair_onehot = list(dfcross_hot.columns) model = None ############################################################################## if 'path_features_store' in pars: save_features(dfcross_hot, 'colcross_onehot', pars['path_features_store']) save(colcross_single_onehot_select, pars['path_pipeline_export'] + f'/{prefix}_select.pkl') save(colcross_pair, pars['path_pipeline_export'] + f'/{prefix}_stats.pkl') save(colcross_pair_onehot, pars['path_pipeline_export'] + f'/{prefix}_pair.pkl') save(model, pars['path_pipeline_export'] + f'/{prefix}_pars.pkl') col_pars = {'model': model, 'stats': colcross_pair} col_pars['cols_new'] = { # 'colcross_single' : col , ###list 'colcross_pair': colcross_pair_onehot ### list } return dfcross_hot, col_pars
def pd_col_atemplate(df=None, col=None, pars={}): """ Example of custom Processor Used at prediction time "path_pipeline" : Training time : "path_features_store" : to store intermediate dataframe "path_pipeline_export": to store pipeline for later usage """ from source.util_feature import save, load prefix = "col_myfun" #### Inference time LOAD previous pars ########################################### if "path_pipeline" in pars: prepro = load(pars["path_pipeline"] + f"/{prefix}_model.pkl") pars = load(pars["path_pipeline"] + f"/{prefix}_pars.pkl") pars = {} if pars is None else pars #### Do something ################################################################# df_new = df[col] ### Do nithi df_new.columns = [col + "_myfun" for col in df.columns] cols_new = list(df_new.columns) prepro = None ### model pars_new = None ### new params ################################################################################### ###### Training time save all ##################################################### if "path_features_store" in pars and "path_pipeline_export" in pars: save(prepro, pars["path_pipeline_export"] + f"/{prefix}_model.pkl") save(cols_new, pars["path_pipeline_export"] + f"/{prefix}.pkl") save(pars_new, pars["path_pipeline_export"] + f"/{prefix}_pars.pkl") ###### Training & Inference time : df + new column names ########################## col_pars = { "prefix": prefix, "path": pars.get("path_pipeline_export", pars.get("path_pipeline", None)) } col_pars["cols_new"] = { "col_myfun": cols_new ### new column list } return df_new, col_pars
def pd_colcat_encoder_generic(df, col, pars): """ https://pypi.org/project/category-encoders/ encoder = ce.BackwardDifferenceEncoder(cols=[...]) encoder = ce.BaseNEncoder(cols=[...]) encoder = ce.BinaryEncoder(cols=[...]) encoder = ce.CatBoostEncoder(cols=[...]) encoder = ce.CountEncoder(cols=[...]) encoder = ce.GLMMEncoder(cols=[...]) encoder = ce.HashingEncoder(cols=[...]) encoder = ce.HelmertEncoder(cols=[...]) encoder = ce.JamesSteinEncoder(cols=[...]) encoder = ce.LeaveOneOutEncoder(cols=[...]) encoder = ce.MEstimateEncoder(cols=[...]) encoder = ce.OneHotEncoder(cols=[...]) encoder = ce.OrdinalEncoder(cols=[...]) encoder = ce.SumEncoder(cols=[...]) encoder = ce.PolynomialEncoder(cols=[...]) encoder = ce.TargetEncoder(cols=[...]) encoder = ce.WOEEncoder(cols=[...]) """ colcat = col import category_encoders as ce pars_encoder = pars pars_encoder['cols'] = col if 'path_pipeline_export' in pars: try: pars_encoder = load(pars['path_pipeline_export'] + '/colcat_encoder_pars.pkl') except: pass encoder = ce.HashingEncoder(**pars_encoder) dfcat_bin = encoder.fit_transform(df[col]) dfcat_bin.columns = [t for t in dfcat_bin.columns] colcat_encoder = list(dfcat_bin.columns) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfcat_bin, 'dfcat_encoder', pars['path_features_store']) save(encoder, pars['path_pipeline_export'] + "/colcat_encoder_model.pkl") save(pars_encoder, pars['path_pipeline_export'] + "/colcat_encoder_pars.pkl") save(colcat_encoder, pars['path_pipeline_export'] + "/colcat_encoder.pkl") col_pars = {} col_pars['col_encode_model'] = encoder col_pars['cols_new'] = { 'colcat_encoder': colcat_encoder ### list } return dfcat_bin, col_pars
def pd_colnum_normalize(df: pd.DataFrame, col: list = None, pars: dict = None): """ Float num INTO [0,1] 'quantile_cutoff', 'quantile_cutoff_2', 'minmax' 'name': 'fillna', 'na_val' : 0.0 """ prefix = 'colnum_norm' ### == cols_out df = df[col] log2( "### colnum normalize #############################################################" ) from util_feature import pd_colnum_normalize as pd_normalize_fun colnum = col if pars is None: pars = { 'pipe_list': [ { 'name': 'quantile_cutoff' }, # { 'name': 'fillna', 'na_val': 0.0 }, ] } if 'path_pipeline' in pars: #### Load existing column list pars = load(pars['path_pipeline'] + f'/{prefix}_pars.pkl') dfnum_norm, colnum_norm = pd_normalize_fun(df, colname=colnum, pars=pars, suffix="_norm", return_val="dataframe,param") log3('dfnum_norm', dfnum_norm.head(4), colnum_norm) log3('dfnum_norn NA', dfnum_norm.isna().sum()) colnew = colnum_norm log3( "##### Export ######################################################################" ) if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfnum_norm, prefix, pars['path_features_store']) save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: colnew ### list } return dfnum_norm, col_pars
def run_predict(config_name, config_path, n_sample=-1, path_data=None, path_output=None, pars={}, model_dict=None): model_dict = model_dict_load(model_dict, config_path, config_name, verbose=True) m = model_dict['global_pars'] model_class = model_dict['model_pars']['model_class'] path_data = m['path_pred_data'] if path_data is None else path_data path_pipeline = m['path_pred_pipeline'] # path_output + "/pipeline/" ) path_model = m['path_pred_model'] path_output = m['path_pred_output'] if path_output is None else path_output log(path_data, path_model, path_output) pars = { 'cols_group': model_dict['data_pars']['cols_input_type'], 'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list'] } ########################################################################################## colid = load(f'{path_pipeline}/colid.pkl') df = load_dataset(path_data, path_data_y=None, colid=colid, n_sample=n_sample) from run_preprocess import preprocess_inference as preprocess dfX, cols_family = preprocess(df, path_pipeline, preprocess_pars=pars) ypred, yproba = predict(model_class, path_model, dfX, cols_family) log("############ Saving prediction ###################################################" ) log(ypred.shape, path_output) os.makedirs(path_output, exist_ok=True) df[cols_family["coly"] + "_pred"] = ypred if yproba is not None: df[cols_family["coly"] + "_pred_proba"] = yproba df.to_csv(f"{path_output}/prediction.csv") log(df.head(8)) log("########### Export Specific ######################################################" ) df[cols_family["coly"]] = ypred df[[cols_family["coly"]]].to_csv(f"{path_output}/pred_only.csv")
def predict(model_name, path_model, dfX, cols_family): """ Arguments: model_name {[str]} -- [description] path_model {[str]} -- [description] dfX {[DataFrame]} -- [description] cols_family {[dict]} -- [description] Returns: ypred [numpy.array] -- [vector of prediction] """ modelx = map_model(model_name) modelx.reset() log(modelx, path_model) #log(os.getcwd()) sys.path.append(root) #### Needed due to import source error log("#### Load model ############################################") print(path_model + "/model/model.pkl") # modelx.model = load(path_model + "/model//model.pkl") modelx.model = load(path_model + "/model.pkl") # stats = load(path_model + "/model/info.pkl") # colsX = load(path_model + "/model/colsX.pkl") ## column name colsX = load(path_model + "/colsX.pkl") ## column name # coly = load( path_model + "/model/coly.pkl" ) assert colsX is not None, "cannot load colsx, " + path_model assert modelx.model is not None, "cannot load modelx, " + path_model log("#### modelx\n", modelx.model.model) log("### Prediction ############################################") dfX1 = dfX.reindex(columns=colsX) #reindex included ypred = modelx.predict(dfX1) return ypred
def pd_colcat_to_onehot(df, col=None, pars=None): """ """ log("#### colcat to onehot") col = [col] if isinstance(col, str) else col if len(col) == 1: colnew = [col[0] + "_onehot"] df[colnew] = df[col] col_pars = {} col_pars['colcat_onehot'] = colnew col_pars['cols_new'] = { # 'colnum' : col , ###list 'colcat_onehot': colnew ### list } return df[colnew], col_pars colcat_onehot = None if 'path_pipeline' in pars: colcat_onehot = load(pars['path_pipeline'] + '/colcat_onehot.pkl') ###################################################################################### colcat = col dfcat_hot, colcat_onehot = util_feature.pd_col_to_onehot( df[colcat], colname=colcat, colonehot=colcat_onehot, return_val="dataframe,param") log(dfcat_hot[colcat_onehot].head(5)) ###################################################################################### if 'path_features_store' in pars: save_features(dfcat_hot, 'colcat_onehot', pars['path_features_store']) save(colcat_onehot, pars['path_pipeline_export'] + "/colcat_onehot.pkl") save(colcat, pars['path_pipeline_export'] + "/colcat.pkl") col_pars = {} col_pars['colcat_onehot'] = colcat_onehot col_pars['cols_new'] = { # 'colnum' : col , ###list 'colcat_onehot': colcat_onehot ### list } print("ok ------------") return dfcat_hot, col_pars
def pd_colcat_to_onehot(df, col=None, pars=None): dfbum_bin = df[col] if len(col) == 1: colnew = [col[0] + "_onehot"] df[colnew] = df[col] col_pars = {} col_pars['colcat_onehot'] = colnew col_pars['cols_new'] = { # 'colnum' : col , ###list 'colcat_onehot': colnew ### list } return df[colnew], col_pars path_pipeline = pars.get('path_pipeline', False) colcat_onehot = load( f'{path_pipeline}/colcat_onehot.pkl') if path_pipeline else None colcat = col log("#### colcat to onehot") dfcat_hot, colcat_onehot = util_feature.pd_col_to_onehot( df[colcat], colname=colcat, colonehot=colcat_onehot, return_val="dataframe,param") log(dfcat_hot[colcat_onehot].head(5)) if 'path_features_store' in pars: path_features_store = pars['path_features_store'] save_features(dfcat_hot, 'colcat_onehot', path_features_store) save(colcat_onehot, pars['path_pipeline_export'] + "/colcat_onehot.pkl") save(colcat, pars['path_pipeline_export'] + "/colcat.pkl") col_pars = {} col_pars['colcat_onehot'] = colcat_onehot col_pars['cols_new'] = { # 'colnum' : col , ###list 'colcat_onehot': colcat_onehot ### list } print("ok ------------") return dfcat_hot, col_pars
def transform(model_name, path_model, dfX, model_dict, task_type='transform'): """ Arguments: model_name {[str]} -- [description] path_model {[str]} -- [description] dfX {[DataFrame]} -- [description] cols_family {[dict]} -- [description] Returns: ypred [numpy.array] -- [vector of prediction] """ modelx = map_model(model_name) modelx.reset() log(modelx, path_model) sys.path.append(root) #### Needed due to import source error log("#### Load model ############################################") log2(path_model + "/model/model.pkl") modelx.model = modelx.load(path_model + "/model.pkl") colsX = load(path_model + "/colsX.pkl") ## column name # coly = load( path_model + "/model/coly.pkl" ) assert colsX is not None, "cannot load colsx, " + path_model assert modelx.model is not None, "cannot load modelx, " + path_model log("#### modelx\n", modelx.model.model) log("### Task Inference #############################################") # dfX1 = dfX.reindex(columns=colsX) #reindex included if task_type == 'encode': dfX = modelx.encode(dfX, data_pars=model_dict['data_pars'], compute_pars=model_dict['compute_pars']) elif task_type == 'decode': dfX = modelx.encode(dfX, data_pars=model_dict['data_pars'], compute_pars=model_dict['compute_pars']) else: dfX = modelx.transform(dfX, data_pars=model_dict['data_pars'], compute_pars=model_dict['compute_pars']) return dfX
def run_data_check(path_data, path_data_ref, path_model, path_output, sample_ratio=0.5): """ Calcualata Dataset Shift before prediction. """ from run_preprocess import preprocess_inference as preprocess path_output = root + path_output path_data = root + path_data path_data_ref = root + path_data_ref path_pipeline = root + path_model + "/pipeline/" os.makedirs(path_output, exist_ok=True) colid = load(f'{path_pipeline}/colid.pkl') df1 = load_dataset(path_data_ref, colid=colid) dfX1, cols_family1 = preprocess(df1, path_pipeline) df2 = load_dataset(path_data, colid=colid) dfX2, cols_family2 = preprocess(df2, path_pipeline) colsX = cols_family1["colnum_bin"] + cols_family1["colcat_bin"] dfX1 = dfX1[colsX] dfX2 = dfX2[colsX] from util_feature import pd_stat_dataset_shift nsample = int(min(len(dfX1), len(dfX2)) * sample_ratio) metrics_psi = pd_stat_dataset_shift(dfX2, dfX1, colsX, nsample=nsample, buckets=7, axis=0) metrics_psi.to_csv(f"{path_output}/prediction_features_metrics.csv") log(metrics_psi)
def predict(model_name, path_model, dfX, cols_family, model_dict): """ Arguments: model_name {[str]} -- [description] path_model {[str]} -- [description] dfX {[DataFrame]} -- [description] cols_family {[dict]} -- [description] Returns: ypred [numpy.array] -- [vector of prediction] """ log("#### Load model class ############################################") modelx = map_model(model_name) assert modelx is not None, "cannot load modelx, " + path_model modelx.reset() log2(modelx, path_model) sys.path.append(root) #### Needed due to import source error log("#### Load existing model weights #################################") log2(path_model + "/model/") # modelx.model = load(path_model + "/model//model.pkl") # modelx.model = load(path_model + "/model.pkl") modelx.load_model(path_model) colsX = load(path_model + "/colsX.pkl") ## column name assert colsX is not None, "cannot load colsx, " + path_model assert modelx.model is not None, "cannot load modelx, " + path_model log2("#### modelx\n", modelx.model) log("### Prediction ###################################################") dfX = dfX.reindex(columns=colsX) #reindex included ypred_tuple = modelx.predict(dfX, data_pars=model_dict['data_pars'], compute_pars=model_dict['compute_pars']) log2('ypred shape', str(ypred_tuple)[:100]) return ypred_tuple
def pd_coltext_universal_google(df, col, pars={}): """ # Universal sentence encoding from Tensorflow Text ---> Vectors from source.preprocessors import pd_coltext_universal_google https://tfhub.dev/google/universal-sentence-encoder-multilingual/3 #latest Tensorflow that supports sentencepiece is 1.13.1 !pip uninstall --quiet --yes tensorflow !pip install --quiet tensorflow-gpu==1.13.1 !pip install --quiet tensorflow-hub pip install --quiet tf-sentencepiece, simpleneighbors !pip install --quiet simpleneighbors # df : dataframe # col : list of text colnum names pars """ prefix = "coltext_universal_google" if 'path_pipeline' in pars : ### Load during Inference coltext_embed = load( pars['path_pipeline'] + "/{prefix}.pkl" ) pars_model = load( pars['path_pipeline'] + "/{prefix}_pars.pkl" ) ####### Custom Code ############################################################### import tensorflow as tf import tensorflow_hub as hub import tensorflow_text #from tqdm import tqdm #progress bar uri_list = [ ] url_default = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3" url = pars.get("model_uri", url_default ) model = hub.load( url ) pars_model = {} dfall = None for coli in col[:1] : X = [] for r in (df[coli]): if pd.isnull(r)==True : r="" emb = model(r) review_emb = tf.reshape(emb, [-1]).numpy() X.append(review_emb) dfi = pd.DataFrame(X, columns= [ coli + "_" + str(i) for i in range( len(X[0])) ] , index = df.index) dfall = pd.concat((dfall, dfi)) if dfall is not None else dfi coltext_embed = list(dfall.columns) ##### Export #################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfall, 'dftext_embed', pars['path_features_store']) save(coltext_embed, pars['path_pipeline_export'] + "/{prefix}.pkl" ) save(pars_model, pars['path_pipeline_export'] + "/{prefix}_pars.pkl" ) # save(model, pars['path_pipeline_export'] + "/{prefix}_model.pkl" ) # model_uri = pars['path_pipeline_export'] + "/{prefix}_model.pkl" # col_pars = {'model_uri' : model_uri, 'pars': pars_model} col_pars = {'model_uri' : url , 'pars': pars_model} # model_uri col_pars['cols_new'] = { 'coltext_universal_google' : coltext_embed ### list } return dfall, col_pars
def pd_col_genetic_transform(df=None, col=None, pars=None): """ Find Symbolic formulae for faeture engineering """ prefix = 'col_genetic' ###################################################################################### from gplearn.genetic import SymbolicTransformer from gplearn.functions import make_function import random colX = col # [col_ for col_ in col if col_ not in coly] train_X = df[colX].fillna(method='ffill') feature_name_ = colX def squaree(x): return x * x square_ = make_function(function=squaree, name='square_', arity=1) function_set = pars.get('function_set', [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan', square_ ]) pars_genetic = pars.get( 'pars_genetic', { 'generations': 5, 'population_size': 10, ### Higher than nb_features 'metric': 'spearman', 'tournament_size': 20, 'stopping_criteria': 1.0, 'const_range': (-1., 1.), 'p_crossover': 0.9, 'p_subtree_mutation': 0.01, 'p_hoist_mutation': 0.01, 'p_point_mutation': 0.01, 'p_point_replace': 0.05, 'parsimony_coefficient': 0.005, #### 0.00005 Control Complexity 'max_samples': 0.9, 'verbose': 1, #'n_components' ### Control number of outtput features : n_components 'random_state': 0, 'n_jobs': 4, }) if 'path_pipeline' in pars: #### Inference time gp = load(pars['path_pipeline'] + f"/{prefix}_model.pkl") pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") else: ### Training time coly = pars['coly'] train_y = pars['dfy'] gp = SymbolicTransformer( hall_of_fame=train_X.shape[1] + 1, ### Buggy n_components=pars_genetic.get('n_components', train_X.shape[1]), feature_names=feature_name_, function_set=function_set, **pars_genetic) gp.fit(train_X, train_y) ##### Transform Data ######################################### df_genetic = gp.transform(train_X) tag = random.randint(0, 10) #### UNIQUE TAG col_genetic = [f"gen_{tag}_{i}" for i in range(df_genetic.shape[1])] df_genetic = pd.DataFrame(df_genetic, columns=col_genetic, index=train_X.index) df_genetic.index = train_X.index pars_gen_all = {'pars_genetic': pars_genetic, 'function_set': function_set} ##### Formulae Exrraction ##################################### formula = str(gp).replace("[", "").replace("]", "") flist = formula.split(",\n") form_dict = {x: flist[i] for i, x in enumerate(col_genetic)} pars_gen_all['formulae_dict'] = form_dict log("########## Formulae ", form_dict) # col_pars['map_dict'] = dict(zip(train_X.columns.to_list(), feature_name_)) col_new = col_genetic ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_genetic, 'df_genetic', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_gen_all, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") # save(form_dict, pars['path_pipeline_export'] + f"/{prefix}_formula.pkl") save_json(form_dict, pars['path_pipeline_export'] + f"/{prefix}_formula.json") ### Human readable col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: col_new ### list } return df_genetic, col_pars
def pd_coltext(df, col, pars={}): """ df : Datframe col : list of columns pars : dict of pars """ from utils import util_text, util_model #### Load pars ################################################################### path_pipeline = pars.get('path_pipeline', None) word_tokeep_dict_all = load( path_pipeline + "/word_tokeep_dict_all.pkl" ) if path_pipeline is not None else {} # dftext_tdidf_all = load(f'{path_pipeline}/dftext_tdidf.pkl') if path_pipeline else None # dftext_svd_list_all = load(f'{path_pipeline}/dftext_svd.pkl') if path_pipeline else None dimpca = pars.get('dimpca', 2) word_minfreq = pars.get('word_minfreq', 3) #### Process #################################################################### stopwords = nlp_get_stopwords() dftext = pd_coltext_clean(df, col, stopwords= stopwords , pars=pars) dftext_svd_list_all = None dftext_tdidf_all = None ### Processing each of text columns to create a bag of word/to load the bag of word -> tf-idf -> svd for col_ in col: if path_pipeline is not None: ### If it is in Inference step, use the saved bag of word for the column `col_` word_tokeep = word_tokeep_dict_all[col_] else: ### If it is not, create a bag of word coltext_freq, word_tokeep = pd_coltext_wordfreq(df, col_, stopwords, ntoken=100) ## nb of words to keep word_tokeep_dict_all[col_] = word_tokeep ## save the bag of wrod for `col_` in a dict dftext_tdidf_dict, word_tokeep_dict = util_text.pd_coltext_tdidf(dftext, coltext=col_, word_minfreq= word_minfreq, word_tokeep = word_tokeep, return_val = "dataframe,param") dftext_tdidf_all = pd.DataFrame(dftext_tdidf_dict) if dftext_tdidf_all is None else pd.concat((dftext_tdidf_all,pd.DataFrame(dftext_tdidf_dict)),axis=1) log(word_tokeep_dict) ### Dimesnion reduction for Sparse Matrix dftext_svd_list, svd_list = util_model.pd_dim_reduction(dftext_tdidf_dict, colname = None, model_pretrain = None, colprefix = col_ + "_svd", method = "svd", dimpca=dimpca, return_val="dataframe,param") dftext_svd_list_all = dftext_svd_list if dftext_svd_list_all is None else pd.concat((dftext_svd_list_all,dftext_svd_list),axis=1) ################################################################################# ###### Save and Export ########################################################## if 'path_features_store' in pars: save_features(dftext_svd_list_all, 'dftext_svd' + "-" + str(col), pars['path_features_store']) # save(dftext_svd_list_all, pars['path_pipeline_export'] + "/dftext_svd.pkl") # save(dftext_tdidf_all, pars['path_pipeline_export'] + "/dftext_tdidf.pkl" ) save(word_tokeep_dict_all, pars['path_pipeline_export'] + "/word_tokeep_dict_all.pkl" ) col_pars = {} col_pars['cols_new'] = { # 'coltext_tdidf' : dftext_tdidf_all.columns.tolist(), ### list 'coltext_svd' : dftext_svd_list_all.columns.tolist() ### list } dftext_svd_list_all.index = dftext.index # return pd.concat((dftext_svd_list_all,dftext_svd_list_all),axis=1), col_pars return dftext_svd_list_all, col_pars
def pd_augmentation_sdv(df, col=None, pars={}) : ''' Using SDV Variation Autoencoders, the function augments more data into the dataset params: df : (pandas dataframe) original dataframe col : column name for data enancement pars : (dict - optional) contains: n_samples : (int - optional) number of samples you would like to add, defaul is 10% primary_key : (String - optional) the primary key of dataframe aggregate : (boolean - optional) if False, prints SVD metrics, else it averages them path_model_save: saving location if save_model is set to True path_model_load: saved model location to skip training path_data_new : new data where saved returns: df_new : (pandas dataframe) df with more augmented data col : (list of strings) same columns ''' n_samples = pars.get('n_samples', max(1, int(len(df) * 0.10) ) ) ## Add 10% or 1 sample by default value primary_key = pars.get('colid', None) ### Custom can be created on the fly metrics_type = pars.get('aggregate', False) path_model_save = pars.get('path_model_save', 'data/output/ztmp/') model_name = pars.get('model_name', "TVAE") # model fitting if 'path_model_load' in pars: model = load(pars['path_model_load']) else: log('##### Training Started #####') model = {'TVAE' : TVAE, 'CTGAN' : CTGAN, 'PAR' : PAR}[model_name] if model_name == 'PAR': model = model(entity_columns = pars['entity_columns'], context_columns = pars['context_columns'], sequence_index = pars['sequence_index']) else: model = model(primary_key=primary_key) model.fit(df) log('##### Training Finshed #####') try: save(model, path_model_save ) log('model saved at: ', path_model_save ) except: log('saving model failed: ', path_model_save) log('##### Generating Samples #############') new_data = model.sample(n_samples) log_pd( new_data, n=7) log('######### Evaluation Results #########') if metrics_type == True: evals = evaluate(new_data, df, aggregate= True ) log(evals) else: evals = evaluate(new_data, df, aggregate= False ) log_pd(evals, n=7) # appending new data df_new = df.append(new_data) log(str(len(df_new) - len(df)) + ' new data added') if 'path_newdata' in pars : new_data.to_parquet( pars['path_newdata'] + '/features.parquet' ) log('###### df augmentation save on disk', pars['path_newdata'] ) log('###### augmentation complete ######') return df_new, col
def pd_colnum_quantile_norm(df, col, pars={}): """ colnum normalization by quantile """ prefix = "colnum_quantile_norm" df = df[col] num_col = col ##### Grab previous computed params ################################################ pars2 = {} if 'path_pipeline' in pars: #### Load existing column list colnum_quantile_norm = load(pars['path_pipeline'] + f'/{prefix}.pkl') model = load(pars['path_pipeline'] + f'/{prefix}_model.pkl') pars2 = load(pars['path_pipeline'] + f'/{prefix}_pars.pkl') lower_bound_sparse = pars2.get('lower_bound_sparse', None) upper_bound_sparse = pars2.get('upper_bound_sparse', None) lower_bound = pars2.get('lower_bound_sparse', None) upper_bound = pars2.get('upper_bound_sparse', None) sparse_col = pars2.get('colsparse', ['capital-gain', 'capital-loss']) ####### Find IQR and implement to numericals and sparse columns seperately ########## Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 for col in num_col: if col in sparse_col: df_nosparse = pd.DataFrame(df[df[col] != df[col].mode()[0]][col]) if lower_bound_sparse is not None: pass elif df_nosparse[col].quantile( 0.25) < df[col].mode()[0]: #Unexpected case lower_bound_sparse = df_nosparse[col].quantile(0.25) else: lower_bound_sparse = df[col].mode()[0] if upper_bound_sparse is not None: pass elif df_nosparse[col].quantile( 0.75) < df[col].mode()[0]: #Unexpected case upper_bound_sparse = df[col].mode()[0] else: upper_bound_sparse = df_nosparse[col].quantile(0.75) n_outliers = len(df[(df[col] < lower_bound_sparse) | (df[col] > upper_bound_sparse)][col]) if n_outliers > 0: df.loc[df[col] < lower_bound_sparse, col] = lower_bound_sparse * 0.75 #--> MAIN DF CHANGED df.loc[df[col] > upper_bound_sparse, col] = upper_bound_sparse * 1.25 # --> MAIN DF CHANGED else: if lower_bound is None or upper_bound is None: lower_bound = df[col].quantile(0.25) - 1.5 * IQR[col] upper_bound = df[col].quantile(0.75) + 1.5 * IQR[col] df[col] = np.where(df[col] > upper_bound, 1.25 * upper_bound, df[col]) df[col] = np.where(df[col] < lower_bound, 0.75 * lower_bound, df[col]) df.columns = [t + "_qt_norm" for t in df.columns] pars_new = { 'lower_bound': lower_bound, 'upper_bound': upper_bound, 'lower_bound_sparse': lower_bound_sparse, 'upper_bound_sparse': upper_bound_sparse } dfnew = df model = None colnew = list(df.columns) ##### Export ############################################################################## if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df, prefix, pars['path_features_store']) save(colnew, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_new, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: colnew ### list } return dfnew, col_pars