def pd_colcat_bin(df, col=None, pars=None): # dfbum_bin = df[col] path_pipeline = pars.get('path_pipeline', False) colcat_bin_map = load( f'{path_pipeline}/colcat_bin_map.pkl') if path_pipeline else None colcat = [col] if isinstance(col, str) else col log("#### Colcat to integer encoding ") dfcat_bin, colcat_bin_map = util_feature.pd_colcat_toint( df[colcat], colname=colcat, colcat_map=colcat_bin_map, suffix="_int") colcat_bin = list(dfcat_bin.columns) ##### Colcat processing ################################################################ colcat_map = util_feature.pd_colcat_mapping(df, colcat) log(df[colcat].dtypes, colcat_map) if 'path_features_store' in pars: save_features(dfcat_bin, 'dfcat_bin', pars['path_features_store']) save(colcat_bin_map, pars['path_pipeline_export'] + "/colcat_bin_map.pkl") save(colcat_bin, pars['path_pipeline_export'] + "/colcat_bin.pkl") col_pars = {} col_pars['colcat_bin_map'] = colcat_bin_map col_pars['cols_new'] = { 'colcat': col, ###list 'colcat_bin': colcat_bin ### list } return dfcat_bin, col_pars
def pd_colnum_binto_onehot(df, col=None, pars=None): assert isinstance(col, list) and isinstance(df, pd.DataFrame) dfnum_bin = df[col] colnum_bin = col path_pipeline = pars.get('path_pipeline', False) colnum_onehot = load( f'{path_pipeline}/colnum_onehot.pkl') if path_pipeline else None log("###### colnum bin to One Hot #################################################" ) from util_feature import pd_col_to_onehot dfnum_hot, colnum_onehot = pd_col_to_onehot(dfnum_bin[colnum_bin], colname=colnum_bin, colonehot=colnum_onehot, return_val="dataframe,param") log(colnum_onehot) if 'path_features_store' in pars: save_features(dfnum_hot, 'colnum_onehot', pars['path_features_store']) save(colnum_onehot, pars['path_pipeline_export'] + "/colnum_onehot.pkl") col_pars = {} col_pars['colnum_onehot'] = colnum_onehot col_pars['cols_new'] = { # 'colnum' : col , ###list 'colnum_onehot': colnum_onehot ### list } return dfnum_hot, col_pars
def pd_colnum_bin(df, col, pars): from util_feature import pd_colnum_tocat path_pipeline = pars.get('path_pipeline', False) colnum_binmap = load(f'{path_pipeline}/colnum_binmap.pkl') if path_pipeline else None log(colnum_binmap) colnum = col log("### colnum Map numerics to Category bin ###########################################") dfnum_bin, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=colnum_binmap, bins=10, suffix="_bin", method="uniform", return_val="dataframe,param") log(colnum_binmap) ### Renaming colunm_bin with suffix colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())] log(colnum_bin) if 'path_features_store' in pars: scol = "_".join(col[:5]) save_features(dfnum_bin, 'colnum_bin' + "-" + scol, pars['path_features_store']) save(colnum_binmap, pars['path_pipeline_export'] + "/colnum_binmap.pkl" ) save(colnum_bin, pars['path_pipeline_export'] + "/colnum_bin.pkl" ) col_pars = {} col_pars['colnumbin_map'] = colnum_binmap col_pars['cols_new'] = { 'colnum' : col , ###list 'colnum_bin' : colnum_bin ### list } return dfnum_bin, col_pars
def pd_coltext_universal_google(df, col, pars={}): """ # Universal sentence encoding from Tensorflow Text ---> Vectors from source.preprocessors import pd_coltext_universal_google https://tfhub.dev/google/universal-sentence-encoder-multilingual/3 #@title Setup Environment #latest Tensorflow that supports sentencepiece is 1.13.1 !pip uninstall --quiet --yes tensorflow !pip install --quiet tensorflow-gpu==1.13.1 !pip install --quiet tensorflow-hub pip install --quiet tf-sentencepiece, simpleneighbors !pip install --quiet simpleneighbors # df : dataframe # col : list of text colnum names pars """ import tensorflow as tf import tensorflow_hub as hub import tensorflow_text #from tqdm import tqdm #progress bar uri_list = [] uri_default = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3" uri = pars.get("url_model", uri_default) use = hub.load(uri) dfall = None for coli in col[:1]: X = [] for r in (df[coli]): if pd.isnull(r) == True: r = "" emb = use(r) review_emb = tf.reshape(emb, [-1]).numpy() X.append(review_emb) dfi = pd.DataFrame( X, columns=[coli + "_" + str(i) for i in range(len(X[0]))], index=df.index) dfall = pd.concat((dfall, dfi)) if dfall is not None else dfi coltext_embed = list(dfall.columns) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfall, 'dftext_embed', pars['path_features_store']) save(coltext_embed, pars['path_pipeline_export'] + "/coltext_universal_google.pkl") col_pars = {'model_encoder': uri} col_pars['cols_new'] = { 'coltext_universal_google': coltext_embed ### list } return dfall, col_pars
def pd_colnum_normalize(df: pd.DataFrame, col: list = None, pars: dict = None): """ Float num INTO [0,1] 'quantile_cutoff', 'quantile_cutoff_2', 'minmax' 'name': 'fillna', 'na_val' : 0.0 """ prefix = 'colnum_norm' ### == cols_out df = df[col] log2( "### colnum normalize #############################################################" ) from util_feature import pd_colnum_normalize as pd_normalize_fun colnum = col if pars is None: pars = { 'pipe_list': [ { 'name': 'quantile_cutoff' }, # { 'name': 'fillna', 'na_val': 0.0 }, ] } if 'path_pipeline' in pars: #### Load existing column list pars = load(pars['path_pipeline'] + f'/{prefix}_pars.pkl') dfnum_norm, colnum_norm = pd_normalize_fun(df, colname=colnum, pars=pars, suffix="_norm", return_val="dataframe,param") log3('dfnum_norm', dfnum_norm.head(4), colnum_norm) log3('dfnum_norn NA', dfnum_norm.isna().sum()) colnew = colnum_norm log3( "##### Export ######################################################################" ) if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfnum_norm, prefix, pars['path_features_store']) save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: colnew ### list } return dfnum_norm, col_pars
def pd_colcross(df, col, pars): """ cross_feature_new = feat1 X feat2 (pair feature) """ log("##### Cross Features From OneHot Features ######################################" ) prefix = 'colcross_onehot' # params_check(pars, [('dfcat_hot', pd.DataFrame), 'colid', ]) from util_feature import pd_feature_generate_cross dfcat_hot = pars['dfcat_hot'] colid = pars['colid'] try: dfnum_hot = pars['dfnum_hot'] df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left') except: df_onehot = copy.deepcopy(dfcat_hot) colcross_single = pars['colcross_single'] pars_model = {'pct_threshold': 0.02, 'm_combination': 2} if 'path_pipeline' in pars: #### Load existing column list colcross_single = load(pars['path_pipeline'] + f'/{prefix}_select.pkl') # pars_model = load( pars['path_pipeline'] + f'/{prefix}_pars.pkl') colcross_single_onehot_select = [] ## Select existing columns for t in list(df_onehot.columns): for c1 in colcross_single: if c1 in t: colcross_single_onehot_select.append(t) df_onehot = df_onehot[colcross_single_onehot_select] dfcross_hot, colcross_pair = pd_feature_generate_cross( df_onehot, colcross_single_onehot_select, **pars_model) log(dfcross_hot.head(2).T) colcross_pair_onehot = list(dfcross_hot.columns) model = None ############################################################################## if 'path_features_store' in pars: save_features(dfcross_hot, 'colcross_onehot', pars['path_features_store']) save(colcross_single_onehot_select, pars['path_pipeline_export'] + f'/{prefix}_select.pkl') save(colcross_pair, pars['path_pipeline_export'] + f'/{prefix}_stats.pkl') save(colcross_pair_onehot, pars['path_pipeline_export'] + f'/{prefix}_pair.pkl') save(model, pars['path_pipeline_export'] + f'/{prefix}_pars.pkl') col_pars = {'model': model, 'stats': colcross_pair} col_pars['cols_new'] = { # 'colcross_single' : col , ###list 'colcross_pair': colcross_pair_onehot ### list } return dfcross_hot, col_pars
def pd_colcat_to_onehot(df, col=None, pars=None): """ """ log("#### colcat to onehot") col = [col] if isinstance(col, str) else col if len(col) == 1: colnew = [col[0] + "_onehot"] df[colnew] = df[col] col_pars = {} col_pars['colcat_onehot'] = colnew col_pars['cols_new'] = { # 'colnum' : col , ###list 'colcat_onehot': colnew ### list } return df[colnew], col_pars colcat_onehot = None if 'path_pipeline' in pars: colcat_onehot = load(pars['path_pipeline'] + '/colcat_onehot.pkl') ###################################################################################### colcat = col dfcat_hot, colcat_onehot = util_feature.pd_col_to_onehot( df[colcat], colname=colcat, colonehot=colcat_onehot, return_val="dataframe,param") log(dfcat_hot[colcat_onehot].head(5)) ###################################################################################### if 'path_features_store' in pars: save_features(dfcat_hot, 'colcat_onehot', pars['path_features_store']) save(colcat_onehot, pars['path_pipeline_export'] + "/colcat_onehot.pkl") save(colcat, pars['path_pipeline_export'] + "/colcat.pkl") col_pars = {} col_pars['colcat_onehot'] = colcat_onehot col_pars['cols_new'] = { # 'colnum' : col , ###list 'colcat_onehot': colcat_onehot ### list } print("ok ------------") return dfcat_hot, col_pars
def pd_colcat_to_onehot(df, col=None, pars=None): dfbum_bin = df[col] if len(col) == 1: colnew = [col[0] + "_onehot"] df[colnew] = df[col] col_pars = {} col_pars['colcat_onehot'] = colnew col_pars['cols_new'] = { # 'colnum' : col , ###list 'colcat_onehot': colnew ### list } return df[colnew], col_pars path_pipeline = pars.get('path_pipeline', False) colcat_onehot = load( f'{path_pipeline}/colcat_onehot.pkl') if path_pipeline else None colcat = col log("#### colcat to onehot") dfcat_hot, colcat_onehot = util_feature.pd_col_to_onehot( df[colcat], colname=colcat, colonehot=colcat_onehot, return_val="dataframe,param") log(dfcat_hot[colcat_onehot].head(5)) if 'path_features_store' in pars: path_features_store = pars['path_features_store'] save_features(dfcat_hot, 'colcat_onehot', path_features_store) save(colcat_onehot, pars['path_pipeline_export'] + "/colcat_onehot.pkl") save(colcat, pars['path_pipeline_export'] + "/colcat.pkl") col_pars = {} col_pars['colcat_onehot'] = colcat_onehot col_pars['cols_new'] = { # 'colnum' : col , ###list 'colcat_onehot': colcat_onehot ### list } print("ok ------------") return dfcat_hot, col_pars
def pd_colcat_minhash(df, col, pars): """ MinHash Algo for category https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087 """ prefix = 'colcat_minhash' colcat = col pars_minhash = {'n_component' : [4, 2], 'model_pretrain_dict' : None,} if 'path_pipeline_export' in pars : try : pars_minhash = load( pars['path_pipeline_export'] + '/colcat_minhash_pars.pkl') except : pass log("#### Colcat to Hash encoding #############################################") from utils import util_text dfcat_bin, col_hash_model= util_text.pd_coltext_minhash(df[colcat], colcat, return_val="dataframe,param", **pars_minhash ) colcat_minhash = list(dfcat_bin.columns) log(col_hash_model) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfcat_bin, prefix, pars['path_features_store']) save(colcat_minhash, pars['path_pipeline_export'] + f"/{prefix}.pkl" ) save(pars_minhash, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl" ) save(col_hash_model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl" ) col_pars = {} col_pars['col_hash_model'] = col_hash_model col_pars['cols_new'] = { 'colcat_minhash' : colcat_minhash ### list } return dfcat_bin, col_pars
def pd_colcat_symbolic(df, col, pars): """ https://github.com/arita37/deltapy pip install deltapy """ pars_encoder = pars pars_encoder['cols'] = col if 'path_pipeline_export' in pars: try: pars_encoder = load(pars['path_pipeline_export'] + '/col_genetic_pars.pkl') model_encoder = load(pars['path_pipeline_export'] + '/col_genetic_model.pkl') col_encoder = load(pars['path_pipeline_export'] + '/col_genetic.pkl') except: pass ################################################################################### coly = pars['coly'] from gplearn.genetic import SymbolicTransformer function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan' ] gp = SymbolicTransformer(generations=20, population_size=200, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6) gen_feats = gp.fit_transform(df[col], df[coly]) gen_feats = pd.DataFrame( gen_feats, columns=["gen_" + str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = df.index dfnew = gen_feats dfnew.columns = [t for t in dfnew.columns] ################################################################################### colnew = list(dfnew.columns) if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfnew, 'dfgen', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + "/col_genetic_model.pkl") save(pars_encoder, pars['path_pipeline_export'] + "/col_genetic_pars.pkl") save(colnew, pars['path_pipeline_export'] + "/col_genetic.pkl") col_pars = {'model': gp} col_pars['cols_new'] = { 'col_genetic': colnew ### list } return dfnew, col_pars
def pd_colcat_encoder_generic(df, col, pars): """ Create a Class or decorator https://pypi.org/project/category-encoders/ encoder = ce.BackwardDifferenceEncoder(cols=[...]) encoder = ce.BaseNEncoder(cols=[...]) encoder = ce.BinaryEncoder(cols=[...]) encoder = ce.CatBoostEncoder(cols=[...]) encoder = ce.CountEncoder(cols=[...]) encoder = ce.GLMMEncoder(cols=[...]) encoder = ce.HashingEncoder(cols=[...]) encoder = ce.HelmertEncoder(cols=[...]) encoder = ce.JamesSteinEncoder(cols=[...]) encoder = ce.LeaveOneOutEncoder(cols=[...]) encoder = ce.MEstimateEncoder(cols=[...]) encoder = ce.OneHotEncoder(cols=[...]) encoder = ce.OrdinalEncoder(cols=[...]) encoder = ce.SumEncoder(cols=[...]) encoder = ce.PolynomialEncoder(cols=[...]) encoder = ce.TargetEncoder(cols=[...]) encoder = ce.WOEEncoder(cols=[...]) """ prefix = "colcat_encoder_generic" pars_model = None if 'path_pipeline' in pars: ### Load during Inference colcat_encoder = load(pars['path_pipeline'] + f"/{prefix}.pkl") pars_model = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") #model = load( pars['path_pipeline'] + f"/{prefix}_model.pkl" ) ####### Custom Code ############################################################### from category_encoders import HashingEncoder, WOEEncoder pars_model = pars.get('model_pars', {}) if pars_model is None else pars_model pars_model['cols'] = col model_name = pars.get('model_name', 'HashingEncoder') model_class = {'HashingEncoder': HashingEncoder}[model_name] model = model_class(**pars_model) dfcat_encoder = model.fit_transform(df[col]) dfcat_encoder.columns = [t + "_cod" for t in dfcat_encoder.columns] colcat_encoder = list(dfcat_encoder.columns) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfcat_encoder, 'dfcat_encoder', pars['path_features_store']) save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(pars_model, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") save(colcat_encoder, pars['path_pipeline_export'] + f"/{prefix}.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { 'colcat_encoder_generic': colcat_encoder ### list } return dfcat_encoder, col_pars
def prepro_save(prefix, pars, df_new, cols_new, prepro) -> (pd.DataFrame, dict): """ Save preprocessors and export :param prefix: :param pars: :param df_new: :param cols_new: :param prepro: :param pars_prepro: :return: """ ### Clean Pars of extra heavy data pars2 = {} for k, val in pars.items(): if isinstance(val, pd.DataFrame): continue pars2[k] = val if "path_features_store" in pars and "path_pipeline_export" in pars: save(prepro, pars["path_pipeline_export"] + f"/{prefix}_model.pkl") save(cols_new, pars["path_pipeline_export"] + f"/{prefix}_cols.pkl") save(pars2, pars["path_pipeline_export"] + f"/{prefix}_pars.pkl") ###### Training & Inference time : df + new column names ########################## col_pars = { "prefix": prefix, "path": pars.get("path_pipeline_export", pars.get("path_pipeline", None)) } col_pars["cols_new"] = { prefix: cols_new ### new column list } return df_new, col_pars
def pd_colcat_encoder_generic(df, col, pars): """ https://pypi.org/project/category-encoders/ encoder = ce.BackwardDifferenceEncoder(cols=[...]) encoder = ce.BaseNEncoder(cols=[...]) encoder = ce.BinaryEncoder(cols=[...]) encoder = ce.CatBoostEncoder(cols=[...]) encoder = ce.CountEncoder(cols=[...]) encoder = ce.GLMMEncoder(cols=[...]) encoder = ce.HashingEncoder(cols=[...]) encoder = ce.HelmertEncoder(cols=[...]) encoder = ce.JamesSteinEncoder(cols=[...]) encoder = ce.LeaveOneOutEncoder(cols=[...]) encoder = ce.MEstimateEncoder(cols=[...]) encoder = ce.OneHotEncoder(cols=[...]) encoder = ce.OrdinalEncoder(cols=[...]) encoder = ce.SumEncoder(cols=[...]) encoder = ce.PolynomialEncoder(cols=[...]) encoder = ce.TargetEncoder(cols=[...]) encoder = ce.WOEEncoder(cols=[...]) """ colcat = col import category_encoders as ce pars_encoder = pars pars_encoder['cols'] = col if 'path_pipeline_export' in pars: try: pars_encoder = load(pars['path_pipeline_export'] + '/colcat_encoder_pars.pkl') except: pass encoder = ce.HashingEncoder(**pars_encoder) dfcat_bin = encoder.fit_transform(df[col]) dfcat_bin.columns = [t for t in dfcat_bin.columns] colcat_encoder = list(dfcat_bin.columns) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfcat_bin, 'dfcat_encoder', pars['path_features_store']) save(encoder, pars['path_pipeline_export'] + "/colcat_encoder_model.pkl") save(pars_encoder, pars['path_pipeline_export'] + "/colcat_encoder_pars.pkl") save(colcat_encoder, pars['path_pipeline_export'] + "/colcat_encoder.pkl") col_pars = {} col_pars['col_encode_model'] = encoder col_pars['cols_new'] = { 'colcat_encoder': colcat_encoder ### list } return dfcat_bin, col_pars
def pd_ts_deltapy2( df=None, col=None, pars={}, ): """ Delta py pars : { 'name' : "robust_scaler", 'pars' : {} } """ prefix = 'colts_deltapy' ###### Custom code ################################################################ dfin = df.fillna(method='ffill') model_name = pars['name'] model_pars = pars.get('pars', {}) if 'path_pipeline' in pars: #### Prediction time model = load(pars['path_pipeline'] + f"/{prefix}_model.pkl") pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") else: ### Training time : Dynamic function load from util_feature import load_function_uri ##### transform.robust_scaler(df, drop=["Close_1"]) model = load_function_uri2(model_name) ##### Transform Data ############################################################ df_out = model(dfin, **model_pars) # Extract only returns one value, so no columns to loop over. model_name2 = model_name.replace("::", "-") if 'extract' in model_name: col_out = "0_" + model_name else: col_out = [coli + "_" + model_name for coli in df_out.columns] df_out.columns = col_out df_out.index = df_out.index col_new = col_out ###### Export ##################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_out, 'df_' + prefix, pars['path_features_store']) save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_new, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: col_new ### list of columns } return df_out, col_pars
def pd_sample_imblearn(df=None, col=None, pars=None): """ Over-sample """ params_check(pars, ['model_name', 'pars_resample', 'coly']) # , 'dfy' prefix = '_sample_imblearn' ###################################################################################### from imblearn.over_sampling import SMOTE from imblearn.combine import SMOTEENN, SMOTETomek from imblearn.under_sampling import NearMiss # model_resample = { 'SMOTE' : SMOTE, 'SMOTEENN': SMOTEENN }[ pars.get("model_name", 'SMOTEENN') ] model_resample = locals()[pars.get("model_name", 'SMOTEENN')] pars_resample = pars.get('pars_resample', { 'sampling_strategy': 'auto', 'random_state': 0 }) # , 'n_jobs': 2 if 'path_pipeline' in pars: #### Inference time return df, {'col_new': col} else: ### Training time colX = col # [col_ for col_ in col if col_ not in coly] coly = pars['coly'] train_y = pars['dfy'] ## df[coly] # train_X = df[colX].fillna(method='ffill') gp = model_resample(**pars_resample) X_resample, y_resample = gp.fit_resample(train_X, train_y) col_new = [t + f"_{prefix}" for t in col] df2 = pd.DataFrame(X_resample, columns=col_new) # , index=train_X.index df2[coly] = y_resample ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df2, prefix.replace("col_", "df_"), pars['path_features_store']) save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_resample, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: col_new ### for training input data } return df2, col_pars
def pd_col_genetic_transform(df=None, col=None, pars=None): """ Find Symbolic formulae for faeture engineering """ prefix = 'col_genetic' ###################################################################################### from gplearn.genetic import SymbolicTransformer coly = pars['coly'] colX = [t for t in col if t not in [coly]] train_X = df[colX] train_y = df[coly] function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan' ] pars_genetic = pars.get('pars_genetic', { 'generations': 20, 'n_components': 10, 'population_size': 200 }) gp = SymbolicTransformer(hall_of_fame=100, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6, **pars_genetic) gp.fit(train_X, train_y) df_genetic = gp.transform(train_X) df_genetic = pd.DataFrame( df_genetic, columns=["gen_" + str(a) for a in range(df_genetic.shape[1])]) df_genetic.index = train_X.index col_genetic = list(df_genetic.columns) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_genetic, 'df_genetic', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_genetic, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = {'model': gp, 'pars': pars_genetic} col_pars['cols_new'] = { 'col_genetic': col_genetic ### list } return df_genetic, col_pars
def pd_colts_generate(df=None, col=None, pars={}): """ pars : { 'model_name' : "transform.robust_scaler", 'model_pars' : {} } """ prefix = 'colts_generate' ###### Custom code ################################################################ dfin = df[col].fillna(method='ffill') model_name = pars['model_name'] model_pars = pars.get('model_pars', {}) if 'path_pipeline' in pars: #### Prediction time model = load(pars['path_pipeline'] + f"/{prefix}_model.pkl") pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") else: ### Training time : Dynamic function load from util_feature import load_function_uri ##### transform.robust_scaler(df, drop=["Close_1"]) model = load_function_uri(model_name) model_name = model_name.replace(".", "_") ##### Transform Data ############################################################ df_out = model(dfin, col, **model_pars) col_out = [coli + "_" + model_name for coli in df_out.columns] df_out.columns = col_out df_out.index = train_X.index col_new = col_out ###### Export ##################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_out, 'df_' + prefix, pars['path_features_store']) save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_new, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: col_new ### list of columns } return df_out, col_pars
def pd_col_atemplate(df=None, col=None, pars={}): """ Example of custom Processor Used at prediction time "path_pipeline" : Training time : "path_features_store" : to store intermediate dataframe "path_pipeline_export": to store pipeline for later usage """ from source.util_feature import save, load prefix = "col_myfun" #### Inference time LOAD previous pars ########################################### if "path_pipeline" in pars: prepro = load(pars["path_pipeline"] + f"/{prefix}_model.pkl") pars = load(pars["path_pipeline"] + f"/{prefix}_pars.pkl") pars = {} if pars is None else pars #### Do something ################################################################# df_new = df[col] ### Do nithi df_new.columns = [col + "_myfun" for col in df.columns] cols_new = list(df_new.columns) prepro = None ### model pars_new = None ### new params ################################################################################### ###### Training time save all ##################################################### if "path_features_store" in pars and "path_pipeline_export" in pars: save(prepro, pars["path_pipeline_export"] + f"/{prefix}_model.pkl") save(cols_new, pars["path_pipeline_export"] + f"/{prefix}.pkl") save(pars_new, pars["path_pipeline_export"] + f"/{prefix}_pars.pkl") ###### Training & Inference time : df + new column names ########################## col_pars = { "prefix": prefix, "path": pars.get("path_pipeline_export", pars.get("path_pipeline", None)) } col_pars["cols_new"] = { "col_myfun": cols_new ### new column list } return df_new, col_pars
def pd_filter_resample(df=None, col=None, pars=None): """ Over-sample, Under-sample """ prefix = 'col_imbalance' ###################################################################################### from imblearn.over_sampling import SMOTE model_resample = { 'SMOTE' : SMOTE}[ pars.get("model_name", 'SMOTE') ] pars_resample = pars.get('pars_resample', {'sampling_strategy' : 'auto', 'random_state':0, 'k_neighbors':5, 'n_jobs': 2}) if 'path_pipeline' in pars : #### Inference time return df, {'col_new': col } #gp = load(pars['path_pipeline'] + f"/{prefix}_model.pkl" ) #pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl" ) else : ### Training time colX = col # [col_ for col_ in col if col_ not in coly] train_X = df[colX].fillna(method='ffill') coly = pars['coly'] train_y = pars['dfy'] gp = model_resample( **pars_resample) X_resample, y_resample = gp.fit_resample(train_X, train_y) df2 = pd.DataFrame(X_resample, columns = col, index=train_X.index) df2[coly] = y_resample col_new = col ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df2, 'df_resample', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl" ) save(col, pars['path_pipeline_export'] + f"/{prefix}.pkl" ) save(pars_resample, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl" ) col_pars = {'prefix' : prefix , 'path' : pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix : col_new ### list } return df2, col_pars
def pd_coltext(df, col, pars={}): """ df : Datframe col : list of columns pars : dict of pars """ from utils import util_text, util_model #### Load pars ################################################################### path_pipeline = pars.get('path_pipeline', None) word_tokeep_dict_all = load( path_pipeline + "/word_tokeep_dict_all.pkl" ) if path_pipeline is not None else {} # dftext_tdidf_all = load(f'{path_pipeline}/dftext_tdidf.pkl') if path_pipeline else None # dftext_svd_list_all = load(f'{path_pipeline}/dftext_svd.pkl') if path_pipeline else None dimpca = pars.get('dimpca', 2) word_minfreq = pars.get('word_minfreq', 3) #### Process #################################################################### stopwords = nlp_get_stopwords() dftext = pd_coltext_clean(df, col, stopwords= stopwords , pars=pars) dftext_svd_list_all = None dftext_tdidf_all = None ### Processing each of text columns to create a bag of word/to load the bag of word -> tf-idf -> svd for col_ in col: if path_pipeline is not None: ### If it is in Inference step, use the saved bag of word for the column `col_` word_tokeep = word_tokeep_dict_all[col_] else: ### If it is not, create a bag of word coltext_freq, word_tokeep = pd_coltext_wordfreq(df, col_, stopwords, ntoken=100) ## nb of words to keep word_tokeep_dict_all[col_] = word_tokeep ## save the bag of wrod for `col_` in a dict dftext_tdidf_dict, word_tokeep_dict = util_text.pd_coltext_tdidf(dftext, coltext=col_, word_minfreq= word_minfreq, word_tokeep = word_tokeep, return_val = "dataframe,param") dftext_tdidf_all = pd.DataFrame(dftext_tdidf_dict) if dftext_tdidf_all is None else pd.concat((dftext_tdidf_all,pd.DataFrame(dftext_tdidf_dict)),axis=1) log(word_tokeep_dict) ### Dimesnion reduction for Sparse Matrix dftext_svd_list, svd_list = util_model.pd_dim_reduction(dftext_tdidf_dict, colname = None, model_pretrain = None, colprefix = col_ + "_svd", method = "svd", dimpca=dimpca, return_val="dataframe,param") dftext_svd_list_all = dftext_svd_list if dftext_svd_list_all is None else pd.concat((dftext_svd_list_all,dftext_svd_list),axis=1) ################################################################################# ###### Save and Export ########################################################## if 'path_features_store' in pars: save_features(dftext_svd_list_all, 'dftext_svd' + "-" + str(col), pars['path_features_store']) # save(dftext_svd_list_all, pars['path_pipeline_export'] + "/dftext_svd.pkl") # save(dftext_tdidf_all, pars['path_pipeline_export'] + "/dftext_tdidf.pkl" ) save(word_tokeep_dict_all, pars['path_pipeline_export'] + "/word_tokeep_dict_all.pkl" ) col_pars = {} col_pars['cols_new'] = { # 'coltext_tdidf' : dftext_tdidf_all.columns.tolist(), ### list 'coltext_svd' : dftext_svd_list_all.columns.tolist() ### list } dftext_svd_list_all.index = dftext.index # return pd.concat((dftext_svd_list_all,dftext_svd_list_all),axis=1), col_pars return dftext_svd_list_all, col_pars
def preprocess(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000, preprocess_pars={}, path_features_store=None): """ Used for trainiing only Save params on disk :param path_train_X: :param path_train_y: :param path_pipeline_export: :param cols_group: :param n_sample: :param preprocess_pars: :param path_features_store: :return: """ ##### column names for feature generation ##################################################### log(cols_group) coly = cols_group['coly'] # 'salary' colid = cols_group['colid'] # "jobId" colcat = cols_group[ 'colcat'] # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ] colnum = cols_group['colnum'] # ['yearsExperience', 'milesFromMetropolis'] os.makedirs(path_pipeline_export, exist_ok=True) log(path_pipeline_export) save(colid, f'{path_pipeline_export}/colid.pkl') ### Pipeline Execution ########################################## pipe_default = [{ 'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'type': '' }, { 'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'type': 'cross' }] pipe_list = preprocess_pars.get('pipe_list', pipe_default) pipe_list_X = [ task for task in pipe_list if task.get('type', '') not in ['coly', 'filter'] ] pipe_list_y = [ task for task in pipe_list if task.get('type', '') in ['coly'] ] pipe_filter = [ task for task in pipe_list if task.get('type', '') in ['filter'] ] ##### Load data ################################################################################# df = load_dataset(path_train_X, path_train_y, colid, n_sample=n_sample) ##### Generate features ########################################################################## dfi_all = {} ### Dict of all features cols_family_all = {'colid': colid, 'colnum': colnum, 'colcat': colcat} if len(pipe_filter) > 0: log("##### Filter #########################################################################" ) pipe_i = pipe_filter[0] pipe_fun = load_function_uri(pipe_i['uri']) df, col_pars = pipe_fun(df, list(df.columns), pars=pipe_i.get('pars', {})) if len(pipe_list_y) > 0: log("##### coly ###########################################################################" ) pipe_i = pipe_list_y[0] pipe_fun = load_function_uri(pipe_i['uri']) logs("----------df----------\n", df) pars = pipe_i.get('pars', {}) pars['path_features_store'] = path_features_store pars['path_pipeline_export'] = path_pipeline_export df, col_pars = pipe_fun(df, cols_group['coly'], pars=pars) ### coly can remove rows logs("----------df----------\n", df) dfi_all['coly'] = df[cols_group['coly']] cols_family_all['coly'] = cols_group['coly'] save_features(df[cols_group['coly']], "coly", path_features_store) ### already saved save(coly, f'{path_pipeline_export}/coly.pkl') ##### Processors ############################################################################### dfi_all['coly'] = df[cols_group['coly']] #for colg, colg_list in cols_group.items() : # if colg not in ['colid']: # dfi_all[colg] = df[colg_list] ## colnum colcat, coly for pipe_i in pipe_list_X: log("###################", pipe_i, "##########################################################") pipe_fun = load_function_uri( pipe_i['uri']) ### Load the code definition into pipe_fun cols_name = pipe_i['cols_family'] col_type = pipe_i['type'] pars = pipe_i.get('pars', {}) pars[ 'path_features_store'] = path_features_store ### intermdiate dataframe pars['path_pipeline_export'] = path_pipeline_export ### Store pipeline if col_type == 'cross': log("################### Adding Cross ###################################################" ) pars['dfnum_hot'] = dfi_all[ 'colnum_onehot'] ### dfnum_hot --> dfcross pars['dfcat_hot'] = dfi_all['colcat_onehot'] pars['colid'] = colid pars['colcross_single'] = cols_group.get('colcross', []) elif col_type == 'add_coly': log('add_coly genetic', cols_group['coly']) pars['coly'] = cols_group['coly'] pars['dfy'] = dfi_all['coly'] ### Transformed dfy ### Input columns or prevously Computed Columns ( colnum_bin ) cols_list = cols_group[cols_name] if cols_name in cols_group else list( dfi_all[cols_name].columns) df_ = df[cols_list] if cols_name in cols_group else dfi_all[cols_name] #cols_list = list(dfi_all[cols_name].columns) #df_ = dfi_all[cols_name] dfi, col_pars = pipe_fun(df_, cols_list, pars=pars) ### Concatenate colnum, colnum_bin into cols_family_all , dfi_all ########################### for colj, colist in col_pars['cols_new'].items(): ### Merge sub-family cols_family_all[colj] = cols_family_all.get(colj, []) + colist dfi_all[colj] = pd.concat( (dfi_all[colj], dfi), axis=1) if colj in dfi_all else dfi # save_features(dfi_all[colj], colj, path_features_store) ###### Merge AlL int dfXy ################################################################## dfXy = df[[coly] + colnum + colcat] #dfXy = df[ [coly] ] for t in dfi_all.keys(): if t not in ['coly', 'colnum', 'colcat']: dfXy = pd.concat((dfXy, dfi_all[t]), axis=1) save_features(dfXy, 'dfX', path_features_store) colXy = list(dfXy.columns) colXy.remove(coly) ##### Only X columns if len(colid) > 0: cols_family_all['colid'] = colid cols_family_all['colX'] = colXy #### Cols group for model input ########################################################### save(colXy, f'{path_pipeline_export}/colsX.pkl') save(cols_family_all, f'{path_pipeline_export}/cols_family.pkl') ###### Return values ####################################################################### return dfXy, cols_family_all
def run_preprocess(config_name, config_path, n_sample=5000, mode='run_preprocess', model_dict=None ): #prefix "pre" added, in order to make if loop possible """ :param config_name: titanic_lightgbm :param config_path: titanic_classifier.py :param n_sample: nb of rows used :param mode: 'run_preprocess' / 'load_prerocess' :param model_dict: Optional provide the dict model :return: None, only show and save dataframe """ model_dict = model_dict_load(model_dict, config_path, config_name, verbose=True) m = model_dict['global_pars'] path_data = m['path_data_preprocess'] path_train_X = m.get( 'path_data_prepro_X', path_data + "/features.zip") # ### Can be a list of zip or parquet files path_train_y = m.get( 'path_data_prepro_y', path_data + "/target.zip") # ### Can be a list of zip or parquet files path_output = m['path_train_output'] path_pipeline = m.get('path_pipeline', path_output + "/pipeline/") path_features_store = m.get( 'path_features_store', path_output + '/features_store/' ) #path_data_train replaced with path_output, because preprocessed files are stored there path_check_out = m.get('path_check_out', path_output + "/check/") log(path_output) log("#### load input column family ###################################################" ) try: cols_group = model_dict['data_pars'][ 'cols_input_type'] ### the model config file except: cols_group = json.load(open(path_data + "/cols_group.json", mode='r')) #pars_download = model_dict['data_pars'].get('download_pars', None ) #if pars_download : # for url, target_path in pars_download['']: # pass log("#### Preprocess #################################################################" ) preprocess_pars = model_dict['model_pars']['pre_process_pars'] if mode == "run_preprocess": dfXy, cols = preprocess(path_train_X, path_train_y, path_pipeline, cols_group, n_sample, preprocess_pars, path_features_store) elif mode == "load_preprocess": dfXy, cols = preprocess_load(path_train_X, path_train_y, path_pipeline, cols_group, n_sample, preprocess_pars, path_features_store) model_dict['data_pars']['coly'] = cols['coly'] ### Generate actual column names from colum groups INTO a single list of columns model_dict['data_pars']['cols_model'] = sum([ cols[colgroup] for colgroup in model_dict['data_pars']['cols_model_group'] ], []) log(model_dict['data_pars']['cols_model'], model_dict['data_pars']['coly']) log("#### Save data on disk #############################") dfXy.to_parquet(path_output + "/dfXy.parquet") save(model_dict, path_output + "/model_dict.pkl") log("######### finish #################################", )
def pd_col_genetic_transform(df=None, col=None, pars=None): """ Find Symbolic formulae for faeture engineering """ prefix = 'col_genetic' ###################################################################################### from gplearn.genetic import SymbolicTransformer from gplearn.functions import make_function import random colX = col # [col_ for col_ in col if col_ not in coly] train_X = df[colX].fillna(method='ffill') feature_name_ = colX def squaree(x): return x * x square_ = make_function(function=squaree, name='square_', arity=1) function_set = pars.get('function_set', [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan', square_ ]) pars_genetic = pars.get( 'pars_genetic', { 'generations': 5, 'population_size': 10, ### Higher than nb_features 'metric': 'spearman', 'tournament_size': 20, 'stopping_criteria': 1.0, 'const_range': (-1., 1.), 'p_crossover': 0.9, 'p_subtree_mutation': 0.01, 'p_hoist_mutation': 0.01, 'p_point_mutation': 0.01, 'p_point_replace': 0.05, 'parsimony_coefficient': 0.005, #### 0.00005 Control Complexity 'max_samples': 0.9, 'verbose': 1, #'n_components' ### Control number of outtput features : n_components 'random_state': 0, 'n_jobs': 4, }) if 'path_pipeline' in pars: #### Inference time gp = load(pars['path_pipeline'] + f"/{prefix}_model.pkl") pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") else: ### Training time coly = pars['coly'] train_y = pars['dfy'] gp = SymbolicTransformer( hall_of_fame=train_X.shape[1] + 1, ### Buggy n_components=pars_genetic.get('n_components', train_X.shape[1]), feature_names=feature_name_, function_set=function_set, **pars_genetic) gp.fit(train_X, train_y) ##### Transform Data ######################################### df_genetic = gp.transform(train_X) tag = random.randint(0, 10) #### UNIQUE TAG col_genetic = [f"gen_{tag}_{i}" for i in range(df_genetic.shape[1])] df_genetic = pd.DataFrame(df_genetic, columns=col_genetic, index=train_X.index) df_genetic.index = train_X.index pars_gen_all = {'pars_genetic': pars_genetic, 'function_set': function_set} ##### Formulae Exrraction ##################################### formula = str(gp).replace("[", "").replace("]", "") flist = formula.split(",\n") form_dict = {x: flist[i] for i, x in enumerate(col_genetic)} pars_gen_all['formulae_dict'] = form_dict log("########## Formulae ", form_dict) # col_pars['map_dict'] = dict(zip(train_X.columns.to_list(), feature_name_)) col_new = col_genetic ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_genetic, 'df_genetic', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_gen_all, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") # save(form_dict, pars['path_pipeline_export'] + f"/{prefix}_formula.pkl") save_json(form_dict, pars['path_pipeline_export'] + f"/{prefix}_formula.json") ### Human readable col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: col_new ### list } return df_genetic, col_pars
if 'path_model_load' in pars: model = load(pars['path_model_load']) else: log('##### Training Started #####') model = {'TVAE' : TVAE, 'CTGAN' : CTGAN, 'PAR' : PAR}[model_name] if model_name == 'PAR': model = model(entity_columns = pars['entity_columns'], context_columns = pars['context_columns'], sequence_index = pars['sequence_index']) else: model = model(primary_key=primary_key) model.fit(df) log('##### Training Finshed #####') try: save(model, path_model_save ) log('model saved at: ', path_model_save ) except: log('saving model failed: ', path_model_save) log('##### Generating Samples #############') new_data = model.sample(n_samples) log_pd( new_data, n=7) log('######### Evaluation Results #########') if metrics_type == True: evals = evaluate(new_data, df, aggregate= True ) log(evals) else: evals = evaluate(new_data, df, aggregate= False )
def pd_augmentation_sdv(df, col=None, pars={}): ''' Using SDV Variation Autoencoders, the function augments more data into the dataset params: df : (pandas dataframe) original dataframe col : column name for data enancement pars : (dict - optional) contains: n_samples : (int - optional) number of samples you would like to add, defaul is 10% primary_key : (String - optional) the primary key of dataframe aggregate : (boolean - optional) if False, prints SVD metrics, else it averages them path_model_save: saving location if save_model is set to True path_model_load: saved model location to skip training path_data_new : new data where saved returns: df_new : (pandas dataframe) df with more augmented data col : (list of strings) same columns ''' n_samples = pars.get('n_samples', max(1, int( len(df) * 0.10))) ## Add 10% or 1 sample by default value primary_key = pars.get('colid', None) ### Custom can be created on the fly metrics_type = pars.get('aggregate', False) path_model_save = pars.get('path_model_save', 'data/output/ztmp/') model_name = pars.get('model_name', "TVAE") # importing libraries try: #from sdv.demo import load_tabular_demo from sdv.tabular import TVAE from sdv.tabular import CTGAN from sdv.timeseries import PAR from sdv.evaluation import evaluate import ctgan if ctgan.__version__ != '0.3.1.dev0': raise Exception('ctgan outdated, updating...') except: os.system("pip install sdv") os.system('pip install ctgan==0.3.1.dev0') from sdv.tabular import TVAE from sdv.tabular import CTGAN from sdv.timeseries import PAR from sdv.evaluation import evaluate # model fitting if 'path_model_load' in pars: model = load(pars['path_model_load']) else: log('##### Training Started #####') model = {'TVAE': TVAE, 'CTGAN': CTGAN, 'PAR': PAR}[model_name] if model_name == 'PAR': model = model(entity_columns=pars['entity_columns'], context_columns=pars['context_columns'], sequence_index=pars['sequence_index']) else: model = model(primary_key=primary_key) model.fit(df) log('##### Training Finshed #####') try: save(model, path_model_save) log('model saved at: ', path_model_save) except: log('saving model failed: ', path_model_save) log('##### Generating Samples #############') new_data = model.sample(n_samples) log_pd(new_data, n=7) log('######### Evaluation Results #########') if metrics_type == True: evals = evaluate(new_data, df, aggregate=True) log(evals) else: evals = evaluate(new_data, df, aggregate=False) log_pd(evals, n=7) # appending new data df_new = df.append(new_data) log(str(len(df_new) - len(df)) + ' new data added') if 'path_newdata' in pars: new_data.to_parquet(pars['path_newdata'] + '/features.parquet') log('###### df augmentation save on disk', pars['path_newdata']) log('###### augmentation complete ######') return df_new, col
def train(model_dict, dfX, cols_family, post_process_fun): """ Train the model using model_dict, save model, save prediction :param model_dict: dict containing params :param dfX: pd.DataFrame :param cols_family: dict of list containing column names :param post_process_fun: :return: dfXtrain , dfXval DataFrame containing prediction. """ model_pars, compute_pars = model_dict['model_pars'], model_dict[ 'compute_pars'] data_pars = model_dict['data_pars'] model_name, model_path = model_pars['model_class'], model_dict[ 'global_pars']['path_train_model'] metric_list = compute_pars['metric_list'] assert 'cols_model_type2' in data_pars, 'Missing cols_model_type2, split of columns by data type ' log2(data_pars['cols_model_type2']) log("#### Model Input preparation ##################################################" ) log2(dfX.shape) dfX = dfX.sample(frac=1.0) itrain = int(0.6 * len(dfX)) ival = int(0.8 * len(dfX)) colsX = data_pars['cols_model'] coly = data_pars['coly'] log2('Model colsX', colsX) log2('Model coly', coly) log2('Model column type: ', data_pars['cols_model_type2']) ### Only Parameters data_pars_ref = copy.deepcopy(data_pars) #### TODO : Lazy Dict to have large dataset data_pars['data_type'] = 'ram' data_pars['train'] = { 'Xtrain': dfX[colsX].iloc[:itrain, :], 'ytrain': dfX[coly].iloc[:itrain], 'Xtest': dfX[colsX].iloc[itrain:ival, :], 'ytest': dfX[coly].iloc[itrain:ival], 'Xval': dfX[colsX].iloc[ival:, :], 'yval': dfX[coly].iloc[ival:], } log("#### Init, Train ############################################################" ) # from config_model import map_model modelx = map_model(model_name) log2(modelx) modelx.reset() ### data_pars_ref has NO data. modelx.init(model_pars, data_pars=data_pars_ref, compute_pars=compute_pars) ### Using Actual daa in data_pars['train'] modelx.fit(data_pars, compute_pars) log("#### Predict ################################################################" ) ypred, ypred_proba = modelx.predict(dfX[colsX], data_pars=data_pars_ref, compute_pars=compute_pars) dfX[coly + '_pred'] = ypred # y_norm(ypred, inverse=True) dfX[coly] = dfX[coly].apply(lambda x: post_process_fun(x)) dfX[coly + '_pred'] = dfX[coly + '_pred'].apply(lambda x: post_process_fun(x)) if ypred_proba is None: ### No proba ypred_proba_val = None elif len(ypred_proba.shape) <= 1: #### Single dim proba ypred_proba_val = ypred_proba[ival:] dfX[coly + '_proba'] = ypred_proba elif len(ypred_proba.shape) > 1: ## Muitple proba from util_feature import np_conv_to_one_col ypred_proba_val = ypred_proba[ival:, :] dfX[coly + '_proba'] = np_conv_to_one_col( ypred_proba, ";") ### merge into string "p1,p2,p3,p4" log(dfX.head(3).T) log2("Actual : ", dfX[coly]) log2("Prediction: ", dfX[coly + '_pred']) log("#### Metrics ###############################################################" ) from util_feature import metrics_eval metrics_test = metrics_eval(metric_list, ytrue=dfX[coly].iloc[ival:], ypred=dfX[coly + '_pred'].iloc[ival:], ypred_proba=ypred_proba_val) stats = {'metrics_test': metrics_test} log(stats) log("### Saving model, dfX, columns #############################################" ) log2(model_path + "/model.pkl") os.makedirs(model_path, exist_ok=True) save(colsX, model_path + "/colsX.pkl") save(coly, model_path + "/coly.pkl") modelx.save(model_path, stats) log("### Reload model, ###############################################" ) log2(modelx.model.model_pars, modelx.model.compute_pars) modelx = map_model(model_name) modelx.load_model(model_path) log("Reload model pars", modelx.model.model_pars) log2("Reload model", modelx.model) return dfX.iloc[:ival, :].reset_index(), dfX.iloc[ ival:, :].reset_index(), stats
def preprocess(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000, preprocess_pars={}, filter_pars={}, path_features_store=None): """ :param path_train_X: :param path_train_y: :param path_pipeline_export: :param cols_group: :param n_sample: :param preprocess_pars: :param filter_pars: :param path_features_store: :return: """ from util_feature import (pd_colnum_tocat, pd_col_to_onehot, pd_colcat_mapping, pd_colcat_toint, pd_feature_generate_cross) ##### column names for feature generation ##################################################### log(cols_group) coly = cols_group['coly'] # 'salary' colid = cols_group['colid'] # "jobId" colcat = cols_group['colcat'] # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ] colnum = cols_group['colnum'] # ['yearsExperience', 'milesFromMetropolis'] colcross_single = cols_group.get('colcross', []) ### List of single columns coltext = cols_group.get('coltext', []) coldate = cols_group.get('coldate', []) colall = colnum + colcat + coltext + coldate log(colall) #### Pipeline Execution pipe_default = [ 'filter', 'label', 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot', ] pipe_list = preprocess_pars.get('pipe_list', pipe_default) pipe_list.append('dfdate') pipe_list_pars = preprocess_pars.get('pipe_pars', []) ##### Load data ############################################################################## df = load_dataset(path_train_X, path_train_y, colid, n_sample= n_sample) ##### Filtering / cleaning rows : ######################################################### if "filter" in pipe_list : def isfloat(x): try : a= float(x) return 1 except: return 0 ymin, ymax = filter_pars.get('ymin', -9999999999.0), filter_pars.get('ymax', 999999999.0) print(coly) df['_isfloat'] = df[ coly ].apply(lambda x : isfloat(x)) print(df['_isfloat']) df = df[ df['_isfloat'] > 0 ] df = df[df[coly] > ymin] df = df[df[coly] < ymax] ##### Label processing #################################################################### y_norm_fun = None if "label" in pipe_list : # Target coly processing, Normalization process , customize by model log("y_norm_fun preprocess_pars") y_norm_fun = preprocess_pars.get('y_norm_fun', None) if y_norm_fun is not None: df[coly] = df[coly].apply(lambda x: y_norm_fun(x)) save(y_norm_fun, f'{path_pipeline_export}/y_norm.pkl' ) save_features(df[coly], 'dfy', path_features_store) ########### colnum procesing ############################################################# for x in colnum: print('bam',x) df[x] = df[x].astype("float") log(df[colall].dtypes) if "dfnum" in pipe_list : pass if "dfnum_norm" in pipe_list : log("### colnum normalize ###############################################################") from util_feature import pd_colnum_normalize pars = { 'pipe_list': [ {'name': 'fillna', 'naval' : 0.0 }, {'name': 'minmax'} ]} dfnum_norm, colnum_norm = pd_colnum_normalize(df, colname=colnum, pars=pars, suffix = "_norm", return_val="dataframe,param") log(colnum_norm) save_features(dfnum_norm, 'dfnum_norm', path_features_store) if "dfnum_bin" in pipe_list : log("### colnum Map numerics to Category bin ###########################################") dfnum_bin, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=None, bins=10, suffix="_bin", method="uniform", return_val="dataframe,param") log(colnum_binmap) ### Renaming colunm_bin with suffix colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())] log(colnum_bin) save_features(dfnum_bin, 'dfnum_binmap', path_features_store) if "dfnum_hot" in pipe_list and "dfnum_bin" in pipe_list : log("### colnum bin to One Hot") dfnum_hot, colnum_onehot = pd_col_to_onehot(dfnum_bin[colnum_bin], colname=colnum_bin, colonehot=None, return_val="dataframe,param") log(colnum_onehot) save_features(dfnum_hot, 'dfnum_onehot', path_features_store) ##### Colcat processing ################################################################ colcat_map = pd_colcat_mapping(df, colcat) log(df[colcat].dtypes, colcat_map) if "dfcat_hot" in pipe_list : log("#### colcat to onehot") dfcat_hot, colcat_onehot = pd_col_to_onehot(df[colcat], colname=colcat, colonehot=None, return_val="dataframe,param") log(dfcat_hot[colcat_onehot].head(5)) save_features(dfcat_hot, 'dfcat_onehot', path_features_store) if "dfcat_bin" in pipe_list : log("#### Colcat to integer encoding ") dfcat_bin, colcat_bin_map = pd_colcat_toint(df[colcat], colname=colcat, colcat_map=None, suffix="_int") colcat_bin = list(dfcat_bin.columns) save_features(dfcat_bin, 'dfcat_bin', path_features_store) if "dfcross_hot" in pipe_list : log("##### Cross Features From OneHot Features ######################################") try : df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left') except : df_onehot = copy.deepcopy(dfcat_hot) colcross_single_onehot_select = [] for t in list(df_onehot) : for c1 in colcross_single : if c1 in t : colcross_single_onehot_select.append(t) df_onehot = df_onehot[colcross_single_onehot_select ] dfcross_hot, colcross_pair = pd_feature_generate_cross(df_onehot, colcross_single_onehot_select, pct_threshold=0.02, m_combination=2) log(dfcross_hot.head(2).T) colcross_pair_onehot = list(dfcross_hot.columns) save_features(dfcross_hot, 'dfcross_onehot', path_features_store) del df_onehot ,colcross_pair_onehot if "dftext" in pipe_list : log("##### Coltext processing ###############################################################") stopwords = nlp_get_stopwords() pars = {'n_token' : 100 , 'stopwords': stopwords} dftext = None for coltext_i in coltext : ##### Run the text processor on each column text ############################# dftext_i = pipe_text( df[[coltext_i ]], coltext_i, pars ) dftext = pd.concat((dftext, dftext_i), axis=1) if dftext is not None else dftext_i save_features(dftext_i, 'dftext_' + coltext_i, path_features_store) log(dftext.head(6)) save_features(dftext, 'dftext', path_features_store) if "dfdate" in pipe_list : log("##### Coldate processing #############################################################") from utils import util_date dfdate = None for coldate_i in coldate : dfdate_i = util_date.pd_datestring_split( df[[coldate_i]] , coldate_i, fmt="auto", return_val= "split" ) dfdate = pd.concat((dfdate, dfdate_i), axis=1) if dfdate is not None else dfdate_i save_features(dfdate_i, 'dfdate_' + coldate_i, path_features_store) save_features(dfdate, 'dfdate', path_features_store) print('spoo',dfdate) ################################################################################### # ############### ##### Save pre-processor meta-parameters os.makedirs(path_pipeline_export, exist_ok=True) log(path_pipeline_export) cols_family = {} for t in ['colid', "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns 'coldate', 'coltext', "coly", "y_norm_fun" ]: tfile = f'{path_pipeline_export}/{t}.pkl' log(tfile) t_val = locals().get(t, None) if t_val is not None : save(t_val, tfile) cols_family[t] = t_val ###### Merge AlL ############################################################################# dfXy = df[colnum + colcat + [coly] ] print('localTT',dfXy) for t in [ 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot', 'dfdate', 'dftext' ] : if t in locals() : print('localT', t, locals()[t]) dfXy = pd.concat((dfXy, locals()[t] ), axis=1) save_features(dfXy, 'dfX', path_features_store) colXy = list(dfXy.columns) colXy.remove(coly) ##### Only X columns cols_family['colX'] = colXy save(colXy, f'{path_pipeline_export}/colsX.pkl' ) save(cols_family, f'{path_pipeline_export}/cols_family.pkl' ) ###### Return values ######################################################################### return dfXy, cols_family
def pd_colnum_quantile_norm(df, col, pars={}): """ colnum normalization by quantile """ prefix = "colnum_quantile_norm" df = df[col] num_col = col ##### Grab previous computed params ################################################ pars2 = {} if 'path_pipeline' in pars: #### Load existing column list colnum_quantile_norm = load(pars['path_pipeline'] + f'/{prefix}.pkl') model = load(pars['path_pipeline'] + f'/{prefix}_model.pkl') pars2 = load(pars['path_pipeline'] + f'/{prefix}_pars.pkl') lower_bound_sparse = pars2.get('lower_bound_sparse', None) upper_bound_sparse = pars2.get('upper_bound_sparse', None) lower_bound = pars2.get('lower_bound_sparse', None) upper_bound = pars2.get('upper_bound_sparse', None) sparse_col = pars2.get('colsparse', ['capital-gain', 'capital-loss']) ####### Find IQR and implement to numericals and sparse columns seperately ########## Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 for col in num_col: if col in sparse_col: df_nosparse = pd.DataFrame(df[df[col] != df[col].mode()[0]][col]) if lower_bound_sparse is not None: pass elif df_nosparse[col].quantile( 0.25) < df[col].mode()[0]: #Unexpected case lower_bound_sparse = df_nosparse[col].quantile(0.25) else: lower_bound_sparse = df[col].mode()[0] if upper_bound_sparse is not None: pass elif df_nosparse[col].quantile( 0.75) < df[col].mode()[0]: #Unexpected case upper_bound_sparse = df[col].mode()[0] else: upper_bound_sparse = df_nosparse[col].quantile(0.75) n_outliers = len(df[(df[col] < lower_bound_sparse) | (df[col] > upper_bound_sparse)][col]) if n_outliers > 0: df.loc[df[col] < lower_bound_sparse, col] = lower_bound_sparse * 0.75 #--> MAIN DF CHANGED df.loc[df[col] > upper_bound_sparse, col] = upper_bound_sparse * 1.25 # --> MAIN DF CHANGED else: if lower_bound is None or upper_bound is None: lower_bound = df[col].quantile(0.25) - 1.5 * IQR[col] upper_bound = df[col].quantile(0.75) + 1.5 * IQR[col] df[col] = np.where(df[col] > upper_bound, 1.25 * upper_bound, df[col]) df[col] = np.where(df[col] < lower_bound, 0.75 * lower_bound, df[col]) df.columns = [t + "_qt_norm" for t in df.columns] pars_new = { 'lower_bound': lower_bound, 'upper_bound': upper_bound, 'lower_bound_sparse': lower_bound_sparse, 'upper_bound_sparse': upper_bound_sparse } dfnew = df model = None colnew = list(df.columns) ##### Export ############################################################################## if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df, prefix, pars['path_features_store']) save(colnew, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_new, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") save(model, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: colnew ### list } return dfnew, col_pars
def text_preprocess(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000, preprocess_pars={}, filter_pars={}, path_features_store=None): """ :param path_train_X: :param path_train_y: :param path_pipeline_export: :param cols_group: :param n_sample: :param preprocess_pars: :param filter_pars: :param path_features_store: :return: """ from util_feature import (pd_colnum_tocat, pd_col_to_onehot, pd_colcat_mapping, pd_colcat_toint, pd_feature_generate_cross) ##### column names for feature generation ############################################### log(cols_group) coly = cols_group['coly'] # 'salary' colid = cols_group['colid'] # "jobId" colcat = cols_group[ 'colcat'] # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ] colnum = cols_group['colnum'] # ['yearsExperience', 'milesFromMetropolis'] colcross_single = cols_group.get('colcross', []) ### List of single columns coltext = cols_group.get('coltext', []) coldate = cols_group.get('coldate', []) colall = colnum + colcat + coltext + coldate log(colall) ##### Load data ######################################################################## df = load_dataset(path_train_X, path_train_y, colid, n_sample=n_sample) log("##### Coltext processing ###############################################################" ) from utils import util_text, util_model ### Remoe common words ############################################# import json import string punctuations = string.punctuation stopwords = json.load(open("stopwords_en.json"))["word"] stopwords = [t for t in string.punctuation] + stopwords stopwords = ["", " ", ",", ".", "-", "*", '€', "+", "/"] + stopwords stopwords = list(set(stopwords)) stopwords.sort() print(stopwords) stopwords = set(stopwords) def pipe_text(df, col, pars={}): ntoken = pars['n_token'] df = df.fillna("") dftext = df log(dftext) log(col) list1 = [] list1.append(col) # fromword = [ r"\b({w})\b".format(w=w) for w in fromword ] # print(fromword) for col_n in list1: dftext[col_n] = dftext[col_n].fillna("") dftext[col_n] = dftext[col_n].str.lower() dftext[col_n] = dftext[col_n].apply( lambda x: x.translate(string.punctuation)) dftext[col_n] = dftext[col_n].apply( lambda x: x.translate(string.digits)) dftext[col_n] = dftext[col_n].apply( lambda x: re.sub("[!@,#$+%*:()'-]", " ", x)) dftext[col_n] = dftext[col_n].apply( lambda x: coltext_stopwords(x, stopwords=stopwords)) print(dftext.head(6)) sep = " " """ :param df: :param coltext: text where word frequency should be extracted :param nb_to_show: :return: """ coltext_freq = df[col].apply( lambda x: pd.value_counts(x.split(sep))).sum(axis=0).reset_index() coltext_freq.columns = ["word", "freq"] coltext_freq = coltext_freq.sort_values("freq", ascending=0) log(coltext_freq) word_tokeep = coltext_freq["word"].values[:ntoken] word_tokeep = [t for t in word_tokeep if t not in stopwords] dftext_tdidf_dict, word_tokeep_dict = util_text.pd_coltext_tdidf( dftext, coltext=col, word_minfreq=1, word_tokeep=word_tokeep, return_val="dataframe,param") log(word_tokeep_dict) ### Dimesnion reduction for Sparse Matrix dftext_svd_list, svd_list = util_model.pd_dim_reduction( dftext_tdidf_dict, colname=None, model_pretrain=None, colprefix=col + "_svd", method="svd", dimpca=2, return_val="dataframe,param") return dftext_svd_list pars = {'n_token': 100} dftext1 = None for coltext_i in coltext: dftext_i = pipe_text(df[[coltext_i]], coltext_i, pars) save_features(dftext_i, 'dftext_' + coltext_i, path_features_store) dftext1 = pd.concat( (dftext1, dftext_i)) if dftext1 is not None else dftext_i print(dftext1.head(6)) dftext1.to_csv(r"" + path_features_store + "\dftext.csv", index=False) ################################################################################################## ##### Save pre-processor meta-parameters os.makedirs(path_pipeline_export, exist_ok=True) log(path_pipeline_export) cols_family = {} for t in ['coltext']: tfile = f'{path_pipeline_export}/{t}.pkl' log(tfile) t_val = locals().get(t, None) if t_val is not None: save(t_val, tfile) cols_family[t] = t_val return dftext1, cols_family
def train(model_dict, dfX, cols_family, post_process_fun): """ Train the model using model_dict, save model, save prediction :param model_dict: dict containing params :param dfX: pd.DataFrame :param cols_family: dict of list containing column names :param post_process_fun: :return: dfXtrain , dfXval DataFrame containing prediction. """ model_pars, compute_pars = model_dict['model_pars'], model_dict[ 'compute_pars'] data_pars = model_dict['data_pars'] model_name, model_path = model_pars['model_class'], model_dict[ 'global_pars']['path_train_model'] metric_list = compute_pars['metric_list'] log("#### Data preparation #########################################################" ) log(dfX.shape) dfX = dfX.sample(frac=1.0) itrain = int(0.6 * len(dfX)) ival = int(0.8 * len(dfX)) colsX = data_pars['cols_model'] coly = data_pars['coly'] log('Model colsX', colsX) log('Model coly', coly) data_pars['data_type'] = 'ram' data_pars['train'] = { 'Xtrain': dfX[colsX].iloc[:itrain, :], 'ytrain': dfX[coly].iloc[:itrain], 'Xtest': dfX[colsX].iloc[itrain:ival, :], 'ytest': dfX[coly].iloc[itrain:ival], 'Xval': dfX[colsX].iloc[ival:, :], 'yval': dfX[coly].iloc[ival:], } log("#### Init, Train ############################################################" ) # from config_model import map_model modelx = map_model(model_name) log(modelx) modelx.reset() modelx.init(model_pars, compute_pars=compute_pars) if 'optuna' in model_name: modelx.fit(data_pars, compute_pars) # No need anymore # modelx.model.model_pars['optuna_model'] = modelx.fit(data_pars, compute_pars) else: modelx.fit(data_pars, compute_pars) log("#### Predict ################################################################" ) ypred, ypred_proba = modelx.predict(dfX[colsX], compute_pars=compute_pars) dfX[coly + '_pred'] = ypred # y_norm(ypred, inverse=True) dfX[coly] = dfX[coly].apply(lambda x: post_process_fun(x)) dfX[coly + '_pred'] = dfX[coly + '_pred'].apply(lambda x: post_process_fun(x)) if ypred_proba is None: ypred_proba_val = None elif len(ypred_proba.shape) <= 1: ypred_proba_val = ypred_proba[ival:] dfX[coly + '_proba'] = ypred_proba elif len(ypred_proba.shape) > 1: from util_feature import np_conv_to_one_col ypred_proba_val = ypred_proba[ival:, :] dfX[coly + '_proba'] = np_conv_to_one_col( ypred_proba, ";") ### merge into string "p1,p2,p3,p4" log(dfX.head(3).T) log("Actual : ", dfX[coly]) log("Prediction: ", dfX[coly + '_pred']) log("#### Metrics #############################################################" ) from util_feature import metrics_eval metrics_test = metrics_eval(metric_list, ytrue=dfX[coly].iloc[ival:], ypred=dfX[coly + '_pred'].iloc[ival:], ypred_proba=ypred_proba_val) stats = {'metrics_test': metrics_test} log(stats) log("### Saving model, dfX, columns ###########################################" ) log(model_path + "/model.pkl") os.makedirs(model_path, exist_ok=True) save(colsX, model_path + "/colsX.pkl") save(coly, model_path + "/coly.pkl") modelx.save(model_path, stats) log("### Reload model, ############################################" ) log(modelx.model.model_pars, modelx.model.compute_pars) a = load(model_path + "/model.pkl") log("Reload model pars", a.model_pars) return dfX.iloc[:ival, :].reset_index(), dfX.iloc[ival:, :].reset_index()