def run_predict(model_name, path_model, path_data, path_output, n_sample=-1): path_output = root + path_output path_data = root + path_data + "/features.zip" #.zip path_model = root + path_model path_pipeline = path_model + "/pipeline/" path_test_X = path_data + "/features.zip" #.zip #added path to testing features log(path_data, path_model, path_output) colid = load(f'{path_pipeline}/colid.pkl') df = load_dataset(path_data, path_data_y=None, colid=colid, n_sample=n_sample) dfX, cols_family = preprocess(df, path_pipeline) ypred, yproba = predict(model_name, path_model, dfX, cols_family) log("Saving prediction", ypred.shape, path_output) os.makedirs(path_output, exist_ok=True) df[cols_family["coly"] + "_pred"] = ypred if yproba is not None: df[cols_family["coly"] + "_pred_proba"] = yproba df.to_csv(f"{path_output}/prediction.csv") log(df.head(8)) ##### Export Specific df[cols_family["coly"]] = ypred df[[cols_family["coly"]]].to_csv(f"{path_output}/pred_only.csv")
def run_predict(config_name, config_path, n_sample=-1, path_data=None, path_output=None, pars={}, model_dict=None): model_dict = model_dict_load(model_dict, config_path, config_name, verbose=True) m = model_dict['global_pars'] model_class = model_dict['model_pars']['model_class'] path_data = m['path_pred_data'] if path_data is None else path_data path_pipeline = m['path_pred_pipeline'] # path_output + "/pipeline/" ) path_model = m['path_pred_model'] path_output = m['path_pred_output'] if path_output is None else path_output log(path_data, path_model, path_output) pars = { 'cols_group': model_dict['data_pars']['cols_input_type'], 'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list'] } ########################################################################################## colid = load(f'{path_pipeline}/colid.pkl') df = load_dataset(path_data, path_data_y=None, colid=colid, n_sample=n_sample) from run_preprocess import preprocess_inference as preprocess dfX, cols_family = preprocess(df, path_pipeline, preprocess_pars=pars) ypred, yproba = predict(model_class, path_model, dfX, cols_family) log("############ Saving prediction ###################################################" ) log(ypred.shape, path_output) os.makedirs(path_output, exist_ok=True) df[cols_family["coly"] + "_pred"] = ypred if yproba is not None: df[cols_family["coly"] + "_pred_proba"] = yproba df.to_csv(f"{path_output}/prediction.csv") log(df.head(8)) log("########### Export Specific ######################################################" ) df[cols_family["coly"]] = ypred df[[cols_family["coly"]]].to_csv(f"{path_output}/pred_only.csv")
def run_data_check(path_data, path_data_ref, path_model, path_output, sample_ratio=0.5): """ Calcualata Dataset Shift before prediction. """ from run_preprocess import preprocess_inference as preprocess path_output = root + path_output path_data = root + path_data path_data_ref = root + path_data_ref path_pipeline = root + path_model + "/pipeline/" os.makedirs(path_output, exist_ok=True) colid = load(f'{path_pipeline}/colid.pkl') df1 = load_dataset(path_data_ref, colid=colid) dfX1, cols_family1 = preprocess(df1, path_pipeline) df2 = load_dataset(path_data, colid=colid) dfX2, cols_family2 = preprocess(df2, path_pipeline) colsX = cols_family1["colnum_bin"] + cols_family1["colcat_bin"] dfX1 = dfX1[colsX] dfX2 = dfX2[colsX] from util_feature import pd_stat_dataset_shift nsample = int(min(len(dfX1), len(dfX2)) * sample_ratio) metrics_psi = pd_stat_dataset_shift(dfX2, dfX1, colsX, nsample=nsample, buckets=7, axis=0) metrics_psi.to_csv(f"{path_output}/prediction_features_metrics.csv") log(metrics_psi)
def preprocess(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000, preprocess_pars={}, path_features_store=None): """ Used for trainiing only Save params on disk :param path_train_X: :param path_train_y: :param path_pipeline_export: :param cols_group: :param n_sample: :param preprocess_pars: :param path_features_store: :return: """ ##### column names for feature generation ##################################################### log(cols_group) coly = cols_group['coly'] # 'salary' colid = cols_group['colid'] # "jobId" colcat = cols_group[ 'colcat'] # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ] colnum = cols_group['colnum'] # ['yearsExperience', 'milesFromMetropolis'] os.makedirs(path_pipeline_export, exist_ok=True) log(path_pipeline_export) save(colid, f'{path_pipeline_export}/colid.pkl') ### Pipeline Execution ########################################## pipe_default = [{ 'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'type': '' }, { 'uri': 'source/prepro.py::pd_colnum_binto_onehot', 'pars': {}, 'cols_family': 'colnum_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_to_onehot', 'pars': {}, 'cols_family': 'colcat_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcross', 'pars': {}, 'cols_family': 'colcross', 'type': 'cross' }] pipe_list = preprocess_pars.get('pipe_list', pipe_default) pipe_list_X = [ task for task in pipe_list if task.get('type', '') not in ['coly', 'filter'] ] pipe_list_y = [ task for task in pipe_list if task.get('type', '') in ['coly'] ] pipe_filter = [ task for task in pipe_list if task.get('type', '') in ['filter'] ] ##### Load data ################################################################################# df = load_dataset(path_train_X, path_train_y, colid, n_sample=n_sample) ##### Generate features ########################################################################## dfi_all = {} ### Dict of all features cols_family_all = {'colid': colid, 'colnum': colnum, 'colcat': colcat} if len(pipe_filter) > 0: log("##### Filter #########################################################################" ) pipe_i = pipe_filter[0] pipe_fun = load_function_uri(pipe_i['uri']) df, col_pars = pipe_fun(df, list(df.columns), pars=pipe_i.get('pars', {})) if len(pipe_list_y) > 0: log("##### coly ###########################################################################" ) pipe_i = pipe_list_y[0] pipe_fun = load_function_uri(pipe_i['uri']) logs("----------df----------\n", df) pars = pipe_i.get('pars', {}) pars['path_features_store'] = path_features_store pars['path_pipeline_export'] = path_pipeline_export df, col_pars = pipe_fun(df, cols_group['coly'], pars=pars) ### coly can remove rows logs("----------df----------\n", df) dfi_all['coly'] = df[cols_group['coly']] cols_family_all['coly'] = cols_group['coly'] save_features(df[cols_group['coly']], "coly", path_features_store) ### already saved save(coly, f'{path_pipeline_export}/coly.pkl') ##### Processors ############################################################################### dfi_all['coly'] = df[cols_group['coly']] #for colg, colg_list in cols_group.items() : # if colg not in ['colid']: # dfi_all[colg] = df[colg_list] ## colnum colcat, coly for pipe_i in pipe_list_X: log("###################", pipe_i, "##########################################################") pipe_fun = load_function_uri( pipe_i['uri']) ### Load the code definition into pipe_fun cols_name = pipe_i['cols_family'] col_type = pipe_i['type'] pars = pipe_i.get('pars', {}) pars[ 'path_features_store'] = path_features_store ### intermdiate dataframe pars['path_pipeline_export'] = path_pipeline_export ### Store pipeline if col_type == 'cross': log("################### Adding Cross ###################################################" ) pars['dfnum_hot'] = dfi_all[ 'colnum_onehot'] ### dfnum_hot --> dfcross pars['dfcat_hot'] = dfi_all['colcat_onehot'] pars['colid'] = colid pars['colcross_single'] = cols_group.get('colcross', []) elif col_type == 'add_coly': log('add_coly genetic', cols_group['coly']) pars['coly'] = cols_group['coly'] pars['dfy'] = dfi_all['coly'] ### Transformed dfy ### Input columns or prevously Computed Columns ( colnum_bin ) cols_list = cols_group[cols_name] if cols_name in cols_group else list( dfi_all[cols_name].columns) df_ = df[cols_list] if cols_name in cols_group else dfi_all[cols_name] #cols_list = list(dfi_all[cols_name].columns) #df_ = dfi_all[cols_name] dfi, col_pars = pipe_fun(df_, cols_list, pars=pars) ### Concatenate colnum, colnum_bin into cols_family_all , dfi_all ########################### for colj, colist in col_pars['cols_new'].items(): ### Merge sub-family cols_family_all[colj] = cols_family_all.get(colj, []) + colist dfi_all[colj] = pd.concat( (dfi_all[colj], dfi), axis=1) if colj in dfi_all else dfi # save_features(dfi_all[colj], colj, path_features_store) ###### Merge AlL int dfXy ################################################################## dfXy = df[[coly] + colnum + colcat] #dfXy = df[ [coly] ] for t in dfi_all.keys(): if t not in ['coly', 'colnum', 'colcat']: dfXy = pd.concat((dfXy, dfi_all[t]), axis=1) save_features(dfXy, 'dfX', path_features_store) colXy = list(dfXy.columns) colXy.remove(coly) ##### Only X columns if len(colid) > 0: cols_family_all['colid'] = colid cols_family_all['colX'] = colXy #### Cols group for model input ########################################################### save(colXy, f'{path_pipeline_export}/colsX.pkl') save(cols_family_all, f'{path_pipeline_export}/cols_family.pkl') ###### Return values ####################################################################### return dfXy, cols_family_all
def text_preprocess(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000, preprocess_pars={}, filter_pars={}, path_features_store=None): """ :param path_train_X: :param path_train_y: :param path_pipeline_export: :param cols_group: :param n_sample: :param preprocess_pars: :param filter_pars: :param path_features_store: :return: """ from util_feature import (pd_colnum_tocat, pd_col_to_onehot, pd_colcat_mapping, pd_colcat_toint, pd_feature_generate_cross) ##### column names for feature generation ############################################### log(cols_group) coly = cols_group['coly'] # 'salary' colid = cols_group['colid'] # "jobId" colcat = cols_group[ 'colcat'] # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ] colnum = cols_group['colnum'] # ['yearsExperience', 'milesFromMetropolis'] colcross_single = cols_group.get('colcross', []) ### List of single columns coltext = cols_group.get('coltext', []) coldate = cols_group.get('coldate', []) colall = colnum + colcat + coltext + coldate log(colall) ##### Load data ######################################################################## df = load_dataset(path_train_X, path_train_y, colid, n_sample=n_sample) log("##### Coltext processing ###############################################################" ) from utils import util_text, util_model ### Remoe common words ############################################# import json import string punctuations = string.punctuation stopwords = json.load(open("stopwords_en.json"))["word"] stopwords = [t for t in string.punctuation] + stopwords stopwords = ["", " ", ",", ".", "-", "*", '€', "+", "/"] + stopwords stopwords = list(set(stopwords)) stopwords.sort() print(stopwords) stopwords = set(stopwords) def pipe_text(df, col, pars={}): ntoken = pars['n_token'] df = df.fillna("") dftext = df log(dftext) log(col) list1 = [] list1.append(col) # fromword = [ r"\b({w})\b".format(w=w) for w in fromword ] # print(fromword) for col_n in list1: dftext[col_n] = dftext[col_n].fillna("") dftext[col_n] = dftext[col_n].str.lower() dftext[col_n] = dftext[col_n].apply( lambda x: x.translate(string.punctuation)) dftext[col_n] = dftext[col_n].apply( lambda x: x.translate(string.digits)) dftext[col_n] = dftext[col_n].apply( lambda x: re.sub("[!@,#$+%*:()'-]", " ", x)) dftext[col_n] = dftext[col_n].apply( lambda x: coltext_stopwords(x, stopwords=stopwords)) print(dftext.head(6)) sep = " " """ :param df: :param coltext: text where word frequency should be extracted :param nb_to_show: :return: """ coltext_freq = df[col].apply( lambda x: pd.value_counts(x.split(sep))).sum(axis=0).reset_index() coltext_freq.columns = ["word", "freq"] coltext_freq = coltext_freq.sort_values("freq", ascending=0) log(coltext_freq) word_tokeep = coltext_freq["word"].values[:ntoken] word_tokeep = [t for t in word_tokeep if t not in stopwords] dftext_tdidf_dict, word_tokeep_dict = util_text.pd_coltext_tdidf( dftext, coltext=col, word_minfreq=1, word_tokeep=word_tokeep, return_val="dataframe,param") log(word_tokeep_dict) ### Dimesnion reduction for Sparse Matrix dftext_svd_list, svd_list = util_model.pd_dim_reduction( dftext_tdidf_dict, colname=None, model_pretrain=None, colprefix=col + "_svd", method="svd", dimpca=2, return_val="dataframe,param") return dftext_svd_list pars = {'n_token': 100} dftext1 = None for coltext_i in coltext: dftext_i = pipe_text(df[[coltext_i]], coltext_i, pars) save_features(dftext_i, 'dftext_' + coltext_i, path_features_store) dftext1 = pd.concat( (dftext1, dftext_i)) if dftext1 is not None else dftext_i print(dftext1.head(6)) dftext1.to_csv(r"" + path_features_store + "\dftext.csv", index=False) ################################################################################################## ##### Save pre-processor meta-parameters os.makedirs(path_pipeline_export, exist_ok=True) log(path_pipeline_export) cols_family = {} for t in ['coltext']: tfile = f'{path_pipeline_export}/{t}.pkl' log(tfile) t_val = locals().get(t, None) if t_val is not None: save(t_val, tfile) cols_family[t] = t_val return dftext1, cols_family
def preprocess(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000, preprocess_pars={}, filter_pars={}, path_features_store=None): """ :param path_train_X: :param path_train_y: :param path_pipeline_export: :param cols_group: :param n_sample: :param preprocess_pars: :param filter_pars: :param path_features_store: :return: """ from util_feature import (pd_colnum_tocat, pd_col_to_onehot, pd_colcat_mapping, pd_colcat_toint, pd_feature_generate_cross) ##### column names for feature generation ##################################################### log(cols_group) coly = cols_group['coly'] # 'salary' colid = cols_group['colid'] # "jobId" colcat = cols_group['colcat'] # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ] colnum = cols_group['colnum'] # ['yearsExperience', 'milesFromMetropolis'] colcross_single = cols_group.get('colcross', []) ### List of single columns coltext = cols_group.get('coltext', []) coldate = cols_group.get('coldate', []) colall = colnum + colcat + coltext + coldate log(colall) #### Pipeline Execution pipe_default = [ 'filter', 'label', 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot', ] pipe_list = preprocess_pars.get('pipe_list', pipe_default) pipe_list.append('dfdate') pipe_list_pars = preprocess_pars.get('pipe_pars', []) ##### Load data ############################################################################## df = load_dataset(path_train_X, path_train_y, colid, n_sample= n_sample) ##### Filtering / cleaning rows : ######################################################### if "filter" in pipe_list : def isfloat(x): try : a= float(x) return 1 except: return 0 ymin, ymax = filter_pars.get('ymin', -9999999999.0), filter_pars.get('ymax', 999999999.0) print(coly) df['_isfloat'] = df[ coly ].apply(lambda x : isfloat(x)) print(df['_isfloat']) df = df[ df['_isfloat'] > 0 ] df = df[df[coly] > ymin] df = df[df[coly] < ymax] ##### Label processing #################################################################### y_norm_fun = None if "label" in pipe_list : # Target coly processing, Normalization process , customize by model log("y_norm_fun preprocess_pars") y_norm_fun = preprocess_pars.get('y_norm_fun', None) if y_norm_fun is not None: df[coly] = df[coly].apply(lambda x: y_norm_fun(x)) save(y_norm_fun, f'{path_pipeline_export}/y_norm.pkl' ) save_features(df[coly], 'dfy', path_features_store) ########### colnum procesing ############################################################# for x in colnum: print('bam',x) df[x] = df[x].astype("float") log(df[colall].dtypes) if "dfnum" in pipe_list : pass if "dfnum_norm" in pipe_list : log("### colnum normalize ###############################################################") from util_feature import pd_colnum_normalize pars = { 'pipe_list': [ {'name': 'fillna', 'naval' : 0.0 }, {'name': 'minmax'} ]} dfnum_norm, colnum_norm = pd_colnum_normalize(df, colname=colnum, pars=pars, suffix = "_norm", return_val="dataframe,param") log(colnum_norm) save_features(dfnum_norm, 'dfnum_norm', path_features_store) if "dfnum_bin" in pipe_list : log("### colnum Map numerics to Category bin ###########################################") dfnum_bin, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=None, bins=10, suffix="_bin", method="uniform", return_val="dataframe,param") log(colnum_binmap) ### Renaming colunm_bin with suffix colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())] log(colnum_bin) save_features(dfnum_bin, 'dfnum_binmap', path_features_store) if "dfnum_hot" in pipe_list and "dfnum_bin" in pipe_list : log("### colnum bin to One Hot") dfnum_hot, colnum_onehot = pd_col_to_onehot(dfnum_bin[colnum_bin], colname=colnum_bin, colonehot=None, return_val="dataframe,param") log(colnum_onehot) save_features(dfnum_hot, 'dfnum_onehot', path_features_store) ##### Colcat processing ################################################################ colcat_map = pd_colcat_mapping(df, colcat) log(df[colcat].dtypes, colcat_map) if "dfcat_hot" in pipe_list : log("#### colcat to onehot") dfcat_hot, colcat_onehot = pd_col_to_onehot(df[colcat], colname=colcat, colonehot=None, return_val="dataframe,param") log(dfcat_hot[colcat_onehot].head(5)) save_features(dfcat_hot, 'dfcat_onehot', path_features_store) if "dfcat_bin" in pipe_list : log("#### Colcat to integer encoding ") dfcat_bin, colcat_bin_map = pd_colcat_toint(df[colcat], colname=colcat, colcat_map=None, suffix="_int") colcat_bin = list(dfcat_bin.columns) save_features(dfcat_bin, 'dfcat_bin', path_features_store) if "dfcross_hot" in pipe_list : log("##### Cross Features From OneHot Features ######################################") try : df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left') except : df_onehot = copy.deepcopy(dfcat_hot) colcross_single_onehot_select = [] for t in list(df_onehot) : for c1 in colcross_single : if c1 in t : colcross_single_onehot_select.append(t) df_onehot = df_onehot[colcross_single_onehot_select ] dfcross_hot, colcross_pair = pd_feature_generate_cross(df_onehot, colcross_single_onehot_select, pct_threshold=0.02, m_combination=2) log(dfcross_hot.head(2).T) colcross_pair_onehot = list(dfcross_hot.columns) save_features(dfcross_hot, 'dfcross_onehot', path_features_store) del df_onehot ,colcross_pair_onehot if "dftext" in pipe_list : log("##### Coltext processing ###############################################################") stopwords = nlp_get_stopwords() pars = {'n_token' : 100 , 'stopwords': stopwords} dftext = None for coltext_i in coltext : ##### Run the text processor on each column text ############################# dftext_i = pipe_text( df[[coltext_i ]], coltext_i, pars ) dftext = pd.concat((dftext, dftext_i), axis=1) if dftext is not None else dftext_i save_features(dftext_i, 'dftext_' + coltext_i, path_features_store) log(dftext.head(6)) save_features(dftext, 'dftext', path_features_store) if "dfdate" in pipe_list : log("##### Coldate processing #############################################################") from utils import util_date dfdate = None for coldate_i in coldate : dfdate_i = util_date.pd_datestring_split( df[[coldate_i]] , coldate_i, fmt="auto", return_val= "split" ) dfdate = pd.concat((dfdate, dfdate_i), axis=1) if dfdate is not None else dfdate_i save_features(dfdate_i, 'dfdate_' + coldate_i, path_features_store) save_features(dfdate, 'dfdate', path_features_store) print('spoo',dfdate) ################################################################################### # ############### ##### Save pre-processor meta-parameters os.makedirs(path_pipeline_export, exist_ok=True) log(path_pipeline_export) cols_family = {} for t in ['colid', "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap", #### Colnum columns "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map", #### colcat columns 'colcross_single_onehot_select', "colcross_pair_onehot", 'colcross_pair', #### colcross columns 'coldate', 'coltext', "coly", "y_norm_fun" ]: tfile = f'{path_pipeline_export}/{t}.pkl' log(tfile) t_val = locals().get(t, None) if t_val is not None : save(t_val, tfile) cols_family[t] = t_val ###### Merge AlL ############################################################################# dfXy = df[colnum + colcat + [coly] ] print('localTT',dfXy) for t in [ 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot', 'dfdate', 'dftext' ] : if t in locals() : print('localT', t, locals()[t]) dfXy = pd.concat((dfXy, locals()[t] ), axis=1) save_features(dfXy, 'dfX', path_features_store) colXy = list(dfXy.columns) colXy.remove(coly) ##### Only X columns cols_family['colX'] = colXy save(colXy, f'{path_pipeline_export}/colsX.pkl' ) save(cols_family, f'{path_pipeline_export}/cols_family.pkl' ) ###### Return values ######################################################################### return dfXy, cols_family
def run_predict(config_name, config_path, n_sample=-1, path_data=None, path_output=None, pars={}, model_dict=None): log("#### Run predict ###############################################################" ) model_dict = model_dict_load(model_dict, config_path, config_name, verbose=True) model_class = model_dict['model_pars']['model_class'] m = model_dict['global_pars'] path_data = m['path_pred_data'] if path_data is None else path_data path_pipeline = m['path_pred_pipeline'] # path_output + "/pipeline/" ) path_model = m['path_pred_model'] path_output = m['path_pred_output'] if path_output is None else path_output log(path_data, path_model, path_output) pars = { 'cols_group': model_dict['data_pars']['cols_input_type'], 'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list'] } log("#### Run preprocess ###########################################################" ) from run_preprocess import preprocess_inference as preprocess colid = load(f'{path_pipeline}/colid.pkl') df = load_dataset(path_data, path_data_y=None, colid=colid, n_sample=n_sample) dfX, cols = preprocess(df, path_pipeline, preprocess_pars=pars) coly = cols["coly"] log("#### Extract column names #########################################################" ) ### Actual column names for Model Input : label y and Input X (colnum , colcat), remove duplicate names ### [ 'colcat', 'colnum' model_dict['data_pars']['coly'] = cols['coly'] model_dict['data_pars']['cols_model'] = list( set( sum([ cols[colgroup] for colgroup in model_dict['data_pars']['cols_model_group'] ], []))) #### Flatten Col Group by column type : Sparse, continuous, .... (ie Neural Network feed Input, remove duplicate names ## 'coldense' = [ 'colnum' ] 'colsparse' = ['colcat' ] model_dict['data_pars']['cols_model_type2'] = {} for colg, colg_list in model_dict['data_pars'].get('cols_model_type', {}).items(): model_dict['data_pars']['cols_model_type2'][colg] = list( set(sum([cols[colgroup] for colgroup in colg_list], []))) log("############ Prediction ##########################################################" ) ypred, yproba = predict(model_class, path_model, dfX, cols, model_dict) post_process_fun = model_dict['model_pars']['post_process_fun'] df[coly + "_pred"] = ypred df[coly + "_pred"] = df[coly + '_pred'].apply(lambda x: post_process_fun(x)) if yproba is not None: df[coly + "_pred_proba"] = yproba log("############ Saving prediction ###################################################" ) log(ypred.shape, path_output) os.makedirs(path_output, exist_ok=True) df.to_csv(f"{path_output}/prediction.csv") log(df.head(8)) log("########### Export Specific ######################################################" ) df[cols["coly"]] = ypred df[[cols["coly"]]].to_csv(f"{path_output}/pred_only.csv")
def run_transform(config_name, config_path, n_sample=1, path_data=None, path_output=None, pars={}, model_dict=None, return_mode=""): log("##### Run transform ###############################################################" ) model_dict = model_dict_load(model_dict, config_path, config_name, verbose=True) model_class = model_dict['model_pars']['model_class'] m = model_dict['global_pars'] path_data = m['path_pred_data'] if path_data is None else path_data path_pipeline = m['path_pred_pipeline'] # path_output + "/pipeline/" ) path_model = m['path_pred_model'] model_file = m.get('model_file', "") ### New path_output = m['path_pred_output'] if path_output is None else path_output log(path_data, path_model, path_output) pars = { 'cols_group': model_dict['data_pars']['cols_input_type'], 'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list'] } log("##### Load Preprocess ############################################################" ) from run_preprocess import preprocess_inference as preprocess colid = load(f'{path_pipeline}/colid.pkl') if model_class in SUPERVISED_MODELS: path_pred_X = m.get('path_pred_X', path_data + "/features.zip") #.zip path_pred_y = m.get('path_pred_y', path_data + "/target.zip") #.zip df = load_dataset(path_pred_X, path_pred_y, colid, n_sample=n_sample) else: df = load_dataset(path_data, None, colid, n_sample=n_sample) dfX, cols = preprocess(df, path_pipeline, preprocess_pars=pars) coly = cols["coly"] log("#### Extract column names #######################################################" ) ### Actual column names for Model Input : label y and Input X (colnum , colcat), remove duplicate names model_dict['data_pars']['coly'] = cols['coly'] model_dict['data_pars']['cols_model'] = list( set( sum([ cols[colgroup] for colgroup in model_dict['data_pars']['cols_model_group'] ], []))) #### Col Group by column type : Sparse, continuous, .... (ie Neural Network feed Input, remove duplicate names #### 'coldense' = [ 'colnum' ] 'colsparse' = ['colcat' ] model_dict['data_pars']['cols_model_type2'] = {} for colg, colg_list in model_dict['data_pars'].get('cols_model_type', {}).items(): model_dict['data_pars']['cols_model_type2'][colg] = list( set(sum([cols[colgroup] for colgroup in colg_list], []))) log("############ Task Inference ###################################################" ) task_type = model_dict['compute_pars'].get('task_inference', 'transform') if model_class in SUPERVISED_MODELS: dfXy = transform( model_file, path_model, (dfX[[c for c in dfX.columns if c not in coly]], df[coly]), model_dict, task_type=task_type) else: dfXy = transform(model_file, path_model, dfX, model_dict, task_type=task_type) post_process_fun = model_dict['model_pars']['post_process_fun'] if return_mode == 'dict': return {'dfXy': dfXy} else: log("#### Export ##################################################################" ) path_check_out = m.get('path_check_out', path_output + "/check/") os.makedirs(path_check_out, exist_ok=True) dfX.to_parquet(path_check_out + "/dfX.parquet") # train input data generate parquet log( "######### Finish #############################################################", )