def run_predict(config_name, config_path, n_sample=-1, path_data=None, path_output=None, pars={}, model_dict=None): model_dict = model_dict_load(model_dict, config_path, config_name, verbose=True) m = model_dict['global_pars'] model_class = model_dict['model_pars']['model_class'] path_data = m['path_pred_data'] if path_data is None else path_data path_pipeline = m['path_pred_pipeline'] # path_output + "/pipeline/" ) path_model = m['path_pred_model'] path_output = m['path_pred_output'] if path_output is None else path_output log(path_data, path_model, path_output) pars = { 'cols_group': model_dict['data_pars']['cols_input_type'], 'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list'] } ########################################################################################## colid = load(f'{path_pipeline}/colid.pkl') df = load_dataset(path_data, path_data_y=None, colid=colid, n_sample=n_sample) from run_preprocess import preprocess_inference as preprocess dfX, cols_family = preprocess(df, path_pipeline, preprocess_pars=pars) ypred, yproba = predict(model_class, path_model, dfX, cols_family) log("############ Saving prediction ###################################################" ) log(ypred.shape, path_output) os.makedirs(path_output, exist_ok=True) df[cols_family["coly"] + "_pred"] = ypred if yproba is not None: df[cols_family["coly"] + "_pred_proba"] = yproba df.to_csv(f"{path_output}/prediction.csv") log(df.head(8)) log("########### Export Specific ######################################################" ) df[cols_family["coly"]] = ypred df[[cols_family["coly"]]].to_csv(f"{path_output}/pred_only.csv")
def run_data_check(path_data, path_data_ref, path_model, path_output, sample_ratio=0.5): """ Calcualata Dataset Shift before prediction. """ from run_preprocess import preprocess_inference as preprocess path_output = root + path_output path_data = root + path_data path_data_ref = root + path_data_ref path_pipeline = root + path_model + "/pipeline/" os.makedirs(path_output, exist_ok=True) colid = load(f'{path_pipeline}/colid.pkl') df1 = load_dataset(path_data_ref, colid=colid) dfX1, cols_family1 = preprocess(df1, path_pipeline) df2 = load_dataset(path_data, colid=colid) dfX2, cols_family2 = preprocess(df2, path_pipeline) colsX = cols_family1["colnum_bin"] + cols_family1["colcat_bin"] dfX1 = dfX1[colsX] dfX2 = dfX2[colsX] from util_feature import pd_stat_dataset_shift nsample = int(min(len(dfX1), len(dfX2)) * sample_ratio) metrics_psi = pd_stat_dataset_shift(dfX2, dfX1, colsX, nsample=nsample, buckets=7, axis=0) metrics_psi.to_csv(f"{path_output}/prediction_features_metrics.csv") log(metrics_psi)
def run_train(config_name, path_data_train=None, path_output=None, config_path="source/config_model.py", n_sample=5000, mode="run_preprocess", model_dict=None): """ Configuration of the model is in config_model.py file :param config_name: :param config_path: :param n_sample: :return: """ model_dict = model_dict_load(model_dict, config_path, config_name, verbose=True) m = model_dict['global_pars'] path_data_train = m['path_data_train'] path_train_X = m.get('path_train_X', path_data_train + "/features.zip") #.zip path_train_y = m.get('path_train_y', path_data_train + "/target.zip") #.zip path_output = m['path_train_output'] # path_model = m.get('path_model', path_output + "/model/" ) path_pipeline = m.get('path_pipeline', path_output + "/pipeline/") path_features_store = m.get( 'path_features_store', path_output + '/features_store/' ) #path_data_train replaced with path_output, because preprocessed files are stored there path_check_out = m.get('path_check_out', path_output + "/check/") log(path_output) log("#### load input column family ##################################################" ) try: cols_group = model_dict['data_pars'][ 'cols_input_type'] ### the model config file except: cols_group = json.load( open(path_data_train + "/cols_group.json", mode='r')) log(cols_group) log("#### Preprocess ################################################################" ) preprocess_pars = model_dict['model_pars']['pre_process_pars'] #filter_pars = model_dict['data_pars']['filter_pars'] if mode == "run_preprocess": dfXy, cols = preprocess(path_train_X, path_train_y, path_pipeline, cols_group, n_sample, preprocess_pars, path_features_store=path_features_store) elif mode == "load_preprocess": #### Load existing data dfXy, cols = preprocess_load(path_train_X, path_train_y, path_pipeline, cols_group, n_sample, preprocess_pars, path_features_store=path_features_store) ### Actual column for label y and Input X (colnum , colcat model_dict['data_pars']['coly'] = cols['coly'] model_dict['data_pars']['cols_model'] = sum([ cols[colgroup] for colgroup in model_dict['data_pars']['cols_model_group'] ], []) log("#### Train model: #############################################################" ) log(str(model_dict)[:1000]) post_process_fun = model_dict['model_pars']['post_process_fun'] dfXy, dfXytest = train(model_dict, dfXy, cols, post_process_fun) log("#### Export ##################################################################" ) os.makedirs(path_check_out, exist_ok=True) colexport = [cols['colid'], cols['coly'], cols['coly'] + "_pred"] dfXy[colexport].reset_index().to_csv(path_check_out + "/pred_check.csv") # Only results dfXy.to_parquet(path_check_out + "/dfX.parquet") # train input data generate parquet #dfXy.to_csv(path_check_out + "/dfX.csv") # train input data generate csv dfXytest.to_parquet( path_check_out + "/dfXtest.parquet") # Test input data generate parquet #dfXytest.to_csv(path_check_out + "/dfXtest.csv") # Test input data generate csv log( "######### Finish #############################################################", )
def run_train(config_name, config_path="source/config_model.py", n_sample=5000, mode="run_preprocess", model_dict=None, return_mode='file', **kw): """ Configuration of the model is in config_model.py file :param config_name: :param config_path: :param n_sample: :return: """ model_dict = model_dict_load(model_dict, config_path, config_name, verbose=True) m = model_dict['global_pars'] path_data_train = m['path_data_train'] path_train_X = m.get('path_train_X', path_data_train + "/features.zip") #.zip path_train_y = m.get('path_train_y', path_data_train + "/target.zip") #.zip path_output = m['path_train_output'] # path_model = m.get('path_model', path_output + "/model/" ) path_pipeline = m.get('path_pipeline', path_output + "/pipeline/") path_features_store = m.get( 'path_features_store', path_output + '/features_store/' ) #path_data_train replaced with path_output, because preprocessed files are stored there path_check_out = m.get('path_check_out', path_output + "/check/") log(path_output) log("#### load raw data column family, colum check ###################################" ) cols_validate(model_dict) cols_group = model_dict['data_pars']['cols_input_type'] ### Raw log2(cols_group) log("#### Preprocess ################################################################" ) preprocess_pars = model_dict['model_pars']['pre_process_pars'] if mode == "run_preprocess": dfXy, cols = preprocess( path_train_X, path_train_y, path_pipeline, ### path to save preprocessing pipeline cols_group, ### dict of column family n_sample, preprocess_pars, path_features_store ### Store intermediate dataframe ) elif mode == "load_preprocess": #### Load existing data dfXy, cols = preprocess_load(path_train_X, path_train_y, path_pipeline, cols_group, n_sample, preprocess_pars, path_features_store=path_features_store) log("#### Extract column names #####################################################" ) ### Actual column names for Model Input : label y and Input X (colnum , colcat), remove duplicate names model_dict['data_pars']['coly'] = cols['coly'] model_dict['data_pars']['cols_model'] = list( set( sum([ cols[colgroup] for colgroup in model_dict['data_pars']['cols_model_group'] ], []))) #### Flatten Col Group by column type : Sparse, continuous, .... (ie Neural Network feed Input, remove duplicate names ## 'coldense' = [ 'colnum' ] 'colsparse' = ['colcat' ] model_dict['data_pars']['cols_model_type2'] = {} for colg, colg_list in model_dict['data_pars'].get('cols_model_type', {}).items(): model_dict['data_pars']['cols_model_type2'][colg] = list( set(sum([cols[colgroup] for colgroup in colg_list], []))) log("#### Train model: #############################################################" ) log3(str(model_dict)[:1000]) post_process_fun = model_dict['model_pars']['post_process_fun'] dfXy, dfXytest, stats = train(model_dict, dfXy, cols, post_process_fun) log("#### Register model ##########################################################" ) mlflow_pars = model_dict.get('compute_pars', {}).get('mlflow_pars', None) if mlflow_pars is not None: mlflow_register(dfXy, model_dict, stats, mlflow_pars) log("#### Export ##################################################################" ) if return_mode == 'dict': return {'dfXy': dfXy, 'dfXytest': dfXytest, 'stats': stats} else: os.makedirs(path_check_out, exist_ok=True) colexport = [cols['colid'], cols['coly'], cols['coly'] + "_pred"] if cols['coly'] + '_proba' in dfXy.columns: colexport.append(cols['coly'] + '_proba') dfXy[colexport].to_csv(path_check_out + "/pred_check.csv", sep="\t") # Only results dfXy.to_parquet(path_check_out + "/dfX.parquet") # train input data generate parquet dfXytest.to_parquet( path_check_out + "/dfXtest.parquet") # Test input data generate parquet #dfXy.to_csv(path_check_out + "/dfX.csv") # train input data generate csv #dfXytest.to_csv(path_check_out + "/dfXtest.csv") # Test input data generate csv log( "######### Finish #############################################################", )
def run_train(config_name, config_path="source/config_model.py", n_sample=5000, mode="run_preprocess", model_dict=None, return_mode='file', **kw): """ Configuration of the model is in config_model.py file :param config_name: :param config_path: :param n_sample: :return: """ model_dict = model_dict_load(model_dict, config_path, config_name, verbose=True) mlflow_pars = model_dict.get('compute_pars', {}).get('mlflow_pars', None) m = model_dict['global_pars'] path_data_train = m['path_data_train'] path_train_X = m.get('path_train_X', path_data_train + "/features.zip") #.zip path_train_y = m.get('path_train_y', path_data_train + "/target.zip") #.zip path_output = m['path_train_output'] # path_model = m.get('path_model', path_output + "/model/" ) path_pipeline = m.get('path_pipeline', path_output + "/pipeline/" ) path_features_store = m.get('path_features_store', path_output + '/features_store/' ) #path_data_train replaced with path_output, because preprocessed files are stored there path_check_out = m.get('path_check_out', path_output + "/check/" ) log(path_output) log("#### load input column family ##################################################") try : cols_group = model_dict['data_pars']['cols_input_type'] ### the model config file except : cols_group = json.load(open(path_data_train + "/cols_group.json", mode='r')) log(cols_group) log("#### Preprocess ################################################################") preprocess_pars = model_dict['model_pars']['pre_process_pars'] if mode == "run_preprocess" : dfXy, cols = preprocess(path_train_X, path_train_y, path_pipeline, ### path to save preprocessing pipeline cols_group, ### dict of column family n_sample, preprocess_pars, path_features_store ### Store intermediate dataframe ) elif mode == "load_preprocess" : #### Load existing data dfXy, cols = preprocess_load(path_train_X, path_train_y, path_pipeline, cols_group, n_sample, preprocess_pars, path_features_store=path_features_store) ### Actual column names for label y and Input X (colnum , colcat) model_dict['data_pars']['coly'] = cols['coly'] model_dict['data_pars']['cols_model'] = sum([ cols[colgroup] for colgroup in model_dict['data_pars']['cols_model_group'] ] , []) #### Col Group to model input : Sparse, continuous, .... (ie Neural Network ## 'coldense' = [ 'colnum' ] 'colsparse' = ['colcat' ] ## model_dict['data_pars']['cols_model_type2'] = {} for colg, colg_list in model_dict['data_pars'].get('cols_model_type', {}).items() : model_dict['data_pars']['cols_model_type2'][colg] = sum([ cols[colgroup] for colgroup in colg_list ] , []) log("#### Train model: #############################################################") log(str(model_dict)[:1000]) post_process_fun = model_dict['model_pars']['post_process_fun'] dfXy, dfXytest,stats = train(model_dict, dfXy, cols, post_process_fun) if mlflow_pars is not None: log("#### Using mlflow #########################################################") # def register(run_name, params, metrics, signature, model_class, tracking_uri= "sqlite:///local.db"): from run_mlflow import register from mlflow.models.signature import infer_signature train_signature = dfXy[model_dict['data_pars']['cols_model']] y_signature = dfXy[model_dict['data_pars']['coly']] signature = infer_signature(train_signature, y_signature) register( run_name = model_dict['global_pars']['config_name'], params = model_dict['global_pars'], metrics = stats["metrics_test"], signature = signature, model_class = model_dict['model_pars']["model_class"], tracking_uri = mlflow_pars.get( 'tracking_db', "sqlite:///mlflow_local.db") ) if return_mode == 'dict' : return { 'dfXy' : dfXy, 'dfXytest': dfXytest, 'stats' : stats } else : log("#### Export ##################################################################") os.makedirs(path_check_out, exist_ok=True) colexport = [cols['colid'], cols['coly'], cols['coly'] + "_pred"] dfXy[colexport].reset_index().to_csv(path_check_out + "/pred_check.csv") # Only results dfXy.to_parquet(path_check_out + "/dfX.parquet") # train input data generate parquet #dfXy.to_csv(path_check_out + "/dfX.csv") # train input data generate csv dfXytest.to_parquet(path_check_out + "/dfXtest.parquet") # Test input data generate parquet #dfXytest.to_csv(path_check_out + "/dfXtest.csv") # Test input data generate csv log("######### Finish #############################################################", )
def run_predict(config_name, config_path, n_sample=-1, path_data=None, path_output=None, pars={}, model_dict=None): log("#### Run predict ###############################################################" ) model_dict = model_dict_load(model_dict, config_path, config_name, verbose=True) model_class = model_dict['model_pars']['model_class'] m = model_dict['global_pars'] path_data = m['path_pred_data'] if path_data is None else path_data path_pipeline = m['path_pred_pipeline'] # path_output + "/pipeline/" ) path_model = m['path_pred_model'] path_output = m['path_pred_output'] if path_output is None else path_output log(path_data, path_model, path_output) pars = { 'cols_group': model_dict['data_pars']['cols_input_type'], 'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list'] } log("#### Run preprocess ###########################################################" ) from run_preprocess import preprocess_inference as preprocess colid = load(f'{path_pipeline}/colid.pkl') df = load_dataset(path_data, path_data_y=None, colid=colid, n_sample=n_sample) dfX, cols = preprocess(df, path_pipeline, preprocess_pars=pars) coly = cols["coly"] log("#### Extract column names #########################################################" ) ### Actual column names for Model Input : label y and Input X (colnum , colcat), remove duplicate names ### [ 'colcat', 'colnum' model_dict['data_pars']['coly'] = cols['coly'] model_dict['data_pars']['cols_model'] = list( set( sum([ cols[colgroup] for colgroup in model_dict['data_pars']['cols_model_group'] ], []))) #### Flatten Col Group by column type : Sparse, continuous, .... (ie Neural Network feed Input, remove duplicate names ## 'coldense' = [ 'colnum' ] 'colsparse' = ['colcat' ] model_dict['data_pars']['cols_model_type2'] = {} for colg, colg_list in model_dict['data_pars'].get('cols_model_type', {}).items(): model_dict['data_pars']['cols_model_type2'][colg] = list( set(sum([cols[colgroup] for colgroup in colg_list], []))) log("############ Prediction ##########################################################" ) ypred, yproba = predict(model_class, path_model, dfX, cols, model_dict) post_process_fun = model_dict['model_pars']['post_process_fun'] df[coly + "_pred"] = ypred df[coly + "_pred"] = df[coly + '_pred'].apply(lambda x: post_process_fun(x)) if yproba is not None: df[coly + "_pred_proba"] = yproba log("############ Saving prediction ###################################################" ) log(ypred.shape, path_output) os.makedirs(path_output, exist_ok=True) df.to_csv(f"{path_output}/prediction.csv") log(df.head(8)) log("########### Export Specific ######################################################" ) df[cols["coly"]] = ypred df[[cols["coly"]]].to_csv(f"{path_output}/pred_only.csv")
def run_transform(config_name, config_path, n_sample=1, path_data=None, path_output=None, pars={}, model_dict=None, return_mode=""): log("##### Run transform ###############################################################" ) model_dict = model_dict_load(model_dict, config_path, config_name, verbose=True) model_class = model_dict['model_pars']['model_class'] m = model_dict['global_pars'] path_data = m['path_pred_data'] if path_data is None else path_data path_pipeline = m['path_pred_pipeline'] # path_output + "/pipeline/" ) path_model = m['path_pred_model'] model_file = m.get('model_file', "") ### New path_output = m['path_pred_output'] if path_output is None else path_output log(path_data, path_model, path_output) pars = { 'cols_group': model_dict['data_pars']['cols_input_type'], 'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list'] } log("##### Load Preprocess ############################################################" ) from run_preprocess import preprocess_inference as preprocess colid = load(f'{path_pipeline}/colid.pkl') if model_class in SUPERVISED_MODELS: path_pred_X = m.get('path_pred_X', path_data + "/features.zip") #.zip path_pred_y = m.get('path_pred_y', path_data + "/target.zip") #.zip df = load_dataset(path_pred_X, path_pred_y, colid, n_sample=n_sample) else: df = load_dataset(path_data, None, colid, n_sample=n_sample) dfX, cols = preprocess(df, path_pipeline, preprocess_pars=pars) coly = cols["coly"] log("#### Extract column names #######################################################" ) ### Actual column names for Model Input : label y and Input X (colnum , colcat), remove duplicate names model_dict['data_pars']['coly'] = cols['coly'] model_dict['data_pars']['cols_model'] = list( set( sum([ cols[colgroup] for colgroup in model_dict['data_pars']['cols_model_group'] ], []))) #### Col Group by column type : Sparse, continuous, .... (ie Neural Network feed Input, remove duplicate names #### 'coldense' = [ 'colnum' ] 'colsparse' = ['colcat' ] model_dict['data_pars']['cols_model_type2'] = {} for colg, colg_list in model_dict['data_pars'].get('cols_model_type', {}).items(): model_dict['data_pars']['cols_model_type2'][colg] = list( set(sum([cols[colgroup] for colgroup in colg_list], []))) log("############ Task Inference ###################################################" ) task_type = model_dict['compute_pars'].get('task_inference', 'transform') if model_class in SUPERVISED_MODELS: dfXy = transform( model_file, path_model, (dfX[[c for c in dfX.columns if c not in coly]], df[coly]), model_dict, task_type=task_type) else: dfXy = transform(model_file, path_model, dfX, model_dict, task_type=task_type) post_process_fun = model_dict['model_pars']['post_process_fun'] if return_mode == 'dict': return {'dfXy': dfXy} else: log("#### Export ##################################################################" ) path_check_out = m.get('path_check_out', path_output + "/check/") os.makedirs(path_check_out, exist_ok=True) dfX.to_parquet(path_check_out + "/dfX.parquet") # train input data generate parquet log( "######### Finish #############################################################", )