def model_load(prefix='sl', data_dir=None, training=True): """ example funtion to load model The prefix allows the loading of different models """ if not data_dir: # data_dir = os.path.join("..","data","cs-train") data_dir = os.path.join(os.getcwd(), "data", "cs-train") models = [ f for f in os.listdir(os.path.join(".", "models")) if re.search("sl", f) ] if len(models) == 0: raise Exception( "Models with prefix '{}' cannot be found did you train?".format( prefix)) all_models = {} for model in models: all_models[re.split("-", model)[1]] = joblib.load( os.path.join(".", "models", model)) ## load data ts_data = fetch_ts(data_dir) all_data = {} for country, df in ts_data.items(): X, y, dates = engineer_features(df, training=training) dates = np.array([str(d) for d in dates]) all_data[country] = {"X": X, "y": y, "dates": dates} return (all_data, all_models)
def _model_train(df =None,prefix ='sl',country=None,test=False): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file """ ## start timer for runtime time_start = time.time() X,y,dates = engineer_features(df) if test: n_samples = int(np.round(0.3 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]),n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size),subset_indices) y=y[mask] X=X[mask] dates=dates[mask] ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) ## train a random forest model param_grid_rf = { 'rf__criterion': ['mse','mae'], 'rf__n_estimators': [10,15,20,25] } pipe_rf = Pipeline(steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())]) grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1) grid.fit(X_train[:5], y_train[:5]) y_pred = grid.predict(X_test) eval_rmse = round(np.sqrt(mean_squared_error(y_test,y_pred))) ## retrain using all data grid.fit(X, y) model_name = re.sub("\.","_",str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, "test-{}-{}.joblib".format(country,model_name)) print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join(MODEL_DIR, "{}-{}-{}.joblib".format(prefix,country,model_name)) print("... saving model: {}".format(saved_model)) joblib.dump(grid,saved_model) runtime = convert((time.time()-time_start)) ## update log date_range = f"{str(dates[0])} - {str(dates[-1])}"
def model_load(prefix='sl', data_dir=None, training=True): if not data_dir: data_dir = os.path.join("..", "capstone-w", "cs-train") models = [ f for f in os.listdir(os.path.join(".", "models")) if re.search("sl", f) ] if len(models) == 0: raise Exception( "Models with prefix '{}' cannot be found did you train?".format( prefix)) all_models = {} for model in models: all_models[re.split("-", model)[1]] = joblib.load( os.path.join(".", "models", model)) # load data ts_data = fetch_ts(data_dir) all_data = {} for country, df in ts_data.items(): df = clean_data(df) X, y, dates = engineer_features(df, training=training) dates = np.array([str(d) for d in dates]) all_data[country] = {"X": X, "y": y, "dates": dates} return (all_data, all_models)
def _model_train(df, tag, pipe, param_grid, test=False): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file """ ## start timer for runtime time_start = time.time() X, y, dates = engineer_features(df) if test: n_samples = int(np.round(0.3 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size), subset_indices) y = y[mask] X = X[mask] dates = dates[mask] ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1) grid.fit(X_train, y_train) y_pred = grid.predict(X_test) eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred))) ## retrain using all data grid.fit(X, y) model_name = re.sub("\.", "_", str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, "test-{}-{}.joblib".format(tag, model_name)) print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join(MODEL_DIR, "sl-{}-{}.joblib".format(tag, model_name)) print("... saving model: {}".format(saved_model)) joblib.dump(grid, saved_model) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) ## update log update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse}, runtime, MODEL_VERSION, test=test)
def model_monitor(country="all", dev=DEV, training=True): """ performance monitoring """ print("Monitor Model") ## import data #datasets = engineer_features(training=training, dev=dev) datasets = engineer_features(training=training) X, y, dates, labels = datasets[country] dates = pd.to_datetime(dates) print(X.shape) ## train the model if training: _model_train(X, y, labels, tag=country, dev=dev) ## monitor RMSE samples = [10, 20, 30, 50, 60] for n in samples: X_new, y_new, dates_new = simulate_samples(n, X, y, dates) queries = [(str(d.year), str(d.month), str(d.day), country) for d in dates_new] y_pred = [model_predict(year=query[0], month=query[1], day=query[2], country=query[3],verbose=False, dev=dev)["y_pred"][0].round(2) for query in queries] rmse = np.sqrt(mean_squared_error(y_new.tolist(),y_pred)) print("sample size: {}, RSME: {}".format(n, rmse.round(2))) ## monitor performance ## scaling scaler = StandardScaler() X = scaler.fit_transform(X) samples = [25, 50, 75, 90] clf_y = EllipticEnvelope(random_state=0,contamination=0.01) clf_X = EllipticEnvelope(random_state=0,contamination=0.01) clf_X.fit(X) clf_y.fit(y.reshape(y.size,1)) results = defaultdict(list) for n in samples: X_new, y_new, dates_new = simulate_samples(n,X,y, dates) results["sample_size"].append(n) results['wasserstein_X'].append(np.round(wasserstein_distance(X.flatten(),X_new.flatten()),2)) results['wasserstein_y'].append(np.round(wasserstein_distance(y,y_new),2)) test1 = clf_X.predict(X_new) test2 = clf_y.predict(y_new.reshape(y_new.size,1)) results["outlier_percent_X"].append(np.round(1.0 - (test1[test1==1].size / test1.size),2)) results["outlier_percent_y"].append(np.round(1.0 - (test2[test2==1].size / test2.size),2)) return pd.DataFrame(results)
def _model_train_gradient_boost(df, tag, test=False): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file """ ## start timer for runtime time_start = time.time() X, y, dates = engineer_features(df) if test: n_samples = int(np.round(0.3 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size), subset_indices) y = y[mask] X = X[mask] dates = dates[mask] ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) ## train a gradient boost regressor model param_grid_gb = { 'gb__criterion': ['mse', 'mae'], 'gb__n_estimators': [10, 15, 20, 25] } pipe_gb = Pipeline( steps=[('scaler', StandardScaler()), ('gb', GradientBoostingRegressor())]) grid = GridSearchCV(pipe_gb, param_grid=param_grid_gb, cv=5, n_jobs=-1) #iid=False grid.fit(X_train, y_train) y_pred = grid.predict(X_test) eval_mse = round(mean_squared_error(y_test, y_pred)) eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred))) print("country", tag, "-- gradient boost regressor eval --") print("sme:", eval_mse) print("rsme:", eval_rmse)
def model_load(prefix='sl', data_dir=None, training=True, country=None): """ example function to load model The prefix allows the loading of different models """ ## if data path not specified, use generic if not data_dir: data_dir = os.path.join(".", "data", "cs-train") ## load all models (or filter for country) if country is None: models = [ f for f in os.listdir(os.path.join(".", "models")) if re.search(prefix, f) ] else: country_id = re.sub("\s+", "_", country.lower()) models = [ f for f in os.listdir(os.path.join(".", "models")) if (re.search(prefix, f) and re.search(country_id, f)) ] if len(models) == 0: if country is None: raise Exception( "Models with prefix '{}' cannot be found, did you train?". format(prefix)) else: raise Exception( "Model for '{0}' with predix '{1}' cannot be found, did you train it?" .format(prefix, country)) ## store model in dictionary ## key = model name ## value = model all_models = {} for model in models: all_models[re.split("-", model)[1]] = joblib.load( os.path.join(".", "models", model)) ## load data ts_data = fetch_ts(data_dir, country=country) all_data = {} for country, df in ts_data.items(): X, y, dates = engineer_features(df, training=training) dates = np.array([str(d) for d in dates]) all_data[country] = {"X": X, "y": y, "dates": dates} return (all_data, all_models)
def get_latest_train_data(): """ load the data used in the latest training """ #data_file = os.path.join("models",'latest-train.pickle') Prod_files = r"C:\Users\AshwiniShitole\Desktop\Ashwini\Personal\Data Science\AI Academy\AI Enterprise Workflow Certification\AI in Production\Capstone_Project\case-study-soln\data\cs-production" if not os.path.exists(Prod_files): raise Exception( "cannot find {}-- did you train the model?".format(data_file)) Prod_data = fetch_data(Prod_files) Prod_data = Prod_data.drop(columns=['customer_id']) PROD_TS = convert_to_ts(Prod_data) X, y, dates = engineer_features(PROD_TS, training=False) return (X, y)
def model_load(prefix='sl', data_dir=None, training=True): """ example funtion to load model The prefix allows the loading of different models """ if not data_dir: data_dir = os.path.join(PARENT_DIR, "Final_Capstone/cs-train") all_models = model_load_only(prefix=prefix) ## load data ts_data = fetch_ts(data_dir) all_data = {} for country, df in ts_data.items(): X, y, dates = engineer_features(df, training=training) dates = np.array([str(d) for d in dates]) all_data[country] = {"X": X, "y": y, "dates": dates} return (all_data, all_models)
def model_load(country, prefix='sl', data_dir=None, training=True): """ example funtion to load model The prefix allows the loading of different models """ warnings.filterwarnings("ignore") if not data_dir: data_dir = os.path.join(DATA_DIR) # country when passed will load that country's model. 'all' will all models model_name = prefix + '-' + country models = [ f for f in os.listdir(os.path.join(MODEL_DIR)) if re.search(model_name, f) ] if len(models) == 0: raise Exception( "Models with prefix '{}' cannot be found did you train?".format( prefix)) all_models = {} for model in models: all_models[re.split("-", model)[1]] = joblib.load( os.path.join(MODEL_DIR, model)) # load data ts_data = fetch_ts(data_dir) all_data = {} for country, df in ts_data.items(): X, y, dates = engineer_features(df, training=training) dates = np.array([str(d) for d in dates]) all_data[country] = {"X": X, "y": y, "dates": dates} return (all_data, all_models)
def _model_train(df, tag, test=False): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file """ ## start timer for runtime time_start = time.time() X, y, dates = engineer_features(df) rs = 42 if test: n_samples = int(np.round(0.3 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size), subset_indices) y = y[mask] X = X[mask] dates = dates[mask] ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) ## build models regressor_names = [ "SGDRegressor", "RandomForestRegressor", "GradientBoostingRegressor", "AdaBoostRegressor" ] regressors = (SGDRegressor(random_state=rs), RandomForestRegressor(random_state=rs), GradientBoostingRegressor(random_state=rs), AdaBoostRegressor(random_state=rs)) params = [{ "reg__penalty": ["l1", "l2", "elasticnet"], "reg__learning_rate": ["constant", "optimal", "invscaling"] }, { "reg__n_estimators": [10, 30, 50], "reg__max_features": [3, 4, 5], "reg__bootstrap": [True, False] }, { "reg__n_estimators": [10, 30, 50], "reg__max_features": [3, 4, 5], "reg__learning_rate": [1, 0.1, 0.01, 0.001] }, { "reg__n_estimators": [10, 30, 50], "reg__learning_rate": [1, 0.1, 0.01, 0.001] }] ## train models models = {} total = len(regressor_names) for iteration, (name, regressor, param) in enumerate( zip(regressor_names, regressors, params)): pipe = Pipeline(steps=[('scaler', StandardScaler()), ("reg", regressor)]) grid = GridSearchCV(pipe, param_grid=param, scoring="neg_mean_squared_error", cv=5, n_jobs=-1, return_train_score=True) grid.fit(X_train, y_train) models[name] = grid, grid.best_estimator_["reg"].get_params() ## evaluation on the validation set val_scores = [] for key, model in models.items(): y_pred = model[0].predict(X_test) rmse = np.sqrt(mean_squared_error(y_pred, y_test)) val_scores.append(rmse) ## select best model bm = regressor_names[np.argmin(val_scores)] opt_model, params = models[bm] print("cuurent optimal model is: ", bm) ## retrain best model using all data opt_model.fit(X, y) model_name = re.sub("\.", "_", str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, "test-{}-{}.joblib".format(tag, model_name)) print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join(MODEL_DIR, "sl-{}-{}.joblib".format(tag, model_name)) print("... saving model: {}".format(saved_model)) print("... saving latest data") data_file = os.path.join("models", 'latest-train.pickle') with open(data_file, 'wb') as tmp: pickle.dump({'y': y, 'X': X}, tmp) joblib.dump(opt_model, saved_model) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) ## plot the figure of rmse model_names = ['SGD', 'RF', 'GBM', 'ADA'] model_rmses = val_scores fig = plt.figure(figsize=(10, 5)) # creating the bar plot plt.bar(model_names, model_rmses, width=0.4) plt.xlabel("Model Names") plt.ylabel("Model Errors") plt.title("Model Training RMSE Comparisons") plt.show() ## update log update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': min(val_scores)}, runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test=True)
def _model_train(df,tag,test=False): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file """ ## start timer for runtime time_start = time.time() X,y,dates = engineer_features(df) if test: n_samples = int(np.round(0.3 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]),n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size),subset_indices) y=y[mask] X=X[mask] dates=dates[mask] ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) ################################################################################### ## train a random forest model ################################################################################### print("\nTRAINING MODELS: RANDOM FOREST MODEL") param_grid_rf = { 'rf__criterion': ['mse','mae'], 'rf__n_estimators': [10,15,20,25] } pipe_rf = Pipeline(steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())]) grid_rf = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1,return_train_score=True) grid_rf.fit(X_train, y_train) scores_df_rf = pd.DataFrame(grid_rf.cv_results_).sort_values(by='rank_test_score') scores_df_rf['model']=grid_rf scores_df_rf = scores_df_rf[scores_df_rf['rank_test_score'] == 1] #print(scores_df_rf.columns) print(grid_rf.best_params_) print(grid_rf.best_score_) y_pred = grid_rf.predict(X_test) eval_rmse = round(np.sqrt(mean_squared_error(y_test,y_pred))) scores_df_rf['eval_rmse']=eval_rmse print ("eval_rmse: {}".format(eval_rmse)) #print(scores_df_rf[['rank_test_score','params','mean_test_score','mean_fit_time','mean_score_time','eval_rmse']]) print("\nEND OF TRAINING MODELS: RANDOM FOREST MODEL") ################################################################################### ## train a bagging model ################################################################################### print("\nTRAINING MODELS: BAGGING MODEL") ## train a bagging model pipe_bag = Pipeline(steps=[('scaler', StandardScaler()), ('bag', BaggingRegressor(base_estimator=SVR(), random_state=0))]) param_grid_bag = { 'bag__n_estimators': [10,15,20,25] } grid_bag = GridSearchCV(pipe_bag, param_grid=param_grid_bag, cv=5, iid=False, n_jobs=-1) grid_bag.fit(X_train, y_train) #print(grid_bag.get_params()) #print(grid_bag.score(X_train, y_train)) #print(grid_bag.cv_results_) scores_df_bag = pd.DataFrame(grid_bag.cv_results_).sort_values(by='rank_test_score') scores_df_bag['model']=grid_bag scores_df_bag = scores_df_bag[scores_df_bag['rank_test_score'] == 1] #print(scores_df_bag.columns) print(grid_bag.best_params_) print(grid_bag.best_score_) y_pred = grid_bag.predict(X_test) eval_rmse = round(np.sqrt(mean_squared_error(y_test,y_pred))) scores_df_bag['eval_rmse']=eval_rmse print ("eval_rmse: {}".format(eval_rmse)) #print(scores_df_bag[['rank_test_score','params','mean_test_score','mean_fit_time','mean_score_time','eval_rmse']]) print("\nEND OF TRAINING MODELS: BAGGING MODEL") ## Compare models results_df = scores_df_rf.append(scores_df_bag,ignore_index=True).sort_values(by='mean_test_score',ascending=False) print(results_df[['model','params','mean_test_score','mean_fit_time','mean_score_time','eval_rmse']]) best_model = results_df['model'].loc[0] print("best_model: {}".format(best_model)) ## retrain using all data and the Random Forest model best_model.fit(X, y) model_name = re.sub("\.","_",str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, "test-{}-{}.joblib".format(tag,model_name)) print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join(MODEL_DIR, "sl-{}-{}.joblib".format(tag,model_name)) print("... saving model: {}".format(saved_model)) joblib.dump(best_model,saved_model) m, s = divmod(time.time()-time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d"%(h, m, s) ## update log update_train_log(tag,(str(dates[0]),str(dates[-1])),{'rmse':eval_rmse},runtime, MODEL_VERSION, MODEL_VERSION_NOTE,test=True) '''update_train_log((str(dates[0]),str(dates[-1])),{'rmse':eval_rmse},runtime,
def _model_train(prefix, df, tag, test=False, model=DEFAULT_MODEL, model_param_grid=DEFAULT_PARAM_GRID, scaler=DEFAULT_SCALER): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file """ ## start timer for runtime time_start = time.time() X, y, dates = engineer_features(df) if test: n_samples = int(np.round(0.3 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size), subset_indices) y = y[mask] X = X[mask] dates = dates[mask] ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) pipe_rf = Pipeline(steps=[('scaler', scaler), ('rf', model)]) grid = GridSearchCV(pipe_rf, param_grid=model_param_grid, cv=5, n_jobs=-1) grid.fit(X_train, y_train) y_pred = grid.predict(X_test) eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred))) ## retrain using all data grid.fit(X, y) model_name = re.sub("\.", "_", str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, "test-{}-{}.joblib".format(tag, model_name)) print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join( MODEL_DIR, "{}-{}-{}.joblib".format(prefix, tag, model_name)) print("... saving model: {}".format(saved_model)) data_file = os.path.join( MODEL_DIR, '{}-{}-{}-train.pickle'.format(prefix, tag, model_name)) with open(data_file, 'wb') as tmp: pickle.dump({'y': y, 'X': X}, tmp) print("... saving latest data") joblib.dump(grid, saved_model) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) ## update log update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse}, runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test=test)
def _model_train(df, tag, test=False): """ example function to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file """ # start timer for runtime time_start = time.time() x, y, dates = engineer_features(df) if test: n_samples = int(np.round(0.3 * x.shape[0])) subset_indices = np.random.choice(np.arange(x.shape[0]), n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size), subset_indices) y = y[mask] x = x[mask] dates = dates[mask] # Perform a train-test split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, shuffle=True, random_state=42) print("... training random forest for {}".format(tag)) # train a random forest model param_grid_rf = { 'rf__criterion': ['mse', 'mae'], 'rf__n_estimators': [10, 15, 20, 25] } pipe_rf = Pipeline( steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())]) grid_rf = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, n_jobs=1) grid_rf.fit(x_train, y_train) y_pred_rf = grid_rf.predict(x_test) eval_rmse_rf = round(np.sqrt(mean_squared_error(y_test, y_pred_rf))) # retrain using all data grid_rf.fit(x, y) model_name = re.sub("\.", "_", str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, "test-{}-{}.joblib".format(tag, model_name)) print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join(MODEL_DIR, "rf-{}-{}.joblib".format(tag, model_name)) print("... saving model: {}".format(saved_model)) joblib.dump(grid_rf, saved_model) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) # update log update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse_rf}, runtime, MODEL_VERSION, MODEL_VERSION_NOTE) print("... training XGBRegressor for {}".format(tag)) # training an XGBoost model pipe_xgb = Pipeline( steps=[('scaler', StandardScaler()), ('xgb_model', xgb.XGBRegressor())]) param_grid_xgb = { 'xgb_model__subsample': np.arange(.05, 1, .05), 'xgb_model__max_depth': np.arange(3, 20, 1), 'xgb_model__colsample_bytree': np.arange(.1, 1.05, .05) } grid_xgb = RandomizedSearchCV(estimator=pipe_xgb, param_distributions=param_grid_xgb, n_iter=10, scoring='neg_mean_squared_error', cv=4) grid_xgb.fit(x_train, y_train) y_pred_xgb = grid_xgb.predict(x_test) eval_rmse_xgb = round(np.sqrt(mean_squared_error(y_test, y_pred_xgb))) # retrain using all data grid_xgb.fit(x, y) model_name = re.sub("\.", "_", str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, "test-{}-{}.joblib".format(tag, model_name)) print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join(MODEL_DIR, "xgb-{}-{}.joblib".format(tag, model_name)) print("... saving model: {}".format(saved_model)) joblib.dump(grid_xgb, saved_model) update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse_xgb}, runtime, MODEL_VERSION, MODEL_VERSION_NOTE)
def _model_train(df, tag, test=False): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file """ ## start timer for runtime time_start = time.time() X, y, dates = engineer_features(df) if test: n_samples = int(np.round(0.3 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size), subset_indices) y = y[mask] X = X[mask] dates = dates[mask] ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) ## train a random forest model param_grid_rf = [{ 'regr': [RandomForestRegressor()], 'regr__criterion': ['mse', 'mae'], 'regr__n_estimators': [10, 15, 20, 25] }, { 'regr': [SVR()], 'regr__kernel': ['rbf', 'linear'], 'regr__C': [1.0, 1.5] }] pipe_rf = Pipeline( steps=[('scaler', StandardScaler()), ('regr', RandomForestRegressor())]) grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1) grid.fit(X_train, y_train) y_pred = grid.predict(X_test) eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred))) ## retrain using all data grid.fit(X, y) model_name = re.sub("\.", "_", str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, "test-{}-{}.joblib".format(tag, model_name)) print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join(MODEL_DIR, "sl-{}-{}.joblib".format(tag, model_name)) print("... saving model: {}".format(saved_model)) joblib.dump(grid, saved_model) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) ## update log update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse}, runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test=True)
def _model_train(df, tag, test=False): time_start = time.time() X, y, dates = engineer_features(df) if test: n_samples = int(np.round(0.3 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size), subset_indices) y = y[mask] X = X[mask] dates = dates[mask] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) param_grid_rf = { 'rf__criterion': ['mse', 'mae'], 'rf__n_estimators': [10, 15, 20, 25] } pipe_rf = Pipeline( steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())]) grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1) grid.fit(X_train, y_train) y_pred = grid.predict(X_test) eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred))) grid.fit(X, y) model_name = re.sub("\.", "_", str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, "test-{}-{}.joblib".format(tag, model_name)) print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join(MODEL_DIR, "sl-{}-{}.joblib".format(tag, model_name)) print("... saving model: {}".format(saved_model)) joblib.dump(grid, saved_model) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse}, runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test=True)
def _model_train(df, tag, test=False, regressor=None): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file """ # Models available for training regressorsList = { 'randomforest': RandomForestRegressor(), 'extratrees': ExtraTreesRegressor() } if regressor.lower() not in regressorsList.keys(): raise Exception( "Regressor with name '{}' not found (available: {})".format( regressor, ', '.join(regressorsList.keys()))) regressor = regressor.lower() # match is case insensitive ## start timer for runtime time_start = time.time() X, y, dates = engineer_features(df) if test: n_samples = int(np.round(0.3 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size), subset_indices) y = y[mask] X = X[mask] dates = dates[mask] ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) ## train a regression model param_grid_rf = { 'reg__criterion': ['mse', 'mae'], 'reg__n_estimators': [10, 15, 20, 25] } pipe_rf = Pipeline( steps=[('scaler', StandardScaler()), ('reg', regressorsList[regressor])]) grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1) print("... using model: {}".format(regressor)) grid.fit(X_train, y_train) y_pred = grid.predict(X_test) eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred))) ## retrain using all data grid.fit(X, y) model_name = re.sub("\.", "_", str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, "test-{}-{}.joblib".format(tag, model_name)) print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join(MODEL_DIR, "sl-{}-{}.joblib".format(tag, model_name)) print("... saving model: {}".format(saved_model)) joblib.dump(grid, saved_model) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s)