def model_predict(year, month, day, country = 'all'): """ Make predictions based on a country. """ ## Timer time_start = time.time() ## Load all data ts = load_ts() eng_datasets = {country: engineer_features(ts[country], training = False) for country in ts.keys()} ## Load all models models = model_load() ## check if model for country if country not in models.keys(): raise Exception(f'ERROR: (model_predict) for country {country} is unavailable.') ## Check if dataset is available if country not in eng_datasets.keys(): raise Exception(f'ERROR: (dataset) for country {country} is unavailable.') ## Load data and model for country model = models[country] eng_dataset = eng_datasets[country] ## Check date target_date = f'{year}-{str(month).zfill(2)}-{str(day).zfill(2)}' print(target_date) ## Data to predict on: X_pred = eng_dataset[eng_dataset['dates'] == target_date].drop(['target','dates'], axis =1 ) # Prediction y_pred = model.predict(X_pred) _update_predict_log(tag = country, y_pred = y_pred, target_date = target_date,\ MODEL_VERSION = MODEL_VERSION, MODEL_VERSION_NOTE = MODEL_VERSION_NOTE) return(y_pred)
def model_load(prefix='sl', data_dir=None, training=True, save_pickle=False): """ example funtion to load model The prefix allows the loading of different models """ if not data_dir: data_dir = os.path.join(".", "data", "cs-train") models = [ f for f in os.listdir(os.path.join(".", "models")) if re.search("sl", f) ] if len(models) == 0: raise Exception( f"Models with prefix '{prefix}' cannot be found did you train?") all_models = {} for model in models: all_models[re.split("-", model)[1]] = joblib.load( os.path.join(".", "models", model)) ## load data ts_data = fetch_ts(data_dir) all_data = {} for country, df in ts_data.items(): X, y, dates = engineer_features(df, training=training) dates = np.array([str(d) for d in dates]) all_data[country] = {"X": X, "y": y, "dates": dates} if save_pickle: version_ = re.sub("\.", "_", str(MODEL_VERSION)) pickle.dump( (all_data, all_models), open(os.path.join("models", f"all_data_model-{version_}.pickle"), "wb")) print('Pickle file saved.') return (all_data, all_models)
def model_compare(data_dir, country='United Kingdom'): ''' train all models for one country using gridsearch, return a df that compares the performance of models ''' print('Ingesting data') df = fetch_data(data_dir) df_country = convert_to_ts(df, country) X, y, dates = engineer_features(df_country) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) # define all pipelines and the param grids pipe_lr = Pipeline(steps=[('scaler', StandardScaler()), ('lr', ElasticNet())]) pipe_sgd = Pipeline(steps=[('scaler', StandardScaler()), ('sgd', SGDRegressor())]) pipe_svr = Pipeline(steps=[('scaler', StandardScaler()), ('svr', SVR())]) pipe_rf = Pipeline( steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())]) pipe_gbt = Pipeline( steps=[('scaler', StandardScaler()), ('gbt', GradientBoostingRegressor())]) param_grid_lr = { 'lr__max_iter': [10000], 'lr__alpha': np.logspace(-3, 0, 5), 'lr__l1_ratio': np.linspace(0, 1, 5) } param_grid_sgd = { 'sgd__penalty': ['elasticnet'], 'sgd__alpha': np.logspace(-4, 1, 5), 'sgd__l1_ratio': np.linspace(0, 1, 5), 'sgd__max_iter': np.linspace(50, 250, 5, dtype='int'), 'sgd__learning_rate': ['optimal', 'invscaling'] } param_grid_svr = { 'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'svr__C': np.logspace(-2, 2, 5), 'svr__gamma': np.logspace(-3, 0, 4), } param_grid_rf = { 'rf__n_estimators': np.linspace(25, 100, 4, dtype='int'), 'rf__max_depth': np.linspace(6, 15, 4, dtype='int'), 'rf__min_samples_split': np.linspace(2, 8, 4, dtype='int') } param_grid_gbt = { 'gbt__learning_rate': np.logspace(-3, -1.5, 5), 'gbt__n_estimators': np.linspace(25, 100, 4, dtype='int'), 'gbt__max_depth': np.linspace(6, 15, 4, dtype='int'), 'gbt__min_samples_split': np.linspace(2, 8, 4, dtype='int'), } all_pipes = { pipe_lr: param_grid_lr, pipe_sgd: param_grid_sgd, pipe_svr: param_grid_svr, pipe_rf: param_grid_rf, pipe_gbt: param_grid_gbt } # train each model time_start_all = time.time() results = [] for pipe in all_pipes: time_start = time.time() pipe_name = '->'.join([step[0] for step in pipe.steps]) print(f'Training {pipe_name}') grid = GridSearchCV(pipe, param_grid=all_pipes[pipe], cv=5, n_jobs=-1, verbose=0) grid.fit(X_train, y_train) y_pred = grid.predict(X_test) run_time = time.time() - time_start rmse = mse(y_test, y_pred, squared=False) comb = 1 for param in [*all_pipes[pipe].values()]: comb = comb * len(param) # divide the total run time by the numbe of possible combinations of params avg_time = run_time / comb results.append( [pipe_name, rmse, run_time, avg_time, grid.best_params_]) run_time_all = time.time() - time_start_all print(f'Training finished! Total training time {round(run_time_all)}s') df = pd.DataFrame(results, columns=[ 'pipeline', 'test_rmse', 'total_time', 'avg_time', 'best_params' ]) return df
def _model_train(df, tag, test=False): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file """ ## start timer for runtime time_start = time.time() X, y, dates = engineer_features(df) if test: n_samples = int(np.round(0.3 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size), subset_indices) y = y[mask] X = X[mask] dates = dates[mask] ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) ## train a random forest model param_grid_rf = { 'rf__n_estimators': np.linspace(25, 100, 4, dtype='int'), 'rf__max_depth': np.linspace(6, 15, 4, dtype='int'), 'rf__min_samples_split': np.linspace(2, 8, 4, dtype='int') } pipe_rf = Pipeline( steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())]) grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1) grid.fit(X_train, y_train) y_pred = grid.predict(X_test) eval_rmse = round(np.sqrt(mse(y_test, y_pred))) ## retrain using all data grid.fit(X, y) model_name = re.sub("\.", "_", str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, f"test-{tag}-{model_name}.joblib") print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join(MODEL_DIR, f"sl-{tag}-{model_name}.joblib") print("... saving model: {}".format(saved_model)) joblib.dump(grid, saved_model) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) ## update log update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse}, runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test=test)
def _model_train(dataset,tag,test = TEST): """ Train models and select the best one out of DecisionTreeRegression, GradientBoostingRegression, AdaBoostRegression and XGBoostRegressor. Feed the model the timeseries_datasets. """ ## start timer for runtime time_start = time.time() dataset = engineer_features(dataset, training = True) X = dataset.drop(['target','dates'], axis = 1) y = dataset.target #Train_Test_Split Data X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, random_state = 0) ##Train Models GridSearchParameters = {'criterion': ['mse', 'mae', 'friedman_mse'], 'max_depth': [None, 10,20,50], 'max_features': ['auto', 'sqrt', 'log2']}, \ {'criterion': ['mse', 'mae'], 'max_features' : ['auto', 'sqrt'] }, \ {'loss' : ['ls', 'lad', 'huber', 'quantile'], 'learning_rate' : [0.1,0.01,0.001]}, \ {'loss' : ['linear', 'square',], 'learning_rate' : [0.05, 0.1, 0.01]}, \ {'learning_rate': [0.05, 0.1, 0.01], 'max_depth': [1, 5, 50], 'n_estimators': [100, 1000, 500] } params = { 'DTR_P' : GridSearchParameters[0], 'RFR_P' : GridSearchParameters[1], 'GBR_P' : GridSearchParameters[2], 'ADA_P' : GridSearchParameters[3], 'XGB_P' : GridSearchParameters[4], } regressor_dict = { 'DTR' : DecisionTreeRegressor(random_state = 42), 'RFR' : RandomForestRegressor(random_state = 42), 'GBR' : GradientBoostingRegressor(random_state = 42), 'ADA' : AdaBoostRegressor(random_state = 42), 'XGB' : xgb.XGBRegressor(seed = 42) } models = {} for model_name in regressor_dict: pipe = Pipeline(steps = [('scaler', StandardScaler()), ('regressor', regressor_dict[model_name])]) grid = GridSearchCV(regressor_dict[model_name], param_grid = params[model_name + '_P'], cv = 5) grid.fit(X_train, y_train) models[model_name] = grid model_scores = [] #Test which model is optimal. for model in models: y_pred = models[model].predict(X_test) rmse = np.sqrt(mse(y_pred, y_test)) model_scores.append(rmse) model_index = np.argmin(model_scores) model_score = min(model_scores) model_name = list(models.keys())[model_index] best_model = list(models.values())[model_index] print(f'The best model for {tag} is {model_name}.') #Retrain on best model. best_model.fit(X,y) #Save model. if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) if test: saved_model = os.path.join(MODEL_DIR, f'test-{tag}-{model_name}.joblib') else: saved_model = os.path.join(MODEL_DIR, f'sl-{tag}-{model_name}.joblib') joblib.dump(best_model,saved_model) m, s = divmod(time.time()-time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d"%(h, m, s) # Update Train Log. _update_train_log(tag, best_model, model_index, model_score, dataset.shape, runtime, MODEL_VERSION, MODEL_VERSION_NOTE,test)