def trainClf(clf, param_grid_rf): data_dir = os.path.join(os.path.join('.', 'data'), 'cs-train') work_dir = os.path.join(os.path.join('.', 'data'), 'work-data') aDf = ingestTrainData(data_dir) data = getAllTS(aDf, work_dir) X, y, dates = engineer_features(data['united_kingdom']) # Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) # train a random forest model pipe_rf = Pipeline(steps=[('scaler', StandardScaler()), ('clf', clf)]) grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1) grid.fit(X_train, y_train) y_train_pred = grid.predict(X_train) y_pred = grid.predict(X_test) eval_train_rmse = round(np.sqrt(mean_squared_error(y_train, y_train_pred))) eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred))) print('eval_rmse_test', eval_rmse) print('eval_rmse_train', eval_train_rmse) print(grid.best_estimator_) return grid
def load_data(inp_dir=None, work_dir=None, training=None): # load data idf = ingestTrainData(inp_dir) ts_data = getAllTS(idf, work_dir) all_data = {} for country, df in ts_data.items(): X, y, dates = engineer_features(df, training=training) dates = np.array([str(d) for d in dates]) all_data[country] = {"X": X, "y": y, "dates": dates} return all_data
def getFeatures(): # load time series work_dir = join(data_dir, 'work-data') ts_file_path = join(work_dir, 'ts-data-all.csv') df = pd.DataFrame() if not exists(ts_file_path): # create time series idf = ingestTrainData(join('.', 'data')) df = getTimeSeries(idf) else: df = pd.read_csv(ts_file_path) X, y, dates = engineer_features(df) return X, y, dates
def model_train(data_dir, test=False, model_dir=None, force_data_load=True): """ funtion to train model given a df 'mode' - can be used to subset data essentially simulating a train """ work_dir = os.path.join(data_dir, TSDIR) inp_dir = os.path.join(data_dir, 'cs-train') if not model_dir: model_dir = MODEL_DIR if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) if test: print("running training in test mode only uk will be trained") idf = pd.DataFrame() if (force_data_load): # print('loading data from ', inp_dir) idf = ingestTrainData(inp_dir) else: train_path = os.path.join(work_dir, 'train-data-cleaned.csv') idf = pd.read_csv(train_path) # fetch time-series formatted data ts_data = getAllTS(idf, work_dir) # train a different model for each data sets for country, df in ts_data.items(): if test and country not in ['all', 'united_kingdom']: continue _model_train(df, country, model_dir=model_dir, test=test)
def monitoring(): # load time series data_dir = join('.', 'data') model_dir = join('.', 'models') monitor_dir = join('.', 'monitor') if not exists(monitor_dir): os.mkdir(monitor_dir) work_dir = join(data_dir, 'work-data') ts_file_path = join(work_dir, 'ts-data-all.csv') df = pd.DataFrame() if not exists(ts_file_path): # create time series idf = ingestTrainData(join('.', 'data')) df = getTimeSeries(idf) else: df = pd.read_csv(ts_file_path) X, y, dates = engineer_features(df) all_data, all_models = model_load(training=False, data_dir=data_dir, model_dir=model_dir, test=False) results = pd.DataFrame(columns=['date', 'y_pred', 'y', 'diff']) for idx, d in enumerate(dates): date = pd.to_datetime(d) error = False answ = None try: answ = model_predict('all', str(date.year), str(date.month), str(date.day), test=False, all_data=all_data, all_models=all_models) except: print('system error:' + str(sys.exc_info()[1])) error = True y_pred = None diff = None yt = y[idx] if not error: y_pred = answ['y_pred'][0] diff = abs(y_pred - yt) results = results.append( { 'date': date, 'y_pred': y_pred, 'y': yt, 'diff': diff }, ignore_index=True) # take only the last dates today = dt.datetime.today() monname = "model-monitoring-{}-{}-{}".format(today.year, today.month, today.day) results.to_csv(join(monitor_dir, monname + ".csv")) fig, ax = plt.subplots(1, 1) fig.set_size_inches(15, 8) ax.set_title('prediction error distribution') sns.distplot(results['diff'], bins=50, color='#008899', ax=ax) fig.savefig(join(monitor_dir, monname) + '.png', dpi=200) statistics_path = join(monitor_dir, 'monitor_statistics.csv') statDF = pd.DataFrame() today_iso = today.strftime('%y-%m-%d') mse = mean_squared_error(results['y'].values, results['y_pred'].values) if exists(statistics_path): statDF = pd.read_csv(statistics_path) found = statDF[statDF['date'] == today_iso] if (found.shape[0] > 0): statDF.loc[statDF['date'] == today_iso, ['mse']] = mse else: statDF = statDF.append({ 'date': today_iso, 'mse': mse }, ignore_index=True) statDF.to_csv(statistics_path)