def objective(params): params['learning_rate'] = float(params['learning_rate']) params['max_depth'] = int(params['max_depth']) params['min_child_weight'] = float(params['min_child_weight']) params['subsample'] = float(params['subsample']) params['gamma'] = float(params['gamma']) params['colsample_bytree'] = float(params['colsample_bytree']) params['n_estimators'] = int(params['n_estimators']) params['reg_alpha'] = float(params['reg_alpha']) params['reg_lambda'] = float(params['reg_lambda']) params['objective'] = params['objective'] params['eval_metric'] = params['eval_metric'] params['nthread'] = params['nthread'] params['booster'] = params['booster'] params['tree_method'] = params['tree_method'] params['silent'] = params['silent'] global X, y, best clf = xgb.XGBRegressor(n_jobs=2, **params) y_hat = cross_val_predict(clf, X, y, method='predict', cv=5) score = metrics_regression(y, y_hat)['mae'] print("############### Score: {0}".format(score)) print("############### Prms: ", params) print('..........................') return score
def metrics(self, y: 'array', y_hat: 'array', msg: str='')->dict: """ Calculate basic metrics for regressors. y -- real data. y_hat -- predicted data. msg -- word or sentence to be included in the screen message. return -- dictionary of metrics. """ dmetrics = metrics_regression(y, y_hat, self.X_.shape[1]) print('[info] Metrics(%s): bias = %.3f mae = %.3f r2 = %.3f' % (msg, dmetrics['bias'], dmetrics['mae'], dmetrics['r2'])) return dmetrics
def main(exp_id, alpha, l1_ratio): """ Launch an experiment. """ warnings.filterwarnings("ignore") # # validate experiment availability if not exp_id in lidexp: click.secho('[error] an experiment with id "%s" is not available.' % exp_id, bold=True, fg='red') quit('Aborted!') else: # header click.secho('Experiment: id=%s name=%s' % (exp_id, didexp[exp_id]['name']), bold=True, underline=True, bg='blue') # confirmation click.confirm('Do you want to continue?', default=False, abort=True, prompt_suffix=': ', show_default=True, err=False) # # ARGUMENTS: EXPERIMENT click.secho('[arg] alpha = %s' % alpha, fg='green') click.secho('[arg] l1_ratio = %s' % l1_ratio, fg='green') # runner with mlflow.start_run(experiment_id=exp_id): # # MODEL from experiments import model test_y, test_yhat, k = model.launcher(dfdata, alpha, l1_ratio) # # SCORES dmetrics = metrics_regression(test_y, test_yhat, k) # print metrics click.echo("[info] Metrics: ") click.secho(" BIAS: %s" % dmetrics['bias'], fg='blue') click.secho(" MAE: %s" % dmetrics['mae'], fg='blue') click.secho(" R2: %s" % dmetrics['r2'], fg='blue') # tracking mlflow.log_param("alpha", alpha) mlflow.log_param("l1_ratio", l1_ratio) mlflow.log_metric("bias", dmetrics['bias']) mlflow.log_metric("mae", dmetrics['mae']) mlflow.log_metric("r2", dmetrics['r2'])
# replace training dataset X = X_train y = y_train """ ESTIMATOR WITH BAYESIAN TUNING """ from hpsklearn import HyperoptEstimator, any_regressor, any_preprocessing from hyperopt import tpe # Instantiate a HyperoptEstimator with the search space and number of evaluations clf = HyperoptEstimator(regressor=any_regressor('my_clf'), preprocessing=any_preprocessing('my_pre'), algo=tpe.suggest, max_evals=250, trial_timeout=300) clf.fit(X, y) print(clf.best_model()) y_hat = clf.predict(X_test) dscores = metrics_regression(y_test, y_hat, X.shape[1]) tf = t.since('test') print( '\nBayesian tuning -test: bias = %.3f mae = %.3f r2 = %.3f (time: %s)' % (dscores['bias'], dscores['mae'], dscores['r2'], format_duration(tf))) # training y_hat = clf.predict(X) dscores = metrics_regression(y, y_hat, X.shape[1]) print( 'Bayesian tuning - train: bias = %.3f mae = %.3f r2 = %.3f (time: %s)' % (dscores['bias'], dscores['mae'], dscores['r2'], format_duration(tf)))
def total_metrics(df: 'df', sobservation: str, sprediction: str, nX=None): """ Plot an error analysis overview for whole data. df -- df where is included the data to be validated. sobservation -- column name of real data. sprediction -- column name of predicted data. nX -- number of features used to calculate the prediction (default None). """ # copy data data = df[[sobservation, sprediction]].dropna() # calculate total metrics dmetrics = metrics_regression(data[sobservation].values, data[sprediction].values, k=nX) bias, mae, r2 = dmetrics['bias'], dmetrics['mae'], dmetrics['r2'] # calculate residues residues = data[sobservation].values - data[sprediction].values res_avg, res_std = np.mean(residues), np.std(residues) # PLOT 1 import matplotlib.pyplot as plt import scipy.stats as stats fig = plt.figure(figsize=(15, 10)) # pie1: BIAS ax1 = plt.subplot2grid((2, 3), (0, 0)) labels = '%.3f' % bias, '' sizes = [ np.abs(bias) * 100. / np.max(data[sobservation].values), (np.max(data[sobservation].values) - np.abs(bias)) * 100. / np.max(data[sobservation].values) ] explode = (0.1, 0) ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.3f%%', shadow=False, startangle=65, textprops=dict(fontsize=18), colors=['red', 'yellow']) ax1.axis('equal') ax1.set_title("BIAS", fontsize=18) # pie2: MAE ax2 = plt.subplot2grid((2, 3), (0, 1)) labels = '%.3f' % mae, '' sizes = [ np.abs(mae) * 100. / np.max(data[sobservation].values), (np.max(data[sobservation].values) - np.abs(mae)) * 100. / np.max(data[sobservation].values) ] explode = (0.1, 0) ax2.pie(sizes, explode=explode, labels=labels, autopct='%1.3f%%', shadow=False, startangle=65, textprops=dict(fontsize=18), colors=['red', 'purple']) ax2.axis('equal') ax2.set_title("MAE", fontsize=18) # pie3: R2 ax3 = plt.subplot2grid((2, 3), (0, 2)) labels = '%.3f' % r2, '' sizes = [np.abs(r2) * 100, (1 - np.abs(r2)) * 100.] explode = (0.1, 0) ax3.pie(sizes, explode=explode, labels=labels, autopct='%1.3f%%', shadow=False, startangle=265, textprops=dict(fontsize=18), colors=['green', 'red']) ax3.axis('equal') ax3.set_title("R2", fontsize=18) # scatter: RESIDUES vs Y ax4 = plt.subplot2grid((2, 3), (1, 0)) bins = np.linspace(min(residues), max(residues), 50) ax4.scatter(data[sobservation].values, residues, s=10, facecolors='none', edgecolors='black') ax4.hlines(res_avg, np.min(data[sobservation].values), np.max(data[sobservation].values), colors='red', linestyles='solid', label='average') ax4.hlines(res_avg + res_std, np.min(data[sobservation].values), np.max(data[sobservation].values), colors='red', linestyles='--', label='std') ax4.hlines(res_avg - res_std, np.min(data[sobservation].values), np.max(data[sobservation].values), colors='red', linestyles='--') ax4.legend(loc='best', fontsize=12, shadow=True) ax4.set_title("RESIDUES = %.3f +/- %.3f" % (res_avg, res_std), fontsize=18) ax4.set_xlabel(sobservation, fontsize=14) ax4.set_ylabel('') ax4.set_facecolor('xkcd:white') # probplot: RESIDUES (vs theoretical Norm distribution) ax5 = plt.subplot2grid((2, 3), (1, 1)) stats.probplot(residues, dist="norm", plot=ax5, fit=True) ax5.set_title("Probplot: RESIDUES\n(vs Norm Dist.)", fontsize=18) ax5.set_xlabel('Theoretical Quantiles', fontsize=14) ax5.set_ylabel('Ordered Values', fontsize=14) # kde: DISTRIBUTION (real vs predictioni) ax6 = plt.subplot2grid((2, 3), (1, 2)) data.rename(columns={ sobservation: 'real', sprediction: 'prediction' }).plot(kind='kde', ax=ax6, style=['b--', 'r--'], linewidth=2.) ax6.set_title("KDE: real vs prediction", fontsize=18) ax6.set_xlabel('Values', fontsize=14) ax6.set_ylabel('Density', fontsize=14) # display plt.subplots_adjust(wspace=0.5) plt.show()
def per_reference_metrics(data: 'df', sobservation: str, sprediction: str, sreference: str, nX=None): """ Plot metrics per each value of a reference categorical variable. data -- df where is included the data to be validated. sobservation -- column name of real data. sprediction -- column name of predicted data. sreference -- column name of the reference variable. nX -- number of features used to calculate the prediction (default None). return -- dataframe with scores per values of the reference variable. """ # METRICS CALCULATION lvar_values = sorted(data[sreference].unique()) # by values of reference variable lbias = list() lmae = list() lr2 = list() lres_avg = list() lres_std = list() lfolk = list() for ivar_value in lvar_values: # collect data idata = data[data[sreference] == ivar_value] # calculate metrics dmetrics = metrics_regression(idata[sobservation].values, idata[sprediction].values, k=nX) ibias, imae, ir2 = dmetrics['bias'], dmetrics['mae'], dmetrics['r2'] # calculate residues iresidues = idata[sobservation].values - idata[sprediction].values ires_avg, ires_std = np.mean(iresidues), np.std(iresidues) # store lfolk.append(ivar_value) lbias.append(ibias) lmae.append(imae) lr2.append(ir2) lres_avg.append(ires_avg) lres_std.append(ires_std) resfolk = pd.DataFrame({ sreference: lfolk, 'bias': lbias, 'mae': lmae, 'r2': lr2, 'res_avg': lres_avg, 'res_std': lres_std }).set_index(sreference) # drop inf values resfolk = resfolk.replace([np.inf, -np.inf], np.nan) # PLOT PER THE REFERENCE VARIABLE import matplotlib.pyplot as plt fig = plt.figure(figsize=(20, 15)) # line: BIAS ax1 = plt.subplot2grid((5, 1), (0, 0)) ax1.plot(resfolk.index.tolist(), resfolk.bias.tolist(), linestyle='--', color='blue', linewidth=3) ax1.scatter(resfolk.index.tolist(), resfolk.bias.tolist(), color='blue') # for i, v in enumerate(resfolk.bias.values): # if not np.isnan(v): # ax1.annotate('%.3f' % v, (i, v), fontsize=14, rotation=45, color='grey') ax1.set_title('BIAS', fontsize=18) ax1.set_xticks(resfolk.index.tolist()) ax1.set_xticklabels(resfolk.index.tolist(), fontsize=14) # line: MAE ax2 = plt.subplot2grid((5, 1), (1, 0)) ax2.plot(resfolk.index.tolist(), resfolk.mae.tolist(), linestyle='--', color='orange', linewidth=3) ax2.scatter(resfolk.index.tolist(), resfolk.mae.tolist(), color='orange') # for i, v in enumerate(resfolk.mae.values): # if not np.isnan(v): # ax2.annotate('%.3f' % v, (i, v), fontsize=14, rotation=45, color='grey') ax2.set_title('MAE', fontsize=18) ax2.set_xticks(resfolk.index.tolist()) ax2.set_xticklabels(resfolk.index.tolist(), fontsize=14) # line: R2 ax3 = plt.subplot2grid((5, 1), (2, 0)) ax3.plot(resfolk.index.tolist(), resfolk.r2.tolist(), linestyle='--', color='green', linewidth=3) ax3.scatter(resfolk.index.tolist(), resfolk.r2.tolist(), color='green') # for i, v in enumerate(resfolk.r2.values): # if not np.isnan(v): # ax3.annotate('%.3f' % v, (i, v), fontsize=14, rotation=45, color='grey') ax3.set_title('R2', fontsize=18) ax3.set_xticks(resfolk.index.tolist()) ax3.set_xticklabels(resfolk.index.tolist(), fontsize=14) ax3.set_ylim([0, 1]) # line: RESIDUES(avg) ax4 = plt.subplot2grid((5, 1), (3, 0)) ax4.plot(resfolk.index.tolist(), resfolk.res_avg.tolist(), linestyle='--', color='red', linewidth=3) ax4.scatter(resfolk.index.tolist(), resfolk.res_avg.tolist(), color='red') # for i, v in enumerate(resfolk.res_avg.values): # if not np.isnan(v): # ax4.annotate('%.3f' % v, (i, v), fontsize=14, rotation=45, color='grey') ax4.set_title('RESIDUES (avg)', fontsize=18) ax4.set_xticks(resfolk.index.tolist()) ax4.set_xticklabels(resfolk.index.tolist(), fontsize=14) # line: RESIDUES(std) ax5 = plt.subplot2grid((5, 1), (4, 0)) ax5.plot(resfolk.index.tolist(), resfolk.res_std.tolist(), linestyle='--', color='red', linewidth=3) ax5.scatter(resfolk.index.tolist(), resfolk.res_std.tolist(), color='red') # for i, v in enumerate(resfolk.res_std.values): # if not np.isnan(v): # ax5.annotate('%.3f' % v, (i, v), fontsize=14, rotation=45, color='grey') ax5.set_title('RESIDUES (std)', fontsize=18) ax5.set_xlabel(sreference, fontsize=14) ax5.set_xticks(resfolk.index.tolist()) ax5.set_xticklabels(resfolk.index.tolist(), fontsize=14) # display plt.subplots_adjust(hspace=0.6) plt.show() # return return resfolk
def metrics(self, X, y): # validation: check that X and y have correct shape X, y = check_X_y(X, y) # calculate and return metrics return metrics_regression(y, self.predict(X))
def main(): # init timer t = Timer() t.add('test') """ DATA PREPARATION """ # load data data, dcol = solar.load() # select data ly = ['y'] lx = [ 'doy', 'hour', 'LCDC267', 'MCDC267', 'HCDC267', 'TCDC267', 'logAPCP267', 'RH267', 'TMP267', 'DSWRF267' ] data = data[lx + ly] dcol = get_dcol(data, ltarget=ly) # select one hour data hour = 11 idata = data[data.hour == hour] idata.drop('hour', axis=1, inplace=True) idcol = get_dcol(idata, ltarget=['y']) # clean del (data) del (dcol) # filtering outliers (ghi vs power) from preprocessing.outliers import median2D isoutlier = median2D.launch(idata['DSWRF267'].values, idata.y.values, percent=20.) idata['isoutlier'] = isoutlier idata = idata[idata.isoutlier == False] idata.drop('isoutlier', axis=1, inplace=True) # prepare data X = idata[idcol['lx']].values scaler = Scaler() y = scaler.fit_transform(idata[idcol['ly']].values).ravel() print('Prepared data: X: %s y: %s' % (X.shape, y.shape)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) print('Prepared data: X_train: %s y_train: %s' % (X_train.shape, y_train.shape)) print('Prepared data: X_test: %s y_test: %s' % (X_test.shape, y_test.shape)) # replace training dataset X = X_train y = y_train """ ESTIMATOR WITH BAYESIAN TUNING """ from hpsklearn import HyperoptEstimator, xgboost_regression from hyperopt import tpe # Instantiate a HyperoptEstimator with the search space and number of evaluations clf = HyperoptEstimator(regressor=xgboost_regression('my_clf'), preprocessing=[], algo=tpe.suggest, max_evals=250, trial_timeout=300) clf.fit(X, y) print(clf.best_model()) y_hat = clf.predict(X_test) dscores = metrics_regression(y_test, y_hat, X.shape[1]) tf = t.since('test') print( '\nBayesian tuning -test: bias = %.3f mae = %.3f r2 = %.3f (time: %s)' % (dscores['bias'], dscores['mae'], dscores['r2'], format_duration(tf))) # training y_hat = clf.predict(X) dscores = metrics_regression(y, y_hat, X.shape[1]) print( 'Bayesian tuning - train: bias = %.3f mae = %.3f r2 = %.3f (time: %s)' % (dscores['bias'], dscores['mae'], dscores['r2'], format_duration(tf)))