Exemplo n.º 1
0
def objective(params):
    params['learning_rate'] = float(params['learning_rate'])
    params['max_depth'] = int(params['max_depth'])
    params['min_child_weight'] = float(params['min_child_weight'])
    params['subsample'] = float(params['subsample'])
    params['gamma'] = float(params['gamma'])
    params['colsample_bytree'] = float(params['colsample_bytree'])
    params['n_estimators'] = int(params['n_estimators'])
    params['reg_alpha'] = float(params['reg_alpha'])
    params['reg_lambda'] = float(params['reg_lambda'])
    params['objective'] = params['objective']
    params['eval_metric'] = params['eval_metric']
    params['nthread'] = params['nthread']
    params['booster'] = params['booster']
    params['tree_method'] = params['tree_method']
    params['silent'] = params['silent']

    global X, y, best

    clf = xgb.XGBRegressor(n_jobs=2, **params)

    y_hat = cross_val_predict(clf, X, y, method='predict', cv=5)
    score = metrics_regression(y, y_hat)['mae']
    print("############### Score: {0}".format(score))
    print("############### Prms: ", params)
    print('..........................')

    return score
Exemplo n.º 2
0
 def metrics(self, y: 'array', y_hat: 'array', msg: str='')->dict:
     """
     Calculate basic metrics for regressors.
     y -- real data.
     y_hat -- predicted data.
     msg -- word or sentence to be included in the screen message.
     return -- dictionary of metrics.
     """
     dmetrics = metrics_regression(y, y_hat, self.X_.shape[1])
     print('[info] Metrics(%s): bias = %.3f  mae = %.3f   r2 = %.3f' % (msg, dmetrics['bias'], dmetrics['mae'], dmetrics['r2']))
     return dmetrics
Exemplo n.º 3
0
def main(exp_id, alpha, l1_ratio):
    """
    Launch an experiment.
    """
    warnings.filterwarnings("ignore")

    # # validate experiment availability
    if not exp_id in lidexp:
        click.secho('[error] an experiment with id "%s" is not available.' % exp_id, bold=True, fg='red')
        quit('Aborted!')
    else:
        # header
        click.secho('Experiment: id=%s  name=%s' % (exp_id, didexp[exp_id]['name']), bold=True, underline=True, bg='blue')
    # confirmation
    click.confirm('Do you want to continue?', default=False, abort=True, prompt_suffix=': ', show_default=True, err=False)

    # # ARGUMENTS: EXPERIMENT
    click.secho('[arg] alpha = %s' % alpha, fg='green')
    click.secho('[arg] l1_ratio = %s' % l1_ratio, fg='green')

    # runner
    with mlflow.start_run(experiment_id=exp_id):

        # # MODEL
        from experiments import model
        test_y, test_yhat, k = model.launcher(dfdata, alpha, l1_ratio)

        # # SCORES
        dmetrics = metrics_regression(test_y, test_yhat, k)
        # print metrics
        click.echo("[info] Metrics: ")
        click.secho("  BIAS: %s" % dmetrics['bias'], fg='blue')
        click.secho("  MAE: %s" % dmetrics['mae'], fg='blue')
        click.secho("  R2: %s" % dmetrics['r2'], fg='blue')

        # tracking
        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("bias", dmetrics['bias'])
        mlflow.log_metric("mae", dmetrics['mae'])
        mlflow.log_metric("r2", dmetrics['r2'])
    # replace training dataset
    X = X_train
    y = y_train
    """ ESTIMATOR WITH BAYESIAN TUNING """

    from hpsklearn import HyperoptEstimator, any_regressor, any_preprocessing
    from hyperopt import tpe
    # Instantiate a HyperoptEstimator with the search space and number of evaluations
    clf = HyperoptEstimator(regressor=any_regressor('my_clf'),
                            preprocessing=any_preprocessing('my_pre'),
                            algo=tpe.suggest,
                            max_evals=250,
                            trial_timeout=300)

    clf.fit(X, y)
    print(clf.best_model())
    y_hat = clf.predict(X_test)
    dscores = metrics_regression(y_test, y_hat, X.shape[1])
    tf = t.since('test')
    print(
        '\nBayesian tuning -test:  bias = %.3f  mae = %.3f  r2 = %.3f (time: %s)'
        %
        (dscores['bias'], dscores['mae'], dscores['r2'], format_duration(tf)))
    # training
    y_hat = clf.predict(X)
    dscores = metrics_regression(y, y_hat, X.shape[1])
    print(
        'Bayesian tuning - train:  bias = %.3f  mae = %.3f  r2 = %.3f (time: %s)'
        %
        (dscores['bias'], dscores['mae'], dscores['r2'], format_duration(tf)))
Exemplo n.º 5
0
def total_metrics(df: 'df', sobservation: str, sprediction: str, nX=None):
    """
    Plot an error analysis overview for whole data.
    df -- df where is included the data to be validated.
    sobservation -- column name of real data.
    sprediction -- column name of predicted data.
    nX -- number of features used to calculate the prediction (default None).
    """
    # copy data
    data = df[[sobservation, sprediction]].dropna()
    # calculate total metrics
    dmetrics = metrics_regression(data[sobservation].values,
                                  data[sprediction].values,
                                  k=nX)
    bias, mae, r2 = dmetrics['bias'], dmetrics['mae'], dmetrics['r2']
    # calculate residues
    residues = data[sobservation].values - data[sprediction].values
    res_avg, res_std = np.mean(residues), np.std(residues)

    # PLOT 1
    import matplotlib.pyplot as plt
    import scipy.stats as stats
    fig = plt.figure(figsize=(15, 10))
    # pie1: BIAS
    ax1 = plt.subplot2grid((2, 3), (0, 0))
    labels = '%.3f' % bias, ''
    sizes = [
        np.abs(bias) * 100. / np.max(data[sobservation].values),
        (np.max(data[sobservation].values) - np.abs(bias)) * 100. /
        np.max(data[sobservation].values)
    ]
    explode = (0.1, 0)
    ax1.pie(sizes,
            explode=explode,
            labels=labels,
            autopct='%1.3f%%',
            shadow=False,
            startangle=65,
            textprops=dict(fontsize=18),
            colors=['red', 'yellow'])
    ax1.axis('equal')
    ax1.set_title("BIAS", fontsize=18)

    # pie2: MAE
    ax2 = plt.subplot2grid((2, 3), (0, 1))
    labels = '%.3f' % mae, ''
    sizes = [
        np.abs(mae) * 100. / np.max(data[sobservation].values),
        (np.max(data[sobservation].values) - np.abs(mae)) * 100. /
        np.max(data[sobservation].values)
    ]
    explode = (0.1, 0)
    ax2.pie(sizes,
            explode=explode,
            labels=labels,
            autopct='%1.3f%%',
            shadow=False,
            startangle=65,
            textprops=dict(fontsize=18),
            colors=['red', 'purple'])
    ax2.axis('equal')
    ax2.set_title("MAE", fontsize=18)

    # pie3: R2
    ax3 = plt.subplot2grid((2, 3), (0, 2))
    labels = '%.3f' % r2, ''
    sizes = [np.abs(r2) * 100, (1 - np.abs(r2)) * 100.]
    explode = (0.1, 0)
    ax3.pie(sizes,
            explode=explode,
            labels=labels,
            autopct='%1.3f%%',
            shadow=False,
            startangle=265,
            textprops=dict(fontsize=18),
            colors=['green', 'red'])
    ax3.axis('equal')
    ax3.set_title("R2", fontsize=18)

    # scatter: RESIDUES vs Y
    ax4 = plt.subplot2grid((2, 3), (1, 0))
    bins = np.linspace(min(residues), max(residues), 50)
    ax4.scatter(data[sobservation].values,
                residues,
                s=10,
                facecolors='none',
                edgecolors='black')
    ax4.hlines(res_avg,
               np.min(data[sobservation].values),
               np.max(data[sobservation].values),
               colors='red',
               linestyles='solid',
               label='average')
    ax4.hlines(res_avg + res_std,
               np.min(data[sobservation].values),
               np.max(data[sobservation].values),
               colors='red',
               linestyles='--',
               label='std')
    ax4.hlines(res_avg - res_std,
               np.min(data[sobservation].values),
               np.max(data[sobservation].values),
               colors='red',
               linestyles='--')
    ax4.legend(loc='best', fontsize=12, shadow=True)
    ax4.set_title("RESIDUES = %.3f +/- %.3f" % (res_avg, res_std), fontsize=18)
    ax4.set_xlabel(sobservation, fontsize=14)
    ax4.set_ylabel('')
    ax4.set_facecolor('xkcd:white')

    # probplot: RESIDUES (vs theoretical Norm distribution)
    ax5 = plt.subplot2grid((2, 3), (1, 1))
    stats.probplot(residues, dist="norm", plot=ax5, fit=True)
    ax5.set_title("Probplot: RESIDUES\n(vs Norm Dist.)", fontsize=18)
    ax5.set_xlabel('Theoretical Quantiles', fontsize=14)
    ax5.set_ylabel('Ordered Values', fontsize=14)

    # kde: DISTRIBUTION (real vs predictioni)
    ax6 = plt.subplot2grid((2, 3), (1, 2))
    data.rename(columns={
        sobservation: 'real',
        sprediction: 'prediction'
    }).plot(kind='kde', ax=ax6, style=['b--', 'r--'], linewidth=2.)
    ax6.set_title("KDE: real vs prediction", fontsize=18)
    ax6.set_xlabel('Values', fontsize=14)
    ax6.set_ylabel('Density', fontsize=14)

    # display
    plt.subplots_adjust(wspace=0.5)
    plt.show()
Exemplo n.º 6
0
def per_reference_metrics(data: 'df',
                          sobservation: str,
                          sprediction: str,
                          sreference: str,
                          nX=None):
    """
    Plot metrics per each value of a reference categorical variable.
    data -- df where is included the data to be validated.
    sobservation -- column name of real data.
    sprediction -- column name of predicted data.
    sreference -- column name of the reference variable.
    nX -- number of features used to calculate the prediction (default None).
    return -- dataframe with scores per values of the reference variable.
    """

    # METRICS CALCULATION

    lvar_values = sorted(data[sreference].unique())
    # by values of reference variable
    lbias = list()
    lmae = list()
    lr2 = list()
    lres_avg = list()
    lres_std = list()
    lfolk = list()
    for ivar_value in lvar_values:
        # collect data
        idata = data[data[sreference] == ivar_value]
        # calculate metrics
        dmetrics = metrics_regression(idata[sobservation].values,
                                      idata[sprediction].values,
                                      k=nX)
        ibias, imae, ir2 = dmetrics['bias'], dmetrics['mae'], dmetrics['r2']
        # calculate residues
        iresidues = idata[sobservation].values - idata[sprediction].values
        ires_avg, ires_std = np.mean(iresidues), np.std(iresidues)
        # store
        lfolk.append(ivar_value)
        lbias.append(ibias)
        lmae.append(imae)
        lr2.append(ir2)
        lres_avg.append(ires_avg)
        lres_std.append(ires_std)
    resfolk = pd.DataFrame({
        sreference: lfolk,
        'bias': lbias,
        'mae': lmae,
        'r2': lr2,
        'res_avg': lres_avg,
        'res_std': lres_std
    }).set_index(sreference)
    # drop inf values
    resfolk = resfolk.replace([np.inf, -np.inf], np.nan)

    # PLOT PER THE REFERENCE VARIABLE
    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(20, 15))

    # line: BIAS
    ax1 = plt.subplot2grid((5, 1), (0, 0))
    ax1.plot(resfolk.index.tolist(),
             resfolk.bias.tolist(),
             linestyle='--',
             color='blue',
             linewidth=3)
    ax1.scatter(resfolk.index.tolist(), resfolk.bias.tolist(), color='blue')
    # for i, v in enumerate(resfolk.bias.values):
    #    if not np.isnan(v):
    #        ax1.annotate('%.3f' % v, (i, v), fontsize=14, rotation=45, color='grey')
    ax1.set_title('BIAS', fontsize=18)
    ax1.set_xticks(resfolk.index.tolist())
    ax1.set_xticklabels(resfolk.index.tolist(), fontsize=14)

    # line: MAE
    ax2 = plt.subplot2grid((5, 1), (1, 0))
    ax2.plot(resfolk.index.tolist(),
             resfolk.mae.tolist(),
             linestyle='--',
             color='orange',
             linewidth=3)
    ax2.scatter(resfolk.index.tolist(), resfolk.mae.tolist(), color='orange')
    # for i, v in enumerate(resfolk.mae.values):
    #    if not np.isnan(v):
    #        ax2.annotate('%.3f' % v, (i, v), fontsize=14, rotation=45, color='grey')
    ax2.set_title('MAE', fontsize=18)
    ax2.set_xticks(resfolk.index.tolist())
    ax2.set_xticklabels(resfolk.index.tolist(), fontsize=14)

    # line: R2
    ax3 = plt.subplot2grid((5, 1), (2, 0))
    ax3.plot(resfolk.index.tolist(),
             resfolk.r2.tolist(),
             linestyle='--',
             color='green',
             linewidth=3)
    ax3.scatter(resfolk.index.tolist(), resfolk.r2.tolist(), color='green')
    # for i, v in enumerate(resfolk.r2.values):
    #    if not np.isnan(v):
    #        ax3.annotate('%.3f' % v, (i, v), fontsize=14, rotation=45, color='grey')
    ax3.set_title('R2', fontsize=18)
    ax3.set_xticks(resfolk.index.tolist())
    ax3.set_xticklabels(resfolk.index.tolist(), fontsize=14)
    ax3.set_ylim([0, 1])

    # line: RESIDUES(avg)
    ax4 = plt.subplot2grid((5, 1), (3, 0))
    ax4.plot(resfolk.index.tolist(),
             resfolk.res_avg.tolist(),
             linestyle='--',
             color='red',
             linewidth=3)
    ax4.scatter(resfolk.index.tolist(), resfolk.res_avg.tolist(), color='red')
    # for i, v in enumerate(resfolk.res_avg.values):
    #    if not np.isnan(v):
    #        ax4.annotate('%.3f' % v, (i, v), fontsize=14, rotation=45, color='grey')
    ax4.set_title('RESIDUES (avg)', fontsize=18)
    ax4.set_xticks(resfolk.index.tolist())
    ax4.set_xticklabels(resfolk.index.tolist(), fontsize=14)

    # line: RESIDUES(std)
    ax5 = plt.subplot2grid((5, 1), (4, 0))
    ax5.plot(resfolk.index.tolist(),
             resfolk.res_std.tolist(),
             linestyle='--',
             color='red',
             linewidth=3)
    ax5.scatter(resfolk.index.tolist(), resfolk.res_std.tolist(), color='red')
    # for i, v in enumerate(resfolk.res_std.values):
    #    if not np.isnan(v):
    #        ax5.annotate('%.3f' % v, (i, v), fontsize=14, rotation=45, color='grey')
    ax5.set_title('RESIDUES (std)', fontsize=18)
    ax5.set_xlabel(sreference, fontsize=14)
    ax5.set_xticks(resfolk.index.tolist())
    ax5.set_xticklabels(resfolk.index.tolist(), fontsize=14)

    # display
    plt.subplots_adjust(hspace=0.6)
    plt.show()

    # return
    return resfolk
Exemplo n.º 7
0
 def metrics(self, X, y):
     # validation: check that X and y have correct shape
     X, y = check_X_y(X, y)
     # calculate and return metrics
     return metrics_regression(y, self.predict(X))
Exemplo n.º 8
0
def main():
    # init timer
    t = Timer()
    t.add('test')
    """ DATA PREPARATION """

    # load data
    data, dcol = solar.load()
    # select data
    ly = ['y']
    lx = [
        'doy', 'hour', 'LCDC267', 'MCDC267', 'HCDC267', 'TCDC267',
        'logAPCP267', 'RH267', 'TMP267', 'DSWRF267'
    ]
    data = data[lx + ly]
    dcol = get_dcol(data, ltarget=ly)
    # select one hour data
    hour = 11
    idata = data[data.hour == hour]
    idata.drop('hour', axis=1, inplace=True)
    idcol = get_dcol(idata, ltarget=['y'])
    # clean
    del (data)
    del (dcol)

    # filtering outliers (ghi vs power)
    from preprocessing.outliers import median2D
    isoutlier = median2D.launch(idata['DSWRF267'].values,
                                idata.y.values,
                                percent=20.)
    idata['isoutlier'] = isoutlier
    idata = idata[idata.isoutlier == False]
    idata.drop('isoutlier', axis=1, inplace=True)

    # prepare data
    X = idata[idcol['lx']].values
    scaler = Scaler()
    y = scaler.fit_transform(idata[idcol['ly']].values).ravel()
    print('Prepared data: X: %s  y: %s' % (X.shape, y.shape))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42)
    print('Prepared data: X_train: %s  y_train: %s' %
          (X_train.shape, y_train.shape))
    print('Prepared data: X_test: %s  y_test: %s' %
          (X_test.shape, y_test.shape))
    # replace training dataset
    X = X_train
    y = y_train
    """ ESTIMATOR WITH BAYESIAN TUNING """

    from hpsklearn import HyperoptEstimator, xgboost_regression
    from hyperopt import tpe
    # Instantiate a HyperoptEstimator with the search space and number of evaluations
    clf = HyperoptEstimator(regressor=xgboost_regression('my_clf'),
                            preprocessing=[],
                            algo=tpe.suggest,
                            max_evals=250,
                            trial_timeout=300)

    clf.fit(X, y)
    print(clf.best_model())
    y_hat = clf.predict(X_test)
    dscores = metrics_regression(y_test, y_hat, X.shape[1])
    tf = t.since('test')
    print(
        '\nBayesian tuning -test:  bias = %.3f  mae = %.3f  r2 = %.3f (time: %s)'
        %
        (dscores['bias'], dscores['mae'], dscores['r2'], format_duration(tf)))
    # training
    y_hat = clf.predict(X)
    dscores = metrics_regression(y, y_hat, X.shape[1])
    print(
        'Bayesian tuning - train:  bias = %.3f  mae = %.3f  r2 = %.3f (time: %s)'
        %
        (dscores['bias'], dscores['mae'], dscores['r2'], format_duration(tf)))