def plot_decompositions(
    signal,
    figsize=None,
    save_path=None,
    measurement_time='month',
    measurement_unit="$10^8m^3$",
):
    cols = signal.columns.values
    logger.info('cols={}'.format(cols))
    T = signal.shape[0]
    t = np.arange(start=1, stop=T + 1, step=1, dtype=np.float) / T
    freqs = t - 0.5 - 1 / T
    if figsize == None:
        figsize = (7.48, 1 * len(cols))
    plt.figure(figsize=figsize)
    for i in range(len(cols)):
        subsignal = signal[cols[i]].values
        plt.subplot(len(cols), 2, 2 * i + 1)
        plt.title(cols[i])
        plt.plot(subsignal, c='b')
        if i == len(cols) - 1:
            plt.xlabel('Time(' + measurement_time + ')')
        else:
            plt.xticks([])
        plt.ylabel(r"Streamflow(" + measurement_unit + ")", )
        plt.subplot(len(cols), 2, 2 * i + 2)
        plt.title(cols[i])
        plt.plot(freqs, abs(fft(subsignal)), c='b', lw=0.8, zorder=0)
        if i == len(cols) - 1:
            plt.xlabel('Frequency(1/' + measurement_time + ')')
        else:
            plt.xticks([])
        plt.ylabel('Amplitude')
    plt.tight_layout()
def dum_pred_results(path,
                     train_y,
                     train_predictions,
                     dev_y,
                     dev_predictions,
                     test_y,
                     test_predictions,
                     time_cost=None):
    """ 
    Dump real records (labels) and predictions as well as evaluation criteria (metrix R2,RMSE,MAE,MAPE,PPTS,time_cost) to csv.
    Args:
        path: The local disk path to dump data into.
        train_y: records of training set with numpy array type.
        train_predictions: predictions of training set with numpy array type.
        dev_y: records of development set with numpy array type.
        dev_predictions: predictions of development set with numpy array type.
        test_y: records of testing set with numpy array type.
        test_predictions: predictions of testing set with numpy array type.
        time_cost: Time cost for profiling.
    
    Return:
    A csv file
    """
    logger.info('Dump records, predictions and evaluation criteria...')
    logger.info('Compute Nash-Sutcliffe efficiency (NSE)...')
    train_nse = r2_score(train_y, train_predictions)
    dev_nse = r2_score(dev_y, dev_predictions)
    test_nse = r2_score(test_y, test_predictions)
    logger.info('Compute Mean Square Error (MSE)...')
    train_mse = mean_squared_error(y_true=train_y, y_pred=train_predictions)
    dev_mse = mean_squared_error(y_true=dev_y, y_pred=dev_predictions)
    test_mse = mean_squared_error(y_true=test_y, y_pred=test_predictions)
    logger.info('Compute normalized mean square error (NRMSE)...')
    train_nrmse = math.sqrt(mean_squared_error(
        train_y, train_predictions)) / (sum(train_y) / len(train_y))
    dev_nrmse = math.sqrt(mean_squared_error(
        dev_y, dev_predictions)) / (sum(dev_y) / len(dev_y))
    test_nrmse = math.sqrt(mean_squared_error(
        test_y, test_predictions)) / (sum(test_y) / len(test_y))
    logger.info('Compute mean absolute error (MAE)...')
    train_mae = mean_absolute_error(train_y, train_predictions)
    dev_mae = mean_absolute_error(dev_y, dev_predictions)
    test_mae = mean_absolute_error(test_y, test_predictions)
    logger.info('Compute mean absolute percentage error (MAPE)...')
    train_mape = np.mean(np.abs((train_y - train_predictions) / train_y)) * 100
    dev_mape = np.mean(np.abs((dev_y - dev_predictions) / dev_y)) * 100
    test_mape = np.mean(np.abs((test_y - test_predictions) / test_y)) * 100
    logger.info('Compute peak percentage of threshold statistic (PPTS)...')
    train_ppts = PPTS(train_y, train_predictions, 5)
    dev_ppts = PPTS(dev_y, dev_predictions, 5)
    test_ppts = PPTS(test_y, test_predictions, 5)
    logger.info('Dumping the model results.')
    dump_train_dev_test_to_csv(
        path=path,
        train_y=train_y,
        train_pred=train_predictions,
        train_nse=train_nse,
        train_mse=train_mse,
        train_nrmse=train_nrmse,
        train_mae=train_mae,
        train_mape=train_mape,
        train_ppts=train_ppts,
        dev_y=dev_y,
        dev_pred=dev_predictions,
        dev_nse=dev_nse,
        dev_mse=dev_mse,
        dev_nrmse=dev_nrmse,
        dev_mae=dev_mae,
        dev_mape=dev_mape,
        dev_ppts=dev_ppts,
        test_y=test_y,
        test_pred=test_predictions,
        test_nse=test_nse,
        test_mse=test_mse,
        test_nrmse=test_nrmse,
        test_mae=test_mae,
        test_mape=test_mape,
        test_ppts=test_ppts,
        time_cost=time_cost,
    )
Пример #3
0
def ensemble(root_path,
             original_series,
             station,
             predictor,
             predict_pattern,
             variables,
             decomposer=None,
             wavelet_level='db10-2',
             framework='WDDFF'):

    if decomposer == 'modwt':
        if framework == 'TSDP':
            lags_dict = variables['lags_dict'][wavelet_level]
        else:
            lags_dict = None
    elif decomposer == 'dwt':
        lags_dict = variables['lags_dict'][wavelet_level]
    else:
        lags_dict = variables['lags_dict']
    full_len = variables['full_len']
    train_len = variables['train_len']
    dev_len = variables['dev_len']
    test_len = variables['test_len']
    logger.info('Ensemble forecasting results...')
    logger.info('Root path:{}'.format(root_path))
    logger.info('original series:\n{}'.format(original_series))
    logger.info('Station:{}'.format(station))
    logger.info('Decomposer:{}'.format(decomposer))
    logger.info('Lags dict:{}'.format(lags_dict))
    logger.info('Predictor:{}'.format(predictor))
    logger.info('Predict pattern:{}'.format(predict_pattern))
    logger.info('Training length:{}'.format(train_len))
    logger.info('Development length:{}'.format(test_len))
    logger.info('Testing length:{}'.format(test_len))
    logger.info('Entire length:{}'.format(full_len))
    logger.info(
        'Wavelet and decomposition level of WA:{}'.format(wavelet_level))

    if decomposer == 'modwt':
        models_path = root_path + '/' + station + '_' + decomposer + '/projects/' + predictor + '-' + framework.lower(
        ) + '/' + wavelet_level + '/' + predict_pattern + '/'
    elif decomposer == 'dwt':
        models_path = root_path + '/' + station + '_' + decomposer + '/projects/' + predictor + '/' + wavelet_level + '/' + predict_pattern + '/'
    elif decomposer == None:
        models_path = root_path + '/' + station + '/projects/' + predictor + '/' + predict_pattern + '/'
    else:
        models_path = root_path + '/' + station + '_' + decomposer + '/projects/' + predictor + '/' + predict_pattern + '/'
    logger.info("Model path:{}".format(models_path))

    if 'multi_step' not in predict_pattern:
        models_history = models_path + 'history/'
        optimal_model = ''
        min_dev_mse = np.inf
        for file_ in os.listdir(models_history):
            if '.csv' in file_ and 'optimized_params' not in file_:
                logger.info('read model results:{}'.format(file_))
                dev_mse = pd.read_csv(models_history + file_)['dev_mse'][0]
                if dev_mse < min_dev_mse:
                    min_dev_mse = dev_mse
                    optimal_model = file_
        logger.info('Optimal model:{}'.format(optimal_model))
        logger.info('Minimum MSE={}'.format(min_dev_mse))
        res = load(models_history +
                   (optimal_model.split('.csv')[0] + '_result.pkl'))
        dump(res, models_path + 'result.pkl')
        optimal_model = pd.DataFrame([optimal_model],
                                     columns=['optimal_model'])
        optimal_results = pd.read_csv(models_history +
                                      optimal_model['optimal_model'][0])
        if predictor == 'esvr' or predictor == 'gbrt':
            optimal_params = pd.read_csv(
                models_history +
                optimal_model['optimal_model'][0].split('.csv')[0] +
                '_optimized_params.csv')
            optimal_results = pd.concat(
                [optimal_model, optimal_params, optimal_results], axis=1)
        elif predictor == 'lstm':
            optimal_results = pd.concat([optimal_model, optimal_results],
                                        axis=1)
        optimal_results.to_csv(models_path + 'optimal_model_results.csv')
        plot_rela_pred(optimal_results['train_y'],
                       optimal_results['train_pred'],
                       models_path + 'train_pred.png')
        plot_rela_pred(optimal_results['dev_y'][0:data_part['dev_len']],
                       optimal_results['dev_pred'][0:data_part['dev_len']],
                       models_path + 'dev_pred.png')
        plot_rela_pred(optimal_results['test_y'][0:data_part['test_len']],
                       optimal_results['test_pred'][0:data_part['test_len']],
                       models_path + 'test_pred.png')
    else:
        for i in range(len(lags_dict)):
            print(len(lags_dict))
            model_path = models_path + 's' + str(i + 1) + '/'
            models_history = model_path + 'history/'
            optimal_model = ''
            min_dev_mse = np.inf
            for file_ in os.listdir(models_history):
                if '.csv' in file_ and 'optimized_params' not in file_:
                    logger.info('read model results:{}'.format(file_))
                    dev_mse = pd.read_csv(models_history + file_)['dev_mse'][0]
                    if dev_mse < min_dev_mse:
                        min_dev_mse = dev_mse
                        optimal_model = file_
            logger.info('Optimal model:{}'.format(optimal_model))
            logger.info('Minimum MSE={}'.format(min_dev_mse))
            res = load(models_history +
                       (optimal_model.split('.csv')[0] + '_result.pkl'))
            dump(res, model_path + 'result.pkl')
            optimal_model = pd.DataFrame([optimal_model],
                                         columns=['optimal_model'])
            optimal_results = pd.read_csv(models_history +
                                          optimal_model['optimal_model'][0])
            if predictor == 'esvr' or predictor == 'gbrt':
                optimal_params = pd.read_csv(
                    models_history +
                    optimal_model['optimal_model'][0].split('.csv')[0] +
                    '_optimized_params.csv')
                optimal_results = pd.concat(
                    [optimal_model, optimal_params, optimal_results], axis=1)
            elif predictor == 'lstm':
                optimal_results = pd.concat([optimal_model, optimal_results],
                                            axis=1)
            optimal_results.to_csv(model_path + 'optimal_model_results.csv')
            plot_rela_pred(optimal_results['train_y'],
                           optimal_results['train_pred'],
                           model_path + 'train_pred.png')
            plot_rela_pred(optimal_results['dev_y'][0:data_part['dev_len']],
                           optimal_results['dev_pred'][0:data_part['dev_len']],
                           model_path + 'dev_pred.png')
            plot_rela_pred(
                optimal_results['test_y'][0:data_part['test_len']],
                optimal_results['test_pred'][0:data_part['test_len']],
                model_path + 'test_pred.png')
        train_len_ = train_len - max(lags_dict.values())
        train_sum_pred = pd.DataFrame()
        dev_sum_pred = pd.DataFrame()
        test_sum_pred = pd.DataFrame()
        time_cost_sum = 0.0
        for i in range(len(lags_dict)):
            model_path = models_path + 's' + str(i + 1) + '/'
            results = pd.read_csv(model_path + 'optimal_model_results.csv')
            time_cost_sum = time_cost_sum + results['time_cost'][0]
            train_pred = results['train_pred']
            train_pred = train_pred[train_pred.shape[0] - train_len_:]
            train_pred = train_pred.reset_index(drop=True)
            dev_pred = results['dev_pred'][0:dev_len]
            test_pred = results['test_pred'][0:test_len]
            train_sum_pred = pd.concat([train_sum_pred, train_pred], axis=1)
            dev_sum_pred = pd.concat([dev_sum_pred, dev_pred], axis=1)
            test_sum_pred = pd.concat([test_sum_pred, test_pred], axis=1)
        train_sum_pred = train_sum_pred.sum(axis=1)
        dev_sum_pred = dev_sum_pred.sum(axis=1)
        test_sum_pred = test_sum_pred.sum(axis=1)
        train_sum_pred[train_sum_pred < 0.0] = 0.0
        dev_sum_pred[dev_sum_pred < 0.0] = 0.0
        test_sum_pred[test_sum_pred < 0.0] = 0.0
        original_series = original_series.reset_index(drop=True)
        train_y = original_series[train_len - train_len_:train_len]
        dev_y = original_series[train_len:train_len + dev_len]
        test_y = original_series[train_len + dev_len:]
        train_y = train_y.reset_index(drop=True)
        dev_y = dev_y.reset_index(drop=True)
        test_y = test_y.reset_index(drop=True)

        train_nse = r2_score(train_y.values, train_sum_pred.values)
        train_mse = mean_squared_error(train_y.values, train_sum_pred.values)
        train_nrmse = math.sqrt(
            mean_squared_error(train_y.values, train_sum_pred.values)) / (
                sum(train_y.values) / len(train_y.values))
        train_ppts = PPTS(train_y.values, train_sum_pred.values, 5)

        dev_nse = r2_score(dev_y.values, dev_sum_pred.values)
        dev_mse = mean_squared_error(dev_y.values, dev_sum_pred.values)
        dev_nrmse = math.sqrt(
            mean_squared_error(dev_y.values, dev_sum_pred.values)) / (
                sum(dev_y.values) / len(dev_y.values))
        dev_ppts = PPTS(dev_y.values, dev_sum_pred.values, 5)

        test_nse = r2_score(test_y.values, test_sum_pred.values)
        test_mse = mean_squared_error(test_y.values, test_sum_pred.values)
        test_nrmse = math.sqrt(
            mean_squared_error(test_y.values, test_sum_pred.values)) / (
                sum(test_y.values) / len(test_y.values))
        test_ppts = PPTS(test_y.values, test_sum_pred.values, 5)

        metrics = {
            'train_nse': train_nse,
            'train_mse': train_mse,
            'train_nrmse': train_nrmse,
            'train_ppts': train_ppts,
            'dev_nse': dev_nse,
            'dev_mse': dev_mse,
            'dev_nrmse': dev_nrmse,
            'dev_ppts': dev_ppts,
            'test_nse': test_nse,
            'test_mse': test_mse,
            'test_nrmse': test_nrmse,
            'test_ppts': test_ppts,
            'time_cost': time_cost_sum,
        }
        metrics_df = pd.DataFrame(metrics, index=[0])
        print(metrics_df)
        train_results = pd.concat([train_y, train_sum_pred], axis=1)
        train_results = pd.DataFrame(train_results.values,
                                     columns=['train_y', 'train_pred'])
        dev_results = pd.concat([dev_y, dev_sum_pred], axis=1)
        dev_results = pd.DataFrame(dev_results.values,
                                   columns=['dev_y', 'dev_pred'])
        test_results = pd.concat([test_y, test_sum_pred], axis=1)
        test_results = pd.DataFrame(test_results.values,
                                    columns=['test_y', 'test_pred'])
        optimal_results = pd.concat(
            [train_results, dev_results, test_results, metrics_df], axis=1)
        optimal_results.to_csv(models_path + 'optimal_results.csv')
        plot_rela_pred(train_y, train_sum_pred, models_path + 'train_pred.png')
        plot_rela_pred(dev_y, dev_sum_pred, models_path + 'dev_pred.png')
        plot_rela_pred(test_y, test_sum_pred, models_path + 'test_pred.png')
def plot_cv_error(data_path, labels, mode='avg'):
    logger.info('Plot cross validation MSE...')
    logger.info('Data path:{}'.format(data_path))
    logger.info('Labels:{}'.format(labels))
    if isinstance(data_path, str):
        data_path = [data_path]
        labels = [labels]
    plt.figure(figsize=(7.48, 7.48))
    plt.xlabel('CV')
    plt.ylabel('MSE')
    for path, label in zip(data_path, labels):
        logger.info('Read cv results of {}'.format(path))
        dev_cv = {}
        test_cv = {}
        for file_ in os.listdir(path):
            if '.csv' in file_ and 'seed' not in file_ and 'optimized_params' not in file_:
                logger.info('cv-file:{}'.format(file_))
                cv = int(re.findall(r"(?<=cv)\d+", file_)[0])
                logger.info('cv={}'.format(cv))
                data = pd.read_csv(path + file_)
                dev_metrics = data['dev_nrmse'][0]
                test_metrics = data['test_nrmse'][0]
                logger.info('Development metrics={}'.format(dev_metrics))
                dev_cv[cv] = dev_metrics
                test_cv[cv] = test_metrics
        logger.debug('Development cv dict before sort:{}'.format(dev_cv))
        logger.debug('Testing cv dict before sort:{}'.format(test_cv))
        dev_cv = dict(sorted(dev_cv.items()))
        test_cv = dict(sorted(test_cv.items()))
        logger.info('Cross validation development dict:{}'.format(dev_cv))
        logger.info('Cross validation folds:{}'.format(dev_cv.keys()))
        logger.info('Cross validation MSE:{}'.format(dev_cv.values()))
        plt.plot(list(dev_cv.keys()),
                 list(dev_cv.values()),
                 marker='o',
                 label=label + 'dev')
        plt.plot(list(test_cv.keys()),
                 list(test_cv.values()),
                 marker='o',
                 label=label + 'test')
        plt.legend()
    plt.tight_layout()
print(len(cal_pred))
train_pred = cal_pred[0:train_len]
dev_pred = cal_pred[train_len:]
print(model_fit.summary())
print(len(train_pred))
print(len(dev_pred))

residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()
residuals.plot(kind='kde')
plt.show()
print(residuals.describe())

if os.path.exists(model_path + 'arima' + str(order) + '_results.csv'):
    logger.info("The arima" + str(order) + " was already tuned")

history = [x for x in cal]
test_pred = list()
for t in range(len(test)):
    model = ARIMA(history, order=order)
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    test_pred.append(yhat)
    obs = test[t]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))
end = time.process_time()
time_cost = end - start
def plot_rela_pred(records,
                   predictions,
                   fig_savepath,
                   measurement_time='month',
                   measurement_unit="$10^8m^3$",
                   figsize=(7.48, 3),
                   format='PNG',
                   dpi=300):
    """ 
    Plot the relations between the records and predictions.
    Args:
        records: the actual measured records.
        predictions: the predictions obtained by model
        fig_savepath: the path where the plot figure will be saved.
    """
    logger.info('Plot predictions and correlations...')
    if isinstance(records, pd.DataFrame) or isinstance(records, pd.Series):
        records = records.values
    elif isinstance(predictions, pd.DataFrame) or isinstance(
            predictions, pd.Series):
        predictions = predictions.values
    length = records.size
    t = np.linspace(start=1, stop=length, num=length)
    plt.figure(figsize=figsize)
    ax1 = plt.subplot2grid((1, 5), (0, 0), colspan=3)
    ax2 = plt.subplot2grid((1, 5), (0, 3), colspan=2, aspect='equal')

    # ax1.set_xticks([])
    # ax1.set_yticks([])
    ax1.set_xlabel('Time(' + measurement_time + ')', )
    ax1.set_ylabel(r'Streamflow(' + measurement_unit + ')', )
    ax1.plot(t, records, '-', color='blue', label='Records', linewidth=1.0)
    ax1.plot(t,
             predictions,
             '--',
             color='red',
             label='Predictions',
             linewidth=1.0)
    ax1.legend(
        # loc='upper left',
        loc=0,
        # bbox_to_anchor=(0.005,1.2),
        shadow=False,
        frameon=False,
    )
    logger.info('records=\n{}'.format(records))
    logger.info('predictions=\n{}'.format(predictions))

    pred_min = predictions.min()
    pred_max = predictions.max()
    record_min = records.min()
    record_max = records.max()
    if pred_min < record_min:
        xymin = pred_min
    else:
        xymin = record_min
    if pred_max > record_max:
        xymax = pred_max
    else:
        xymax = record_max

    logger.info('xymin={}'.format(xymin))
    logger.info('xymax={}'.format(xymax))

    xx = np.arange(start=xymin, stop=xymax + 1, step=1.0)
    coeff = np.polyfit(predictions, records, 1)
    linear_fit = coeff[0] * xx + coeff[1]
    # print('a:{}'.format(coeff[0]))
    # print('b:{}'.format(coeff[1]))
    # ax2.set_xticks()
    # ax2.set_yticks()
    ax2.set_xlabel(r'Predictions(' + measurement_unit + ')', )
    ax2.set_ylabel(r'Records(' + measurement_unit + ')', )
    # ax2.plot(predictions, records, 'o', color='blue', label='',markersize=6.5)
    ax2.plot(predictions,
             records,
             'o',
             markerfacecolor='w',
             markeredgecolor='blue',
             markersize=6.5)
    # ax2.plot(predictions, linear_fit, '--', color='red', label='Linear fit',linewidth=1.0)
    # ax2.plot(predictions, ideal_fit, '-', color='black', label='Ideal fit',linewidth=1.0)
    ax2.plot(xx,
             linear_fit,
             '--',
             color='red',
             label='Linear fit',
             linewidth=1.0)
    ax2.plot([xymin, xymax], [xymin, xymax],
             '-',
             color='black',
             label='Ideal fit',
             linewidth=1.0)
    ax2.set_xlim([xymin, xymax])
    ax2.set_ylim([xymin, xymax])
    ax2.legend(
        # loc='upper left',
        loc=0,
        # bbox_to_anchor=(0.05,1),
        shadow=False,
        frameon=False,
    )
    # plt.subplots_adjust(left=0.08, bottom=0.12, right=0.98, top=0.98, hspace=0.1, wspace=0.2)
    plt.tight_layout()
    plt.savefig(fig_savepath, format=format, dpi=dpi)
def dump_pred_results(path,
                      train_y=None,
                      train_predictions=None,
                      dev_y=None,
                      dev_predictions=None,
                      test_y=None,
                      test_predictions=None,
                      time_cost=None):
    """ 
    Dump real records (labels) and predictions as well as evaluation criteria (metrix R2,RMSE,MAE,MAPE,PPTS,time_cost) to csv.
    Args:
        path: The local disk path to dump data into.
        train_y: records of training set with numpy array type.
        train_predictions: predictions of training set with numpy array type.
        dev_y: records of development set with numpy array type.
        dev_predictions: predictions of development set with numpy array type.
        test_y: records of testing set with numpy array type.
        test_predictions: predictions of testing set with numpy array type.
        time_cost: Time cost for profiling.
    
    Return:
    A csv file
    """
    logger.info('Dump records, predictions and evaluation criteria...')
    logger.info('Compute Nash-Sutcliffe efficiency (NSE)...')
    # if train_y==None or train_predictions==None:
    #     train_nse = None
    #     train_mse = None
    #     train_nrmse = None
    #     train_mae = None
    #     train_mape = None
    # else:
    train_nse = r2_score(train_y, train_predictions)
    train_mse = mean_squared_error(y_true=train_y, y_pred=train_predictions)
    train_nrmse = math.sqrt(mean_squared_error(
        train_y, train_predictions)) / (sum(train_y) / len(train_y))
    train_mae = mean_absolute_error(train_y, train_predictions)
    train_mape = np.mean(np.abs((train_y - train_predictions) / train_y)) * 100
    train_ppts = PPTS(train_y, train_predictions, 5)

    # if dev_y==None or dev_predictions==None:
    #     dev_nse = None
    #     dev_mse = None
    #     dev_nrmse = None
    #     dev_mae = None
    #     dev_mape = None
    # else:
    dev_nse = r2_score(dev_y, dev_predictions)
    dev_mse = mean_squared_error(y_true=dev_y, y_pred=dev_predictions)
    dev_nrmse = math.sqrt(mean_squared_error(
        dev_y, dev_predictions)) / (sum(dev_y) / len(dev_y))
    dev_mae = mean_absolute_error(dev_y, dev_predictions)
    dev_mape = np.mean(np.abs((dev_y - dev_predictions) / dev_y)) * 100
    dev_ppts = PPTS(dev_y, dev_predictions, 5)

    # if test_y==None or test_predictions==None:
    #     test_nse = None
    #     test_mse = None
    #     test_nrmse = None
    #     test_mae = None
    #     test_mape = None

    # else:
    test_nse = r2_score(test_y, test_predictions)
    test_mse = mean_squared_error(y_true=test_y, y_pred=test_predictions)
    test_nrmse = math.sqrt(mean_squared_error(
        test_y, test_predictions)) / (sum(test_y) / len(test_y))
    test_mae = mean_absolute_error(test_y, test_predictions)
    test_mape = np.mean(np.abs((test_y - test_predictions) / test_y)) * 100
    test_ppts = PPTS(test_y, test_predictions, 5)

    dump_train_dev_test_to_csv(
        path=path,
        train_y=train_y,
        train_pred=train_predictions,
        train_nse=train_nse,
        train_mse=train_mse,
        train_nrmse=train_nrmse,
        train_mae=train_mae,
        train_mape=train_mape,
        train_ppts=train_ppts,
        dev_y=dev_y,
        dev_pred=dev_predictions,
        dev_nse=dev_nse,
        dev_mse=dev_mse,
        dev_nrmse=dev_nrmse,
        dev_mae=dev_mae,
        dev_mape=dev_mape,
        dev_ppts=dev_ppts,
        test_y=test_y,
        test_pred=test_predictions,
        test_nse=test_nse,
        test_mse=test_mse,
        test_nrmse=test_nrmse,
        test_mae=test_mae,
        test_mape=test_mape,
        test_ppts=test_ppts,
        time_cost=time_cost,
    )
def ensemble(root_path,original_series,station,predictor,predict_pattern,variables,decomposer=None,wavelet_level='db10-2'):
    lags_dict = variables['lags_dict']
    full_len = variables['full_len']
    train_len = variables['train_len']
    dev_len = variables['dev_len']
    test_len = variables['test_len']
    logger.info('Ensemble forecasting results...')
    logger.info('Root path:{}'.format(root_path))
    logger.info('original series:\n{}'.format(original_series))
    logger.info('Station:{}'.format(station))
    logger.info('Decomposer:{}'.format(decomposer))   
    logger.info('Lags dict:{}'.format(lags_dict))
    logger.info('Predictor:{}'.format(predictor))
    logger.info('Predict pattern:{}'.format(predict_pattern))
    logger.info('Training length:{}'.format(train_len))
    logger.info('Development length:{}'.format(test_len))
    logger.info('Testing length:{}'.format(test_len))
    logger.info('Entire length:{}'.format(full_len))
    logger.info('Wavelet and decomposition level of WA:{}'.format(wavelet_level))
    
    original = original_series
    if decomposer=='dwt' or decomposer=='modwt':
        models_path = root_path+'/'+station+'_'+decomposer+'/projects/'+predictor+'/'+wavelet_level+'/'+predict_pattern+'/'
    elif decomposer==None:
        models_path = root_path+'/'+station+'/projects/'+predictor+'/'+predict_pattern+'/'
    else:
        models_path = root_path+'/'+station+'_'+decomposer+'/projects/'+predictor+'/'+predict_pattern+'/'
    logger.info("Model path:{}".format(models_path))

    if 'multi_step' not in predict_pattern:
        models_history = models_path+'history/'
        optimal_model = ''
        min_dev_mse = np.inf
        for file_ in os.listdir(models_history):
            if '.csv' in file_ and 'optimized_params' not in file_:
                logger.info('read model results:{}'.format(file_))
                dev_mse = pd.read_csv(models_history+file_)['dev_mse'][0]
                if dev_mse < min_dev_mse:
                    min_dev_mse = dev_mse
                    optimal_model = file_
        logger.info('Optimal model:{}'.format(optimal_model))
        logger.info('Minimum MSE={}'.format(min_dev_mse))
        optimal_model = pd.DataFrame([optimal_model],columns=['optimal_model'])
        optimal_results = pd.read_csv(models_history+optimal_model['optimal_model'][0])
        if predictor=='esvr' or predictor=='gbrt':
            optimal_params = pd.read_csv(models_history+optimal_model['optimal_model'][0].split('.csv')[0]+'_optimized_params.csv')
            optimal_results = pd.concat([optimal_model,optimal_params,optimal_results],axis=1)
        elif predictor=='lstm':
            optimal_results = pd.concat([optimal_model,optimal_results],axis=1)
        optimal_results.to_csv(models_path+'optimal_model_results.csv')
        plot_rela_pred(optimal_results['train_y'],optimal_results['train_pred'],models_path+'train_pred.png')
        plot_rela_pred(optimal_results['dev_y'][0:data_part['dev_len']],optimal_results['dev_pred'][0:data_part['dev_len']],models_path+'dev_pred.png')
        plot_rela_pred(optimal_results['test_y'][0:data_part['test_len']],optimal_results['test_pred'][0:data_part['test_len']],models_path+'test_pred.png')
Пример #9
0
def read_long_leading_time(station,
                           decomposer,
                           mode='pearson',
                           pearson_threshold=0.2,
                           wavelet_level="db10-2"):
    logger.info('reading long lead time model results...')
    logger.info('station:{}'.format(station))
    logger.info('decomposer:{}'.format(decomposer))
    logger.info('mode:{}'.format(mode))
    logger.info('pearson threshold:{}'.format(pearson_threshold))
    logger.info('wavelet level:{}'.format(wavelet_level))

    records = []
    predictions = []
    nse = []
    nrmse = []
    ppts = []

    if decomposer == 'modwt':
        m1 = read_two_stage(
            station=station,
            decomposer=decomposer,
            predict_pattern="single_hybrid_1_ahead_mi_ts0.1",
        )
    else:
        m1 = read_two_stage(
            station=station,
            decomposer=decomposer,
            predict_pattern="one_step_1_ahead_forecast_pacf",
        )
    records.append(m1['test_y'])
    predictions.append(m1['test_pred'])
    nse.append(m1['test_nse'])
    nrmse.append(m1['test_nrmse'])
    ppts.append(m1['test_ppts'])
    # averaging the trained svr with different seed
    leading_times = [3, 5, 7, 9]
    for leading_time in leading_times:
        if decomposer == 'modwt':
            model_path = root_path + "\\" + station + "_" + decomposer + "\\projects\\esvr-wddff\\" + wavelet_level + "\\"
        elif decomposer == "dwt":
            model_path = root_path + "\\" + station + "_" + decomposer + "\\projects\\esvr\\" + wavelet_level + "\\"
        else:
            model_path = root_path + "\\" + station + "_" + decomposer + "\\projects\\esvr\\"
        print("Reading  mode:{}".format(mode))
        if mode == 'pacf':
            model_path = model_path + "one_step_" + str(
                leading_time) + "_ahead_forecast_pacf//"
        elif mode == 'pearson':
            model_path = model_path + "one_step_" + str(
                leading_time) + "_ahead_forecast_pearson" + str(
                    pearson_threshold) + "//"
        elif mode == 'mi':
            model_path = model_path + "single_hybrid_" + str(
                leading_time) + "_ahead_mi_ts0.1//"
        logger.info('model path:{}'.format(model_path))
        results = pd.read_csv(model_path + 'optimal_model_results.csv')
        test_pred = (results['test_pred'][0:120]).values.flatten()
        test_y = (results['test_y'][0:120]).values.flatten()
        records.append(test_y)
        predictions.append(test_pred)
        nse.append(results['test_nse'][0])
        nrmse.append(results['test_nrmse'][0])
        ppts.append(results['test_ppts'][0])

    results = {
        'records': records,
        'predictions': predictions,
        'nse': nse,
        'nrmse': nrmse,
        'ppts': ppts,
    }

    logger.info('results.records:{}'.format(pd.DataFrame(results)['records']))
    logger.info('results.predictions:{}'.format(
        pd.DataFrame(results)['predictions']))

    return results
Пример #10
0
import pandas as pd
import numpy as np
import math
from statistics import mean
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_squared_log_error
import os
root_path = os.path.dirname(os.path.abspath('__file__'))
import sys
sys.path.append(root_path)
from tools.metrics_ import PPTS, mean_absolute_percentage_error
from config.globalLog import logger

logger.info('results_reader')


def read_two_stage(station,
                   decomposer,
                   predict_pattern,
                   wavelet_level="db10-2",
                   framework='WDDFF'):
    if decomposer == 'modwt':
        model_path = root_path + "\\" + station + "_" + decomposer + "\\projects\\esvr-" + framework.lower(
        ) + "\\" + wavelet_level + "\\" + predict_pattern + "\\"
    elif decomposer == "dwt":
        model_path = root_path + "\\" + station + "_" + decomposer + "\\projects\\esvr\\" + wavelet_level + "\\" + predict_pattern + "\\"
    else:
        model_path = root_path + "\\" + station + "_" + decomposer + "\\projects\\esvr\\" + predict_pattern + "\\"
    results = pd.read_csv(model_path + 'optimal_model_results.csv')
    test_pred = (results['test_pred'][0:120]).values.flatten()
    test_y = (results['test_y'][0:120]).values.flatten()
Пример #11
0
# cb_ax = fig.add_axes([0.85, 0.06, 0.05, 0.38])#[x,y,width,height]
# cbar = fig.colorbar(im, cax=cb_ax)
# cbar.set_ticks(np.arange(0, 1.1, 0.5))
# cbar.set_label(r"$Corr_{i,j}$")
# # cbar.set_ticklabels(['low', 'medium', 'high'])
# plt.savefig(graphs_path+"Fig.9.Pearson corr of Huaxian.tif",format="TIFF",dpi=1200)
# plt.savefig(graphs_path+"Fig.9.Pearson corr of Huaxian.pdf",format="PDF",dpi=1200)
# plt.show()

fig = plt.figure(figsize=(7.4861,1.7))
for i in range(len(corrs)):
    ax = plt.subplot(1,5,i+1)
    ax.set_title(titles[i],fontsize=6)
    sign_num=corrs[i].shape[1]
    logger.info('Number of sub-signals:{}'.format(sign_num))
    ticks = list(range(sign_num))
    logger.info('ticks:{}'.format(ticks))
    labels=[]
    for j in ticks:
        if titles[i].find('VMD')>=0:
            labels.append(r'$IMF_{'+str(j+1)+'}$')
        elif titles[i].find('EEMD')>=0:
            if j==sign_num-1:
                labels.append(r'$R$')
            else:
                labels.append(r'$IMF_{'+str(j+1)+'}$')
        elif titles[i].find('MODWT')>=0:
            if j==sign_num-1:
                labels.append(r'$V_{'+str(j)+'}$')
            else: