Пример #1
0
def main():
    if not FLAGS.csv_path:
        raise ValueError("must set --csv_path")

    villani = reader.read_data_sets(FLAGS.csv_path)

    config = Config()
    model = Model(config)
    with open(os.path.join(config.save_dir, 'config.pkl'), 'wb') as f:
        cPickle.dump(config, f)

    with tf.Session() as sess:
        writer = tf.train.SummaryWriter(config.log_dir, sess.graph)
        tf.initialize_all_variables().run()
        saver = tf.train.Saver(tf.all_variables())
        step = 1
        for step in range(config.training_iters):
            new_lr = config.lr * (config.decay_rate**
                                  villani.train.epochs_completed)
            sess.run(tf.assign(model.lr, new_lr))
            batch_xs, batch_ys = villani.train.next_batch(config.batch_size)
            batch_xs = batch_xs.reshape((-1, config.n_steps, config.n_input))
            batch_ys = batch_ys.reshape(
                (-1, config.n_steps, config.n_classes))[:,
                                                        (config.n_steps - 1)]
            _ = sess.run(model.optimizer,
                         feed_dict={
                             model.input_data:
                             batch_xs,
                             model.targets:
                             batch_ys,
                             model.initial_state:
                             np.zeros((batch_xs.shape[0], config.n_hidden))
                         })

            if step % config.display_step == 0:
                # Calculate batch accuracy
                acc, loss, summary = sess.run(
                    [model.accuracy, model.cost, model.summary],
                    feed_dict={
                        model.input_data:
                        batch_xs,
                        model.targets:
                        batch_ys,
                        model.initial_state:
                        np.zeros((batch_xs.shape[0], config.n_hidden))
                    })
                print(
                    "Index %d, Minibatch Loss= %f, Training Accuracy %f, Learning Rate %f"
                    % ((step, loss, acc, new_lr)))
                writer.add_summary(summary)
                writer.flush()

            if step % config.save_frequency == 0:
                checkpoint_path = os.path.join(config.save_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

        print("Optimization Finished!")
Пример #2
0
def main():
    if not FLAGS.csv_path:
        raise ValueError("must set --csv_path")

    villani = reader.read_data_sets(FLAGS.csv_path)

    config = Config()
    model = Model(config)
    with open(os.path.join(config.save_dir, 'config.pkl'), 'wb') as f:
        cPickle.dump(config, f)

    with tf.Session() as sess:
        writer = tf.train.SummaryWriter(config.log_dir, sess.graph)
        tf.initialize_all_variables().run()
        saver = tf.train.Saver(tf.all_variables())
        step = 1
        for step in range(config.training_iters):
            new_lr = config.lr * (config.decay_rate ** villani.train.epochs_completed)
            sess.run(tf.assign(model.lr, new_lr))
            batch_xs, batch_ys = villani.train.next_batch(config.batch_size)
            batch_xs = batch_xs.reshape((-1, config.n_steps, config.n_input))
            batch_ys = batch_ys.reshape((-1, config.n_steps, config.n_classes))[:,(config.n_steps-1)]
            _ = sess.run(model.optimizer, feed_dict={
                model.input_data: batch_xs,
                model.targets: batch_ys,
                model.initial_state: np.zeros((batch_xs.shape[0], config.n_hidden))})

            if step % config.display_step == 0:
                # Calculate batch accuracy
                acc, loss, summary = sess.run([model.accuracy, model.cost, model.summary],
                        feed_dict={model.input_data: batch_xs, model.targets: batch_ys,
                            model.initial_state: np.zeros((batch_xs.shape[0], config.n_hidden))})
                print("Index %d, Minibatch Loss= %f, Training Accuracy %f, Learning Rate %f"%((step, loss, acc, new_lr)))
                writer.add_summary(summary)
                writer.flush()

            if step % config.save_frequency == 0:
                checkpoint_path = os.path.join(config.save_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step = step)

        print("Optimization Finished!")
Пример #3
0
            % (target_site, training_year[0], training_begining, training_year[-1], training_deadline, interval_hours))
print(filename)
print('Training for %s/%s to %s/%s' % (training_year[0], training_duration[0], training_year[-1], training_duration[-1]))
print('Testing for %s/%s to %s/%s' % (testing_year[0], testing_duration[0], testing_year[-1], testing_duration[-1]))

if is_training:
    print('Training ..')
else:
    print('Testing ..')

# reading data
print('Reading data .. ')
start_time = time.time()
print('preparing training set ..')
X_train = read_data_sets(sites=site_list+[target_site], date_range=np.atleast_1d(training_year),
                         beginning=training_duration[0], finish=training_duration[-1],
                         feature_selection=pollution_kind, update=data_update)
X_train = missing_check(X_train)
Y_train = np.array(X_train)[:, -len(pollution_kind):]
Y_train = Y_train[:, pollution_kind.index(target_kind)]
X_train = np.array(X_train)[:, :-len(pollution_kind)]

print('preparing testing set ..')
X_test = read_data_sets(sites=site_list + [target_site], date_range=np.atleast_1d(testing_year),
                        beginning=testing_duration[0], finish=testing_duration[-1],
                        feature_selection=pollution_kind, update=data_update)
Y_test = np.array(X_test)[:, -len(pollution_kind):]
Y_test = Y_test[:, pollution_kind.index(target_kind)]
X_test = missing_check(np.array(X_test)[:, :-len(pollution_kind)])

final_time = time.time()
Пример #4
0
    def load_data(self,
                  data_file,
                  site_list,
                  target_site,
                  target_kind,
                  training_year,
                  training_duration,
                  pollution_kind,
                  SEQ_LENGTH_1,
                  SEQ_LENGTH_2,
                  data_update=False):
        print('Reading data .. ')
        X = read_data_sets(sites=site_list + [target_site],
                           date_range=np.atleast_1d(training_year),
                           beginning=training_duration[0],
                           finish=training_duration[-1],
                           feature_selection=pollution_kind,
                           update=data_update)
        X = missing_check(X)
        Y = np.array(X)[:, -len(pollution_kind):]
        Y = Y[:, pollution_kind.index(target_kind)]
        SeqY = []
        for y in range(len(Y)):
            if (y + (SEQ_LENGTH_2 - 1)) < len(Y):
                Seqy = []
                for time_step in range(SEQ_LENGTH_2):
                    Seqy.append(Y[y + time_step])
                SeqY.append(Seqy)
                del Seqy
            else:
                break
        X = np.array(X)[:, :-len(pollution_kind)]

        # feature process
        if 'WIND_DIREC' in pollution_kind:
            index_of_kind = pollution_kind.index('WIND_DIREC')
            length_of_kind_list = len(pollution_kind)
            len_of_sites_list = len(site_list)
            X = X.tolist()
            for i in range(len(X)):
                for j in range(len_of_sites_list):
                    specific_index = index_of_kind + j * length_of_kind_list
                    coordin = data_coordinate_angle(X[i].pop(specific_index +
                                                             j))
                    X[i].insert(specific_index + j, coordin[1])
                    X[i].insert(specific_index + j, coordin[0])
            X = np.array(X)

        X = construct_time_steps(X[:-1], SEQ_LENGTH_1)

        if SEQ_LENGTH_1 < SEQ_LENGTH_2:
            self.X = X[0:len(SeqY)]
        elif SEQ_LENGTH_1 > SEQ_LENGTH_2:
            SeqY = SeqY[:len(X)]

        with open(data_file, 'w') as f:
            for line in SeqY:
                for elem_no in range(len(line)):
                    f.write(str(line[elem_no]))
                    if elem_no < (len(line) - 1):
                        f.write(' ')
                f.write('\n')
def ensemble_model(target_kind, local, city, target_site, training_year,
                   testing_year, training_duration, testing_duration,
                   interval_hours, data, is_training):
    print('is_training(%s) = %s' % (target_site, is_training))

    site_list = pollution_site_map[local][
        city]  # ['中山', '古亭', '士林', '松山', '萬華']

    # change format from   2014-2015   to   ['2014', '2015']
    training_year = [
        training_year[:training_year.index('-')],
        training_year[training_year.index('-') + 1:]
    ]
    testing_year = [
        testing_year[:testing_year.index('-')],
        testing_year[testing_year.index('-') + 1:]
    ]

    training_duration = [
        training_duration[:training_duration.index('-')],
        training_duration[training_duration.index('-') + 1:]
    ]
    testing_duration = [
        testing_duration[:testing_duration.index('-')],
        testing_duration[testing_duration.index('-') + 1:]
    ]
    interval_hours = int(interval_hours)
    # is_training = False

    # clear redundancy work
    if training_year[0] == training_year[1]:
        training_year.pop(1)
    if testing_year[0] == testing_year[1]:
        testing_year.pop(1)
    else:
        input(
            'The range of testing year should not more than one year or crossing the bound of years.'
        )

    # checking years
    rangeofYear = int(training_year[-1]) - int(training_year[0])
    for i in range(rangeofYear):
        if not (str(i + int(training_year[0])) in training_year):
            training_year.insert(i, str(i + int(training_year[0])))

    # Training Parameters
    # WIND_DIREC is a specific feature, that need to be processed, and it can only be element of input vector now.
    if target_kind == 'PM2.5':
        pollution_kind = [
            'PM2.5', 'O3', 'AMB_TEMP', 'RH', 'WIND_SPEED', 'WIND_DIREC'
        ]
    # target_kind = 'PM2.5'
    data_update = False
    # batch_size = 24 * 7
    seed = 0

    # Network Parameters
    input_size = (len(site_list) * len(pollution_kind) +
                  len(site_list)) if 'WIND_DIREC' in pollution_kind else (
                      len(site_list) * len(pollution_kind))
    time_steps = 12
    # hidden_size = 20
    output_size = 1

    testing_month = testing_duration[0][:testing_duration[0].index('/')]
    folder = root_path + "model/%s/%s/%sh/%s/" % (local, city, interval_hours,
                                                  target_kind)
    training_begining = training_duration[0][:training_duration[0].index('/')]
    training_deadline = training_duration[-1][:training_duration[-1].index('/'
                                                                           )]
    print('site: %s' % target_site)
    print('Training for %s/%s to %s/%s' %
          (training_year[0], training_duration[0], training_year[-1],
           training_duration[-1]))
    print('Testing for %s/%s to %s/%s' %
          (testing_year[0], testing_duration[0], testing_year[-1],
           testing_duration[-1]))

    # for interval
    def ave(X, Y, interval_hours):
        reserve_hours = interval_hours - 1
        deadline = 0
        for i in range(len(Y)):
            # check the reserve data is enough or not
            if (len(Y) - i - 1) < reserve_hours:
                deadline = i
                break  # not enough
            for j in range(reserve_hours):
                Y[i] += Y[i + j + 1]
            Y[i] /= interval_hours
        if deadline:
            X = X[:deadline]
            Y = Y[:deadline]
        return X, Y

    # for interval
    def higher(X, Y, interval_hours):
        reserve_hours = 1  # choose the first n number of biggest
        if interval_hours > reserve_hours:
            deadline = 0
            for i in range(len(Y)):
                # check the reserve data is enough or not
                if (len(Y) - i) < interval_hours:
                    deadline = i
                    break  # not enough
                higher_list = []
                for j in range(interval_hours):
                    if len(higher_list) < reserve_hours:
                        higher_list.append(Y[i + j])
                    elif Y[i + j] > higher_list[0]:
                        higher_list[0] = Y[i + j]
                    higher_list = sorted(higher_list)
                Y[i] = np.array(higher_list).sum() / reserve_hours
            if deadline:
                X = X[:deadline]
                Y = Y[:deadline]
        return X, Y

    if is_training:
        # reading data
        print('Reading data .. ')
        start_time = time.time()
        print('preparing training set ..')
        X_train = read_data_sets(sites=site_list + [target_site],
                                 date_range=np.atleast_1d(training_year),
                                 beginning=training_duration[0],
                                 finish=training_duration[-1],
                                 feature_selection=pollution_kind,
                                 update=data_update)
        X_train = missing_check(X_train)
        Y_train = np.array(X_train)[:, -len(pollution_kind):]
        Y_train = Y_train[:, pollution_kind.index(target_kind)]
        X_train = np.array(X_train)[:, :-len(pollution_kind)]

        print('preparing testing set ..')
        X_test = read_data_sets(sites=site_list + [target_site],
                                date_range=np.atleast_1d(testing_year),
                                beginning=testing_duration[0],
                                finish=testing_duration[-1],
                                feature_selection=pollution_kind,
                                update=data_update)
        Y_test = np.array(X_test)[:, -len(pollution_kind):]
        Y_test = Y_test[:, pollution_kind.index(target_kind)]
        X_test = missing_check(np.array(X_test)[:, :-len(pollution_kind)])

        final_time = time.time()
        print('Reading data .. ok, ', end='')
        time_spent_printer(start_time, final_time)

        print(len(X_train), 'train sequences')
        print(len(X_test), 'test sequences')

        if (len(X_train) < time_steps) or (len(X_test) < time_steps):
            input('time_steps(%d) too long.' % time_steps)

        # normalize
        print('Normalize ..')
        mean_X_train = np.mean(X_train, axis=0)
        std_X_train = np.std(X_train, axis=0)
        if 0 in std_X_train:
            input("Denominator can't be 0.")
        X_train = np.array([(x_train - mean_X_train) / std_X_train
                            for x_train in X_train])
        X_test = np.array([(x_test - mean_X_train) / std_X_train
                           for x_test in X_test])

        mean_y_train = np.mean(Y_train)
        std_y_train = np.std(Y_train)
        if not std_y_train:
            input("Denominator can't be 0.")
        Y_train = [(y - mean_y_train) / std_y_train for y in Y_train]
        print('mean_y_train: %f  std_y_train: %f' %
              (mean_y_train, std_y_train))

        fw = open(folder + "%s_parameter.pickle" % target_site, 'wb')
        cPickle.dump(
            str(mean_X_train) + ',' + str(std_X_train) + ',' +
            str(mean_y_train) + ',' + str(std_y_train), fw)
        fw.close()

        # feature process
        if 'WIND_DIREC' in pollution_kind:
            index_of_kind = pollution_kind.index('WIND_DIREC')
            length_of_kind_list = len(pollution_kind)
            len_of_sites_list = len(site_list)
            X_train = X_train.tolist()
            X_test = X_test.tolist()
            for i in range(len(X_train)):
                for j in range(len_of_sites_list):
                    specific_index = index_of_kind + j * length_of_kind_list
                    coordin = data_coordinate_angle(
                        (X_train[i].pop(specific_index + j)) *
                        std_X_train[specific_index] +
                        mean_X_train[specific_index])
                    X_train[i].insert(specific_index, coordin[1])
                    X_train[i].insert(specific_index, coordin[0])
                    if i < len(X_test):
                        coordin = data_coordinate_angle(
                            (X_test[i].pop(specific_index + j)) *
                            std_X_train[specific_index] +
                            mean_X_train[specific_index])
                        X_test[i].insert(specific_index, coordin[1])
                        X_test[i].insert(specific_index, coordin[0])
            X_train = np.array(X_train)
            X_test = np.array(X_test)
        Y_test = np.array(Y_test, dtype=np.float)

        # --

        print('Constructing time series data set ..')
        # for rnn
        X_rnn_train = construct_time_steps(X_train[:-1], time_steps)
        X_rnn_test = construct_time_steps(X_test[:-1], time_steps)

        X_train = concatenate_time_steps(X_train[:-1], time_steps)
        Y_train = Y_train[time_steps:]

        X_test = concatenate_time_steps(X_test[:-1], time_steps)
        Y_test = Y_test[time_steps:]

        [X_train, Y_train] = higher(X_train, Y_train, interval_hours)
        [X_test, Y_test] = higher(X_test, Y_test, interval_hours)
        X_rnn_train = X_rnn_train[:len(X_train)]
        X_rnn_test = X_rnn_test[:len(X_test)]

        # delete data which have missing values
        i = 0
        while i < len(Y_test):
            if not (
                    Y_test[i] > -10000
            ):  # check missing or not, if Y_test[i] is missing, then this command will return True
                Y_test = np.delete(Y_test, i, 0)
                X_test = np.delete(X_test, i, 0)
                X_rnn_test = np.delete(X_rnn_test, i, 0)
                i = -1
            i += 1
        Y_test = np.array(Y_test, dtype=np.float)

        # --

        X_rnn_train = np.array(X_rnn_train)
        X_rnn_test = np.array(X_rnn_test)
        X_train = np.array(X_train)
        Y_train = np.array(Y_train)
        X_test = np.array(X_test)

        np.random.seed(seed)
        np.random.shuffle(X_train)
        np.random.seed(seed)
        np.random.shuffle(Y_train)

        np.random.seed(seed)
        np.random.shuffle(X_rnn_train)

    else:  # is_training = false
        # mean and std
        fr = open(folder + "%s_parameter.pickle" % target_site, 'rb')
        [mean_X_train, std_X_train, mean_y_train,
         std_y_train] = (cPickle.load(fr)).split(',')
        mean_X_train = mean_X_train.replace('[', '').replace(']', '').replace(
            '\n', '').split(' ')
        while '' in mean_X_train:
            mean_X_train.pop(mean_X_train.index(''))
        mean_X_train = np.array(mean_X_train, dtype=np.float)
        std_X_train = std_X_train.replace('[', '').replace(']', '').replace(
            '\n', '').split(' ')
        while '' in std_X_train:
            std_X_train.pop(std_X_train.index(''))
        std_X_train = np.array(std_X_train, dtype=np.float)
        mean_y_train = float(mean_y_train)
        std_y_train = float(std_y_train)
        fr.close()

        # reading data
        print('preparing testing set ..')
        X_test = data
        X_test = missing_check(np.array(X_test))

        # normalize
        print('Normalize ..')
        if 0 in std_X_train:
            input("Denominator can't be 0.")
        X_test = np.array([(x_test - mean_X_train) / std_X_train
                           for x_test in X_test])

        # feature process
        if 'WIND_DIREC' in pollution_kind:
            index_of_kind = pollution_kind.index('WIND_DIREC')
            length_of_kind_list = len(pollution_kind)
            len_of_sites_list = len(site_list)
            X_test = X_test.tolist()
            for i in range(len(X_test)):
                for j in range(len_of_sites_list):
                    specific_index = index_of_kind + j * length_of_kind_list
                    coordin = data_coordinate_angle(
                        (X_test[i].pop(specific_index + j)) *
                        std_X_train[specific_index] +
                        mean_X_train[specific_index])
                    X_test[i].insert(specific_index, coordin[1])
                    X_test[i].insert(specific_index, coordin[0])
            X_test = np.array(X_test)

        # --

        print('Constructing time series data set ..')
        X_rnn_test = construct_time_steps(X_test, time_steps)
        X_test = concatenate_time_steps(X_test, time_steps)

        # --

        X_rnn_test = np.array(X_rnn_test)
        X_test = np.array(X_test)

    # -- xgboost --
    print('- xgboost -')

    filename = ("xgboost_%s_training_%s_m%s_to_%s_m%s_interval_%s" %
                (target_site, training_year[0], training_begining,
                 training_year[-1], training_deadline, interval_hours))
    print(filename)

    if is_training:
        xgb_model = xgb.XGBRegressor().fit(X_train, Y_train)

        fw = open(folder + filename, 'wb')
        cPickle.dump(xgb_model, fw)
        fw.close()
    else:
        fr = open(folder + filename, 'rb')
        xgb_model = cPickle.load(fr)
        fr.close()

    xgb_pred = xgb_model.predict(X_test)

    # print('rmse(xgboost): %.5f' % (np.mean((Y_test - (mean_y_train + std_y_train * xgb_pred))**2, 0)**0.5))

    # -- rnn --
    print('- rnn -')

    filename = ("sa_DropoutLSTM_%s_training_%s_m%s_to_%s_m%s_interval_%s" %
                (target_site, training_year[0], training_begining,
                 training_year[-1], training_deadline, interval_hours))
    print(filename)

    # Network Parameters
    time_steps = 12
    hidden_size = 20

    print("Expected args: p_W, p_U, p_dense, p_emb, weight_decay, batch_size")
    print("Using default args:")
    param = ["", "0.5", "0.5", "0.5", "0.5", "1e-6", "128"]
    args = [float(a) for a in param[1:]]
    print(args)
    p_W, p_U, p_dense, p_emb, weight_decay, batch_size = args
    batch_size = int(batch_size)

    # --

    print('Build rnn model...')
    start_time = time.time()
    rnn_model = Sequential()

    # layer 1
    rnn_model.add(
        BatchNormalization(epsilon=0.001,
                           mode=0,
                           axis=-1,
                           momentum=0.99,
                           weights=None,
                           beta_init='zero',
                           gamma_init='one',
                           gamma_regularizer=None,
                           beta_regularizer=None,
                           input_shape=(time_steps, input_size)))
    rnn_model.add(
        LSTM(hidden_size,
             W_regularizer=l2(weight_decay),
             U_regularizer=l2(weight_decay),
             b_regularizer=l2(weight_decay),
             dropout_W=p_W,
             dropout_U=p_U))  # return_sequences=True
    rnn_model.add(Dropout(p_dense))

    # output layer
    rnn_model.add(
        BatchNormalization(epsilon=0.001,
                           mode=0,
                           axis=-1,
                           momentum=0.99,
                           weights=None,
                           beta_init='zero',
                           gamma_init='one',
                           gamma_regularizer=None,
                           beta_regularizer=None))
    rnn_model.add(
        Dense(output_size,
              W_regularizer=l2(weight_decay),
              b_regularizer=l2(weight_decay)))

    # optimiser = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=False)
    optimiser = 'adam'
    rnn_model.compile(loss='mean_squared_error', optimizer=optimiser)

    final_time = time.time()
    time_spent_printer(start_time, final_time)

    if is_training:
        print("Train...")
        start_time = time.time()
        rnn_model.fit(X_rnn_train, Y_train, batch_size=batch_size, epochs=50)

        # Potentially save weights
        rnn_model.save_weights(folder + filename, overwrite=True)

        final_time = time.time()
        time_spent_printer(start_time, final_time)

    else:
        print('loading model ..')
        # print('loading model from %s' % (folder + filename + ".hdf5"))
        rnn_model.load_weights(folder + filename)

    rnn_pred = rnn_model.predict(X_rnn_test, batch_size=500, verbose=1)
    final_time = time.time()
    time_spent_printer(start_time, final_time)
    # print('rmse(rnn): %.5f' % (np.mean((np.atleast_2d(Y_test).T - (mean_y_train + std_y_train * rnn_pred))**2, 0)**0.5))

    # --  ensemble --

    print('stacking ..')
    if is_training:
        xgb_output = xgb_model.predict(X_train).reshape(len(X_train), 1)
        # rf_output = rf_model.predict(X_train).reshape(len(X_train), 1)
        rnn_output = rnn_model.predict(X_rnn_train, batch_size=500, verbose=1)
        # ensemble_X_train = np.hstack((X_train, xgb_output, rf_output, rnn_output))
        ensemble_X_train = np.hstack((X_train, xgb_output, rnn_output))

        Y_alert_train = [y * std_y_train + mean_y_train for y in Y_train]
        for element in range(len(Y_train)):
            if Y_alert_train[element] > high_alert:
                Y_alert_train[element] = 1  # [1, 0] = [high, low]
            else:
                Y_alert_train[element] = 0

    xgb_pred = xgb_pred.reshape(len(X_test), 1)
    # rf_pred = rf_pred.reshape(len(X_test), 1)
    rnn_pred = rnn_pred.reshape(len(X_test), 1)
    # ensemble_X_test = np.hstack((X_test, xgb_pred, rf_pred, rnn_pred))
    ensemble_X_test = np.hstack((X_test, xgb_pred, rnn_pred))

    # Y_alert_test = np.zeros(len(Y_test))
    # for element in range(len(Y_test)):
    #     if Y_test[element] > high_alert:
    #         Y_alert_test[element] = 1  # [1, 0] = [high, low]

    print('- ensemble -')
    filename = ("ensemble_%s_training_%s_m%s_to_%s_m%s_interval_%s" %
                (target_site, training_year[0], training_begining,
                 training_year[-1], training_deadline, interval_hours))
    filename2 = ("classification_%s_training_%s_m%s_to_%s_m%s_interval_%s" %
                 (target_site, training_year[0], training_begining,
                  training_year[-1], training_deadline, interval_hours))

    if is_training:
        ensemble_model = xgb.XGBRegressor().fit(ensemble_X_train, Y_train)
        classification_model = xgb.XGBClassifier().fit(ensemble_X_train,
                                                       Y_alert_train)

        fw = open(folder + filename, 'wb')
        cPickle.dump(ensemble_model, fw)
        fw.close()

        fw2 = open(folder + filename2, 'wb')
        cPickle.dump(classification_model, fw2)
        fw2.close()
    else:
        fr = open(folder + filename, 'rb')
        ensemble_model = cPickle.load(fr)
        fr.close()

        fr2 = open(folder + filename2, 'rb')
        classification_model = cPickle.load(fr2)
        fr2.close()

    pred = ensemble_model.predict(ensemble_X_test)
    alert_pred = classification_model.predict(ensemble_X_test)

    # --

    predictions = mean_y_train + std_y_train * pred
    # print('mse: %.5f' % mean_squared_error(Y_test, predictions))

    if is_training:
        print('rmse: %.5f' % (np.mean((Y_test - predictions)**2, 0)**0.5))

        def target_level(target, kind='PM2.5'):
            # target should be a 1d-list
            if kind == 'PM2.5':
                if (target >= 0) and (target < 11.5):  # 0-11
                    return 1
                elif (target >= 11.5) and (target < 23.5):  # 12-23
                    return 2
                elif (target >= 23.5) and (target < 35.5):  # 24-35
                    return 3
                elif (target >= 35.5) and (target < 41.5):  # 36-41
                    return 4
                elif (target >= 41.5) and (target < 47.5):  # 42-47
                    return 5
                elif (target >= 47.5) and (target < 53.5):  # 48-53
                    return 6
                elif (target >= 53.5) and (target < 58.5):  # 54-58
                    return 7
                elif (target >= 58.5) and (target < 64.5):  # 59-64
                    return 8
                elif (target >= 64.5) and (target < 70.5):  # 65-70
                    return 9
                elif target >= 70.5:  # others(71+)
                    return 10
                else:
                    print('error value: %d' % target)
                    return 1

        pred_label = np.zeros(len(predictions))
        real_target = np.zeros(len(Y_test))

        pred_label_true = 0.
        pred_label_false = 0.

        four_label_true = 0.0
        four_label_false = 0.0

        # calculate the accuracy of ten level
        for i in range(len(predictions)):
            pred_label[i] = target_level(predictions[i])
            real_target[i] = target_level(Y_test[i])

            if real_target[i] == pred_label[i]:
                pred_label_true += 1
            else:
                pred_label_false += 1

            # four label
            if (real_target[i] >= 1
                    and real_target[i] <= 3) and (pred_label[i] >= 1
                                                  and pred_label[i] <= 3):
                four_label_true += 1
            elif (real_target[i] >= 4
                  and real_target[i] <= 6) and (pred_label[i] >= 4
                                                and pred_label[i] <= 6):
                four_label_true += 1
            elif (real_target[i] >= 7
                  and real_target[i] <= 9) and (pred_label[i] >= 7
                                                and pred_label[i] <= 9):
                four_label_true += 1
            elif (real_target[i] >= 10) and (pred_label[i] >= 10):
                four_label_true += 1
            else:
                four_label_false += 1

        # print('standard_prob_accuracy: %.5f' % (standard_prob_true / (standard_prob_true + standard_prob_false)))
        print('Ten level accuracy: %.5f' %
              (pred_label_true / (pred_label_true + pred_label_false)))
        print('Four level accuracy: %.5f' %
              (four_label_true / (four_label_true + four_label_false)))
        print('--')

        # --

        ha = 0.0  # observation high, predict high
        hb = 0.0  # observation low, predict high
        hc = 0.0  # observation high, predict low
        hd = 0.0  # observation low, predict low
        la = 0.0  # observation very high, predict very high
        lb = 0.0
        lc = 0.0
        ld = 0.0
        alert_a = 0.0
        alert_b = 0.0
        alert_c = 0.0
        alert_d = 0.0
        integration_a = 0.0
        integration_b = 0.0
        integration_c = 0.0
        integration_d = 0.0

        for each_value in range(len(Y_test)):
            if Y_test[each_value] >= high_alert:  # observation high
                # regression
                if predictions[
                        each_value] >= high_alert:  # forecast high(with tolerance)
                    ha += 1
                else:
                    hc += 1

                # classification
                if alert_pred[each_value]:  # [1, 0] = [high, low]
                    alert_a += 1
                else:
                    alert_c += 1

                # integration
                if alert_pred[each_value] or (predictions[each_value] >=
                                              high_alert):
                    integration_a += 1
                else:
                    integration_c += 1

            else:  # observation low
                # regression
                if predictions[each_value] >= high_alert:
                    hb += 1
                else:
                    hd += 1

                # classification
                if alert_pred[each_value]:
                    alert_b += 1
                else:
                    alert_d += 1

                # integration
                if alert_pred[each_value] or (predictions[each_value] >=
                                              high_alert):
                    integration_b += 1
                else:
                    integration_d += 1

            # --------------------------------------------------------

            if Y_test[each_value] >= low_alert:  # observation higher
                if predictions[each_value] >= low_alert:
                    la += 1
                else:
                    lc += 1
            else:  # observation very low
                if predictions[each_value] >= low_alert:
                    lb += 1
                else:
                    ld += 1

        # print('Two level accuracy: %f' % (two_label_true / (two_label_true + two_label_false)))
        print('high label: (%d, %d, %d, %d)' % (ha, hb, hc, hd))
        print('low label: (%d, %d, %d, %d)' % (la, lb, lc, ld))
        print('alert: (%d, %d, %d, %d)' % (alert_a, alert_b, alert_c, alert_d))

    return predictions
Пример #6
0
# Scheduled sampling [optional]
if config.use_sched_samp:
    config.sample_prob = tf.get_variable("sample_prob",
                                         shape=(),
                                         initializer=tf.zeros_initializer())
sampling_burn_in = 400

# Training Parameters
training_steps = config.training_steps
batch_size = config.batch_size
display_step = 20
inp_steps = FLAGS.inp_steps
out_steps = FLAGS.out_steps

# Read Dataset
dataset, stats = read_data_sets(FLAGS.data_path, True, inp_steps, out_steps)

# Network Parameters
num_input = stats['num_input']  # dataset data input (time series dimension: 3)
num_steps = stats['num_steps']

if out_steps is None:
    # Forecast for the rest if horizon is not set
    out_steps = num_steps - inp_steps

# Print training config
print('-' * 100)
print('model', FLAGS.model, '|dataset|', FLAGS.data_path, '|input steps|',
      inp_steps, '|out steps|', out_steps, '|hidden size|', config.hidden_size,
      '|hidden layer|', config.num_layers, '|learning rate|',
      config.learning_rate, '|decay rate|', config.decay_rate, '|rank val|',
Пример #7
0
# Training Parameters
config = TrainConfig()
# Update config with cmd args
config.use_error_prop = FLAGS.use_error_prop
config.hidden_size = FLAGS.hidden_size
config.learning_rate = FLAGS.learning_rate
config.num_steps = FLAGS.num_steps

training_steps = config.training_steps
display_step = 200
num_steps = config.num_steps
num_test_steps = config.num_test_steps
batch_size = config.batch_size

# Construct dataset
dataset, stats = read_data_sets(FLAGS.data_path, num_steps, num_steps)

# Network Parameters
num_input = stats['num_input']  # dataset data input (time series dimension: 3)

# Print exp settings
print('=' * 80)

print('|model|', FLAGS.model, '|batch size|', batch_size, '|learn rate|',
      FLAGS.learning_rate)

print('_' * 80)
print('|train steps|', num_steps, '|test steps|', num_test_steps,
      '|error prop|', config.use_error_prop)

print('=' * 80)
Пример #8
0
A Recurrent Neural Network (LSTM) multivariate time series forecasting implementation 
Minimalist example using TensorFlow library.

Links:
    [Long Short Term Memory](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf)
    [dataset Dataset](http://yann.lecun.com/exdb/dataset/).
"""

from __future__ import print_function

import tensorflow as tf
from tensorflow.contrib import rnn

# Import dataset data
from reader import read_data_sets
dataset = read_data_sets("./data.npy")
'''
To forecast time series using a recurrent neural network, we consider every 
row as a sequence of short time series. Because dataset times series has 9 dim, we will then
handle 9 sequences for every sample.
'''

# Training Parameters
learning_rate = 0.01
training_steps = 1000
batch_size = 128
display_step = 200

# Network Parameters
num_input = 3  # dataset data input (time series dimension: 3)
timesteps = 10  # timesteps
Пример #9
0
def rnn(pollution_kind, local, city, target_site, training_year, testing_year,
        training_duration, testing_duration, interval_hours, data,
        is_training):
    print('is_training(%s) = %s' % (target_site, is_training))
    # format of training_year and testing_year should be (start year)-(end year), like 2014-2015
    # format of training_duration and testing_duration should be (start date)-(end date), like 1/1-12/31

    # local = os.sys.argv[1]
    # city = os.sys.argv[2]
    site_list = pollution_site_map[local][city]

    # change format from   2014-2015   to   ['2014', '2015']
    training_year = [
        training_year[:training_year.index('-')],
        training_year[training_year.index('-') + 1:]
    ]
    testing_year = [
        testing_year[:testing_year.index('-')],
        testing_year[testing_year.index('-') + 1:]
    ]

    training_duration = [
        training_duration[:training_duration.index('-')],
        training_duration[training_duration.index('-') + 1:]
    ]
    testing_duration = [
        testing_duration[:testing_duration.index('-')],
        testing_duration[testing_duration.index('-') + 1:]
    ]
    interval_hours = int(
        interval_hours
    )  # predict the label of average data of many hours later, default is 1
    # is_training = os.sys.argv[9]   # True False

    # clear redundancy work
    if training_year[0] == training_year[1]:
        training_year.pop(1)
    if testing_year[0] == testing_year[1]:
        testing_year.pop(1)

    # Training Parameters
    # WIND_DIREC is a specific feature, that need to be processed, and it can only be element of input vector now.
    # pollution_kind = ['PM2.5', 'O3', 'AMB_TEMP', 'RH', 'WIND_SPEED', 'WIND_DIREC']
    target_kind = 'PM2.5'
    data_update = False
    # batch_size = 24 * 7
    seed = 0

    # Network Parameters
    input_size = (len(site_list) * len(pollution_kind) +
                  len(site_list)) if 'WIND_DIREC' in pollution_kind else (
                      len(site_list) * len(pollution_kind))
    time_steps = 12
    hidden_size = 20
    output_size = 1

    # print("Expected args: p_W, p_U, p_dense, p_emb, weight_decay, batch_size, maxlen")
    # print("Using default args:")
    param = ["", "0.5", "0.5", "0.5", "0.5", "1e-6", "128", "200"]
    # args = [float(a) for a in sys.argv[1:]]
    args = [float(a) for a in param[1:]]
    # print(args)
    p_W, p_U, p_dense, p_emb, weight_decay, batch_size, maxlen = args
    batch_size = int(batch_size)
    maxlen = int(maxlen)
    testing_month = testing_duration[0][:testing_duration[0].index('/')]
    folder = root_path + "model/%s/%s/" % (local, city)
    filename = (
        "sa_DropoutLSTM_pW_%.2f_pU_%.2f_pDense_%.2f_pEmb_%.2f_reg_%f_batch_size_%d_cutoff_%d_epochs_%s_%sm_%sh"
        % (p_W, p_U, p_dense, p_emb, weight_decay, batch_size, maxlen,
           target_site, testing_month, interval_hours))
    print(filename)

    if is_training:
        # reading data
        print('Reading data for %s .. ' % target_site)
        start_time = time.time()
        print('preparing training set for %s ..' % target_site)
        X_train = read_data_sets(sites=site_list + [target_site],
                                 date_range=np.atleast_1d(training_year),
                                 beginning=training_duration[0],
                                 finish=training_duration[-1],
                                 feature_selection=pollution_kind,
                                 update=data_update)
        X_train = missing_check(X_train)
        Y_train = np.array(X_train)[:, -len(pollution_kind):]
        Y_train = Y_train[:, pollution_kind.index(target_kind)]
        X_train = np.array(X_train)[:, :-len(pollution_kind)]

        print('preparing testing set for %s..' % target_site)
        X_test = read_data_sets(sites=site_list + [target_site],
                                date_range=np.atleast_1d(testing_year),
                                beginning=testing_duration[0],
                                finish=testing_duration[-1],
                                feature_selection=pollution_kind,
                                update=data_update)
        Y_test = np.array(X_test)[:, -len(pollution_kind):]
        Y_test = Y_test[:, pollution_kind.index(target_kind)]
        X_test = missing_check(np.array(X_test)[:, :-len(pollution_kind)])

        final_time = time.time()
        print('Reading data for %s.. ok, ' % target_site, end='')
        time_spent_printer(start_time, final_time)

        print(len(X_train), 'train sequences')
        print(len(X_test), 'test sequences')

        if (len(X_train) < time_steps) or (len(X_test) < time_steps):
            input('time_steps(%d) too long.' % time_steps)

        # normalize
        print('Normalize for %s ..' % target_site)
        mean_X_train = np.mean(X_train, axis=0)
        std_X_train = np.std(X_train, axis=0)
        if 0 in std_X_train:
            input("Denominator can't be 0.(%s)" % target_site)
        X_train = np.array([(x_train - mean_X_train) / std_X_train
                            for x_train in X_train])
        X_test = np.array([(x_test - mean_X_train) / std_X_train
                           for x_test in X_test])

        mean_y_train = np.mean(Y_train)
        std_y_train = np.std(Y_train)
        if not std_y_train:
            input("Denominator can't be 0.(%s)" % target_site)
        Y_train = [(y - mean_y_train) / std_y_train for y in Y_train]
        print('mean_y_train: %f  std_y_train: %f (%s)' %
              (mean_y_train, std_y_train, target_site))

        fw = open(folder + filename + ".pickle", 'wb')
        cPickle.dump(
            str(mean_X_train) + ',' + str(std_X_train) + ',' +
            str(mean_y_train) + ',' + str(std_y_train), fw)
        fw.close()

        # feature process
        if 'WIND_DIREC' in pollution_kind:
            index_of_kind = pollution_kind.index('WIND_DIREC')
            length_of_kind_list = len(pollution_kind)
            len_of_sites_list = len(site_list)
            X_train = X_train.tolist()
            X_test = X_test.tolist()
            for i in range(len(X_train)):
                for j in range(len_of_sites_list):
                    specific_index = index_of_kind + j * length_of_kind_list
                    coordin = data_coordinate_angle(
                        (X_train[i].pop(specific_index + j)) *
                        std_X_train[specific_index] +
                        mean_X_train[specific_index])
                    X_train[i].insert(specific_index, coordin[1])
                    X_train[i].insert(specific_index, coordin[0])
                    if i < len(X_test):
                        coordin = data_coordinate_angle(
                            (X_test[i].pop(specific_index + j)) *
                            std_X_train[specific_index] +
                            mean_X_train[specific_index])
                        X_test[i].insert(specific_index, coordin[1])
                        X_test[i].insert(specific_index, coordin[0])
            X_train = np.array(X_train)
            X_test = np.array(X_test)
        Y_test = np.array(Y_test, dtype=np.float)

        # --
        print('Constructing time series data set for %s ..' % target_site)
        X_train = construct_time_steps(X_train[:-1], time_steps)
        Y_train = Y_train[time_steps:]
        reserve_hours = interval_hours - 1
        deadline = 0
        for i in range(len(Y_train)):
            # check the reserve data is enough or not
            if (len(Y_train) - i - 1) < reserve_hours:
                deadline = i
                break  # not enough
            for j in range(reserve_hours):
                Y_train[i] += Y_train[i + j + 1]
            Y_train[i] /= interval_hours
        if deadline:
            X_train = X_train[:deadline]
            Y_train = Y_train[:deadline]

        X_test = construct_time_steps(X_test[:-1], time_steps)
        Y_test = Y_test[time_steps:]
        deadline = 0
        for i in range(len(Y_test)):
            # check the reserve data is enough or not
            if (len(Y_test) - i - 1) < reserve_hours:
                deadline = i
                break  # not enough
            for j in range(reserve_hours):
                Y_test[i] += Y_test[i + j + 1]
            Y_test[i] /= interval_hours
        if deadline:
            X_test = X_test[:deadline]
            Y_test = Y_test[:deadline]

        # delete data which have missing values
        i = 0
        while i < len(Y_test):
            if not (
                    Y_test[i] > -10000
            ):  # check missing or not, if Y_test[i] is missing, then this command will return True
                Y_test = np.delete(Y_test, i, 0)
                X_test = np.delete(X_test, i, 0)
                i = -1
            i += 1
        Y_test = np.array(Y_test, dtype=np.float)
        # --
        X_train = np.array(X_train)
        Y_train = np.array(Y_train)
        X_test = np.array(X_test)

        np.random.seed(seed)
        np.random.shuffle(X_train)
        np.random.seed(seed)
        np.random.shuffle(Y_train)

    # ------------------------------------
    else:
        fr = open(folder + filename + ".pickle", 'rb')
        [mean_X_train, std_X_train, mean_y_train,
         std_y_train] = (cPickle.load(fr)).split(',')
        mean_X_train = mean_X_train.replace('[', '').replace(']', '').replace(
            '\n', '').split(' ')
        while '' in mean_X_train:
            mean_X_train.pop(mean_X_train.index(''))
        mean_X_train = np.array(mean_X_train, dtype=np.float)
        std_X_train = std_X_train.replace('[', '').replace(']', '').replace(
            '\n', '').split(' ')
        while '' in std_X_train:
            std_X_train.pop(std_X_train.index(''))
        std_X_train = np.array(std_X_train, dtype=np.float)
        mean_y_train = float(mean_y_train)
        std_y_train = float(std_y_train)
        fr.close()

        # input data
        X_test = data

        # normalize
        print('Normalize for %s ..' % target_site)
        X_test = np.array([(x_test - mean_X_train) / std_X_train
                           for x_test in X_test])

        # feature process
        if 'WIND_DIREC' in pollution_kind:
            index_of_kind = pollution_kind.index('WIND_DIREC')
            length_of_kind_list = len(pollution_kind)
            len_of_sites_list = len(site_list)
            X_test = X_test.tolist()
            for i in range(len(X_test)):
                for j in range(len_of_sites_list):
                    specific_index = index_of_kind + j * length_of_kind_list
                    coordin = data_coordinate_angle(
                        (X_test[i].pop(specific_index + j)) *
                        std_X_train[specific_index] +
                        mean_X_train[specific_index])
                    X_test[i].insert(specific_index, coordin[1])
                    X_test[i].insert(specific_index, coordin[0])
            X_test = np.array([X_test])

    print('Build model for %s ..' % target_site)
    start_time = time.time()
    model = Sequential()
    model.add(
        DropoutLSTM(input_size,
                    hidden_size,
                    truncate_gradient=maxlen,
                    W_regularizer=l2(weight_decay),
                    U_regularizer=l2(weight_decay),
                    b_regularizer=l2(weight_decay),
                    p_W=p_W,
                    p_U=p_U))
    model.add(Dropout(p_dense))
    model.add(
        Dense(hidden_size,
              output_size,
              W_regularizer=l2(weight_decay),
              b_regularizer=l2(weight_decay)))

    # optimiser = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=False)
    optimiser = 'adam'
    model.compile(loss='mean_squared_error', optimizer=optimiser)
    final_time = time.time()
    time_spent_printer(start_time, final_time)

    # --

    if is_training:
        print("Train for %s .." % target_site)
        start_time = time.time()
        checkpointer = ModelCheckpoint(filepath=folder + filename + ".hdf5",
                                       verbose=1,
                                       append_epoch_name=False,
                                       save_every_X_epochs=50)
        modeltest_1 = ModelTest(X_train[:100],
                                mean_y_train +
                                std_y_train * np.atleast_2d(Y_train[:100]).T,
                                test_every_X_epochs=1,
                                verbose=0,
                                loss='euclidean',
                                mean_y_train=mean_y_train,
                                std_y_train=std_y_train,
                                tau=0.1)
        modeltest_2 = ModelTest(X_test,
                                np.atleast_2d(Y_test).T,
                                test_every_X_epochs=1,
                                verbose=0,
                                loss='euclidean',
                                mean_y_train=mean_y_train,
                                std_y_train=std_y_train,
                                tau=0.1)
        model.fit(X_train,
                  Y_train,
                  batch_size=batch_size,
                  nb_epoch=251,
                  callbacks=[checkpointer, modeltest_1, modeltest_2])
        # score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True)
        # print('Test score:', score)
        # print('Test accuracy:', acc)

        # model.save_weights(folder+filename+"_250.hdf5", overwrite=True)
        final_time = time.time()
        time_spent_printer(start_time, final_time)

        # --

        print("Test for %s .." % target_site)
        standard_prob = model.predict(X_train, batch_size=500, verbose=1)
        print(
            np.mean(((mean_y_train + std_y_train * np.atleast_2d(Y_train).T) -
                     (mean_y_train + std_y_train * standard_prob))**2, 0)**0.5)

        # --

        standard_prob = model.predict(X_test, batch_size=500, verbose=1)
        T = 50
        prob = np.array([
            model.predict_stochastic(X_test, batch_size=500, verbose=0)
            for _ in xrange(T)
        ])
        prob_mean = np.mean(prob, 0)
        print(
            np.mean((np.atleast_2d(Y_test).T -
                     (mean_y_train + std_y_train * standard_prob))**2, 0)**0.5)
        print(
            np.mean((np.atleast_2d(Y_test).T -
                     (mean_y_train + std_y_train * prob_mean))**2, 0)**0.5)

        standard_prob_pred = np.zeros(len(standard_prob))
        prob_mean_pred = np.zeros(len(prob_mean))
        real_target = np.zeros(len(Y_test))

        standard_prob_true = 0.
        standard_prob_false = 0.
        prob_mean_true = 0.
        prob_mean_false = 0.

        # calculate the accuracy of ten level
        for i in range(len(prob_mean)):
            standard_prob_pred[i] = target_level(mean_y_train +
                                                 std_y_train * prob_mean[i])
            prob_mean_pred[i] = target_level(mean_y_train +
                                             std_y_train * prob_mean[i])
            real_target[i] = target_level(Y_test[i])

            if real_target[i] == standard_prob_pred[i]:
                standard_prob_true += 1
            else:
                standard_prob_false += 1

            if real_target[i] == prob_mean_pred[i]:
                prob_mean_true += 1
            else:
                prob_mean_false += 1

        print('standard_prob_accuracy(%s): %.5f' %
              (target_site, standard_prob_true /
               ((standard_prob_true + standard_prob_false))))
        print('prob_mean_accuracy(%s): %.5f' %
              (target_site,
               (prob_mean_true / (prob_mean_true + prob_mean_false))))

        print('--')

        ha = 0.0  # observation high, predict high
        hb = 0.0  # observation low, predict high
        hc = 0.0  # observation high, predict low
        hd = 0.0  # observation low, predict low
        vha = 0.0  # observation very high, predict very high
        vhb = 0.0
        vhc = 0.0
        vhd = 0.0
        two_label_true = 0.0
        two_label_false = 0.0
        # statistic of status of prediction by forecast & observation
        for each_label in np.arange(len(real_target)):
            if real_target[each_label] >= 7:  # observation high
                if prob_mean_pred[each_label] >= 7:
                    ha += 1
                    two_label_true += 1
                else:
                    hc += 1
                    two_label_false += 1
            else:  # observation low
                if prob_mean_pred[each_label] >= 7:
                    hb += 1
                    two_label_false += 1
                else:
                    hd += 1
                    two_label_true += 1

            if real_target[each_label] >= 10:  # observation very high
                if prob_mean_pred[each_label] >= 10:
                    vha += 1
                else:
                    vhc += 1
            else:  # observation low
                if prob_mean_pred[each_label] >= 10:
                    vhb += 1
                else:
                    vhd += 1

        print('Two level accuracy of %s : %f' %
              (target_site,
               (two_label_true / (two_label_true + two_label_false))))
        print('high label of %s: (%d, %d, %d, %d)' %
              (target_site, ha, hb, hc, hd))
        print('very high label of %s: (%d, %d, %d, %d)' %
              (target_site, vha, vhb, vhc, vhd))

        # plot the real trend and trend of prediction
        prediction = mean_y_train + std_y_train * prob_mean
        plt.plot(np.arange(len(prediction)),
                 Y_test[:len(prediction)],
                 c='gray')
        plt.plot(np.arange(len(prediction)), prediction, color='pink')

        plt.xticks(np.arange(0, len(prediction), 24))
        plt.yticks(np.arange(0, max(Y_test), 10))
        plt.grid(True)
        plt.rc('axes', labelsize=4)

    else:
        print('loading model for %s ..' % target_site)
        model.load_weights(folder + filename + ".hdf5")

        standard_prob = model.predict(X_test, batch_size=1, verbose=1)
        T = 50
        prob = np.array([
            model.predict_stochastic(X_test, batch_size=1, verbose=0)
            for _ in xrange(T)
        ])
        prob_mean = np.mean(prob, 0)

    return mean_y_train + std_y_train * prob_mean