Пример #1
0
def process_data(df_raw_data, time_steps, train_num):
    """
    processing raw data
    :param df_raw_data:
    :param time_steps:
    :param train_num:
    :return:
    """
    # df_raw_data['PM25'].astype(float)
    # df_raw_data = df_raw_data[df_raw_data['PM25'].astype(float) < 100]
    # df_raw_data = drop_outlier(df_raw_data, ['PM25'], 6)
    # draw.draw_time_series(df_raw_data, ['PM25'])
    # time_steps = data.get_time_steps()
    df_raw_data = group_by_diff_time_span(df_raw_data, 'hour')
    max_time_step = max(time_steps.values())
    # pop the date features
    df_date = df_raw_data.pop('Month')
    if 'Day' in df_raw_data.columns:
        df_date = pd.concat([df_date, df_raw_data.pop('Day')], axis=1)
    if 'Hour' in df_raw_data.columns:
        df_date = pd.concat([df_date, df_raw_data.pop('Hour')], axis=1)
    df_date = df_date.loc[max_time_step:]

    # processing the sequence features
    # df_raw_data = df_raw_data[list(time_steps.keys())]
    df_raw_data = data.process_sequence_features(df_raw_data, 0, time_steps, max_time_step, padding_value=-1)
    df_raw_data = df_raw_data.loc[max_time_step:]

    # encoding the date features
    df_date_encoded = data.encoding_features(df_date, ['Month', 'Hour', 'Day'])

    # normalization
    y_scaled, y_scaler = data.min_max_scale(np.array(df_raw_data.pop('PM25')).reshape(-1, 1))
    X_scaled, X_scaler = data.min_max_scale(df_raw_data)
    date_encoded = np.array(df_date_encoded)

    # 分割样本
    X_train = np.append(X_scaled[:train_num, :], date_encoded[:train_num, :], axis=1)
    X_test = np.append(X_scaled[train_num:, :], date_encoded[train_num:, :], axis=1)
    y_train = np.array(y_scaled[:train_num]).reshape(1, -1)[0]
    y_test = np.array(y_scaled[train_num:]).reshape(1, -1)[0]

    return X_train, X_test, y_train, y_test, y_scaler
Пример #2
0
def load_data(path, time_steps, lstm_num, cols=None, dtype=str):
    """
    load data
    :param path: data file path
    :param cols: which features
    :return: X, X_scaler, y, y_scaler
    """
    df_raw = pd.read_csv(path, usecols=cols, dtype=dtype)
    df_date = df_raw.pop('Month')
    df_date = pd.concat([df_date, df_raw.pop('Day')], axis=1)
    df_date = pd.concat([df_date, df_raw.pop('Hour')], axis=1)
    df_date = df_date.loc[time_steps:]

    # processing the sequence features
    df_raw = data.process_sequence_features(df_raw, time_steps=time_steps)
    # encoding the date features
    df_date_encode = data.encoding_features(df_date, ['Month', 'Hour', 'Day'])

    # normalization
    y_scaled, y_scaler = data.min_max_scale(
        np.array(df_raw.pop('PM25')).reshape(-1, 1))
    X_scaled, X_scaler = data.min_max_scale(df_raw)
    date_encode = np.array(df_date_encode)

    # reshape y
    y = y_scaled.reshape((y_scaled.shape[0], 1, y_scaled.shape[1]))
    # reshape X
    X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))
    date_encode = date_encode.reshape(
        (date_encode.shape[0], 1, date_encode.shape[1]))
    X = []
    # 分割,将PM2.5,Press等时间序列特征分别作为每个lstm模型的输入
    split_size = int(X_scaled.shape[2] / lstm_num)
    for i in range(lstm_num):
        X.append(X_scaled[:, :, i * split_size:(i + 1) * split_size])
    # 日期时间特征
    X.append(date_encode)

    return X, X_scaler, y, y_scaler
Пример #3
0
def process_data(df_raw, time_steps, train_num):
    max_time_step = max(time_steps.values())
    # pop the date features
    df_date = df_raw.pop('Month')
    df_date = pd.concat([df_date, df_raw.pop('Day')], axis=1)
    df_date = pd.concat([df_date, df_raw.pop('Hour')], axis=1)
    df_date = df_date.loc[max_time_step:]

    # processing the sequence features
    df_raw = data.process_sequence_features(df_raw, time_steps=time_steps)
    df_raw = df_raw.loc[max_time_step:]
    # encoding the date features
    df_date_encode = data.encoding_features(df_date, ['Month', 'Hour', 'Day'])

    # normalization
    y_scaled, y_scaler = data.min_max_scale(np.array(df_raw.pop('PM25')).reshape(-1, 1))
    X_scaled, X_scaler = data.min_max_scale(df_raw)
    date_encode = np.array(df_date_encode)

    # reshape y
    train_y = y_scaled[:train_num]
    test_y = y_scaled[train_num:]
    train_y = train_y.reshape((train_y.shape[0], 1, train_y.shape[1]))
    test_y = test_y.reshape((test_y.shape[0], 1, test_y.shape[1]))
    # reshape X
    X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))
    date_encode = date_encode.reshape((date_encode.shape[0], 1, date_encode.shape[1]))
    train_X = []
    test_X = []
    # 分割,将PM2.5,Press等时间序列特征作为一个lstm模型的输入
    train_X.append(X_scaled[:train_num, :, :])
    test_X.append(X_scaled[train_num:, :, :])
    # 日期时间特征
    train_X.append(date_encode[:train_num, :, :])
    test_X.append(date_encode[train_num:, :, :])

    return train_X, test_X, train_y, test_y, y_scaler
Пример #4
0
def train(df_raw, time_steps=1, train_num=365 * 24):
    # processing the sequence features
    df_raw = data.process_sequence_features(df_raw, time_steps=time_steps)

    # normalization
    y_scaled, y_scaler = data.min_max_scale(
        np.array(df_raw.pop('PM25')).reshape(-1, 1))
    X_scaled, X_scaler = data.min_max_scale(df_raw)

    # split data to train data and test data
    train_X, train_y, test_X, test_y = data.split_data(X_scaled,
                                                       y_scaled,
                                                       train_num=train_num)

    # reshape data
    train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
    train_y = train_y.reshape((train_y.shape[0], 1, train_y.shape[1]))
    test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
    test_y = test_y.reshape((test_y.shape[0], 1, test_y.shape[1]))

    # build a Sequential model
    model = Sequential()
    model.add(
        LSTM(50,
             input_shape=(train_X.shape[1], train_X.shape[2]),
             return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(100, return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(100, return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(50, return_sequences=True))
    model.add(Dropout(0.3))
    model.add(Dense(units=1024, activation='linear'))
    # model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(units=1024, activation='linear'))
    # model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(units=1))
    model.compile(loss='mse', optimizer='RMSprop')
    history = model.fit(train_X,
                        train_y,
                        epochs=100,
                        batch_size=1024,
                        validation_data=(test_X, test_y),
                        verbose=2,
                        shuffle=False)

    # draw the loss curve
    plt.figure(1)
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')

    # draw to compare the original data and the predicted data, and print the evaluation metrics
    pred_y = model.predict(test_X)
    test_y = data.inverse_to_original_data(train_y.reshape(1, -1),
                                           test_y.reshape(1, -1),
                                           scaler=y_scaler,
                                           train_num=train_num)
    pred_y = data.inverse_to_original_data(train_y.reshape(1, -1),
                                           pred_y.reshape(1, -1),
                                           scaler=y_scaler,
                                           train_num=train_num)
    evaluate.print_metrics(test_y, pred_y)
    evaluate.print_curve(test_y, pred_y)
Пример #5
0
def main(is_train=True):
    data_path = '../DataSet/Processed/Train/261630033_2016_2017_v1.csv'
    # model_path = '../Models/Test/model.best.json'
    # weight_path = '../Models/Test/weights.best.hdf5'
    model_path = '../Models/Test/model_epochs10_batch24.best.json'
    weight_path = '../Models/Test/weights_epochs10_batch24.best.hdf5'
    df_raw = data.get_raw_data(data_path, ['PM25'], dtype=float)
    seq_data = np.array(df_raw).reshape(1, -1)[0]
    test_split = 0.4
    time_steps = 4
    new_data = []
    for i in range(len(df_raw) - time_steps):
        new_data.append(list(seq_data[i:i + time_steps + 1]))
    new_data = np.array(new_data)
    train_num = int(len(new_data) * (1 - test_split))

    y_scaled, y_scaler = data.min_max_scale(new_data[:, -1].reshape(-1, 1))
    X_scaled, X_scaler = data.min_max_scale(new_data[:, 0:time_steps])

    y_train = y_scaled[:train_num, :].reshape(1, -1)[0]
    y_test = y_scaled[train_num:, :].reshape(1, -1)[0]
    X_train = X_scaled[:train_num, :]
    X_test = X_scaled[train_num:, :]
    X_train = X_train.reshape(X_train.shape[0], time_steps, 1)
    X_test = X_test.reshape(X_test.shape[0], time_steps, 1)

    if is_train:
        if os.path.exists(model_path):
            json_string = open(model_path).read()
            model = model_from_json(json_string)
            # 有参数则加载
            if os.path.exists(weight_path):
                print('load weights ' + weight_path)
                model.load_weights(weight_path)
        else:
            model = Sequential()
            model.add(
                LSTM(32,
                     input_shape=(X_train.shape[1], X_train.shape[2]),
                     return_sequences=True))
            model.add(LSTM(32, return_sequences=False))
            model.add(Dense(units=64, activation='linear'))
            model.add(Dense(units=1))
            open(model_path, 'w').write(model.to_json())
        model.compile(loss='mse', optimizer='RMSprop')
        checkpoint = ModelCheckpoint(weight_path,
                                     monitor='val_loss',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='min')
        callbacks_list = [checkpoint]
        history = model.fit(X_train,
                            y_train,
                            epochs=20,
                            batch_size=24,
                            validation_data=(X_test, y_test),
                            verbose=1,
                            callbacks=callbacks_list,
                            shuffle=False)

        evaluate.draw_loss_curve(figure_num='PM2.5',
                                 train_loss=history.history['loss'],
                                 val_loss=history.history['val_loss'])
    else:
        json_string = open(model_path).read()
        model = model_from_json(json_string)
        model.load_weights(weight_path)
        y_pred = model.predict(X_test)
        y_true = y_scaler.inverse_transform(y_test.reshape(-1, 1))
        y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))
        df_all_metrics = evaluate.all_metrics(y_true[:len(y_true) - 1],
                                              y_pred[1:])
        evaluate.draw_fitting_curve(y_true[:len(y_true) - 1], y_pred[1:])
Пример #6
0
def train(df_raw,
          model_path,
          weight_path,
          lstm_config,
          dense_config,
          epochs=100,
          batch_size=100,
          time_steps=1,
          test_split=0.3):
    # pop the date features
    df_date = df_raw.pop('Month')
    df_date = pd.concat([df_date, df_raw.pop('Day')], axis=1)
    df_date = pd.concat([df_date, df_raw.pop('Hour')], axis=1)
    df_date = df_date.loc[time_steps:]

    # processing the sequence features
    df_raw = data.process_sequence_features(df_raw, time_steps=time_steps)
    # encoding the date features
    df_date_encode = data.encoding_features(df_date, ['Month', 'Hour', 'Day'])

    # normalization
    y_scaled, y_scaler = data.min_max_scale(
        np.array(df_raw.pop('PM25')).reshape(-1, 1))
    X_scaled, X_scaler = data.min_max_scale(df_raw)
    date_encode = np.array(df_date_encode)

    # reshape y
    train_y = y_scaled[:int(len(X_scaled) * (1 - test_split))]
    test_y = y_scaled[int(len(X_scaled) * (1 - test_split)):]
    train_y = train_y.reshape((train_y.shape[0], 1, train_y.shape[1]))
    test_y = test_y.reshape((test_y.shape[0], 1, test_y.shape[1]))
    # reshape X
    X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))
    date_encode = date_encode.reshape(
        (date_encode.shape[0], 1, date_encode.shape[1]))
    train_X = []
    test_X = []
    # 分割,将PM2.5,Press等时间序列特征分别作为每个lstm模型的输入
    for i in range(lstm_config['num']):
        train_X.append(X_scaled[:int(len(X_scaled) * (1 - test_split)), :,
                                i * time_steps:(i + 1) * time_steps])
        test_X.append(X_scaled[int(len(X_scaled) * (1 - test_split)):, :,
                               i * time_steps:(i + 1) * time_steps])
    # 日期时间特征
    train_X.append(date_encode[:int(len(X_scaled) * (1 - test_split)), :, :])
    test_X.append(date_encode[int(len(X_scaled) * (1 - test_split)):, :, :])

    # build model
    model = build_model(model_path, weight_path, lstm_config, dense_config,
                        time_steps)

    # checkpoint
    checkpoint = ModelCheckpoint(weight_path,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='min')
    callbacks_list = [checkpoint]
    history = model.fit(train_X,
                        train_y,
                        epochs=epochs,
                        batch_size=batch_size,
                        validation_data=(test_X, test_y),
                        verbose=1,
                        callbacks=callbacks_list,
                        shuffle=False)

    # draw the loss curve
    plt.figure(0)
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')

    # draw to compare the original data and the predicted data, and print the evaluation metrics
    pred_y = model.predict(test_X)
    test_y = data.inverse_to_original_data(
        train_y.reshape(1, -1),
        test_y.reshape(1, -1),
        scaler=y_scaler,
        train_num=int(len(X_scaled) * (1 - test_split)))
    pred_y = data.inverse_to_original_data(
        train_y.reshape(1, -1),
        pred_y.reshape(1, -1),
        scaler=y_scaler,
        train_num=int(len(X_scaled) * (1 - test_split)))
    return test_y, pred_y