示例#1
0
def main():
    # load data
    print("loading data...")
    ts = time.time()
    datapath = os.path.join(Paramater.DATAPATH, "2016", month)
    if is_mmn:
        fname = os.path.join(datapath, 'CACHE',
                             'TaxiBJ_C{}_P{}_T{}_{}_mmn_speed.h5'.format(len_closeness, len_period, len_trend,
                                                                         "External" if hasExternal else "noExternal"))
    else:
        fname = os.path.join(datapath, 'CACHE',
                             'TaxiBJ_C{}_P{}_T{}_{}_speed.h5'.format(len_closeness, len_period, len_trend,
                                                                     "External" if hasExternal else "noExternal"))
    pkl = fname + '.preprocessing_speed.pkl'
    fn = "48_48_20_LinearInterpolationFixed"
    if os.path.exists(fname) and CACHEDATA:
        X_train, Y_train, X_test, Y_test, mmn, external_dim, \
        timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = read_cache(fname, is_mmn,
                                                                                              pkl)
        print("load %s successfully" % fname)
    else:
        datapaths = [os.path.join(datapath, fn)]
        noConditionRegionsPath = os.path.join(datapath, "48_48_20_noSpeedRegion_0.05")
        X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, \
        x_num, y_num, z_num = Data.loadDataFromRaw(
                paths=datapaths, noSpeedRegionPath=noConditionRegionsPath, nb_flow=nb_flow, len_closeness=len_closeness,
                len_period=len_period, len_trend=len_trend
                , len_test=len_test, maxMinNormalization=is_mmn, preprocess_name=pkl,
                meta_data=hasExternal,
                meteorol_data=hasExternal,
                holiday_data=hasExternal, isComplete=False)

        if CACHEDATA:
            cache(fname, X_train, Y_train, X_test, Y_test,
                  external_dim, timestamp_train, timestamp_test, noConditionRegions, is_mmn, x_num, y_num,
                  nb_flow)
    z_num = nb_flow
    # print("\n days (test): ", [v[:8] for v in timestamp_test[0::72]])
    print("\nelapsed time (loading data): %.3f seconds\n" % (time.time() - ts))

    print('=' * 10)
    print("compiling model_train...")
    print(
        "**at the first time, it takes a few minites to compile if you use [Theano] as the backend**")

    ts = time.time()

    X_train, Y_train = Data.getSequenceXY(X_train, Y_train, step)
    Y_train_final = Y_train[:, -1]
    X_test, Y_test = Data.getSequenceXY(X_test, Y_test, step)
    Y_test_final = Y_test[:, -1]
    X_train.append(Y_train)
    X_test.append(Y_test)

    timestamp_train = timestamp_train[step - 1:]
    timestamp_test = timestamp_test[step - 1:]

    if use_diff_test:
        X_test_old = X_test
        Y_test_old = Y_test
        import pandas as pd
        df_diff = pd.read_csv("./data/2016/all/" + fn + "_diff.csv", index_col=0)
        # 大于200 有335个作为test
        test_time = df_diff[df_diff["diff"] > 200]["time"].values
        timestamp_train_dict = dict(zip(timestamp_train, range(len(timestamp_train))))
        timestamp_test_dict = dict(zip(timestamp_test, range(len(timestamp_test))))
        new_X_test = []
        new_Y_test = []
        if isinstance(X_train, list):
            for _ in range(len(X_train)):
                new_X_test.append([])
        for _test_time in test_time:
            _test_time = str(_test_time)
            if (_test_time in timestamp_train_dict):
                index = timestamp_train_dict[_test_time]
                if isinstance(X_train, list):
                    for i in range(len(X_train)):
                        new_X_test[i].append(X_train[i][index])
                else:
                    new_X_test.append(X_train[index])
                new_Y_test.append(Y_train[index])

            if (_test_time in timestamp_test_dict):
                index = timestamp_test_dict[_test_time]
                if isinstance(X_test_old, list):
                    for i in range(len(X_test_old)):
                        new_X_test[i].append(X_test_old[i][index])
                else:
                    new_X_test.append(X_test_old[index])
                new_Y_test.append(Y_test_old[index])

                # if (_test_time not in timestamp_train_dict and _test_time not in timestamp_test_dict):
                #     print(_test_time)

        if isinstance(new_X_test, list):
            for i in range(len(new_X_test)):
                new_X_test[i] = np.stack(new_X_test[i], axis=0)
        else:
            new_X_test = np.stack(new_X_test, axis=0)
        new_Y_test = np.stack(new_Y_test, axis=0)

        # if isinstance(new_X_test, list):
        #     for i in range(len(new_X_test)):
        #         print(new_X_test[i].shape)
        # else:
        #     print(new_X_test.shape)
        # print(new_Y_test.shape)
        X_test = new_X_test
        Y_test = new_Y_test
        Y_test_final = Y_test[:, -1]

    # print "X_test len:", len(X_test)
    # for x in X_test:
    #     print x.shape
    # print Y_test.shape
    # print z_num, x_num, y_num
    print "start build model_train"

    outputs = []
    inputs = []

    resUnit_share_layers = []
    resUnit_share_layers2 = []
    resUnit_share_layers3 = []
    shared_conv1 = Convolution2D(filters=64, kernel_size=(3, 3), padding="same")
    shared_conv2 = Convolution2D(nb_filter=nb_flow, nb_row=3, nb_col=3, border_mode="same")

    shared_conv3 = Convolution2D(filters=64, kernel_size=(3, 3), padding="same")
    shared_conv4 = Convolution2D(nb_filter=nb_flow, nb_row=3, nb_col=3, border_mode="same")
    shared_conv5 = Convolution2D(filters=64, kernel_size=(3, 3), padding="same")
    shared_conv6 = Convolution2D(nb_filter=nb_flow, nb_row=3, nb_col=3, border_mode="same")

    shared_convLSTM_period = ConvLSTM2D(nb_filter=32, nb_row=3, nb_col=3, border_mode="same")
    shared_conv_period = Convolution2D(nb_filter=nb_flow, nb_row=3, nb_col=3, border_mode="same")

    shared_convLSTM_trend = ConvLSTM2D(nb_filter=32, nb_row=3, nb_col=3, border_mode="same")
    shared_conv_trend = Convolution2D(nb_filter=nb_flow, nb_row=3, nb_col=3, border_mode="same")

    shared_ilayers = []

    shared_embeding = Dense(output_dim=10)
    shared_embeding2 = Dense(output_dim=nb_flow * x_num * y_num)

    assert l < step
    for _ in range(step):
        main_outputs = []
        if len_closeness > 0:
            input = Input(shape=(nb_flow * len_closeness, x_num, y_num))
            inputs.append(input)
            # Conv1
            conv1 = shared_conv1(input)
            # [nb_residual_unit] Residual Units
            resUnit_share_index = [0]
            residual_output = ResUnits(_residual_unit, nb_filter=64, repetations=nb_residual_unit, share=True,
                                       shareIndex=resUnit_share_index, shares=resUnit_share_layers)(conv1)
            # Conv2
            activation = Activation('relu')(residual_output)
            conv2 = shared_conv2(activation)
            main_outputs.append(conv2)

            # input = Input(shape=(nb_flow * len_closeness, x_num, y_num))
            # inputs.append(input)
            # # conv1 = Convolution2D(nb_filter=64, nb_row=3, nb_col=3, border_mode="same")(input)
            # # act1 = Activation("relu")(conv1)
            # reshape = Reshape((len_closeness, nb_flow, x_num, y_num))(input)
            # convLSTM = ConvLSTM2D(nb_filter=32, nb_row=3, nb_col=3, border_mode="same")(reshape)
            # act2 = Activation("relu")(convLSTM)
            # conv2 = Convolution2D(nb_filter=nb_flow, nb_row=3, nb_col=3, border_mode="same")(act2)
            # main_outputs.append(conv2)

        if len_period > 0:
            input = Input(shape=(nb_flow * len_period, x_num, y_num))
            inputs.append(input)
            # Conv1
            conv1 = shared_conv3(input)
            # [nb_residual_unit] Residual Units
            resUnit_share_index = [0]
            residual_output = ResUnits(_residual_unit, nb_filter=64, repetations=nb_residual_unit, share=True,
                                       shareIndex=resUnit_share_index, shares=resUnit_share_layers2)(conv1)
            # Conv2
            activation = Activation('relu')(residual_output)
            conv2 = shared_conv4(activation)
            main_outputs.append(conv2)
            # input = Input(shape=(nb_flow * len_period, x_num, y_num))
            # inputs.append(input)
            # # conv1 = Convolution2D(nb_filter=64, nb_row=3, nb_col=3, border_mode="same")(input)
            # # act1 = Activation("relu")(conv1)
            # input = Reshape((len_period, nb_flow, x_num, y_num))(input)
            # convLSTM = shared_convLSTM_period(input)
            # act2 = Activation("relu")(convLSTM)
            # conv2 = shared_conv_period(act2)
            # main_outputs.append(conv2)

        if len_trend > 0:
            input = Input(shape=(nb_flow * len_trend, x_num, y_num))
            inputs.append(input)
            # Conv1
            conv1 = shared_conv5(input)
            # [nb_residual_unit] Residual Units
            resUnit_share_index = [0]
            residual_output = ResUnits(_residual_unit, nb_filter=64, repetations=nb_residual_unit, share=True,
                                       shareIndex=resUnit_share_index, shares=resUnit_share_layers3)(conv1)
            # Conv2
            activation = Activation('relu')(residual_output)
            conv2 = shared_conv6(activation)
            main_outputs.append(conv2)
            # input = Input(shape=(nb_flow * len_trend, x_num, y_num))
            # inputs.append(input)
            # # conv1 = Convolution2D(nb_filter=64, nb_row=3, nb_col=3, border_mode="same")(input)
            # # act1 = Activation("relu")(conv1)
            # reshape = Reshape((len_trend, nb_flow, x_num, y_num))(input)
            # convLSTM = shared_convLSTM_trend(reshape)
            # act2 = Activation("relu")(convLSTM)
            # conv2 = shared_conv_trend(act2)
            # main_outputs.append(conv2)

        if len(main_outputs) == 1:
            main_output = main_outputs[0]
        else:
            new_outputs = []
            for index, output in enumerate(main_outputs):
                if (len(shared_ilayers) <= index):
                    shared_ilayers.append(iLayer())

                new_outputs.append(shared_ilayers[index](output))
            main_output = merge(new_outputs, mode='sum')

        if external_dim != None and external_dim > 0:
            # external input
            external_input = Input(shape=(external_dim,))
            inputs.append(external_input)
            embedding = shared_embeding(external_input)
            embedding = Activation('relu')(embedding)
            h1 = shared_embeding2(embedding)
            activation = Activation('relu')(h1)
            external_output = Reshape((nb_flow, x_num, y_num))(activation)
            main_output = merge([main_output, external_output], mode='sum')

        main_output = Activation('tanh')(main_output)
        outputs.append(main_output)

    main_output = merge(outputs, mode="concat", concat_axis=1)
    predict_sequence = Reshape((step, z_num, x_num, y_num))(main_output)

    input_targets = Input(shape=(step, z_num, x_num, y_num), name="input_targets")
    inputs.append(input_targets)
    main_output = eRNN(error_hidden_dim, (z_num, x_num, y_num), l, False)([predict_sequence, input_targets])

    model_train = Model(inputs=inputs, outputs=[predict_sequence, main_output])
    adam = Adam(lr=lr)
    model_train.compile(loss=['mse', 'mse'],
                        loss_weights=[0.2, 1],
                        optimizer=adam,
                        metrics=[metrics.rmse])
    # model_train.compile(loss=lambda y_true,y_preiod: K.mean(K.square(y_preiod - y_true), axis=-1), optimizer=adam, metrics=[metrics.rmse])
    # model_predict = Model(input=inputs, output=main_output)
    # model_predict.compile(optimizer=adam,loss="mse",metrics=metrics.rmse)
    model_train.summary()
    print "finish build model_train"

    hyperparams_name = 'testMyModel3_speed.c{}.p{}.t{}.resunit{}.lr{}.{}.{}'.format(
            len_closeness, len_period, len_trend, nb_residual_unit, lr,
            "External" if hasExternal else "noExternal",
            "MMN" if is_mmn else "noMMN")

    fname_param = os.path.join(path_model, '{}.best.h5'.format(hyperparams_name))
    early_stopping = EarlyStopping(monitor='val_e_rnn_1_rmse', patience=4, mode='min')
    model_checkpoint = ModelCheckpoint(fname_param, monitor='val_e_rnn_1_rmse', verbose=0, save_best_only=True,
                                       mode='min',
                                       save_weights_only=True)

    print("\nelapsed time (compiling model_train): %.3f seconds\n" %
          (time.time() - ts))

    print('=' * 10)
    print("training model_train...")
    ts = time.time()

    history = model_train.fit(X_train, [Y_train, Y_train_final],
                              epochs=nb_epoch,
                              batch_size=batch_size,
                              validation_split=0.1,
                              callbacks=[early_stopping, model_checkpoint],
                              verbose=1)

    model_train.save_weights(os.path.join(
            path_model, '{}.h5'.format(hyperparams_name)), overwrite=True)
    pickle.dump((history.history), open(os.path.join(
            path_result, '{}.history.pkl'.format(hyperparams_name)), 'wb'))
    print("\nelapsed time (training): %.3f seconds\n" % (time.time() - ts))

    print('=' * 10)
    print('evaluating using the model_train that has the best loss on the valid set')
    ts = time.time()
    model_train.load_weights(fname_param)
    score = model_train.evaluate(X_train, [Y_train, Y_train_final], batch_size=Y_train.shape[0] // 48, verbose=0)

    if is_mmn:
        print('Train score: %.6f rmse (norm): %.6f rmse (real): %.6f' %
              (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.))
    else:
        print('Train score: %.6f rmse (real): %.6f' %
              (score[0], score[1]))

    score = model_train.evaluate(X_test, [Y_test, Y_test_final], batch_size=Y_test.shape[0] // 12, verbose=0)

    if is_mmn:
        print('Test score: %.6f rmse (norm): %.6f rmse (real): %.6f' %
              (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.))
    else:
        print('Test score: %.6f rmse (real): %.6f' %
              (score[0], score[1]))

    if not is_mmn:
        predict = model_train.predict(X_test)[1]
    else:
        predict = mmn.inverse_transform(model_train.predict(X_test)[1])
        Y_test_final = mmn.inverse_transform(Y_test_final)

    # predict = predict[:, -1]
    # Y_test = Y_test[:, -1]

    # print("predict", predict)
    # print("test", Y_test_final)
    rmse = round(Metric.RMSE(predict, Y_test_final, noConditionRegions), 5)
    save_result(predict, Y_test_final, timestamp_test,
                "./result/{}_predict_rmse{}".format(hyperparams_name, str(rmse)))
    print("RMSE:", rmse)
    # print("accuracy", Metric.accuracy(predict, Y_test, noConditionRegions))

    print("\nelapsed time (eval): %.3f seconds\n" % (time.time() - ts))
    exit(1)
示例#2
0
def main():
    # load data
    print("loading data...")
    ts = time.time()
    if is_mmn:
        fname = os.path.join(
            Paramater.DATAPATH, 'CACHE', 'TaxiBJ_C{}_P{}_T{}_{}_mmn.h5'.format(
                len_closeness, len_period, len_trend,
                "External" if hasExternal else "noExternal"))
    else:
        fname = os.path.join(
            Paramater.DATAPATH, 'CACHE', 'TaxiBJ_C{}_P{}_T{}_{}.h5'.format(
                len_closeness, len_period, len_trend,
                "External" if hasExternal else "noExternal"))
    x_num = y_num = 48
    if os.path.exists(fname) and CACHEDATA:
        X_train, Y_train, X_test, Y_test, mmn, external_dim, \
        timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = read_cache(fname, is_mmn)
        print("load %s successfully" % fname)
    else:
        datapaths = [
            Paramater.DATAPATH + "48_48_20_LinearInterpolationFixed_condition"
        ]
        noConditionRegionsPath = Paramater.PROJECTPATH + "data/48_48_20_noSpeedRegion_0.05"
        X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = Data.loadDataFromRaw(
            paths=datapaths,
            noSpeedRegionPath=noConditionRegionsPath,
            nb_flow=nb_flow,
            len_closeness=len_closeness,
            len_period=len_period,
            len_trend=len_trend,
            len_test=len_test,
            maxMinNormalization=is_mmn,
            preprocess_name='preprocessing.pkl',
            meta_data=hasExternal,
            meteorol_data=hasExternal,
            holiday_data=hasExternal)
        if CACHEDATA:
            cache(fname, X_train, Y_train, X_test, Y_test, external_dim,
                  timestamp_train, timestamp_test, noConditionRegions, is_mmn,
                  x_num, y_num, Paramater.Z_NUM)

    # print("\n days (test): ", [v[:8] for v in timestamp_test[0::72]])
    print("\nelapsed time (loading data): %.3f seconds\n" % (time.time() - ts))

    print('=' * 10)
    print("compiling model...")
    print(
        "**at the first time, it takes a few minites to compile if you use [Theano] as the backend**"
    )

    ts = time.time()
    model = build_model(external_dim, x_num=x_num, y_num=y_num)
    hyperparams_name = 'c{}.p{}.t{}.resunit{}.lr{}.{}.{}'.format(
        len_closeness, len_period, len_trend, nb_residual_unit, lr,
        "External" if hasExternal else "noExternal",
        "MMN" if is_mmn else "noMMN")

    fname_param = os.path.join(path_model,
                               '{}.best.h5'.format(hyperparams_name))

    early_stopping = EarlyStopping(monitor='val_rmse', patience=2, mode='min')
    model_checkpoint = ModelCheckpoint(fname_param,
                                       monitor='val_rmse',
                                       verbose=0,
                                       save_best_only=True,
                                       mode='min')

    print("\nelapsed time (compiling model): %.3f seconds\n" %
          (time.time() - ts))

    print('=' * 10)
    print("training model...")
    ts = time.time()
    history = model.fit(X_train,
                        Y_train,
                        nb_epoch=nb_epoch,
                        batch_size=batch_size,
                        validation_split=0.1,
                        callbacks=[early_stopping, model_checkpoint],
                        verbose=1)
    model.save_weights(os.path.join(path_model,
                                    '{}.h5'.format(hyperparams_name)),
                       overwrite=True)
    pickle.dump((history.history),
                open(
                    os.path.join(path_result,
                                 '{}.history.pkl'.format(hyperparams_name)),
                    'wb'))
    print("\nelapsed time (training): %.3f seconds\n" % (time.time() - ts))

    print('=' * 10)
    print('evaluating using the model that has the best loss on the valid set')
    ts = time.time()
    model.load_weights(fname_param)
    score = model.evaluate(X_train,
                           Y_train,
                           batch_size=Y_train.shape[0] // 48,
                           verbose=0)

    if mmn is not None:
        print('Train score: %.6f rmse (norm): %.6f rmse (real): %.6f' %
              (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.))
    else:
        print('Train score: %.6f rmse (real): %.6f' % (score[0], score[1]))

    score = model.evaluate(X_test,
                           Y_test,
                           batch_size=Y_test.shape[0],
                           verbose=0)

    if mmn is not None:
        print('Test score: %.6f rmse (norm): %.6f rmse (real): %.6f' %
              (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.))
    else:
        print('Test score: %.6f rmse (real): %.6f' % (score[0], score[1]))

    if not is_mmn:
        predict = matrixsRounding(model.predict(X_test))
    else:
        predict = matrixsRounding(mmn.inverse_transform(model.predict(X_test)))
    print("RMSE:", Metric.RMSE(predict, Y_test, noConditionRegions))
    print("accuracy", Metric.accuracy(predict, Y_test, noConditionRegions))

    print("\nelapsed time (eval): %.3f seconds\n" % (time.time() - ts))

    print('=' * 10)
    print("training model (cont)...")
    ts = time.time()
    fname_param = os.path.join(path_model,
                               '{}.cont.best.h5'.format(hyperparams_name))
    model_checkpoint = ModelCheckpoint(fname_param,
                                       monitor='rmse',
                                       verbose=0,
                                       save_best_only=True,
                                       mode='min')
    history = model.fit(X_train,
                        Y_train,
                        nb_epoch=nb_epoch_cont,
                        verbose=2,
                        batch_size=batch_size,
                        callbacks=[model_checkpoint])
    pickle.dump(
        (history.history),
        open(
            os.path.join(path_result,
                         '{}.cont.history.pkl'.format(hyperparams_name)),
            'wb'))
    model.save_weights(os.path.join(path_model,
                                    '{}_cont.h5'.format(hyperparams_name)),
                       overwrite=True)
    print("\nelapsed time (training cont): %.3f seconds\n" %
          (time.time() - ts))

    print('=' * 10)
    print('evaluating using the final model')
    score = model.evaluate(X_train,
                           Y_train,
                           batch_size=Y_train.shape[0] // 48,
                           verbose=0)

    if (mmn is not None):
        print('Train score: %.6f rmse (norm): %.6f rmse (real): %.6f' %
              (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.))
    else:
        print('Train score: %.6f rmse (real): %.6f' % (score[0], score[1]))
    ts = time.time()
    score = model.evaluate(X_test,
                           Y_test,
                           batch_size=Y_test.shape[0],
                           verbose=0)
    if mmn is not None:
        print('Test score: %.6f rmse (norm): %.6f rmse (real): %.6f' %
              (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.))
    else:
        print('Test score: %.6f rmse (real): %.6f' % (score[0], score[1]))

    if not is_mmn:
        predict = matrixsRounding(model.predict(X_test))
    else:
        predict = matrixsRounding(mmn.inverse_transform(model.predict(X_test)))
    print("RMSE:", Metric.RMSE(predict, Y_test, noConditionRegions))
    print("accuracy", Metric.accuracy(predict, Y_test, noConditionRegions))

    print("\nelapsed time (eval cont): %.3f seconds\n" % (time.time() - ts))
示例#3
0
def main():
    all_result = []
    # load data
    for _c in c_list:
        for _p in p_list:
            for _t in t_list:
                for _ex in ex_list:
                    if _c == 0 and _p == 0 and _t == 0:
                        continue
                    len_period = _p
                    len_closeness = _c
                    len_trend = _t
                    hasExternal = _ex

                    print("loading data...")
                    ts = time.time()
                    datapath = os.path.join(Paramater.DATAPATH, "2016", "all")
                    if is_mmn:
                        fname = os.path.join(
                            datapath, 'CACHE',
                            'TaxiBJ_C{}_P{}_T{}_{}_mmn_speed.h5'.format(
                                len_closeness, len_period, len_trend,
                                "External" if hasExternal else "noExternal"))
                    else:
                        fname = os.path.join(
                            datapath, 'CACHE',
                            'TaxiBJ_C{}_P{}_T{}_{}_speed.h5'.format(
                                len_closeness, len_period, len_trend,
                                "External" if hasExternal else "noExternal"))
                    x_num = y_num = 48
                    pkl = fname + '.preprocessing_speed.pkl'
                    fn = "48_48_20_LinearInterpolationFixed"
                    if os.path.exists(fname) and CACHEDATA:
                        X_train, Y_train, X_test, Y_test, mmn, external_dim, \
                        timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = read_cache(fname,
                                                                                                              is_mmn,
                                                                                                              pkl)
                        print("load %s successfully" % fname)
                    else:
                        datapaths = [os.path.join(datapath, fn)]
                        noConditionRegionsPath = os.path.join(
                            datapath, "48_48_20_noSpeedRegion_0.05")
                        X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = Data.loadDataFromRaw(
                            paths=datapaths,
                            noSpeedRegionPath=noConditionRegionsPath,
                            nb_flow=nb_flow,
                            len_closeness=len_closeness,
                            len_period=len_period,
                            len_trend=len_trend,
                            len_test=len_test,
                            maxMinNormalization=is_mmn,
                            preprocess_name=pkl,
                            meta_data=hasExternal,
                            meteorol_data=hasExternal,
                            holiday_data=hasExternal,
                            isComplete=False)
                        if CACHEDATA:
                            cache(fname, X_train, Y_train, X_test, Y_test,
                                  external_dim, timestamp_train,
                                  timestamp_test, noConditionRegions, is_mmn,
                                  x_num, y_num, Paramater.Z_NUM)

                    if use_diff_test:
                        X_test_old = X_test
                        Y_test_old = Y_test
                        import pandas as pd
                        df_diff = pd.read_csv("./data/2016/all/" + fn +
                                              "_diff.csv",
                                              index_col=0)
                        # 大于200 有335个作为test
                        test_time = df_diff[
                            df_diff["diff"] > 200]["time"].values
                        timestamp_train_dict = dict(
                            zip(timestamp_train, range(len(timestamp_train))))
                        timestamp_test_dict = dict(
                            zip(timestamp_test, range(len(timestamp_test))))
                        new_X_test = []
                        new_Y_test = []
                        if isinstance(X_train, list):
                            for _ in range(len(X_train)):
                                new_X_test.append([])
                        for _test_time in test_time:
                            _test_time = str(_test_time)
                            if (_test_time in timestamp_train_dict):
                                index = timestamp_train_dict[_test_time]
                                if isinstance(X_train, list):
                                    for i in range(len(X_train)):
                                        new_X_test[i].append(X_train[i][index])
                                else:
                                    new_X_test.append(X_train[index])
                                new_Y_test.append(Y_train[index])

                            if (_test_time in timestamp_test_dict):
                                index = timestamp_test_dict[_test_time]
                                if isinstance(X_test_old, list):
                                    for i in range(len(X_test_old)):
                                        new_X_test[i].append(
                                            X_test_old[i][index])
                                else:
                                    new_X_test.append(X_test_old[index])
                                new_Y_test.append(Y_test_old[index])

                                # if (_test_time not in timestamp_train_dict and _test_time not in timestamp_test_dict):
                                #     print(_test_time)

                        if isinstance(new_X_test, list):
                            for i in range(len(new_X_test)):
                                new_X_test[i] = np.stack(new_X_test[i], axis=0)
                        else:
                            new_X_test = np.stack(new_X_test, axis=0)
                        new_Y_test = np.stack(new_Y_test, axis=0)

                        # if isinstance(new_X_test, list):
                        #     for i in range(len(new_X_test)):
                        #         print(new_X_test[i].shape)
                        # else:
                        #     print(new_X_test.shape)
                        # print(new_Y_test.shape)
                        X_test = new_X_test
                        Y_test = new_Y_test

                    # print("\n days (test): ", [v[:8] for v in timestamp_test[0::72]])
                    print("\nelapsed time (loading data): %.3f seconds\n" %
                          (time.time() - ts))

                    print('=' * 10)
                    print("compiling model...")
                    print(
                        "**at the first time, it takes a few minites to compile if you use [Theano] as the backend**"
                    )

                    ts = time.time()
                    model = build_model(external_dim, x_num, y_num,
                                        len_closeness, len_period, len_trend)
                    hyperparams_name = 'speed.c{}.p{}.t{}.resunit{}.lr{}.{}.{}'.format(
                        len_closeness, len_period, len_trend, nb_residual_unit,
                        lr, "External" if hasExternal else "noExternal",
                        "MMN" if is_mmn else "noMMN")

                    fname_param = os.path.join(
                        path_model, '{}.best.h5'.format(hyperparams_name))

                    early_stopping = EarlyStopping(monitor='val_rmse',
                                                   patience=2,
                                                   mode='min')
                    model_checkpoint = ModelCheckpoint(fname_param,
                                                       monitor='val_rmse',
                                                       verbose=0,
                                                       save_best_only=True,
                                                       mode='min',
                                                       save_weights_only=True)

                    print("\nelapsed time (compiling model): %.3f seconds\n" %
                          (time.time() - ts))

                    print('=' * 10)
                    print("training model...")
                    ts = time.time()
                    history = model.fit(
                        X_train,
                        Y_train,
                        nb_epoch=nb_epoch,
                        batch_size=batch_size,
                        validation_split=0.1,
                        callbacks=[early_stopping, model_checkpoint],
                        verbose=1)
                    model.save_weights(os.path.join(
                        path_model, '{}.h5'.format(hyperparams_name)),
                                       overwrite=True)
                    pickle.dump(
                        (history.history),
                        open(
                            os.path.join(
                                path_result,
                                '{}.history.pkl'.format(hyperparams_name)),
                            'wb'))
                    print("\nelapsed time (training): %.3f seconds\n" %
                          (time.time() - ts))

                    print('=' * 10)
                    print(
                        'evaluating using the model that has the best loss on the valid set'
                    )
                    ts = time.time()
                    model.load_weights(fname_param)
                    score = model.evaluate(X_train,
                                           Y_train,
                                           batch_size=Y_train.shape[0] // 48,
                                           verbose=0)

                    if is_mmn:
                        print(
                            'Train score: %.6f rmse (norm): %.6f rmse (real): %.6f'
                            % (score[0], score[1], score[1] *
                               (mmn._max - mmn._min) / 2.))
                    else:
                        print('Train score: %.6f rmse (real): %.6f' %
                              (score[0], score[1]))

                    score = model.evaluate(X_test,
                                           Y_test,
                                           batch_size=Y_test.shape[0],
                                           verbose=0)

                    if is_mmn:
                        print(
                            'Test score: %.6f rmse (norm): %.6f rmse (real): %.6f'
                            % (score[0], score[1], score[1] *
                               (mmn._max - mmn._min) / 2.))
                    else:
                        print('Test score: %.6f rmse (real): %.6f' %
                              (score[0], score[1]))

                    if not is_mmn:
                        predict = model.predict(X_test)
                    else:
                        predict = mmn.inverse_transform(model.predict(X_test))
                        Y_test = mmn.inverse_transform(Y_test)
                    # print("predict", predict)
                    # print("test", Y_test)
                    # print("RMSE:", Metric.RMSE(predict, Y_test, noConditionRegions))
                    # # print("accuracy", Metric.accuracy(predict, Y_test, noConditionRegions))
                    #
                    # print("\nelapsed time (eval): %.3f seconds\n" % (time.time() - ts))
                    #
                    # print('=' * 10)
                    # print("training model (cont)...")
                    # ts = time.time()
                    # fname_param = os.path.join(
                    #         path_model, '{}.cont.best.h5'.format(hyperparams_name))
                    # model_checkpoint = ModelCheckpoint(
                    #         fname_param, monitor='rmse', verbose=0, save_best_only=True, mode='min')
                    # history = model.fit(X_train, Y_train, nb_epoch=nb_epoch_cont, verbose=2, batch_size=batch_size,
                    #                     callbacks=[
                    #                         model_checkpoint])
                    # pickle.dump((history.history), open(os.path.join(
                    #         path_result, '{}.cont.history.pkl'.format(hyperparams_name)), 'wb'))
                    # model.save_weights(os.path.join(
                    #         path_model, '{}_cont.h5'.format(hyperparams_name)), overwrite=True)
                    # print("\nelapsed time (training cont): %.3f seconds\n" % (time.time() - ts))
                    #
                    # print('=' * 10)
                    # print('evaluating using the final model')
                    # score = model.evaluate(X_train, Y_train, batch_size=Y_train.shape[
                    #                                                         0] // 48, verbose=0)

                    # if (mmn is not None):
                    #     print('Train score: %.6f rmse (norm): %.6f rmse (real): %.6f' %
                    #           (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.))
                    # else:
                    #     print('Train score: %.6f rmse (real): %.6f' %
                    #           (score[0], score[1]))
                    # ts = time.time()
                    # score = model.evaluate(
                    #         X_test, Y_test, batch_size=Y_test.shape[0], verbose=0)
                    # if mmn is not None:
                    #     print('Test score: %.6f rmse (norm): %.6f rmse (real): %.6f' %
                    #           (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.))
                    # else:
                    #     print('Test score: %.6f rmse (real): %.6f' %
                    #           (score[0], score[1]))
                    #
                    # if not is_mmn:
                    #     predict = model.predict(X_test)
                    # else:
                    #     predict = mmn.inverse_transform(model.predict(X_test))

                    rmse = round(
                        Metric.RMSE(predict, Y_test, noConditionRegions), 5)
                    # np.save("./result/{}_predict_rmse{}".format(hyperparams_name, str(rmse)),
                    #         np.stack([predict, Y_test], axis=0))
                    save_result(
                        predict, Y_test, timestamp_test,
                        "./result/{}_predict_rmse{}".format(
                            hyperparams_name, str(rmse)))

                    print("RMSE:", rmse)

                    # print("accuracy", Metric.accuracy(predict, Y_test, noConditionRegions))
                    all_result.append("{}c_{}p_{}t_{}External_{}rmse".format(
                        len_closeness, len_period, len_trend, hasExternal,
                        rmse))
                    print("\nelapsed time (eval cont): %.3f seconds\n" %
                          (time.time() - ts))

    for _v in all_result:
        print(_v)
示例#4
0
@Create Date: 17-9-18, 09:56

@Description:

@Update Date: 17-9-18, 09:56
"""

import matplotlib

matplotlib.use("Agg")
import matplotlib.pyplot as plt
import os
import time
from jampredict.utils.Cache import *
from jampredict.utils import Paramater
from jampredict.feature import Data
import seaborn as ses

if __name__ == '__main__':
    datas, times, x_num, y_num, interval, startTime, endTime, nospeed_regions = \
        Data.loadRawData(os.path.join(Paramater.DATAPATH, "2016/all/48_48_20_LinearInterpolationFixed"),
                         os.path.join(Paramater.DATAPATH, "48_48_20_noSpeedRegion_0.05"), False)
    x_index = 24
    y_index = 24
    datas = datas[:, 0]
    f, ax = plt.subplots(1, 1, figsize=(15, 7))
    print datas[:500, x_index, y_index]
    ses.tsplot(datas[:500, x_index, y_index], ax=ax)
    plt.savefig(os.path.join(Paramater.PROJECTPATH, "fig/test2.jpg"))
示例#5
0
def main():
    # load data
    print("loading data...")
    ts = time.time()
    if is_mmn:
        fname = os.path.join(
            Paramater.DATAPATH, 'CACHE', 'TaxiBJ_C{}_P{}_T{}_{}_mmn.h5'.format(
                len_closeness, len_period, len_trend,
                "External" if hasExternal else "noExternal"))
    else:
        fname = os.path.join(
            Paramater.DATAPATH, 'CACHE', 'TaxiBJ_C{}_P{}_T{}_{}.h5'.format(
                len_closeness, len_period, len_trend,
                "External" if hasExternal else "noExternal"))

    x_num = y_num = 48
    z_num = Paramater.Z_NUM
    if os.path.exists(fname) and CACHEDATA:
        X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = read_cache(
            fname, is_mmn)
        print("load %s successfully" % fname)
    else:
        datapaths = [
            Paramater.DATAPATH + "48_48_20_LinearInterpolationFixed_condition"
        ]
        noConditionRegionsPath = Paramater.PROJECTPATH + "data/48_48_20_noSpeedRegion_0.05"
        X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = Data.loadDataFromRaw(
            paths=datapaths,
            noSpeedRegionPath=noConditionRegionsPath,
            nb_flow=nb_flow,
            len_closeness=len_closeness,
            len_period=len_period,
            len_trend=len_trend,
            len_test=len_test,
            preprocess_name='preprocessing.pkl',
            meta_data=hasExternal,
            meteorol_data=hasExternal,
            holiday_data=hasExternal)
        if CACHEDATA:
            cache(fname, X_train, Y_train, X_test, Y_test, external_dim,
                  timestamp_train, timestamp_test, noConditionRegions, is_mmn,
                  x_num, y_num, Paramater.Z_NUM)

    # print("\n days (test): ", [v[:8] for v in timestamp_test[0::72]])
    print("\nelapsed time (loading data): %.3f seconds\n" % (time.time() - ts))

    print('=' * 10)
    print("compiling model...")
    print(
        "**at the first time, it takes a few minites to compile if you use [Theano] as the backend**"
    )

    ts = time.time()
    model = build_model(external_dim, x_num=x_num, y_num=y_num)

    model.load_weights(
        Paramater.PROJECTPATH +
        "/MODEL/c3.p1.t1.resunit6.lr0.0002.External.MMN.cont.best.h5")
    if not is_mmn:
        predict = matrixsRounding(model.predict(X_test))
    else:
        predict = mmn.inverse_transform(model.predict(X_test))
        # print(predict)
        predict = matrixsRounding(predict)
        # print(predict)
    print("RMSE:", Metric.RMSE(predict, Y_test, noConditionRegions))
    print("accuracy", Metric.accuracy(predict, Y_test, noConditionRegions))
示例#6
0
def main():
    all_results = {}
    for _c in clossesness:
        len_closeness = _c
        print "closeness is", len_closeness
        # load data
        print("loading data...")
        ts = time.time()

        datapath = os.path.join(Paramater.DATAPATH, "2016", month)
        if is_mmn:
            fname = os.path.join(datapath, 'CACHE',
                                 'TaxiBJ_C{}_P{}_T{}_{}_mmn_speed.h5'.format(len_closeness,
                                                                             len_period,
                                                                             len_trend,
                                                                             "External" if hasExternal else "noExternal"))
        else:
            fname = os.path.join(datapath, 'CACHE',
                                 'TaxiBJ_C{}_P{}_T{}_{}_speed.h5'.format(len_closeness,
                                                                         len_period,
                                                                         len_trend,
                                                                         "External" if hasExternal else "noExternal"))
        x_num = y_num = 48
        pkl = fname + '.preprocessing_speed.pkl'
        if os.path.exists(fname) and CACHEDATA:
            X_train, Y_train, X_test, Y_test, mmn, external_dim, \
            timestamp_train, timestamp_test, noConditionRegions, \
            x_num, y_num, z_num = read_cache(fname, is_mmn, pkl)
            print("load %s successfully" % fname)
        else:
            datapaths = [os.path.join(datapath, "48_48_20_MaxSpeedFillingFixed_5")]
            noConditionRegionsPath = os.path.join(datapath, "48_48_20_noSpeedRegion_0.05")
            X_train, Y_train, X_test, Y_test, \
            mmn, external_dim, timestamp_train, \
            timestamp_test, noConditionRegions, \
            x_num, y_num, z_num = Data.loadDataFromRaw(
                    paths=datapaths,
                    noSpeedRegionPath=noConditionRegionsPath,
                    nb_flow=nb_flow,
                    len_closeness=len_closeness,
                    len_period=len_period,
                    len_trend=len_trend,
                    len_test=len_test,
                    maxMinNormalization=is_mmn,
                    preprocess_name=pkl,
                    meta_data=hasExternal,
                    meteorol_data=hasExternal,
                    holiday_data=hasExternal,
                    isComplete=False)
            if CACHEDATA:
                cache(fname, X_train, Y_train, X_test, Y_test,
                      external_dim, timestamp_train, timestamp_test, noConditionRegions, is_mmn, x_num, y_num,
                      Paramater.Z_NUM)

        # print("\n days (test): ", [v[:8] for v in timestamp_test[0::72]])
        print("\nelapsed time (loading data): %.3f seconds\n" % (time.time() - ts))

        if isinstance(X_train, list):
            print "X_train len:", len(X_train)
            for i, _x_train in enumerate(X_train):
                print "x_train_{} shape:".format(i), _x_train.shape
        else:
            print "X_train shape:", X_train.shape
        print "Y_train shape:", Y_train.shape

        if isinstance(X_test, list):
            print "X_test len:", len(X_test)
            for i, _x_test in enumerate(X_test):
                print "x_test_{} shape:".format(i), _x_test.shape
        else:
            print "X_test shape:", X_test.shape
        print "Y_test shape:", Y_test.shape

        # if not use_CNN_model:
        #     if (isinstance(X_train, list) and len(X_train) == 1):
        #         X_train = X_train[0]
        #         X_test = X_test[0]
        #     X = np.vstack([X_train, X_test])
        #     Y = np.vstack([Y_train, Y_test])
        #
        #     print "X", X.shape
        #     print "Y", Y.shape
        #     X, Y = Data.getSequenceXY(X, Y, len_period)
        #     Y = Y[:, -1]
        #     print "after sequence:"
        #     print "X", X.shape
        #     print "Y", Y.shape
        #
        #     X_train = X[:-800]
        #     X_test = X[-800:]
        #     Y_train = Y[:-800]
        #     Y_test = Y[-800:]
        print('=' * 10)
        print("compiling model...")
        print("**at the first time, it takes a few minites to compile if you use [Theano] as the backend**")

        # predict = mmn.inverse_transform(average_method(X_test))
        # Y_test = mmn.inverse_transform(Y_test)
        # # print("predict", predict)
        # # print("test", Y_test)
        # rmse = Metric.RMSE(predict, Y_test, noConditionRegions)
        # # results["avg_method"] = {"rmse": rmse}
        # print rmse
        # exit(1)

        results = {}
        if isinstance(X_test, list) and len(X_test) == 1:
            X_test = X_test[0]
        if isinstance(X_train, list) and len(X_train) == 1:
            X_train = X_train[0]
        if isinstance(Y_train, list) and len(Y_train) == 1:
            Y_train = Y_train[0]
        if isinstance(Y_test, list) and len(Y_test) == 1:
            Y_test = Y_test[0]
        X_test_copy = X_test.copy()
        X_train_copy = X_train.copy()
        Y_train_copy = Y_train.copy()
        Y_test_copy = Y_test.copy()
        for model_method, name in model_methods:
            X_test = X_test_copy.copy()
            X_train = X_train_copy.copy()
            Y_train = Y_train_copy.copy()
            Y_test = Y_test_copy.copy()
            print name
            result = {}
            results[name] = result
            ts = time.time()
            # print(X_train)
            print "start build model"

            # input = Input(shape=(nb_flow * len_period, x_num, y_num))
            # reshape = Reshape((len_period, nb_flow, x_num, y_num))(input)
            # convLSTM = ConvLSTM2D(nb_filter=32, nb_row=3, nb_col=3, border_mode="same")(reshape)
            # act2 = Activation("relu")(convLSTM)
            # main_output = Convolution2D(nb_filter=nb_flow, nb_row=3, nb_col=3, border_mode="same")(act2)
            model = model_method(x_num, y_num, _c)
            adam = Adam(lr=lr)
            from keras.optimizers import SGD, RMSprop
            sgd = SGD(lr, clipvalue=0.01)
            rmsprop = RMSprop()
            model.compile(loss='mse', optimizer=adam, metrics=[metrics.rmse])
            # model.summary()
            # exit(1)
            print "finish build model"
            result["build_time"] = time.time() - ts

            print("\nelapsed time (compiling model): %.3f seconds\n" %
                  (time.time() - ts))

            hyperparams_name = 'closenesstest_{}_speed.c{}.p{}.t{}.resunit{}.lr{}.{}.{}'.format(name,
                                                                                                len_closeness,
                                                                                                len_period, len_trend,
                                                                                                nb_residual_unit, lr,
                                                                                                "External" if hasExternal else "noExternal",
                                                                                                "MMN" if is_mmn else "noMMN")

            fname_param = os.path.join(path_model, '{}.best.h5'.format(hyperparams_name))

            early_stopping = EarlyStopping(monitor='val_rmse', patience=4, mode='min')
            model_checkpoint = ModelCheckpoint(fname_param,
                                               monitor='val_rmse',
                                               verbose=1,
                                               save_best_only=True,
                                               mode='min',
                                               save_weights_only=True)
            print('=' * 10)
            time.sleep(20)
            print("training model...")
            ts = time.time()
            history = model.fit(X_train, Y_train,
                                nb_epoch=nb_epoch,
                                batch_size=batch_size,
                                validation_split=0.1,
                                callbacks=[early_stopping, model_checkpoint],
                                verbose=1)

            result["train_time"] = time.time() - ts
            print("\nelapsed time (training): %.3f seconds\n" % (time.time() - ts))

            model.save_weights(os.path.join(path_model, '{}.h5'.format(hyperparams_name)), overwrite=True)
            pickle.dump((history.history), open(os.path.join(
                    path_result, '{}.history.pkl'.format(hyperparams_name)), 'wb'))
            print('=' * 10)
            print('evaluating using the model that has the best loss on the valid set')
            ts = time.time()
            model.load_weights(fname_param)
            score = model.evaluate(X_train, Y_train, batch_size=Y_train.shape[0] // 48, verbose=0)

            if is_mmn:
                print('Train score: %.6f rmse (norm): %.6f rmse (real): %.6f' %
                      (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.))
            else:
                print('Train score: %.6f rmse (real): %.6f' %
                      (score[0], score[1]))

            score = model.evaluate(
                    X_test, Y_test, batch_size=Y_test.shape[0], verbose=0)

            if is_mmn:
                print('Test score: %.6f rmse (norm): %.6f rmse (real): %.6f' %
                      (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.))
            else:
                print('Test score: %.6f rmse (real): %.6f' %
                      (score[0], score[1]))

            if not is_mmn:
                predict = model.predict(X_test)
            else:
                predict = mmn.inverse_transform(model.predict(X_test))
                Y_test = mmn.inverse_transform(Y_test)
            # print("predict", predict)
            # print("test", Y_test)
            rmse = Metric.RMSE(predict, Y_test, noConditionRegions)
            save_result(predict, Y_test, timestamp_test,
                        "./result/{}_predict_rmse{}".format(hyperparams_name, str(rmse)))
            result["rmse"] = rmse
            print("RMSE:", rmse)
            # print("accuracy", Metric.accuracy(predict, Y_test, noConditionRegions))
            print("\nelapsed time (eval): %.3f seconds\n" % (time.time() - ts))

        # X_test = X_test_copy.copy()
        # Y_test = Y_test_copy.copy()
        # average
        # predict = mmn.inverse_transform(average_method(X_test))
        # Y_test = mmn.inverse_transform(Y_test)
        # print("predict", predict)
        # print("test", Y_test)
        # rmse = Metric.RMSE(predict, Y_test, noConditionRegions)
        # results["avg_method"] = {"rmse": rmse}
        print "closeness is {} and the final result is:".format(_c)
        for method_name, vs in results.items():
            print method_name, ":"
            for _m, _v in vs.items():
                print "    ", _m, _v
        print ""

    print "all finish"

    for _p, _rs in all_results.items():
        print "closeness is {} and the final result is:".format(_p)
        for method_name, vs in _rs.items():
            print method_name, ":"
            for _m, _v in vs.items():
                print "    ", _m, _v
        print ""

    d = {}
    for _method_name in model_methods:
        d[_method_name] = {}
    d["avg_method"] = {}

    for _p, _rs in all_results.items():
        for method_name, vs in _rs.items():
            for _m, _v in vs.items():
                if _m == "rmse":
                    d[method_name][_p] = _v
    clossesness_df = pd.DataFrame(d)
    clossesness_df.to_csv("./result/clossness_rmse.csv", float_format="%.5f")
示例#7
0
def main():
    # load data
    print("loading data...")

    ts = time.time()
    if is_mmn:
        fname = os.path.join(Paramater.DATAPATH, 'CACHE',
                             'TaxiBJ_C{}_P{}_T{}_{}_mmn.h5'.format(len_closeness, len_period, len_trend,
                                                                   "External" if hasExternal else "noExternal"))
    else:
        fname = os.path.join(Paramater.DATAPATH, 'CACHE',
                             'TaxiBJ_C{}_P{}_T{}_{}.h5'.format(len_closeness, len_period, len_trend,
                                                               "External" if hasExternal else "noExternal"))

    f2name = fname.replace(".h5", "_cell.h5")
    if CACHEDATA and os.path.exists(f2name):
        print f2name
        X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = read_cache(
            f2name, is_mmn)
        print("load %s successfully" % f2name)
    else:
        if os.path.exists(fname) and CACHEDATA:
            X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = read_cache(
                fname, is_mmn)

            print("load %s successfully" % fname)
        else:
            datapaths = [Paramater.DATAPATH + "48_48_20_LinearInterpolationFixed_condition"]
            noConditionRegionsPath = Paramater.PROJECTPATH + "data/48_48_20_noSpeedRegion_0.05"
            X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = Data.loadDataFromRaw(
                paths=datapaths, noSpeedRegionPath=noConditionRegionsPath, nb_flow=nb_flow, len_closeness=len_closeness,
                len_period=len_period, len_trend=len_trend
                , len_test=len_test, maxMinNormalization=is_mmn, preprocess_name='preprocessing.pkl',
                meta_data=hasExternal,
                meteorol_data=hasExternal,
                holiday_data=hasExternal)
            if CACHEDATA:
                cache(fname, X_train, Y_train, X_test, Y_test,
                      external_dim, timestamp_train, timestamp_test, noConditionRegions, is_mmn, x_num, y_num, z_num)

        X_train, Y_train = Data.transformMatrixToCell(X_train, Y_train, noConditionRegions, hasExternal)
        X_test, Y_test = Data.transformMatrixToCell(X_test, Y_test, noConditionRegions, hasExternal)

        if CACHEDATA:
            cache(f2name, X_train, Y_train, X_test, Y_test, external_dim, timestamp_train, timestamp_test,
                  list(noConditionRegions), is_mmn, x_num, y_num, z_num)

    # grid cv
    if grid_cv:
        max_depth = [None, 5, 10, 15]
        min_samples_split = [2, 4, 6]
        min_samples_leaf = [1, 2, 3]
        criterion = ["gini", "entropy"]
        param_grid = dict(max_depth=max_depth, min_samples_split=min_samples_split,
                          min_samples_leaf=min_samples_leaf,
                          criterion=criterion)

        grid = GridSearchCV(estimator=DecisionTreeClassifier(random_state=random_state), scoring="accuracy",
                            param_grid=param_grid,
                            n_jobs=-1, verbose=1)
        grid.refit = False
        grid_result = grid.fit(X_train, Y_train)

        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

        max_depth = grid_result.best_params_['max_depth']
        min_samples_split = grid_result.best_params_['min_samples_split']
        min_samples_leaf = grid_result.best_params_['min_samples_leaf']
        criterion = grid_result.best_params_["criterion"]

    else:
        max_depth = 10
        min_samples_split = 4
        min_samples_leaf = 1
        criterion = "gini"

    classfier = DecisionTreeClassifier(criterion=criterion,
                                       max_depth=max_depth,
                                       min_samples_leaf=min_samples_leaf,
                                       min_samples_split=min_samples_split,
                                       random_state=random_state)
    print "DT train ing.."
    classfier.fit(X_train, Y_train)
    print "train finish"
    score = classfier.score(X_test, Y_test)
    print score

    predict = classfier.predict(X_test)
    predict = Data.transformCellToMatrix(predict, Data.getMatrixSize(predict.shape[0], x_num, y_num, z_num,
                                                                     len(noConditionRegions)), x_num, y_num, z_num,
                                         noConditionRegions)
    Y_test = Data.transformCellToMatrix(Y_test, Data.getMatrixSize(Y_test.shape[0], x_num, y_num, z_num,
                                                                   len(noConditionRegions)), x_num, y_num, z_num,
                                        noConditionRegions)
    print("RMSE:", Metric.RMSE(predict, Y_test, noConditionRegions))
    print("accuracy", Metric.accuracy(predict, Y_test, noConditionRegions))
示例#8
0
def main():
    # load data
    print("loading data...")

    ts = time.time()
    if is_mmn:
        fname = os.path.join(
            Paramater.DATAPATH, 'CACHE', 'TaxiBJ_C{}_P{}_T{}_{}_mmn.h5'.format(
                len_closeness, len_period, len_trend,
                "External" if hasExternal else "noExternal"))
    else:
        fname = os.path.join(
            Paramater.DATAPATH, 'CACHE', 'TaxiBJ_C{}_P{}_T{}_{}.h5'.format(
                len_closeness, len_period, len_trend,
                "External" if hasExternal else "noExternal"))

    f2name = fname.replace(".h5", "_cell.h5")
    if CACHEDATA and os.path.exists(f2name):
        print f2name
        X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = read_cache(
            f2name, is_mmn)
        print("load %s successfully" % f2name)
    else:
        if os.path.exists(fname) and CACHEDATA:
            X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = read_cache(
                fname, is_mmn)

            print("load %s successfully" % fname)
        else:
            datapaths = [
                Paramater.DATAPATH +
                "48_48_20_LinearInterpolationFixed_condition"
            ]
            noConditionRegionsPath = Paramater.PROJECTPATH + "data/48_48_20_noSpeedRegion_0.05"
            X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = Data.loadDataFromRaw(
                paths=datapaths,
                noSpeedRegionPath=noConditionRegionsPath,
                nb_flow=nb_flow,
                len_closeness=len_closeness,
                len_period=len_period,
                len_trend=len_trend,
                len_test=len_test,
                maxMinNormalization=is_mmn,
                preprocess_name='preprocessing.pkl',
                meta_data=hasExternal,
                meteorol_data=hasExternal,
                holiday_data=hasExternal)
            if CACHEDATA:
                cache(fname, X_train, Y_train, X_test, Y_test, external_dim,
                      timestamp_train, timestamp_test, noConditionRegions,
                      is_mmn, x_num, y_num, z_num)
        X_train, Y_train = Data.transformMatrixToCell(X_train, Y_train,
                                                      noConditionRegions,
                                                      hasExternal)
        X_test, Y_test = Data.transformMatrixToCell(X_test, Y_test,
                                                    noConditionRegions,
                                                    hasExternal)

        if CACHEDATA:
            cache(f2name, X_train, Y_train, X_test, Y_test,
                  external_dim, timestamp_train, timestamp_test,
                  list(noConditionRegions), is_mmn, x_num, y_num, z_num)

    print "baseline start train ing.."
    bl = BaseLine(maxC=3, maxD=1, maxW=1, minSupport=10, minConfidence=0.9)
    bl.fit(X_train, Y_train, len_closeness, len_period, len_trend)
    print "baseline train finish"
    predict = bl.predict(X_test)
    predict = Data.transformCellToMatrix(
        predict,
        Data.getMatrixSize(predict.shape[0], x_num, y_num, z_num,
                           len(noConditionRegions)), x_num, y_num, z_num,
        noConditionRegions)
    Y_test = Data.transformCellToMatrix(
        Y_test,
        Data.getMatrixSize(Y_test.shape[0], x_num, y_num, z_num,
                           len(noConditionRegions)), x_num, y_num, z_num,
        noConditionRegions)
    print("RMSE:", Metric.RMSE(predict, Y_test, noConditionRegions))
    print("accuracy", Metric.accuracy(predict, Y_test, noConditionRegions))
示例#9
0
def main():
    # load data
    print("loading data...")
    ts = time.time()
    datapath = os.path.join(Paramater.DATAPATH, "2016", "all")
    if is_mmn:
        fname = os.path.join(
            datapath, 'CACHE', 'TaxiBJ_C{}_P{}_T{}_{}_mmn_speed.h5'.format(
                len_closeness, len_period, len_trend,
                "External" if hasExternal else "noExternal"))
    else:
        fname = os.path.join(
            datapath, 'CACHE', 'TaxiBJ_C{}_P{}_T{}_{}_speed.h5'.format(
                len_closeness, len_period, len_trend,
                "External" if hasExternal else "noExternal"))
    x_num = y_num = 48
    pkl = fname + '.preprocessing_speed.pkl'
    if os.path.exists(fname) and CACHEDATA:
        X_train, Y_train, X_test, Y_test, mmn, external_dim, \
        timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = read_cache(fname, is_mmn,
                                                                                              pkl)
        print("load %s successfully" % fname)
    else:
        datapaths = [os.path.join(datapath, "48_48_20_MaxSpeedFillingFixed_5")]
        noConditionRegionsPath = os.path.join(datapath,
                                              "48_48_20_noSpeedRegion_0.05")
        X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = Data.loadDataFromRaw(
            paths=datapaths,
            noSpeedRegionPath=noConditionRegionsPath,
            nb_flow=nb_flow,
            len_closeness=len_closeness,
            len_period=len_period,
            len_trend=len_trend,
            len_test=len_test,
            maxMinNormalization=is_mmn,
            preprocess_name=pkl,
            meta_data=hasExternal,
            meteorol_data=hasExternal,
            holiday_data=hasExternal,
            isComplete=False)
        if CACHEDATA:
            cache(fname, X_train, Y_train, X_test, Y_test, external_dim,
                  timestamp_train, timestamp_test, noConditionRegions, is_mmn,
                  x_num, y_num, Paramater.Z_NUM)

    # print("\n days (test): ", [v[:8] for v in timestamp_test[0::72]])
    print("\nelapsed time (loading data): %.3f seconds\n" % (time.time() - ts))

    print('=' * 10)
    print("compiling model...")
    print(
        "**at the first time, it takes a few minites to compile if you use [Theano] as the backend**"
    )

    ts = time.time()
    print(X_train)

    print "start build model"
    # input = Input(shape=(nb_flow * len_closeness, x_num, y_num))
    # # Conv1
    # conv1 = Convolution2D(
    #     nb_filter=64, nb_row=3, nb_col=3, border_mode="same")(input)
    # # [nb_residual_unit] Residual Units
    # residual_output = ResUnits(_residual_unit, nb_filter=64,
    #                            repetations=nb_residual_unit)(conv1)
    # # Conv2
    # activation = Activation('relu')(residual_output)
    # conv2 = Convolution2D(
    #     nb_filter=nb_flow, nb_row=3, nb_col=3, border_mode="same")(activation)
    # main_output = Activation('tanh')(conv2)

    input = Input(shape=(nb_flow * len_closeness, x_num, y_num))
    # conv1 = Convolution2D(nb_filter=64, nb_row=3, nb_col=3, border_mode="same")(input)
    # act1 = Activation("relu")(conv1)
    reshape = Reshape((len_closeness, nb_flow, x_num, y_num))(input)

    convLSTM = ConvLSTM2D(nb_filter=32,
                          nb_row=3,
                          nb_col=3,
                          border_mode="same",
                          inner_activation="relu")(reshape)
    act2 = Activation("relu")(convLSTM)
    conv2 = Convolution2D(nb_filter=nb_flow,
                          nb_row=3,
                          nb_col=3,
                          border_mode="same")(act2)
    main_output = Activation('tanh')(conv2)

    model = Model(input=input, output=main_output)
    adam = Adam(lr=lr)
    model.compile(loss='mse', optimizer=adam, metrics=[metrics.rmse])
    model.summary()
    print "finish build model"

    hyperparams_name = 'testMyModel_speed.c{}.p{}.t{}.resunit{}.lr{}.{}.{}'.format(
        len_closeness, len_period, len_trend, nb_residual_unit, lr,
        "External" if hasExternal else "noExternal",
        "MMN" if is_mmn else "noMMN")

    fname_param = os.path.join(path_model,
                               '{}.best.h5'.format(hyperparams_name))

    early_stopping = EarlyStopping(monitor='val_rmse', patience=2, mode='min')
    model_checkpoint = ModelCheckpoint(fname_param,
                                       monitor='val_rmse',
                                       verbose=0,
                                       save_best_only=True,
                                       mode='min')

    print("\nelapsed time (compiling model): %.3f seconds\n" %
          (time.time() - ts))

    print('=' * 10)
    print("training model...")
    ts = time.time()
    history = model.fit(X_train,
                        Y_train,
                        nb_epoch=nb_epoch,
                        batch_size=batch_size,
                        validation_split=0.1,
                        callbacks=[early_stopping, model_checkpoint],
                        verbose=1)
    model.save_weights(os.path.join(path_model,
                                    '{}.h5'.format(hyperparams_name)),
                       overwrite=True)
    pickle.dump((history.history),
                open(
                    os.path.join(path_result,
                                 '{}.history.pkl'.format(hyperparams_name)),
                    'wb'))
    print("\nelapsed time (training): %.3f seconds\n" % (time.time() - ts))

    print('=' * 10)
    print('evaluating using the model that has the best loss on the valid set')
    ts = time.time()
    model.load_weights(fname_param)
    score = model.evaluate(X_train,
                           Y_train,
                           batch_size=Y_train.shape[0] // 48,
                           verbose=0)

    if is_mmn:
        print('Train score: %.6f rmse (norm): %.6f rmse (real): %.6f' %
              (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.))
    else:
        print('Train score: %.6f rmse (real): %.6f' % (score[0], score[1]))

    score = model.evaluate(X_test,
                           Y_test,
                           batch_size=Y_test.shape[0],
                           verbose=0)

    if is_mmn:
        print('Test score: %.6f rmse (norm): %.6f rmse (real): %.6f' %
              (score[0], score[1], score[1] * (mmn._max - mmn._min) / 2.))
    else:
        print('Test score: %.6f rmse (real): %.6f' % (score[0], score[1]))

    if not is_mmn:
        predict = model.predict(X_test)
    else:
        predict = mmn.inverse_transform(model.predict(X_test))
        Y_test = mmn.inverse_transform(Y_test)
    print("predict", predict)
    print("test", Y_test)
    print("RMSE:", Metric.RMSE(predict, Y_test, noConditionRegions))
    # print("accuracy", Metric.accuracy(predict, Y_test, noConditionRegions))

    print("\nelapsed time (eval): %.3f seconds\n" % (time.time() - ts))
    exit(1)
示例#10
0
def main():
    # load data
    print("loading data...")

    datapath = os.path.join(Paramater.DATAPATH, "2016", "all")
    ts = time.time()
    if is_mmn:
        fname = os.path.join(
            datapath, 'CACHE', 'TaxiBJ_C{}_P{}_T{}_{}_mmn_speed.h5'.format(
                len_closeness, len_period, len_trend,
                "External" if hasExternal else "noExternal"))
    else:
        fname = os.path.join(
            datapath, 'CACHE', 'TaxiBJ_C{}_P{}_T{}_{}_speed.h5'.format(
                len_closeness, len_period, len_trend,
                "External" if hasExternal else "noExternal"))

    f2name = fname.replace(".h5", "_cell.h5")
    pkl_fname = fname + '.preprocessing_speed.pkl'
    if CACHEDATA and os.path.exists(f2name):
        # print f2name
        print("load %s successfully" % f2name)
        X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = read_cache(
            f2name, is_mmn, pkl_fname)
    else:
        if os.path.exists(fname) and CACHEDATA:
            X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = read_cache(
                fname, is_mmn, pkl_fname)

            print("load %s successfully" % fname)
        else:
            datapaths = [
                os.path.join(datapath, "48_48_20_MaxSpeedFillingFixed_5")
            ]
            noConditionRegionsPath = os.path.join(
                datapath, "48_48_20_noSpeedRegion_0.05")
            X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test, noConditionRegions, x_num, y_num, z_num = Data.loadDataFromRaw(
                paths=datapaths,
                noSpeedRegionPath=noConditionRegionsPath,
                nb_flow=nb_flow,
                len_closeness=len_closeness,
                len_period=len_period,
                len_trend=len_trend,
                len_test=len_test,
                maxMinNormalization=is_mmn,
                preprocess_name=pkl_fname,
                meta_data=hasExternal,
                meteorol_data=hasExternal,
                holiday_data=hasExternal,
                isComplete=False)
            if CACHEDATA:
                cache(fname, X_train, Y_train, X_test, Y_test, external_dim,
                      timestamp_train, timestamp_test, noConditionRegions,
                      is_mmn, x_num, y_num, z_num)

        X_train, Y_train = Data.transformMatrixToCell(X_train, Y_train,
                                                      noConditionRegions,
                                                      hasExternal)
        X_test, Y_test = Data.transformMatrixToCell(X_test, Y_test,
                                                    noConditionRegions,
                                                    hasExternal)

        if CACHEDATA:
            cache(f2name, X_train, Y_train, X_test, Y_test,
                  external_dim, timestamp_train, timestamp_test,
                  list(noConditionRegions), is_mmn, x_num, y_num, z_num)

    # print "X_train", X_train
    # print "Y_train", Y_train
    # print "X_test", X_test
    # print "Y_test", Y_test

    # grid cv
    if grid_cv:
        max_depth = [None, 5, 10, 15]
        min_samples_split = [2, 4, 6]
        min_samples_leaf = [1, 2, 3]
        criterion = ["mse", "mae"]
        param_grid = dict(max_depth=max_depth,
                          min_samples_split=min_samples_split,
                          min_samples_leaf=min_samples_leaf,
                          criterion=criterion)

        grid = GridSearchCV(
            estimator=DecisionTreeRegressor(random_state=random_state),
            scoring="neg_mean_squared_error",
            param_grid=param_grid,
            n_jobs=-1,
            verbose=1)
        grid.refit = False
        grid_result = grid.fit(X_train, Y_train)

        print("Best: %f using %s" %
              (grid_result.best_score_, grid_result.best_params_))

        max_depth = grid_result.best_params_['max_depth']
        min_samples_split = grid_result.best_params_['min_samples_split']
        min_samples_leaf = grid_result.best_params_['min_samples_leaf']
        criterion = grid_result.best_params_["criterion"]

    else:
        max_depth = 10
        min_samples_split = 4
        min_samples_leaf = 1
        criterion = "mse"

    classfier = DecisionTreeRegressor(criterion=criterion,
                                      max_depth=max_depth,
                                      min_samples_leaf=min_samples_leaf,
                                      min_samples_split=min_samples_split,
                                      random_state=random_state)

    print "DT train ing.."
    classfier.fit(X_train, Y_train)
    print "train finish"
    score = classfier.score(X_test, Y_test)
    print score

    predict = classfier.predict(X_test)

    # print "p", predict, x_num, y_num, z_num, noConditionRegions
    predict = Data.transformCellToMatrix(
        predict,
        Data.getMatrixSize(predict.shape[0], x_num, y_num, z_num,
                           len(noConditionRegions)), x_num, y_num, z_num,
        noConditionRegions, Y_test.min())
    Y_test = Data.transformCellToMatrix(
        Y_test,
        Data.getMatrixSize(Y_test.shape[0], x_num, y_num, z_num,
                           len(noConditionRegions)), x_num, y_num, z_num,
        noConditionRegions, Y_test.min())
    # print predict
    # print Y_test
    if is_mmn:
        mmn.printMinMax()
        predict = mmn.inverse_transform(predict)
        Y_test = mmn.inverse_transform(Y_test)

    print "predict", predict
    print "Y_test", Y_test
    print("RMSE:", Metric.RMSE(predict, Y_test, noConditionRegions))