Exemplo n.º 1
0
def evaluation(method, dataset, user, device_source):
    log_name = 'log/cnn_%s_%s_evaluation.txt' % (dataset, method)
    if os.path.exists(log_name):
        os.remove(log_name)
    if user == 'mlsnrs':
        root_dir_prefix = '/home/mlsnrs/apks'
    elif user == 'shellhand':
        root_dir_prefix = '/mnt'
    save_feature_path = '%s/%s/mamadroid/%s/%s/%s_save_feature_list.csv' % (
        root_dir_prefix, device_source, dataset, method, method)
    save_feature_dict = get_save_feature_dict(save_feature_path)
    print('have read save_feature_dict: %d' % len(save_feature_dict))
    x_train, y_train = get_train_data(dataset, method, save_feature_dict,
                                      root_dir_prefix, device_source)
    print('x_train shape: %s y_train shape: %s' %
          (str(x_train.shape), str(y_train.shape)))
    start = time.time()
    print('start train')
    clf = CNN(layer_num=3, kernel_size=3, gpu_id=2)
    clf.fit(x_train, y_train, epoch=5, batch_size=500, lr=0.01)
    end = time.time()
    print('Training  model time used: %f s' % (end - start))
    #     torch.cuda.empty_cache()
    print(x_train.shape)
    y_pred = clf.predict(x_train, batch_size=20)
    print(y_pred.shape)
    cm = confusion_matrix(y_train, np.int32(y_pred >= 0.5))
    TP = cm[1][1]
    FP = cm[0][1]
    TN = cm[0][0]
    FN = cm[1][0]
    F1 = float(2 * TP) / (2 * TP + FN + FP)
    print('train data TP FP TN FN F1: %d %d %d %d %.4f' % (TP, FP, TN, FN, F1))
    with open(log_name, 'a') as f:
        f.write('train data TP FP TN FN F1: %d %d %d %d %.4f\n' %
                (TP, FP, TN, FN, F1))
    x_train = []
    y_train = []
    for test_id in range(0, 1):  #13):
        x_test, y_test = get_test_data(dataset, test_id, method,
                                       save_feature_dict, root_dir_prefix,
                                       device_source)
        print('x_test shape: %s y_test shape: %s' %
              (str(x_test.shape), str(y_test.shape)))
        y_pred = clf.predict(x_test, batch_size=500)
        #         y_pred = classify(y_pred)
        cm = confusion_matrix(y_test, y_pred)
        TP = cm[1][1]
        FP = cm[0][1]
        TN = cm[0][0]
        FN = cm[1][0]
        F1 = float(2 * TP) / (2 * TP + FN + FP)
        print('test_id %d TP FP TN FN F1: %d %d %d %d %.4f' %
              (test_id, TP, FP, TN, FN, F1))
        with open(log_name, 'a') as f:
            f.write('test_id %d TP FP TN FN F1: %d %d %d %d %.4f\n' %
                    (test_id, TP, FP, TN, FN, F1))
def train_cnn_model(emb_layer, x_train, y_train, x_val, y_val, opt):
    model = CNN(embedding_layer=emb_layer,
                num_words=opt.n_words,
                embedding_dim=opt.embed_dim,
                filter_sizes=opt.cnn_filter_shapes,
                feature_maps=opt.filter_sizes,
                max_seq_length=opt.sent_len,
                dropout_rate=opt.dropout_ratio,
                hidden_units=200,
                nb_classes=2).build_model()

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizers.Adam(),
                  metrics=['accuracy'])

    #     y_train = y_train.reshape(-1, 1)
    #     model = build_model(emb_layer, opt)
    print(model.summary())

    early_stopping = EarlyStopping(monitor='val_loss', patience=2)
    history = model.fit(x_train,
                        y_train,
                        epochs=opt.cnn_epoch,
                        batch_size=opt.batch_size,
                        verbose=1,
                        validation_data=(x_val, y_val),
                        callbacks=[early_stopping])

    with open("CNN_train_history.txt", "w") as f:
        print(history.history, file=f)
    return model
def train_baseline_cnn(emb_layer, x_train, y_train, x_val, y_val, opt):
    model = CNN(embedding_layer=emb_layer,
                num_words=opt.transfer_n_words,
                embedding_dim=opt.baseline_embed_dim,
                filter_sizes=opt.cnn_filter_shapes,
                feature_maps=opt.filter_sizes,
                max_seq_length=opt.baseline_sent_len,
                dropout_rate=opt.baseline_drop_out_ratio,
                hidden_units=200,
                nb_classes=2).build_model()

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizers.Adam(),
                  metrics=['accuracy'])

    #     y_train = y_train.reshape(-1, 1)
    #     model = build_model(emb_layer, opt)
    print(model.summary())
    tb_call_back = TensorBoard(log_dir=f'{opt.tbpath}/baseline_cnn_{time()}',
                               histogram_freq=1,
                               write_graph=True,
                               write_images=True)

    checkpoint = ModelCheckpoint("baseline_cnn.h5",
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 save_weights_only=False,
                                 mode='auto',
                                 period=1)
    early_stopping = EarlyStopping(monitor='val_loss', patience=2)
    history = model.fit(x_train,
                        y_train,
                        epochs=opt.baseline_epochs,
                        batch_size=opt.baseline_batchsize,
                        verbose=1,
                        validation_data=(x_val, y_val),
                        callbacks=[early_stopping, tb_call_back, checkpoint])

    with open("CNN_train_baseline_history.txt", "w") as f:
        print(history.history, file=f)
    return model
Exemplo n.º 4
0
    ).build_model()

    model.compile(
        loss='categorical_crossentropy',
        optimizer=keras.optimizers.Adam(),
        metrics=['accuracy']
    )

    # model.summary()

    history = model.fit(
        X_train, y_train,
        epochs=NB_EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=(X_val, y_val),
        callbacks=[
            keras.callbacks.ModelCheckpoint(
                'model-%i.h5' % (i + 1), monitor='val_loss', verbose=1, save_best_only=True, mode='min'
            ),
            # keras.callbacks.TensorBoard(log_dir='./logs/temp', write_graph=True)
        ]
    )
    print()
    histories.append(history.history)


# EVALUATION -------------------------------------------------------------------

def get_avg(histories, his_key):
    tmp = []
    for history in histories:
        tmp.append(history[his_key][np.argmin(history['val_loss'])])
Exemplo n.º 5
0
def optimize_para(method, dataset, user, device_source):
    log_name = 'log/optimize_cnn_%s_%s_evaluation_v2.txt' % (dataset, method)
    #     if os.path.exists(log_name):
    #         os.remove(log_name)
    if user == 'mlsnrs':
        root_dir_prefix = '/home/mlsnrs/apks'
    elif user == 'shellhand':
        root_dir_prefix = '/mnt'
    save_feature_path = '%s/%s/mamadroid/%s/%s/%s_save_feature_list.csv' % (
        root_dir_prefix, device_source, dataset, method, method)
    save_feature_dict = get_save_feature_dict(save_feature_path)
    print('have read save_feature_dict: %d' % len(save_feature_dict))
    x_train, y_train = get_train_data(
        dataset, 2012, method, save_feature_dict, root_dir_prefix
    )  # dataset, train_year, method, save_feature_dict, root_dir_prefix
    print('x_train shape: %s y_train shape: %s' %
          (str(x_train.shape), str(y_train.shape)))
    start = time.time()
    print('start train')
    for b in range(50, 501, 50):
        for k in [5]:  # 3, 5
            for lr in [0.01, 0.1, 0.001]:
                clf = CNN(layer_num=3, kernel_size=k, gpu_id=2)
                step_size = 10
                for e in range(10, 501, step_size):
                    clf.fit(x_train,
                            y_train,
                            epoch=step_size,
                            batch_size=b,
                            lr=lr)
                    end = time.time()
                    #                     print('Training batch_size=%d kernel_size=%d lr=%.2f epoch=%d time used: %f s' % (b, k, lr, e, end - start))
                    #     torch.cuda.empty_cache()
                    y_pred = clf.predict(x_train, batch_size=20)
                    cm = confusion_matrix(y_train, np.int32(y_pred >= 0.5))
                    TP = cm[1][1]
                    FP = cm[0][1]
                    TN = cm[0][0]
                    FN = cm[1][0]
                    F1 = float(2 * TP) / (2 * TP + FN + FP)
                    print(
                        'train data batch_size=%d kernel_size=%d lr=%.2f epoch=%d TP FP TN FN F1: %d %d %d %d %.4f'
                        % (b, k, lr, e, TP, FP, TN, FN, F1))
                    with open(log_name, 'a') as f:
                        f.write(
                            'train data batch_size=%d kernel_size=%d lr=%.2f epoch=%d TP FP TN FN F1: %d %d %d %d %.4f\n'
                            % (b, k, lr, e, TP, FP, TN, FN, F1))
                    for test_id in range(0, 1):  #13):
                        x_test, y_test = get_test_data(
                            dataset, 2013, 0, method, save_feature_dict,
                            root_dir_prefix
                        )  # dataset, test_year, test_month, method, save_feature_dict, root_dir_prefix
                        #                 print('x_test shape: %s y_test shape: %s' % (str(x_test.shape), str(y_test.shape)))
                        y_pred = clf.predict(x_test, batch_size=20)
                        #         y_pred = classify(y_pred)
                        cm = confusion_matrix(y_test, np.int32(y_pred >= 0.5))
                        TP = cm[1][1]
                        FP = cm[0][1]
                        TN = cm[0][0]
                        FN = cm[1][0]
                        F1 = float(2 * TP) / (2 * TP + FN + FP)
                        print(
                            'test_id %d batch_size=%d kernel_size=%d lr=%.2f epoch=%d TP FP TN FN F1: %d %d %d %d %.4f'
                            % (test_id, b, k, lr, e, TP, FP, TN, FN, F1))
                        with open(log_name, 'a') as f:
                            f.write(
                                'test_id %d batch_size=%d kernel_size=%d lr=%.2f epoch=%d TP FP TN FN F1: %d %d %d %d %.4f\n'
                                % (test_id, b, k, lr, e, TP, FP, TN, FN, F1))
Exemplo n.º 6
0
def evaluation(method, dataset, user, device_source):
    if user == 'mlsnrs':
        root_dir_prefix = '/home/mlsnrs/apks'
    elif user == 'shellhand':
        root_dir_prefix = '/mnt'
    save_feature_path = '%s/ssd_1T/mamadroid/%s/%s/%s_save_feature_list.csv' % (
        root_dir_prefix, dataset, method, method)
    save_feature_dict = get_save_feature_dict(save_feature_path)
    print('have read save_feature_dict: %d' % len(save_feature_dict))
    for train_year in range(2012, 2018):
        log_name = 'log/cnn_%s_%s_%dtrain_evaluation.txt' % (dataset, method,
                                                             train_year)
        if os.path.exists(log_name):
            os.remove(log_name)
        x_train, y_train = get_train_data(dataset, train_year, method,
                                          save_feature_dict, root_dir_prefix)
        print('x_train shape: %s y_train shape: %s' %
              (str(x_train.shape), str(y_train.shape)))
        start = time.time()
        print('start train')
        clf = CNN(layer_num=3, kernel_size=5, gpu_id=3)
        clf.fit(x_train, y_train, epoch=260, batch_size=350, lr=0.01)  # 260
        end = time.time()
        print('Training  model time used: %f s' % (end - start))
        print(x_train.shape)
        len_x = x_train.shape[0]
        if (len_x % 20) != 1:
            y_pred = clf.predict(x_train, batch_size=20)
        else:
            y_pred = clf.predict(x_train, batch_size=21)
        print(y_pred.shape)
        cm = confusion_matrix(y_train, np.int32(y_pred >= 0.5))
        TP = cm[1][1]
        FP = cm[0][1]
        TN = cm[0][0]
        FN = cm[1][0]
        F1 = float(2 * TP) / (2 * TP + FN + FP)
        print('train %d data TP FP TN FN F1: %d %d %d %d %.4f' %
              (train_year, TP, FP, TN, FN, F1))
        with open(log_name, 'a') as f:
            f.write('train %d data TP FP TN FN F1: %d %d %d %d %.4f\n' %
                    (train_year, TP, FP, TN, FN, F1))
        x_train = []
        y_train = []
        for test_year in range(train_year + 1, 2019):
            for test_month in range(0, 12):
                x_test, y_test = get_test_data(dataset, test_year, test_month,
                                               method, save_feature_dict,
                                               root_dir_prefix)
                print('%d-%02d x_test shape: %s y_test shape: %s' %
                      (test_year, test_month + 1, str(
                          x_test.shape), str(y_test.shape)))
                len_x = x_test.shape[0]
                if (len_x % 20) != 1:
                    y_pred = clf.predict(x_test, batch_size=20)
                else:
                    y_pred = clf.predict(x_test, batch_size=21)


#                 y_pred = clf.predict(x_test, batch_size = 20)
                cm = confusion_matrix(y_test, np.int32(y_pred >= 0.5))
                TP = cm[1][1]
                FP = cm[0][1]
                TN = cm[0][0]
                FN = cm[1][0]
                F1 = float(2 * TP) / (2 * TP + FN + FP)
                print('test %d-%02d TP FP TN FN F1: %d %d %d %d %.4f' %
                      (test_year, test_month + 1, TP, FP, TN, FN, F1))
                with open(log_name, 'a') as f:
                    f.write('test %d-%02d TP FP TN FN F1: %d %d %d %d %.4f\n' %
                            (test_year, test_month + 1, TP, FP, TN, FN, F1))