def evaluation(method, dataset, user, device_source): log_name = 'log/cnn_%s_%s_evaluation.txt' % (dataset, method) if os.path.exists(log_name): os.remove(log_name) if user == 'mlsnrs': root_dir_prefix = '/home/mlsnrs/apks' elif user == 'shellhand': root_dir_prefix = '/mnt' save_feature_path = '%s/%s/mamadroid/%s/%s/%s_save_feature_list.csv' % ( root_dir_prefix, device_source, dataset, method, method) save_feature_dict = get_save_feature_dict(save_feature_path) print('have read save_feature_dict: %d' % len(save_feature_dict)) x_train, y_train = get_train_data(dataset, method, save_feature_dict, root_dir_prefix, device_source) print('x_train shape: %s y_train shape: %s' % (str(x_train.shape), str(y_train.shape))) start = time.time() print('start train') clf = CNN(layer_num=3, kernel_size=3, gpu_id=2) clf.fit(x_train, y_train, epoch=5, batch_size=500, lr=0.01) end = time.time() print('Training model time used: %f s' % (end - start)) # torch.cuda.empty_cache() print(x_train.shape) y_pred = clf.predict(x_train, batch_size=20) print(y_pred.shape) cm = confusion_matrix(y_train, np.int32(y_pred >= 0.5)) TP = cm[1][1] FP = cm[0][1] TN = cm[0][0] FN = cm[1][0] F1 = float(2 * TP) / (2 * TP + FN + FP) print('train data TP FP TN FN F1: %d %d %d %d %.4f' % (TP, FP, TN, FN, F1)) with open(log_name, 'a') as f: f.write('train data TP FP TN FN F1: %d %d %d %d %.4f\n' % (TP, FP, TN, FN, F1)) x_train = [] y_train = [] for test_id in range(0, 1): #13): x_test, y_test = get_test_data(dataset, test_id, method, save_feature_dict, root_dir_prefix, device_source) print('x_test shape: %s y_test shape: %s' % (str(x_test.shape), str(y_test.shape))) y_pred = clf.predict(x_test, batch_size=500) # y_pred = classify(y_pred) cm = confusion_matrix(y_test, y_pred) TP = cm[1][1] FP = cm[0][1] TN = cm[0][0] FN = cm[1][0] F1 = float(2 * TP) / (2 * TP + FN + FP) print('test_id %d TP FP TN FN F1: %d %d %d %d %.4f' % (test_id, TP, FP, TN, FN, F1)) with open(log_name, 'a') as f: f.write('test_id %d TP FP TN FN F1: %d %d %d %d %.4f\n' % (test_id, TP, FP, TN, FN, F1))
def evaluation(method, dataset, user, device_source): if user == 'mlsnrs': root_dir_prefix = '/home/mlsnrs/apks' elif user == 'shellhand': root_dir_prefix = '/mnt' save_feature_path = '%s/ssd_1T/mamadroid/%s/%s/%s_save_feature_list.csv' % ( root_dir_prefix, dataset, method, method) save_feature_dict = get_save_feature_dict(save_feature_path) print('have read save_feature_dict: %d' % len(save_feature_dict)) for train_year in range(2012, 2018): log_name = 'log/cnn_%s_%s_%dtrain_evaluation.txt' % (dataset, method, train_year) if os.path.exists(log_name): os.remove(log_name) x_train, y_train = get_train_data(dataset, train_year, method, save_feature_dict, root_dir_prefix) print('x_train shape: %s y_train shape: %s' % (str(x_train.shape), str(y_train.shape))) start = time.time() print('start train') clf = CNN(layer_num=3, kernel_size=5, gpu_id=3) clf.fit(x_train, y_train, epoch=260, batch_size=350, lr=0.01) # 260 end = time.time() print('Training model time used: %f s' % (end - start)) print(x_train.shape) len_x = x_train.shape[0] if (len_x % 20) != 1: y_pred = clf.predict(x_train, batch_size=20) else: y_pred = clf.predict(x_train, batch_size=21) print(y_pred.shape) cm = confusion_matrix(y_train, np.int32(y_pred >= 0.5)) TP = cm[1][1] FP = cm[0][1] TN = cm[0][0] FN = cm[1][0] F1 = float(2 * TP) / (2 * TP + FN + FP) print('train %d data TP FP TN FN F1: %d %d %d %d %.4f' % (train_year, TP, FP, TN, FN, F1)) with open(log_name, 'a') as f: f.write('train %d data TP FP TN FN F1: %d %d %d %d %.4f\n' % (train_year, TP, FP, TN, FN, F1)) x_train = [] y_train = [] for test_year in range(train_year + 1, 2019): for test_month in range(0, 12): x_test, y_test = get_test_data(dataset, test_year, test_month, method, save_feature_dict, root_dir_prefix) print('%d-%02d x_test shape: %s y_test shape: %s' % (test_year, test_month + 1, str( x_test.shape), str(y_test.shape))) len_x = x_test.shape[0] if (len_x % 20) != 1: y_pred = clf.predict(x_test, batch_size=20) else: y_pred = clf.predict(x_test, batch_size=21) # y_pred = clf.predict(x_test, batch_size = 20) cm = confusion_matrix(y_test, np.int32(y_pred >= 0.5)) TP = cm[1][1] FP = cm[0][1] TN = cm[0][0] FN = cm[1][0] F1 = float(2 * TP) / (2 * TP + FN + FP) print('test %d-%02d TP FP TN FN F1: %d %d %d %d %.4f' % (test_year, test_month + 1, TP, FP, TN, FN, F1)) with open(log_name, 'a') as f: f.write('test %d-%02d TP FP TN FN F1: %d %d %d %d %.4f\n' % (test_year, test_month + 1, TP, FP, TN, FN, F1))
def optimize_para(method, dataset, user, device_source): log_name = 'log/optimize_cnn_%s_%s_evaluation_v2.txt' % (dataset, method) # if os.path.exists(log_name): # os.remove(log_name) if user == 'mlsnrs': root_dir_prefix = '/home/mlsnrs/apks' elif user == 'shellhand': root_dir_prefix = '/mnt' save_feature_path = '%s/%s/mamadroid/%s/%s/%s_save_feature_list.csv' % ( root_dir_prefix, device_source, dataset, method, method) save_feature_dict = get_save_feature_dict(save_feature_path) print('have read save_feature_dict: %d' % len(save_feature_dict)) x_train, y_train = get_train_data( dataset, 2012, method, save_feature_dict, root_dir_prefix ) # dataset, train_year, method, save_feature_dict, root_dir_prefix print('x_train shape: %s y_train shape: %s' % (str(x_train.shape), str(y_train.shape))) start = time.time() print('start train') for b in range(50, 501, 50): for k in [5]: # 3, 5 for lr in [0.01, 0.1, 0.001]: clf = CNN(layer_num=3, kernel_size=k, gpu_id=2) step_size = 10 for e in range(10, 501, step_size): clf.fit(x_train, y_train, epoch=step_size, batch_size=b, lr=lr) end = time.time() # print('Training batch_size=%d kernel_size=%d lr=%.2f epoch=%d time used: %f s' % (b, k, lr, e, end - start)) # torch.cuda.empty_cache() y_pred = clf.predict(x_train, batch_size=20) cm = confusion_matrix(y_train, np.int32(y_pred >= 0.5)) TP = cm[1][1] FP = cm[0][1] TN = cm[0][0] FN = cm[1][0] F1 = float(2 * TP) / (2 * TP + FN + FP) print( 'train data batch_size=%d kernel_size=%d lr=%.2f epoch=%d TP FP TN FN F1: %d %d %d %d %.4f' % (b, k, lr, e, TP, FP, TN, FN, F1)) with open(log_name, 'a') as f: f.write( 'train data batch_size=%d kernel_size=%d lr=%.2f epoch=%d TP FP TN FN F1: %d %d %d %d %.4f\n' % (b, k, lr, e, TP, FP, TN, FN, F1)) for test_id in range(0, 1): #13): x_test, y_test = get_test_data( dataset, 2013, 0, method, save_feature_dict, root_dir_prefix ) # dataset, test_year, test_month, method, save_feature_dict, root_dir_prefix # print('x_test shape: %s y_test shape: %s' % (str(x_test.shape), str(y_test.shape))) y_pred = clf.predict(x_test, batch_size=20) # y_pred = classify(y_pred) cm = confusion_matrix(y_test, np.int32(y_pred >= 0.5)) TP = cm[1][1] FP = cm[0][1] TN = cm[0][0] FN = cm[1][0] F1 = float(2 * TP) / (2 * TP + FN + FP) print( 'test_id %d batch_size=%d kernel_size=%d lr=%.2f epoch=%d TP FP TN FN F1: %d %d %d %d %.4f' % (test_id, b, k, lr, e, TP, FP, TN, FN, F1)) with open(log_name, 'a') as f: f.write( 'test_id %d batch_size=%d kernel_size=%d lr=%.2f epoch=%d TP FP TN FN F1: %d %d %d %d %.4f\n' % (test_id, b, k, lr, e, TP, FP, TN, FN, F1))
num_batches = 1000 batch_size = 50 learning_rate = 0.01 # model = MLP() model = CNN() data_loader = DataLoader() optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) losses = [] for batch_index in range(num_batches): X, y = data_loader.get_batch(batch_size) with tf.GradientTape() as tape: y_logit_pred = model(tf.convert_to_tensor(X)) loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=y_logit_pred) if batch_index % 100 == 0: print("batch %d: loss %f" % (batch_index, loss.numpy())) losses.append(loss.numpy()) grads = tape.gradient(loss, model.variables) optimizer.apply_gradients(grads_and_vars=zip(grads, model.variables)) num_eval_samples = np.shape(data_loader.eval_labels)[0] y_pred = model.predict(data_loader.eval_data).numpy() print("Test accuracy: {}".format( sum(y_pred == data_loader.eval_labels) / num_eval_samples)) plt.plot(losses) plt.show()