if __name__ == '__main__': filename = 'data/jester-data-1.csv' items = {} users = {} matrix = [] matrix, users, items = parse(filename) f = Filter(matrix, users, items) if len(sys.argv) == 3: method = sys.argv[1] testFile = sys.argv[2] testData = csv.reader(open(testFile, "r")) results = f.execute(method, testData) print_evaluation(f, method, results) elif len(sys.argv) == 2 and sys.argv[1] == 'all': method = "adj_weighted_sum" testData1 = csv.reader(open("data/TestSet01.csv", "r")) testData2 = csv.reader(open("data/TestSet02.csv", "r")) testData3 = csv.reader(open("data/TestSet03.csv", "r")) testData4 = csv.reader(open("data/TestSet04.csv", "r")) results1 = f.execute(method, testData1) results2 = f.execute(method, testData2) results3 = f.execute(method, testData3) results4 = f.execute(method, testData4) print_evaluation(f, "TestSet01", results1) print_evaluation(f, "TestSet02", results2) print_evaluation(f, "TestSet03", results3)
X: valid_dataset.input_batch, Y: valid_dataset.target_batch, Sequences: valid_dataset.seq_lens, keep_prob: 1 }) accuracy = tf.reduce_mean( tf.cast(tf.equal(result, tf.cast(Y, tf.int64)), tf.float32)) accuracy_ret = sess.run(accuracy, feed_dict={Y: valid_dataset.target_batch}) speed = time.time() - last_time print('Epoch:', '%04d ' % (epoch + 1), 'accuracy =', '{:.6f} '.format(accuracy_ret), 'cost =', '{:.6f}'.format(loss), 'speed =', '{:.2f}'.format(speed), 'sec') last_time = time.time() avg_p, avg_r, avg_f = utils.print_evaluation( valid_dataset.target_batch, result, output_char2vec.char_dict) eval.set(epoch, accuracy_ret, loss, speed, avg_p, avg_r, avg_f) print('') print('\n------------ Testing ------------ ') test_sentences = data.read_data("data/test/BHXX0035.txt", 30) test_dataset, _ = data.make_sequences(test_sentences, char2vec, output_char2vec, seq_length, make_valid=False) result = sess.run(prediction, feed_dict={ X: test_dataset.input_batch, Y: test_dataset.target_batch,
def process(dataroot, classifier_name, file_ending): global K K = 5 data = read_data(dataroot, file_ending) print("data is read ", data.shape) #data = read_toy_data(dataroot,file_ending) X, ID, Y = correct_data(data, K) print("data is corrected") labels, labels_d = get_labels(Y) X = normalize_data(X) print("data is normalized") Y = encode_label(Y, labels_d) input_dim = X.shape[1] num_class = len(np.unique(Y)) confusion_matrix_sum = np.zeros((num_class, num_class), dtype=int) if classifier_name == 'cnn': lr = 1e-3 reg = 1e-5 elif classifier_name == 'softmax': lr = 1e-1 reg = 1e-6 else: lr = None reg = None batch_size = 5120 device = 'cuda:0' #batch_size = 1024*5 #device = 'cuda:0' if use_class_weight_to_balance: pre_fingerprint = os_join(dataroot, '{}_k_{}_w'.format(classifier_name, str(K))) else: pre_fingerprint = os_join(dataroot, '{}_k_{}'.format(classifier_name, str(K))) optim = 'Adam' num_iters = int( Y.shape[0] * 10 * .8 // batch_size) # 10 epochs of whole data. train data is 80% of whole data classifier_args = (classifier_name, optim, lr, reg, batch_size, input_dim, num_class, num_iters, device) config = '_optim_{}_lr_{}_reg_{}_bs_{}'.format(optim, lr, reg, batch_size) fingerprint = pre_fingerprint + config logdir = os_join(fingerprint, 'log') ensure_dir(fingerprint) ensure_dir(logdir) kfold_pred_time = 0 skf = StratifiedKFold(n_splits=K, random_state=SEED) for fold_index, (train_index, test_index) in enumerate(skf.split(X, Y)): X_train = X[train_index] y_train = Y[train_index] X_test = X[test_index] test_id = ID[test_index] y_test = Y[test_index] runs_dir = os_join(logdir, 'fold_{}'.format(fold_index)) pred, duration = classify_fold(classifier_name, X_train, y_train, X_test, fold_index, classifier_args, runs_dir) acc = metrics.balanced_accuracy_score(y_test, pred) print("Balanced accuracy including benign: {}".format(acc)) kfold_pred_time += duration assert pred.shape == y_test.shape, "y_true={} and pred.shape={} should be same ".format( y_test.shape, pred.shape) plot_confusion_matrix(os_join(fingerprint, 'cm_fold_{}.jpg'.format(fold_index)), y_test, pred, classes=labels, normalize=False, title='Confusion matrix, with normalization') plot_confusion_matrix(os_join( fingerprint, 'cm_norm_fold_{}.jpg'.format(fold_index)), y_test, pred, classes=labels, normalize=True, title='Confusion matrix, with normalization') cm_i = confusion_matrix(y_test, pred) confusion_matrix_sum += cm_i cm = np.array(confusion_matrix_sum / K).astype(np.float) print(dataroot, classifier_name) plot_confusion_matrix(os_join(fingerprint, 'cm_nonnorm_fold_avg.jpg'), [], [], cm=cm, classes=labels, title='Confusion matrix, without normalization') plot_confusion_matrix(os_join(fingerprint, 'cm_norm_fold_avg.jpg'), [], [], cm=cm, classes=labels, normalize=True, title='Confusion matrix, with normalization') print_evaluation(cm, labels, os_join(fingerprint, 'evaluation.csv')) print_absolute_recall(cm, labels, os_join(fingerprint, 'absolute_recall.csv'))
def classify(dataroot,classifier_name): global K K=5 #fraction = 1 fraction = 0.001 #total_records_in_whole = 6907723; total_records = 6907705; # in fold folds_df = [] fold_root = join(dataroot,'folds_fraction_{}'.format(fraction)) ds_list = [] for fold_index in range(K): df = pd.read_csv(join(fold_root,'fold_{}.csv'.format(fold_index))) folds_df.append(df) ds_list.append(df.Label) total_label_df = pd.concat(ds_list,sort=False) labels,labels_d = get_labels(total_label_df.unique()) class_weight = get_class_weights(encode_label(total_label_df.values,labels_d)) #balance = 'sample_per_batch' #balance = 'with_loss' balance = 'explicit' input_dim = folds_df[0].shape[1]-2 # because we remove Label and FlowID columns from X labels,labels_d = get_labels(folds_df[0].Label.unique()) num_class = len(labels) if classifier_name in ['cnn','softmax']: batch_size =256 num_iters = 0.1*(total_records*.8*.9)//batch_size # 10 epochs for total dataset optim='Adam' if classifier_name=='cnn': lr =1e-3 reg = 0 device = [0,1] elif classifier_name=='softmax': lr = 1e-3 reg =0 device = 'cuda:0' classifier_args = {'classifier_name':classifier_name,'optim':optim,'lr':lr,'reg':reg,'batch_size':batch_size,'input_dim':input_dim,'num_class':num_class,'num_iters':num_iters,'device':device, 'balance':balance, 'class_weight':class_weight} config = '_optim_{}_lr_{}_reg_{}_bs_{}_b_{}'.format(optim,lr,reg,batch_size,balance) else: lr = None reg = None batch_size=None device=None classifier_args = {'classifier_name':classifier_name,'balance':balance} config = '_b_{}'.format(balance) pre_fingerprint = join(dataroot,'classifiers','kfold', 'r_{}_c_{}_k_{}'.format(fraction,classifier_name,str(K))) fingerprint = pre_fingerprint + config print("Running experiment \n ",fingerprint) logdir = join(fingerprint,'log') cmdir = join(fingerprint,'cm') recalldir = join(fingerprint,'recall') evaldir = join(fingerprint,'eval') ensure_dirs(fingerprint,logdir,cmdir,recalldir,evaldir) confusion_matrix_sum = np.zeros((num_class, num_class),dtype=float) majority_confusion_matrix_sum = np.zeros((num_class, num_class),dtype=float) kfold_pred_time = 0 kfold_feature_importance = np.zeros(input_dim,dtype=np.float) skf = StratifiedKFold(n_splits=K,random_state=SEED) for fold_index in range(K): print("Fold ",fold_index) test_df = folds_df[fold_index] train_df = pd.concat([folds_df[i] for i in range(K) if i!=fold_index],sort=False) X_train, y_train = df_to_array(train_df) y_train = encode_label(y_train,labels_d) runs_dir=join(logdir,'fold_{}'.format(fold_index)) clf,duration = train_fold(X_train,y_train,fold_index,classifier_args,runs_dir) if classifier_name=='forest': kfold_feature_importance+=clf.feature_importances_ flowids_test,y_flowid_test = group_data(test_df) y_flowid_test = encode_label(y_flowid_test,labels_d) pred_any, pred_majority = predict_fold(classifier_name,clf,test_df, flowids_test, y_flowid_test) assert pred_any.shape==pred_majority.shape,"any and majority shapes should be same {},{}".format(pred_any.shape,pred_majority.shape) #assert pred.shape==y_flowid_test.shape, "y_true={} and pred.shape={} should be same ".format(y_flowid_test.shape,pred.shape) acc_pred_any = metrics.balanced_accuracy_score(y_flowid_test,pred_any) acc_pred_majority = metrics.balanced_accuracy_score(y_flowid_test,pred_majority) print("Fold Balanced accuracy(any,majority): ({:.2f},{:.2f})".format(acc_pred_any,acc_pred_majority)) kfold_pred_time+=duration plot_confusion_matrix(join(cmdir,'any_fold_{}.jpg'.format(fold_index)), y_flowid_test, pred_any, classes=labels,normalize=False, title='Confusion matrix, with normalization') plot_confusion_matrix(join(cmdir,'any_norm_fold_{}.jpg'.format(fold_index)), y_flowid_test, pred_any, classes=labels,normalize=True, title='Confusion matrix, with normalization') cm_i = confusion_matrix(y_flowid_test,pred_any) print_absolute_recall(cm_i, labels, join(recalldir,'any_fold_{}.csv'.format(fold_index)),fold_root) print_evaluation(cm_i, labels, join(evaldir,'any_fold_{}.csv'.format(fold_index))) confusion_matrix_sum+=cm_i plot_confusion_matrix(join(cmdir,'majority_fold_{}.jpg'.format(fold_index)), y_flowid_test, pred_majority, classes=labels,normalize=False, title='Confusion matrix, with normalization') plot_confusion_matrix(join(cmdir,'majority_norm_fold_{}.jpg'.format(fold_index)), y_flowid_test, pred_majority, classes=labels,normalize=True, title='Confusion matrix, with normalization') majority_cm_i = confusion_matrix(y_flowid_test,pred_majority) print_absolute_recall(majority_cm_i, labels, join(recalldir,'majority_fold_{}.csv'.format(fold_index)),fold_root) print_evaluation(majority_cm_i, labels, join(evaldir,'majority_fold_{}.csv'.format(fold_index))) majority_confusion_matrix_sum+=majority_cm_i if classifier_name=='forest': print_feature_importance(kfold_feature_importance,join(dataroot,'folds_fraction_{}'.format(fraction),'feature_selection.csv')) cm = confusion_matrix_sum cm_majority = majority_confusion_matrix_sum print(dataroot,classifier_name) plot_confusion_matrix(join(cmdir,'avg_any_fold.jpg'), [], [],cm=cm, classes=labels, title='Confusion matrix, without normalization') plot_confusion_matrix(join(cmdir,'avg_any_norm_fold.jpg'), [], [],cm=cm, classes=labels, normalize=True, title='Confusion matrix, with normalization') plot_confusion_matrix(join(cmdir,'avg_majority_fold.jpg'), [], [],cm=cm_majority, classes=labels, title='Confusion matrix, without normalization') plot_confusion_matrix(join(cmdir,'avg_majority_norm_fold.jpg'), [], [],cm=cm_majority, classes=labels, normalize=True, title='Confusion matrix, with normalization') print_evaluation(cm, labels, join(fingerprint,'evaluation_any.csv')) print_evaluation(cm_majority, labels, join(fingerprint,'evaluation_majority.csv')) print_absolute_recall(cm, labels, join(fingerprint,'recall_any.csv'),fold_root,kfold=True) print_absolute_recall(cm_majority, labels, join(fingerprint,'recall_majority.csv'),fold_root,kfold=True)
if __name__ == '__main__': if len(sys.argv) != 3: print 'Expected input format: python EvaluateCFList.py <method> <testList>' else: filename = 'data/jester-data-1.csv' items = {} users = {} matrix = [] size = int(sys.argv[2]) matrix, users, items = parse(filename) testData = gen_tests(users, size) f = Filter(matrix, users, items) method = sys.argv[1] print "Starting predictions" if method == 'all': w_results = f.execute('weighted_sum', testData) a_w_results = f.execute('adj_weighted_sum', testData) c_w_results = f.execute('cosine_weighted_sum', testData) c_a_w_results = f.execute('cosine_adj_weighted_sum', testData) print_evaluation(f, "Weighted Sum", w_results) print_evaluation(f, "Adjusted Weighted Sum", a_w_results) print_evaluation(f, "Cosine Weighted Sum", c_w_results) print_evaluation(f, "Cosine Adjusted Weighted Sum", c_a_w_results) else: results = f.execute(method, testData) print_evaluation(f, method, results)
def run(self): training_dataset, valid_dataset = data.make_sequences( self.input, self.char2vec, self.output_char2vec, self.seq_length) input_batch = training_dataset.input_batch target_batch = training_dataset.target_batch seq_lens = training_dataset.seq_lens hidden_size = self.modelconfig.hidden_size X = tf.placeholder(tf.int32, [None, self.seq_length]) # X data X_onehot = tf.one_hot( X, self.modelconfig.input_size) # one hot: 1 -> 0 1 0 0 0 0 0 0 0 0 Y = tf.placeholder(tf.int32, [None, self.seq_length]) # Y label Sequences = tf.placeholder(tf.int32, [None]) keep_prob = tf.placeholder(tf.float32) if self.type == "multi": print('\n****** MultiLayer LSTM Initialize ******') with tf.variable_scope('cell_def'): cell1 = tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_size, state_is_tuple=True) cell1 = tf.nn.rnn_cell.DropoutWrapper( cell1, output_keep_prob=keep_prob) cell2 = tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_size, state_is_tuple=True) cell2 = tf.nn.rnn_cell.DropoutWrapper( cell2, output_keep_prob=keep_prob) multi_cell = tf.nn.rnn_cell.MultiRNNCell([cell1, cell2]) with tf.variable_scope('rnn_def'): outputs, _states = tf.nn.dynamic_rnn(multi_cell, X_onehot, dtype=tf.float32, sequence_length=Sequences) elif self.type == "bimul": print('\n****** Bidirectional LSTM Initialize ******') with tf.variable_scope('cell_def'): forward = tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_size, state_is_tuple=True) forward = tf.nn.rnn_cell.DropoutWrapper( forward, output_keep_prob=keep_prob) backward = tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_size, state_is_tuple=True) backward = tf.nn.rnn_cell.DropoutWrapper( backward, output_keep_prob=keep_prob) with tf.variable_scope('rnn_def'): outputs, states = tf.nn.bidirectional_dynamic_rnn( forward, backward, inputs=X_onehot, dtype=tf.float32, sequence_length=Sequences) outputs = tf.concat(values=outputs, axis=2) model = tf.layers.dense(outputs, self.modelconfig.output_size, activation=None) cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model, labels=Y)) prediction = tf.argmax(model, axis=2) optimizer = tf.train.AdamOptimizer(learning_rate=0.005).minimize(cost) # optimizer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost) sess = tf.Session() sess.run(tf.global_variables_initializer()) eval = utils.Evaluation(self.type, self.modelconfig.epoch, 25) print('\n------------ Training ------------ ') last_time = time.time() for epoch in range(self.modelconfig.epoch): _, loss = sess.run( [optimizer, cost], feed_dict={ X: input_batch, Y: target_batch, Sequences: seq_lens, keep_prob: 0.8 }) if epoch % 25 == 24: result = sess.run(prediction, feed_dict={ X: valid_dataset.input_batch, Y: valid_dataset.target_batch, Sequences: valid_dataset.seq_lens, keep_prob: 1 }) accuracy = tf.reduce_mean( tf.cast(tf.equal(result, tf.cast(Y, tf.int64)), tf.float32)) accuracy_ret = sess.run( accuracy, feed_dict={Y: valid_dataset.target_batch}) speed = time.time() - last_time print('Epoch:', '%04d ' % (epoch + 1), 'accuracy =', '{:.6f} '.format(accuracy_ret), 'cost =', '{:.6f}'.format(loss), 'speed =', '{:.2f}'.format(speed), 'sec') last_time = time.time() avg_p, avg_r, avg_f = utils.print_evaluation( valid_dataset.target_batch, result, self.output_char2vec.char_dict) eval.set(epoch, accuracy_ret, loss, speed, avg_p, avg_r, avg_f) print('') print('\n------------ Testing ------------ ') test_sentences = data.read_data("data/test/BHXX0035.txt", 30) test_dataset, _ = data.make_sequences(test_sentences, self.char2vec, self.output_char2vec, self.seq_length, make_valid=False) result = sess.run(prediction, feed_dict={ X: test_dataset.input_batch, Y: test_dataset.target_batch, Sequences: test_dataset.seq_lens, keep_prob: 1 }) accuracy = tf.reduce_mean( tf.cast(tf.equal(result, tf.cast(Y, tf.int64)), tf.float32)) accuracy_ret = sess.run(accuracy, feed_dict={Y: test_dataset.target_batch}) print('Accuracy =', '{:.6f}'.format(accuracy_ret)) for index, predict_sequence in enumerate(result): target_output, prediction_output = data.compare_sentence( self.output_char2vec, test_dataset.target_batch[index], test_dataset.input_source[index], predict_sequence) if index < 2: print("target sentence: ", target_output[1]) print("prediction sentence:", prediction_output[1]) return eval