예제 #1
0
if __name__ == '__main__':
    filename = 'data/jester-data-1.csv'
    items = {}
    users = {}
    matrix = []

    matrix, users, items = parse(filename)
    f = Filter(matrix, users, items)

    if len(sys.argv) == 3:
        method = sys.argv[1]
        testFile = sys.argv[2]
        testData = csv.reader(open(testFile, "r"))
        results = f.execute(method, testData)
        print_evaluation(f, method, results)
    elif len(sys.argv) == 2 and sys.argv[1] == 'all':
        method = "adj_weighted_sum"
        testData1 = csv.reader(open("data/TestSet01.csv", "r"))
        testData2 = csv.reader(open("data/TestSet02.csv", "r"))
        testData3 = csv.reader(open("data/TestSet03.csv", "r"))
        testData4 = csv.reader(open("data/TestSet04.csv", "r"))

        results1 = f.execute(method, testData1)
        results2 = f.execute(method, testData2)
        results3 = f.execute(method, testData3)
        results4 = f.execute(method, testData4)
        
        print_evaluation(f, "TestSet01", results1)
        print_evaluation(f, "TestSet02", results2)
        print_evaluation(f, "TestSet03", results3)
예제 #2
0
                              X: valid_dataset.input_batch,
                              Y: valid_dataset.target_batch,
                              Sequences: valid_dataset.seq_lens,
                              keep_prob: 1
                          })
        accuracy = tf.reduce_mean(
            tf.cast(tf.equal(result, tf.cast(Y, tf.int64)), tf.float32))
        accuracy_ret = sess.run(accuracy,
                                feed_dict={Y: valid_dataset.target_batch})
        speed = time.time() - last_time
        print('Epoch:', '%04d  ' % (epoch + 1), 'accuracy =',
              '{:.6f}  '.format(accuracy_ret), 'cost =', '{:.6f}'.format(loss),
              'speed =', '{:.2f}'.format(speed), 'sec')
        last_time = time.time()

        avg_p, avg_r, avg_f = utils.print_evaluation(
            valid_dataset.target_batch, result, output_char2vec.char_dict)
        eval.set(epoch, accuracy_ret, loss, speed, avg_p, avg_r, avg_f)
        print('')

print('\n------------ Testing ------------ ')
test_sentences = data.read_data("data/test/BHXX0035.txt", 30)
test_dataset, _ = data.make_sequences(test_sentences,
                                      char2vec,
                                      output_char2vec,
                                      seq_length,
                                      make_valid=False)

result = sess.run(prediction,
                  feed_dict={
                      X: test_dataset.input_batch,
                      Y: test_dataset.target_batch,
def process(dataroot, classifier_name, file_ending):
    global K
    K = 5
    data = read_data(dataroot, file_ending)
    print("data is read ", data.shape)
    #data = read_toy_data(dataroot,file_ending)
    X, ID, Y = correct_data(data, K)
    print("data is corrected")
    labels, labels_d = get_labels(Y)

    X = normalize_data(X)
    print("data is normalized")
    Y = encode_label(Y, labels_d)

    input_dim = X.shape[1]
    num_class = len(np.unique(Y))

    confusion_matrix_sum = np.zeros((num_class, num_class), dtype=int)
    if classifier_name == 'cnn':
        lr = 1e-3
        reg = 1e-5
    elif classifier_name == 'softmax':
        lr = 1e-1
        reg = 1e-6
    else:
        lr = None
        reg = None

    batch_size = 5120
    device = 'cuda:0'
    #batch_size = 1024*5
    #device = 'cuda:0'
    if use_class_weight_to_balance:
        pre_fingerprint = os_join(dataroot,
                                  '{}_k_{}_w'.format(classifier_name, str(K)))
    else:
        pre_fingerprint = os_join(dataroot,
                                  '{}_k_{}'.format(classifier_name, str(K)))

    optim = 'Adam'
    num_iters = int(
        Y.shape[0] * 10 * .8 //
        batch_size)  # 10 epochs of whole data. train data is 80% of whole data
    classifier_args = (classifier_name, optim, lr, reg, batch_size, input_dim,
                       num_class, num_iters, device)
    config = '_optim_{}_lr_{}_reg_{}_bs_{}'.format(optim, lr, reg, batch_size)
    fingerprint = pre_fingerprint + config
    logdir = os_join(fingerprint, 'log')
    ensure_dir(fingerprint)
    ensure_dir(logdir)

    kfold_pred_time = 0
    skf = StratifiedKFold(n_splits=K, random_state=SEED)
    for fold_index, (train_index, test_index) in enumerate(skf.split(X, Y)):
        X_train = X[train_index]
        y_train = Y[train_index]
        X_test = X[test_index]
        test_id = ID[test_index]
        y_test = Y[test_index]
        runs_dir = os_join(logdir, 'fold_{}'.format(fold_index))
        pred, duration = classify_fold(classifier_name, X_train, y_train,
                                       X_test, fold_index, classifier_args,
                                       runs_dir)
        acc = metrics.balanced_accuracy_score(y_test, pred)
        print("Balanced accuracy including benign: {}".format(acc))
        kfold_pred_time += duration
        assert pred.shape == y_test.shape, "y_true={} and pred.shape={} should be same ".format(
            y_test.shape, pred.shape)
        plot_confusion_matrix(os_join(fingerprint,
                                      'cm_fold_{}.jpg'.format(fold_index)),
                              y_test,
                              pred,
                              classes=labels,
                              normalize=False,
                              title='Confusion matrix, with normalization')
        plot_confusion_matrix(os_join(
            fingerprint, 'cm_norm_fold_{}.jpg'.format(fold_index)),
                              y_test,
                              pred,
                              classes=labels,
                              normalize=True,
                              title='Confusion matrix, with normalization')
        cm_i = confusion_matrix(y_test, pred)
        confusion_matrix_sum += cm_i
    cm = np.array(confusion_matrix_sum / K).astype(np.float)
    print(dataroot, classifier_name)
    plot_confusion_matrix(os_join(fingerprint, 'cm_nonnorm_fold_avg.jpg'), [],
                          [],
                          cm=cm,
                          classes=labels,
                          title='Confusion matrix, without normalization')
    plot_confusion_matrix(os_join(fingerprint, 'cm_norm_fold_avg.jpg'), [], [],
                          cm=cm,
                          classes=labels,
                          normalize=True,
                          title='Confusion matrix, with normalization')

    print_evaluation(cm, labels, os_join(fingerprint, 'evaluation.csv'))
    print_absolute_recall(cm, labels,
                          os_join(fingerprint, 'absolute_recall.csv'))
def classify(dataroot,classifier_name):
        global K
        K=5
        #fraction = 1
        fraction = 0.001
        
        #total_records_in_whole = 6907723;
        total_records = 6907705; # in fold
        folds_df = []
        fold_root = join(dataroot,'folds_fraction_{}'.format(fraction))
        
        ds_list = []
        for fold_index in range(K):
            df = pd.read_csv(join(fold_root,'fold_{}.csv'.format(fold_index))) 
            folds_df.append(df)
            ds_list.append(df.Label)
        total_label_df = pd.concat(ds_list,sort=False)
        labels,labels_d = get_labels(total_label_df.unique())
        class_weight = get_class_weights(encode_label(total_label_df.values,labels_d))
        
        #balance = 'sample_per_batch'
        #balance = 'with_loss'
        balance = 'explicit'
 
        input_dim = folds_df[0].shape[1]-2 # because we remove Label and FlowID columns from X
        labels,labels_d = get_labels(folds_df[0].Label.unique())

        num_class = len(labels)

        if classifier_name in ['cnn','softmax']:
            batch_size =256
            num_iters = 0.1*(total_records*.8*.9)//batch_size # 10 epochs for total dataset
            optim='Adam'  
            if classifier_name=='cnn':
                lr =1e-3
                reg = 0
                device = [0,1]

            elif classifier_name=='softmax':
                lr = 1e-3
                reg =0 
                device = 'cuda:0'
            classifier_args = {'classifier_name':classifier_name,'optim':optim,'lr':lr,'reg':reg,'batch_size':batch_size,'input_dim':input_dim,'num_class':num_class,'num_iters':num_iters,'device':device, 'balance':balance, 'class_weight':class_weight}
            config =  '_optim_{}_lr_{}_reg_{}_bs_{}_b_{}'.format(optim,lr,reg,batch_size,balance)
        else:
            lr = None
            reg = None
            batch_size=None
            device=None
            classifier_args = {'classifier_name':classifier_name,'balance':balance}
            config = '_b_{}'.format(balance)
        
        
        pre_fingerprint = join(dataroot,'classifiers','kfold', 'r_{}_c_{}_k_{}'.format(fraction,classifier_name,str(K)))
            
        fingerprint = pre_fingerprint + config
        print("Running experiment \n ",fingerprint)
        logdir = join(fingerprint,'log')
        cmdir = join(fingerprint,'cm')
        recalldir = join(fingerprint,'recall')
        evaldir = join(fingerprint,'eval')

        ensure_dirs(fingerprint,logdir,cmdir,recalldir,evaldir)       

        confusion_matrix_sum = np.zeros((num_class, num_class),dtype=float)
        majority_confusion_matrix_sum = np.zeros((num_class, num_class),dtype=float)
        kfold_pred_time = 0
        kfold_feature_importance = np.zeros(input_dim,dtype=np.float)
        skf = StratifiedKFold(n_splits=K,random_state=SEED)
        for fold_index in range(K):
            print("Fold ",fold_index)
            test_df = folds_df[fold_index]
            train_df = pd.concat([folds_df[i] for i in range(K) if i!=fold_index],sort=False)
            
            X_train, y_train = df_to_array(train_df)
            y_train = encode_label(y_train,labels_d)

            runs_dir=join(logdir,'fold_{}'.format(fold_index))
            clf,duration = train_fold(X_train,y_train,fold_index,classifier_args,runs_dir)
            if classifier_name=='forest':
                kfold_feature_importance+=clf.feature_importances_
            
            flowids_test,y_flowid_test = group_data(test_df)
            y_flowid_test = encode_label(y_flowid_test,labels_d)
            pred_any, pred_majority = predict_fold(classifier_name,clf,test_df, flowids_test, y_flowid_test)
            
            assert pred_any.shape==pred_majority.shape,"any and majority shapes should be same {},{}".format(pred_any.shape,pred_majority.shape)
            #assert pred.shape==y_flowid_test.shape, "y_true={} and pred.shape={} should be same ".format(y_flowid_test.shape,pred.shape)
            acc_pred_any = metrics.balanced_accuracy_score(y_flowid_test,pred_any)
            acc_pred_majority = metrics.balanced_accuracy_score(y_flowid_test,pred_majority)
            print("Fold Balanced accuracy(any,majority): ({:.2f},{:.2f})".format(acc_pred_any,acc_pred_majority))
            kfold_pred_time+=duration

            plot_confusion_matrix(join(cmdir,'any_fold_{}.jpg'.format(fold_index)), y_flowid_test, pred_any, classes=labels,normalize=False, title='Confusion matrix, with normalization')
            plot_confusion_matrix(join(cmdir,'any_norm_fold_{}.jpg'.format(fold_index)), y_flowid_test, pred_any, classes=labels,normalize=True, title='Confusion matrix, with normalization')         
            cm_i = confusion_matrix(y_flowid_test,pred_any)
            print_absolute_recall(cm_i, labels, join(recalldir,'any_fold_{}.csv'.format(fold_index)),fold_root)
            print_evaluation(cm_i, labels, join(evaldir,'any_fold_{}.csv'.format(fold_index)))
            confusion_matrix_sum+=cm_i
       

            plot_confusion_matrix(join(cmdir,'majority_fold_{}.jpg'.format(fold_index)), y_flowid_test, pred_majority, classes=labels,normalize=False, title='Confusion matrix, with normalization')
            plot_confusion_matrix(join(cmdir,'majority_norm_fold_{}.jpg'.format(fold_index)), y_flowid_test, pred_majority, classes=labels,normalize=True, title='Confusion matrix, with normalization')                       
            majority_cm_i = confusion_matrix(y_flowid_test,pred_majority)
            print_absolute_recall(majority_cm_i, labels, join(recalldir,'majority_fold_{}.csv'.format(fold_index)),fold_root)
            print_evaluation(majority_cm_i, labels, join(evaldir,'majority_fold_{}.csv'.format(fold_index)))
            majority_confusion_matrix_sum+=majority_cm_i

        if classifier_name=='forest':
            print_feature_importance(kfold_feature_importance,join(dataroot,'folds_fraction_{}'.format(fraction),'feature_selection.csv'))


        cm = confusion_matrix_sum
        cm_majority = majority_confusion_matrix_sum
        print(dataroot,classifier_name)

 
        plot_confusion_matrix(join(cmdir,'avg_any_fold.jpg'), [], [],cm=cm, classes=labels, title='Confusion matrix, without normalization')
        plot_confusion_matrix(join(cmdir,'avg_any_norm_fold.jpg'), [], [],cm=cm, classes=labels, normalize=True, title='Confusion matrix, with normalization')

        
        plot_confusion_matrix(join(cmdir,'avg_majority_fold.jpg'), [], [],cm=cm_majority, classes=labels, title='Confusion matrix, without normalization')
        plot_confusion_matrix(join(cmdir,'avg_majority_norm_fold.jpg'), [], [],cm=cm_majority, classes=labels, normalize=True, title='Confusion matrix, with normalization')
        
        print_evaluation(cm, labels, join(fingerprint,'evaluation_any.csv'))
        print_evaluation(cm_majority, labels, join(fingerprint,'evaluation_majority.csv'))
        
        print_absolute_recall(cm, labels, join(fingerprint,'recall_any.csv'),fold_root,kfold=True)
        print_absolute_recall(cm_majority, labels, join(fingerprint,'recall_majority.csv'),fold_root,kfold=True)
예제 #5
0
if __name__ == '__main__':
    filename = 'data/jester-data-1.csv'
    items = {}
    users = {}
    matrix = []

    matrix, users, items = parse(filename)
    f = Filter(matrix, users, items)

    if len(sys.argv) == 3:
        method = sys.argv[1]
        testFile = sys.argv[2]
        testData = csv.reader(open(testFile, "r"))
        results = f.execute(method, testData)
        print_evaluation(f, method, results)
    elif len(sys.argv) == 2 and sys.argv[1] == 'all':
        method = "adj_weighted_sum"
        testData1 = csv.reader(open("data/TestSet01.csv", "r"))
        testData2 = csv.reader(open("data/TestSet02.csv", "r"))
        testData3 = csv.reader(open("data/TestSet03.csv", "r"))
        testData4 = csv.reader(open("data/TestSet04.csv", "r"))

        results1 = f.execute(method, testData1)
        results2 = f.execute(method, testData2)
        results3 = f.execute(method, testData3)
        results4 = f.execute(method, testData4)

        print_evaluation(f, "TestSet01", results1)
        print_evaluation(f, "TestSet02", results2)
        print_evaluation(f, "TestSet03", results3)
예제 #6
0
if __name__ == '__main__':
    if len(sys.argv) != 3:
        print 'Expected input format: python EvaluateCFList.py <method> <testList>'
    else:
        filename = 'data/jester-data-1.csv'
        items = {}
        users = {}
        matrix = []

        size = int(sys.argv[2])

        matrix, users, items = parse(filename)
        testData = gen_tests(users, size)
        f = Filter(matrix, users, items)

        method = sys.argv[1]
        print "Starting predictions"
        if method == 'all':
            w_results = f.execute('weighted_sum', testData)
            a_w_results = f.execute('adj_weighted_sum', testData)
            c_w_results = f.execute('cosine_weighted_sum', testData)
            c_a_w_results = f.execute('cosine_adj_weighted_sum', testData)
            print_evaluation(f, "Weighted Sum", w_results)
            print_evaluation(f, "Adjusted Weighted Sum", a_w_results)
            print_evaluation(f, "Cosine Weighted Sum", c_w_results)
            print_evaluation(f, "Cosine Adjusted Weighted Sum", c_a_w_results)
        else:
            results = f.execute(method, testData)
            print_evaluation(f, method, results)

예제 #7
0
    def run(self):
        training_dataset, valid_dataset = data.make_sequences(
            self.input, self.char2vec, self.output_char2vec, self.seq_length)

        input_batch = training_dataset.input_batch
        target_batch = training_dataset.target_batch
        seq_lens = training_dataset.seq_lens

        hidden_size = self.modelconfig.hidden_size

        X = tf.placeholder(tf.int32, [None, self.seq_length])  # X data
        X_onehot = tf.one_hot(
            X,
            self.modelconfig.input_size)  # one hot: 1 -> 0 1 0 0 0 0 0 0 0 0

        Y = tf.placeholder(tf.int32, [None, self.seq_length])  # Y label

        Sequences = tf.placeholder(tf.int32, [None])

        keep_prob = tf.placeholder(tf.float32)

        if self.type == "multi":
            print('\n****** MultiLayer LSTM Initialize ******')

            with tf.variable_scope('cell_def'):
                cell1 = tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_size,
                                                     state_is_tuple=True)
                cell1 = tf.nn.rnn_cell.DropoutWrapper(
                    cell1, output_keep_prob=keep_prob)
                cell2 = tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_size,
                                                     state_is_tuple=True)
                cell2 = tf.nn.rnn_cell.DropoutWrapper(
                    cell2, output_keep_prob=keep_prob)
                multi_cell = tf.nn.rnn_cell.MultiRNNCell([cell1, cell2])

            with tf.variable_scope('rnn_def'):
                outputs, _states = tf.nn.dynamic_rnn(multi_cell,
                                                     X_onehot,
                                                     dtype=tf.float32,
                                                     sequence_length=Sequences)

        elif self.type == "bimul":
            print('\n****** Bidirectional LSTM Initialize ******')

            with tf.variable_scope('cell_def'):
                forward = tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_size,
                                                       state_is_tuple=True)
                forward = tf.nn.rnn_cell.DropoutWrapper(
                    forward, output_keep_prob=keep_prob)
                backward = tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_size,
                                                        state_is_tuple=True)
                backward = tf.nn.rnn_cell.DropoutWrapper(
                    backward, output_keep_prob=keep_prob)

            with tf.variable_scope('rnn_def'):
                outputs, states = tf.nn.bidirectional_dynamic_rnn(
                    forward,
                    backward,
                    inputs=X_onehot,
                    dtype=tf.float32,
                    sequence_length=Sequences)
                outputs = tf.concat(values=outputs, axis=2)

        model = tf.layers.dense(outputs,
                                self.modelconfig.output_size,
                                activation=None)
        cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model,
                                                           labels=Y))
        prediction = tf.argmax(model, axis=2)

        optimizer = tf.train.AdamOptimizer(learning_rate=0.005).minimize(cost)
        # optimizer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)

        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        eval = utils.Evaluation(self.type, self.modelconfig.epoch, 25)

        print('\n------------ Training ------------ ')
        last_time = time.time()
        for epoch in range(self.modelconfig.epoch):

            _, loss = sess.run(
                [optimizer, cost],
                feed_dict={
                    X: input_batch,
                    Y: target_batch,
                    Sequences: seq_lens,
                    keep_prob: 0.8
                })
            if epoch % 25 == 24:
                result = sess.run(prediction,
                                  feed_dict={
                                      X: valid_dataset.input_batch,
                                      Y: valid_dataset.target_batch,
                                      Sequences: valid_dataset.seq_lens,
                                      keep_prob: 1
                                  })
                accuracy = tf.reduce_mean(
                    tf.cast(tf.equal(result, tf.cast(Y, tf.int64)),
                            tf.float32))
                accuracy_ret = sess.run(
                    accuracy, feed_dict={Y: valid_dataset.target_batch})
                speed = time.time() - last_time
                print('Epoch:', '%04d  ' % (epoch + 1), 'accuracy =',
                      '{:.6f}  '.format(accuracy_ret), 'cost =',
                      '{:.6f}'.format(loss), 'speed =', '{:.2f}'.format(speed),
                      'sec')
                last_time = time.time()

                avg_p, avg_r, avg_f = utils.print_evaluation(
                    valid_dataset.target_batch, result,
                    self.output_char2vec.char_dict)
                eval.set(epoch, accuracy_ret, loss, speed, avg_p, avg_r, avg_f)
                print('')

        print('\n------------ Testing ------------ ')
        test_sentences = data.read_data("data/test/BHXX0035.txt", 30)
        test_dataset, _ = data.make_sequences(test_sentences,
                                              self.char2vec,
                                              self.output_char2vec,
                                              self.seq_length,
                                              make_valid=False)

        result = sess.run(prediction,
                          feed_dict={
                              X: test_dataset.input_batch,
                              Y: test_dataset.target_batch,
                              Sequences: test_dataset.seq_lens,
                              keep_prob: 1
                          })

        accuracy = tf.reduce_mean(
            tf.cast(tf.equal(result, tf.cast(Y, tf.int64)), tf.float32))
        accuracy_ret = sess.run(accuracy,
                                feed_dict={Y: test_dataset.target_batch})

        print('Accuracy =', '{:.6f}'.format(accuracy_ret))

        for index, predict_sequence in enumerate(result):
            target_output, prediction_output = data.compare_sentence(
                self.output_char2vec, test_dataset.target_batch[index],
                test_dataset.input_source[index], predict_sequence)
            if index < 2:
                print("target sentence:    ", target_output[1])
                print("prediction sentence:", prediction_output[1])
        return eval
예제 #8
0
if __name__ == '__main__':
    if len(sys.argv) != 3:
        print 'Expected input format: python EvaluateCFList.py <method> <testList>'
    else:
        filename = 'data/jester-data-1.csv'
        items = {}
        users = {}
        matrix = []

        size = int(sys.argv[2])

        matrix, users, items = parse(filename)
        testData = gen_tests(users, size)
        f = Filter(matrix, users, items)

        method = sys.argv[1]
        print "Starting predictions"
        if method == 'all':
            w_results = f.execute('weighted_sum', testData)
            a_w_results = f.execute('adj_weighted_sum', testData)
            c_w_results = f.execute('cosine_weighted_sum', testData)
            c_a_w_results = f.execute('cosine_adj_weighted_sum', testData)
            print_evaluation(f, "Weighted Sum", w_results)
            print_evaluation(f, "Adjusted Weighted Sum", a_w_results)
            print_evaluation(f, "Cosine Weighted Sum", c_w_results)
            print_evaluation(f, "Cosine Adjusted Weighted Sum", c_a_w_results)
        else:
            results = f.execute(method, testData)
            print_evaluation(f, method, results)