def knn_experiment(): K = len(topics) dm = data.create_data_manager() ordered_topics = dm.order_topics(topics) print 'loading train data . . .' X_train, Y_train = dm.load_data('train') Y_train_slice = dm.slice_Y(Y_train, ordered_topics) print 'loading test data . . .' X_test, Y_test = dm.load_data('test') Y_gold = dm.slice_Y(Y_test, ordered_topics) final_results = {} print 'interating over models . . .' for k in [1,15,31,45,61]: model = '%d-nearest neighbours' % k print 'now using model %s . . .' % model learner = knn.MultikNN(K, k) # K being the number of topics, and k being the number of neighbours learner.train(X_train, Y_train_slice) Y_pred = learner.batch_predict_classes(X_test) results = evaluate.per_topic_results(Y_pred, Y_gold) results_dict = {topic: result for (topic, result) in zip(ordered_topics, results)} pprint(results_dict) final_results[model] = results_dict print 'saving final results . . .' with open('results/knn_final_results.txt', 'w') as f: pprint(final_results, stream=f)
def distance_matrix(self): print 'creating distance_matrix . . .' dm = data.create_data_manager() X_train, _ = dm.load_data('train') X_test, _ = dm.load_data('test') distances = np.empty((X_test.shape[0], X_train.shape[0])) for row in xrange(X_test.shape[0]): print 'row', row x = X_test[row] distances[row] = np.array([self._distance(x, X_train[col]) for col in xrange(X_train.shape[0])]) np.save('distance_matrix.npy', distances) return distances
def test2(): print 'loading data . . .' dm = data.create_data_manager() X_train, Y_train = dm.load_data('train') Y_train_slice = dm.slice_Y(Y_train, topics) print 'training knn . . .' K = len(topics) learner = knn.MultikNN(K, 30) learner.train(X_train, Y_train_slice) print 'testing knn . . .' X_test, Y_test = dm.load_data('test') Y_test_slice = dm.slice_Y(Y_test, topics) errors = learner.classification_errors(X_test, Y_test_slice) return errors
def test(): # test run using degree-3 polynomial kernel with C=1 # for the ten main topics in reuters dm = data.create_data_manager() print 'loading train data . . .' X_train, Y_train = dm.load_data('train') Y_train_slice = dm.slice_Y(Y_train, topics) print 'training svm . . .' K = len(topics) learner = svm2.MultiSVM(K, 1.0, 'poly', 3) learner.train(X_train, Y_train_slice) print 'loading test data . . .' X_test, Y_test = dm.load_data('test') Y_gold = dm.slice_Y(Y_test, topics) print 'predicting train . . .' Y_pred = learner.batch_predict_classes(X_train) print 'evaluating . . .' precision, recall = evaluate.precision_recall(Y_pred, Y_train_slice) f1 = evaluate.f_score(precision, recall) print 'Precision: %.3f' % precision print 'Recall: %.3f' % recall print 'F1: %.3f' % f1 print 'predicting test . . .' Y_pred = learner.batch_predict_classes(X_test) print 'evaluating . . .' precision, recall = evaluate.precision_recall(Y_pred, Y_gold) f1 = evaluate.f_score(precision, recall) print 'Precision: %.3f' % precision print 'Recall: %.3f' % recall print 'F1: %.3f' % f1
def svm_experiment(C, balance=''): K = len(topics) dm = data.create_data_manager() ordered_topics = dm.order_topics(topics) print 'loading train data . . .' X_train, Y_train = dm.load_data('train') Y_train_slice = dm.slice_Y(Y_train, ordered_topics) print 'loading test data . . .' X_test, Y_test = dm.load_data('test') Y_gold = dm.slice_Y(Y_test, ordered_topics) final_results = {} #print (Y_train_slice.sum(axis=0) + Y_train_slice.shape[0]).tolist() #raise Exception print 'interating over models . . .' for model, kernel in svm_kernels: print 'now using model %s . . .' % model learner = svm.MultiSVM(K, C, kernel) learner.train(X_train, Y_train_slice, ordered_topics, balance=balance, # use balance = '' if don't want to balance max_per_class=3000) Y_pred = learner.batch_predict_classes(X_test) results = evaluate.per_topic_results(Y_pred, Y_gold) results_dict = {topic: result for (topic, result) in zip(ordered_topics, results)} pprint(results_dict) final_results[model] = results_dict print 'saving final results . . .' with open('results/svm_final_results_C_%.1f_%s.txt' % (C,balance), 'w') as f: pprint(final_results, stream=f)
def __init__(self, XY=None): self.dm = data.create_data_manager() if XY == None: self.X, self.Y = self.dm.load_data('train') else: self.X, self.Y = XY