import numpy as np from data_analysis import data_warehouse from sklearn.cross_validation import KFold author_list = [] feature_list = [] t1 = data_warehouse.get_stylometric_features_by_author_id(1) # 1 feature_list.extend(t1) author_list.extend([0 for x in range(len(t1))]) t2 = data_warehouse.get_stylometric_features_by_author_id(2) # 2 feature_list.extend(t2) author_list.extend([1 for x in range(len(t2))]) t3 = data_warehouse.get_stylometric_features_by_author_id(3) # 3 feature_list.extend(t3) author_list.extend([2 for x in range(len(t3))]) X = np.array(feature_list) y = np.array(author_list) kf = KFold(len(feature_list), n_folds=3) print len(feature_list) print len(author_list) print len(kf) for train_index, test_index in kf: print ("TRAIN: ", train_index, "TEST: ", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index]
import time import numpy as np from data_analysis import data_warehouse from sklearn import cross_validation from sklearn.metrics import classification_report, accuracy_score from data_analysis import calculate_K_nearest_neighbors_classifier_for_sets as KNN start_time = time.time() author_list = [] feature_list = [] for i in range(1, 20): temp_arr = data_warehouse.get_stylometric_features_by_author_id(i) # 3 feature_list.extend(temp_arr) author_list.extend([i for x in range(len(temp_arr))]) X_train, X_test, y_train, y_test = cross_validation.train_test_split(feature_list, author_list, test_size=0.1, random_state=1) train = np.array(zip(X_train, y_train)) test = np.array(zip(X_test, y_test)) predictions = [] print 'Finished getting data from the database' #print CKNN.get_knn_classifier_cross_validation(X, y) for idx in range(len(X_test)): print 'Classifying test instance number ', str(idx) + ':' neighbors = KNN.get_set_neighbor(training_set=train, test_instance=test[idx][0], k=5)
import numpy as np from data_analysis import data_warehouse from sklearn.cross_validation import KFold author_list = [] feature_list = [] t1 = data_warehouse.get_stylometric_features_by_author_id(1) # 1 feature_list.extend(t1) author_list.extend([0 for x in range(len(t1))]) t2 = data_warehouse.get_stylometric_features_by_author_id(2) # 2 feature_list.extend(t2) author_list.extend([1 for x in range(len(t2))]) t3 = data_warehouse.get_stylometric_features_by_author_id(3) # 3 feature_list.extend(t3) author_list.extend([2 for x in range(len(t3))]) X = np.array(feature_list) y = np.array(author_list) kf = KFold(len(feature_list), n_folds=3) print len(feature_list) print len(author_list) print len(kf) for train_index, test_index in kf: print("TRAIN: ", train_index, "TEST: ", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index]