def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model allknn = [] performance = [] function = [] for key,value in distance_funcs.items(): for k in range(1,min(30,len(Xtrain)),2): knn = KNN(k,value) knn.train(Xtrain,ytrain) answer = knn.predict(Xval) score = f1_score(yval,answer) allknn.append(knn) function.append(key) performance.append(score) #print(k,',',key,',',answer,',',yval,',',score) result = allknn[np.argmax(np.array(performance))] best_function = function[np.argmax(np.array(performance))] return result, result.k, best_function
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model best_model = None best_k = 0 best_f1_score = -1 best_function = "" best_choices = [] highest_range = len(Xtrain) - 1 if highest_range > 31: highest_range = 31 for name, distance_func in distance_funcs.items(): for k in range(1, highest_range, 2): model = KNN(k, distance_function=distance_func) model.train(Xtrain, ytrain) train_f1_score = f1_score(ytrain, model.predict(Xtrain)) valid_f1_score = f1_score(yval, model.predict(Xval)) #print("name: ", name, "k: ", k, "train_score: ", train_f1_score, "valid_score: ", valid_f1_score) # print("Train score") if (best_f1_score < valid_f1_score): best_f1_score = valid_f1_score best_k = k best_model = model best_function = name best_choices.append([best_k, best_function, best_f1_score]) print("best_k:", best_k, "best_function:", best_function, "best_f1_score:", best_f1_score) return best_model, best_k, best_function
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): Xtrain = np.array(Xtrain, dtype=float) ytrain = np.array(ytrain, dtype=int) Xval = np.array(Xval, dtype=float) yval = np.array(yval, dtype=int) f1 = np.zeros((30, 4)) upper_k = 30 if len(Xtrain) < 30: upper_k = len(Xtrain) m = 0 for k in range(1, upper_k, 2): c = 0 for j in distance_funcs: inst = KNN(k, distance_funcs[j]) inst.train(Xtrain, ytrain) pred_val = inst.predict(Xval) f1[k][c] = f1_score(yval, pred_val) if f1[k][c] > m: best_k = k best_func = j m = f1[k][c] best_model = inst c = c + 1 print(f1) print(best_model, best_k, best_func) return best_model, best_k, best_func raise NotImplementedError
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model distance_funcs_list = [ 'euclidean', 'gaussian', 'inner_prod', 'cosine_dist' ] best_f1 = 0 best_model = None best_k = -1 best_func = "*" for k in range(1, 30, 2): if k < len(Xtrain): for func_string in distance_funcs_list: knn = KNN(k, distance_funcs[func_string]) knn.train(Xtrain, ytrain) predicted_vals = knn.predict(Xval) curr_f1 = f1_score(yval, predicted_vals) if curr_f1 > best_f1: best_f1 = curr_f1 best_model = knn best_k = k best_func = func_string # print(best_model, best_k, best_func) return best_model, best_k, best_func
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model best_f1_score, best_k = 0, -1 for scaling_name, scaling_class in scaling_classes.items(): for name, func in distance_funcs.items(): scaler = scaling_class() train_features_scaled = scaler(Xtrain) valid_features_scaled = scaler(Xval) k_lim = len(Xtrain) - 1 for k in range(1, min(31, k_lim), 2): model = KNN(k=k, distance_function=func) model.train(train_features_scaled, ytrain) valid_f1_score = f1_score(yval, model.predict(valid_features_scaled)) if valid_f1_score > best_f1_score: best_f1_score, best_k = valid_f1_score, k model1 = model func1 = name scaler1 = scaling_name return model1, best_k, func1, scaler1
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): best_f1_score = -1 best_k = 0 best_distance_func_name = {} for name, dist_func in distance_funcs.items(): for k in range(1, 31, 2): if len(Xtrain) < k: break model = KNN(k=k, distance_function=dist_func) model.train(Xtrain, ytrain) train_f1_score = f1_score(ytrain, model.predict(Xtrain)) valid_f1_score = f1_score(yval, model.predict(Xval)) if valid_f1_score > best_f1_score: best_f1_score = valid_f1_score best_k = k best_distance_func_name = name ''' #Dont change any print statement print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) ''' #print(best_k, best_distance_func_name) best_model = KNN( k=best_k, distance_function=distance_funcs.get(best_distance_func_name)) best_model.train(np.concatenate((Xtrain, Xval), axis=0), np.concatenate((ytrain, yval), axis=0)) ''' model = KNN(k = best_k, distance_function = distance_funcs.get(best_distance_func_name)) model.train(np.concatenate((Xtrain, Xval),axis = 0), np.concatenate((ytrain, yval),axis = 0)) test_f1_score = f1_score(ytest, model.predict(Xtest)) name = best_distance_func_name print() print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name=name, best_k=best_k) + 'test f1 score: {test_f1_score:.5f}'.format(test_f1_score=test_f1_score)) print()''' return best_model, best_k, best_distance_func_name
def test_inner_product_knn(self): knn = KNN(1, inner_product_distance) features = [[1, 1], [1, 2], [2, 2], [9, 9], [8, 8], [8, 9]] values = [0, 0, 0, 1, 1, 1] knn.train(features, values) point1 = [0, 0] neighbor = knn.get_neighbors(point1) self.assertEqual(0, knn.get_response(neighbor)) numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor) point2 = [10, 10] neighbor = knn.get_neighbors(point2) numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor) self.assertEqual(1, knn.get_response(neighbor))
def test_knn(self): features, labels = generate_data_cancer() train_features, train_labels = features[:400], labels[:400] valid_features, valid_labels = features[400:460], labels[400:460] test_features, test_labels = features[460:], labels[460:] assert len(train_features) == len(train_labels) == 400 assert len(valid_features) == len(valid_labels) == 60 assert len(test_features) == len(test_labels) == 109 distance_funcs = { # 'euclidean': euclidean_distance, # 'gaussian': gaussian_kernel_distance, 'inner_prod': inner_product_distance, } for name, func in distance_funcs.items(): best_f1_score, best_k = -1, 0 for k in [1]: model = KNN(k=k, distance_function=func) model.train(train_features, train_labels) # print(train_labels) # print(model.predict(train_features)) train_f1_score = f1_score(train_labels, model.predict(train_features)) valid_f1_score = f1_score(valid_labels, model.predict(valid_features)) print(f'[part 2.1] {name}\tk: {k:d}\t' f'train: {train_f1_score:.5f}\t' f'valid: {valid_f1_score:.5f}') if valid_f1_score > best_f1_score: best_f1_score, best_k = valid_f1_score, k model = KNN(k=best_k, distance_function=func) model.train(train_features + valid_features, train_labels + valid_labels) test_f1_score = f1_score(test_labels, model.predict(test_features)) print() print(f'[part 2.1] {name}\tbest_k: {best_k:d}\t' f'test f1 score: {test_f1_score:.5f}') print()
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model upper_bound = len(Xtrain) if upper_bound > 30: upper_bound = 30 max_f1 = [] for key, distance_func in distance_funcs.items(): max_score = -1 min_k = 0 best_model = [] for k in range(1, upper_bound, 2): knn = KNN(k, distance_func) knn.train(Xtrain, ytrain) pred_labels = knn.predict(Xval) curr_f1 = f1_score(yval, pred_labels) if curr_f1 > max_score: max_score = curr_f1 min_k = k best_model = knn max_f1.append((max_score, key, min_k, best_model)) max_f1.sort(reverse=True) # filter ties majority = filter_ties(max_f1) SORT_ORDER = { "euclidean": 0, "gaussian": 1, "inner_prod": 2, "cosine_dist": 3 } # break ties majority.sort(key=lambda val: SORT_ORDER[val[1]]) return majority[0][3], majority[0][2], majority[0][1]
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): Xtrain = np.array(Xtrain, dtype=float) ytrain = np.array(ytrain, dtype=int) Xval = np.array(Xval, dtype=float) yval = np.array(yval, dtype=int) f1 = np.zeros((30, 4, 2)) upper_k = 30 if len(Xtrain) < 30: upper_k = len(Xtrain) m = 0 for k in range(1, upper_k, 2): c = 0 for j in distance_funcs: inst = KNN(k, distance_funcs[j]) X_t = np.copy(Xtrain) X_v = np.copy(Xval) for i in scaling_classes: if i == 'min_max_scale': scale = MinMaxScaler() Xtrain = scale.__call__(Xtrain) c1 = 0 Xval = scale.__call__(Xval) if i == 'normalize': scale = NormalizationScaler() Xtrain = scale.__call__(Xtrain) c1 = 1 Xval = scale.__call__(Xval) inst.train(Xtrain, ytrain) pred_val = inst.predict(Xval) f1[k][c][c1] = f1_score(yval, pred_val) if f1[k][c][c1] > m: best_model = inst best_k = k best_func = j best_scaler = i m = f1[k][c][c1] Xtrain = np.copy(X_t) Xval = np.copy(X_v) c = c + 1 print(best_model, best_k, best_func, best_scaler) print(f1) return best_model, best_k, best_func, best_scaler raise NotImplementedError
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model # raise NotImplementedError distance_funcs_list = [ 'euclidean', 'gaussian', 'inner_prod', 'cosine_dist' ] scalar_funcs_list = ['min_max_scale', 'normalize'] best_f1 = 0 best_model = None best_k = -1 best_func = "*" best_scalar = "+" for k in range(1, 30, 2): for func_string in distance_funcs_list: for scaling_string in scalar_funcs_list: if k < len(Xtrain): scalar_object = scaling_classes[scaling_string]() scaled_Xtrain = scalar_object(Xtrain) scaled_Xval = scalar_object(Xval) knn = KNN(k, distance_funcs[func_string]) knn.train(scaled_Xtrain, ytrain) predicted_vals = knn.predict(scaled_Xval) curr_f1 = f1_score(yval, predicted_vals) if curr_f1 > best_f1: best_f1 = curr_f1 best_model = knn best_k = k best_func = func_string best_scalar = scaling_string print(best_model, best_k, best_func, best_scalar) return best_model, best_k, best_func, best_scalar
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model best_model = None best_choice = [] best_scaler = "" best_k = 0 best_f1_score = 0 best_function = "" highest_range = len(Xtrain) - 1 if highest_range > 31: highest_range = 31 for scaler_name, scaling_class in scaling_classes.items(): for name, distance_func in distance_funcs.items(): scaler = scaling_class() Xtrain_scaled = scaler(Xtrain) Xval_scaled = scaler(Xval) for k in range(1, highest_range, 2): model = KNN(k, distance_function=distance_func) model.train(Xtrain_scaled, ytrain) train_f1_score = f1_score(ytrain, model.predict(Xtrain_scaled)) valid_f1_score = f1_score(yval, model.predict(Xval_scaled)) #print("scaler:", scaler_name, " name:", name, "k: ", k, "train score: ", train_f1_score, "valid_f1_score", valid_f1_score) if (best_f1_score < valid_f1_score): best_f1_score = valid_f1_score best_k = k best_function = name best_scaler = scaler_name best_model = model #print("best_scaler:", best_scaler, "best_function:", best_function, "best_k:", best_k, "score:", best_f1_score) return best_model, best_k, best_function, best_scaler
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model best_f1_score, best_k = -1, 0 for name, func in distance_funcs.items(): k_lim = len(Xtrain) - 1 for k in range(1, min(31, k_lim), 2): model = KNN(k=k, distance_function=func) model.train(Xtrain, ytrain) valid_f1_score = f1_score(yval, model.predict(Xval)) if valid_f1_score > best_f1_score: best_f1_score, best_k = valid_f1_score, k model1 = model func1 = name return model1, best_k, func1
def setUp(self): self.knn = KNN(1, euclidean_distance)
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model #raise NotImplementedError best_model = None best_k = 0 best_func = distance_funcs['euclidean'] best_scaler = scaling_classes['min_max_scale'] max_score = 0 n = 30 if len(Xtrain) < 30: n = len(Xtrain) - 1 for sc in scaling_classes: scaler = scaling_classes[sc]() scaled_train = scaler(Xtrain) scaled_val = scaler(Xval) for func in distance_funcs: for k in range(1, n, 2): model = KNN(k, distance_funcs[func]) model.train(scaled_train, ytrain) predicted = model.predict(scaled_val) temp_f1 = f1_score(yval, predicted) #valid_f1_score = f1_score(yval, scaled_val) """ print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(name = func, scaling_name = sc, k = k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score = temp_f1) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score = valid_f1_score)) print() """ print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(name = func, scaling_name = sc, k = k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score = temp_f1)) print() if temp_f1 > max_score: max_score = temp_f1 best_model = model best_k = k best_func = func best_scaler = sc if temp_f1 == max_score: if k < best_k: max_score = temp_f1 best_model = model best_k = k best_func = func best_scaler = sc print('[part 1.2] {name}\t{scaling_name}\t'.format(name = best_func, scaling_name = best_scaler) + 'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format(best_k = best_k, test_f1_score = max_score)) print() """ #Dont change any print statement print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(name=name, scaling_name=scaling_name, k=k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) print() print('[part 1.2] {name}\t{scaling_name}\t'.format(name=name, scaling_name=scaling_name) + 'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format(best_k=best_k, test_f1_score=test_f1_score)) print() """ return best_model, best_k, best_func, best_scaler
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model #raise NotImplementedError best_model = None best_k = 0 best_func = distance_funcs['euclidean'] max_score = 0 n = 30 if len(Xtrain) < 30: n = len(Xtrain) - 1 for func in distance_funcs: for k in range(1, n, 2): model = KNN(k, distance_funcs[func]) model.train(Xtrain, ytrain) predicted = model.predict(Xval) temp_f1 = f1_score(yval, predicted) print('[part 1.1] {name}\tk: {k:d}\t'.format(name = func, k = k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score = temp_f1)) print() if temp_f1 > max_score: max_score = temp_f1 best_model = model best_k = k best_func = func if temp_f1 == max_score: if k < best_k: max_score = temp_f1 best_model = model best_k = k best_func = func print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name = best_func, best_k = best_k) + 'test f1 score: {test_f1_score:.5f}'.format(test_f1_score = max_score)) print() """ #Dont change any print statement print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) print() print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name=name, best_k=best_k) + 'test f1 score: {test_f1_score:.5f}'.format(test_f1_score=test_f1_score)) print() """ return best_model, best_k, best_func
def test_most_freq(): knn = KNN(5, 'euclidean_distance') print(knn.most_frequent(test_array))
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model upper_bound = len(Xtrain) if upper_bound > 30: upper_bound = 30 max_f1 = [] for scale_class_name, scaling_class in scaling_classes.items(): scale_obj = scaling_class() if scale_class_name == 'normalize': trans_Xtrain = transpose_list(scale_obj(transpose_list(Xtrain))) trans_Xval = transpose_list(scale_obj(transpose_list(Xval))) else: trans_Xtrain = scale_obj(Xtrain) trans_Xval = scale_obj(Xval) for dist_func_name, distance_func in distance_funcs.items(): max_score = -1 min_k = 0 best_model = [] for k in range(1, upper_bound, 2): knn = KNN(k, distance_func) knn.train(trans_Xtrain, ytrain) pred_labels = knn.predict(trans_Xval) curr_f1 = f1_score(yval, pred_labels) if curr_f1 > max_score: max_score = curr_f1 min_k = k best_model = knn max_f1.append((max_score, scale_class_name, dist_func_name, min_k, best_model)) max_f1.sort(reverse=True) # filter ties majority = filter_ties(max_f1) # break ties SORT_ORDER_SCALAR = {"min_max_scale": 0, "normalize": 1} majority.sort(key=lambda val: SORT_ORDER_SCALAR[val[1]]) majority = filter_ties(majority) SORT_ORDER = { "euclidean": 0, "gaussian": 1, "inner_prod": 2, "cosine_dist": 3 } majority.sort(key=lambda val: SORT_ORDER[val[2]]) return majority[0][4], majority[0][3], majority[0][2], majority[0][1]