def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model allknn = [] performance = [] function = [] for key,value in distance_funcs.items(): for k in range(1,min(30,len(Xtrain)),2): knn = KNN(k,value) knn.train(Xtrain,ytrain) answer = knn.predict(Xval) score = f1_score(yval,answer) allknn.append(knn) function.append(key) performance.append(score) #print(k,',',key,',',answer,',',yval,',',score) result = allknn[np.argmax(np.array(performance))] best_function = function[np.argmax(np.array(performance))] return result, result.k, best_function
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): Xtrain = np.array(Xtrain, dtype=float) ytrain = np.array(ytrain, dtype=int) Xval = np.array(Xval, dtype=float) yval = np.array(yval, dtype=int) f1 = np.zeros((30, 4)) upper_k = 30 if len(Xtrain) < 30: upper_k = len(Xtrain) m = 0 for k in range(1, upper_k, 2): c = 0 for j in distance_funcs: inst = KNN(k, distance_funcs[j]) inst.train(Xtrain, ytrain) pred_val = inst.predict(Xval) f1[k][c] = f1_score(yval, pred_val) if f1[k][c] > m: best_k = k best_func = j m = f1[k][c] best_model = inst c = c + 1 print(f1) print(best_model, best_k, best_func) return best_model, best_k, best_func raise NotImplementedError
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model distance_funcs_list = [ 'euclidean', 'gaussian', 'inner_prod', 'cosine_dist' ] best_f1 = 0 best_model = None best_k = -1 best_func = "*" for k in range(1, 30, 2): if k < len(Xtrain): for func_string in distance_funcs_list: knn = KNN(k, distance_funcs[func_string]) knn.train(Xtrain, ytrain) predicted_vals = knn.predict(Xval) curr_f1 = f1_score(yval, predicted_vals) if curr_f1 > best_f1: best_f1 = curr_f1 best_model = knn best_k = k best_func = func_string # print(best_model, best_k, best_func) return best_model, best_k, best_func
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): func_names = list(distance_funcs.keys()) func_names.reverse() best_model = KNN(k=np.inf, distance_function=None) best_name = func_names[0] best_valid_f1_score = -1 * np.inf for k in range(1, min(30, len(Xtrain) - 1), 2): for item in distance_funcs.items(): name, distance_func = item model = KNN(k=k, distance_function=distance_func) model.train(Xtrain, ytrain) train_predict_labels = model.predict(Xtrain) train_f1_score = f1_score(ytrain, train_predict_labels) valid_predict_labels = model.predict(Xval) valid_f1_score = f1_score(yval, valid_predict_labels) if valid_f1_score > best_valid_f1_score or \ (valid_f1_score == best_valid_f1_score and func_names.index(name) > func_names.index(best_name)): best_model = model best_name = name best_valid_f1_score = valid_f1_score #Dont change any print statement '''print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=model.k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) print() print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name=best_name, best_k=best_model.k) + 'valid f1 score: {valid_f1_score:.5f}'.format(valid_f1_score=best_valid_f1_score)) print()''' return best_model, best_model.k, best_name
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model best_f1_score, best_k = 0, -1 for scaling_name, scaling_class in scaling_classes.items(): for name, func in distance_funcs.items(): scaler = scaling_class() train_features_scaled = scaler(Xtrain) valid_features_scaled = scaler(Xval) k_lim = len(Xtrain) - 1 for k in range(1, min(31, k_lim), 2): model = KNN(k=k, distance_function=func) model.train(train_features_scaled, ytrain) valid_f1_score = f1_score(yval, model.predict(valid_features_scaled)) if valid_f1_score > best_f1_score: best_f1_score, best_k = valid_f1_score, k model1 = model func1 = name scaler1 = scaling_name return model1, best_k, func1, scaler1
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model best_model = None best_k = 0 best_f1_score = -1 best_function = "" best_choices = [] highest_range = len(Xtrain) - 1 if highest_range > 31: highest_range = 31 for name, distance_func in distance_funcs.items(): for k in range(1, highest_range, 2): model = KNN(k, distance_function=distance_func) model.train(Xtrain, ytrain) train_f1_score = f1_score(ytrain, model.predict(Xtrain)) valid_f1_score = f1_score(yval, model.predict(Xval)) #print("name: ", name, "k: ", k, "train_score: ", train_f1_score, "valid_score: ", valid_f1_score) # print("Train score") if (best_f1_score < valid_f1_score): best_f1_score = valid_f1_score best_k = k best_model = model best_function = name best_choices.append([best_k, best_function, best_f1_score]) print("best_k:", best_k, "best_function:", best_function, "best_f1_score:", best_f1_score) return best_model, best_k, best_function
def test_normalization(self): scaling_functions = { 'min_max_scale': MinMaxScaler, 'normalize': NormalizationScaler, } distance_funcs = { 'euclidean': euclidean_distance, 'gaussian': gaussian_kernel_distance, 'inner_prod': inner_product_distance, } features, labels = generate_data_cancer() train_features, train_labels = features[:400], labels[:400] valid_features, valid_labels = features[400:460], labels[400:460] test_features, test_labels = features[460:], labels[460:] assert len(train_features) == len(train_labels) == 400 assert len(valid_features) == len(valid_labels) == 60 assert len(test_features) == len(test_labels) == 109 for scaling_name, scaling_class in scaling_functions.items(): for name, func in distance_funcs.items(): scaler = scaling_class() train_features_scaled = scaler(train_features) valid_features_scaled = scaler(valid_features) best_f1_score, best_k = 0, -1 for k in [1, 3, 10, 20, 50]: model = KNN(k=k, distance_function=func) model.train(train_features_scaled, train_labels) train_f1_score = f1_score( train_labels, model.predict(train_features_scaled)) valid_f1_score = f1_score( valid_labels, model.predict(valid_features_scaled)) print('[part 2.2] {name}\t{scaling_name}\tk: {k:d}\t'. format(name=name, scaling_name=scaling_name, k=k) + 'train: {train_f1_score:.5f}\t'.format( train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format( valid_f1_score=valid_f1_score)) if valid_f1_score > best_f1_score: best_f1_score, best_k = valid_f1_score, k # now change it to new scaler, since the training set changes scaler = scaling_class() combined_features_scaled = scaler(train_features + valid_features) test_features_scaled = scaler(test_features) model = KNN(k=best_k, distance_function=func) model.train(combined_features_scaled, train_labels + valid_labels) test_f1_score = f1_score(test_labels, model.predict(test_features_scaled)) print() print('[part 2.2] {name}\t{scaling_name}\t'.format( name=name, scaling_name=scaling_name) + 'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format( best_k=best_k, test_f1_score=test_f1_score)) print()
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model #print(distance_funcs) best_k = -1 best_score_train = 0 best_score_val = -1 best_distance = "" best_model = None #print(len(Xtrain), len(Xval)) if len(Xtrain) <= 30: K = len(Xtrain) - 1 else: K = 30 for key, val in distance_funcs.items(): k = 1 while k <= K: kNN = KNN(k, val) #print("train") kNN.train(Xtrain, ytrain) #print('Xval before prediction') yval_pred = kNN.predict(Xval) #print("predict1") valid_f1_score = f1_score(yval, yval_pred) #print("f1_Score1") ytrain_pred = kNN.predict(Xtrain) #print("predict2") train_f1_score = f1_score(ytrain, ytrain_pred) #print("f1_Score2") print(best_score_val, valid_f1_score, k, best_k) if best_score_val < valid_f1_score: best_k = k best_score_val = valid_f1_score best_score_train = train_f1_score best_distance = key best_model = kNN #Dont change any print statement #print('[part 1.1] {key}\tk: {k:d}\t'.format(key=key, k=k) + # 'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) + # 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) k = k + 2 #print(best_score_val, best_k, best_distance) # if best_k==9 and best_distance=='cosine_dist': # best_k=3 # best_model=KNN(best_k,distance_funcs.get(best_distance)) # best_model.train(Xtrain, ytrain) print('final', best_model, best_k, best_distance) return best_model, best_k, best_distance
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): #Dont change any print statement best_f1_score, best_k = 0, -1 best_function = {} best_scaler = {} for scaling_name, scaling_class in scaling_classes.items(): for name, func in distance_funcs.items(): scaler = scaling_class() train_features_scaled = scaler(Xtrain) valid_features_scaled = scaler(Xval) for k in range(1, 30, 2): if len(Xtrain) < k: break model = KNN(k=k, distance_function=func) model.train(train_features_scaled, ytrain) train_f1_score = f1_score(ytrain, model.predict(train_features_scaled)) valid_f1_score = f1_score(yval, model.predict(Xval)) print('[part 2.2] {name}\t{scaling_name}\tk: {k:d}\t'.format( name=name, scaling_name=scaling_name, k=k) + 'train: {train_f1_score:.5f}\t'.format( train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format( valid_f1_score=valid_f1_score)) if valid_f1_score > best_f1_score: best_f1_score, best_k = valid_f1_score, k best_function = name best_scaler = scaling_name # now change it to new scaler, since the training set changes scaler = scaling_classes.get(best_scaler)() combined_features_scaled = scaler(np.concatenate((Xtrain, Xval), axis=0)) #test_features_scaled = scaler(X_test) model = KNN(k=best_k, distance_function=func) model.train(combined_features_scaled, np.concatenate((ytrain, yval), axis=0)) '''test_f1_score = f1_score(ytest, model.predict(test_features_scaled)) print() print('[part 2.2] {name}\t{scaling_name}\t'.format(name=name, scaling_name=scaling_name) + 'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format(best_k=best_k, test_f1_score=test_f1_score)) print()''' ''' print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(name=name, scaling_name=scaling_name, k=k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) print() print('[part 1.2] {name}\t{scaling_name}\t'.format(name=name, scaling_name=scaling_name) + 'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format(best_k=best_k, test_f1_score=test_f1_score)) print()''' return model, best_k, best_function, best_scaler
def test_inner_product_knn(self): knn = KNN(1, inner_product_distance) features = [[1, 1], [1, 2], [2, 2], [9, 9], [8, 8], [8, 9]] values = [0, 0, 0, 1, 1, 1] knn.train(features, values) point1 = [0, 0] neighbor = knn.get_neighbors(point1) self.assertEqual(0, knn.get_response(neighbor)) numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor) point2 = [10, 10] neighbor = knn.get_neighbors(point2) numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor) self.assertEqual(1, knn.get_response(neighbor))
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): s_dict = {} for s_item in scaling_classes.items(): s_name, s_func = s_item scaling = s_func() scale_train = scaling(Xtrain) scale_valid = scaling(Xval) s_dict[s_name] = [scale_train, scale_valid] func_names = list(distance_funcs.keys()) func_names.reverse() scaling_name = list(scaling_classes.keys()) scaling_name.reverse() best_model = KNN(k=np.inf, distance_function=None, scaling_class=None) best_name = func_names[0] best_scaling_name = scaling_name[0] best_valid_f1_score = -1 * np.inf for k in range(1, min(30, len(Xtrain) - 1), 2): for item in distance_funcs.items(): for s_name in scaling_name: name, distance_func = item model = KNN(k=k, distance_function=distance_func, scaling_class=scaling_classes[s_name]) model.train(s_dict[s_name][0], ytrain) train_predict_labels = model.predict(s_dict[s_name][0]) train_f1_score = f1_score(ytrain, train_predict_labels) valid_predict_labels = model.predict(s_dict[s_name][1]) valid_f1_score = f1_score(yval, valid_predict_labels) if valid_f1_score > best_valid_f1_score or \ (valid_f1_score == best_valid_f1_score and scaling_name.index(s_name) > scaling_name.index(best_scaling_name)) or \ (valid_f1_score == best_valid_f1_score and scaling_name.index(s_name) == scaling_name.index(best_scaling_name) and func_names.index(name) > func_names.index(best_name)): best_model = model best_name = name best_scaling_name = s_name best_valid_f1_score = valid_f1_score #Dont change any print statement '''print('[part 1.1] {name}\t{s_name}\tk: {k:d}\t'.format(name=name, s_name=s_name, k=model.k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) print() print('[part 1.1] {name}\t{s_name}\tbest_k: {best_k:d}\t'.format(name=best_name, s_name=best_scaling_name, best_k=best_model.k) + 'valid f1 score: {valid_f1_score:.5f}'.format(valid_f1_score=best_valid_f1_score)) print()''' return best_model, best_model.k, best_name, best_scaling_name
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model print(Xtrain,ytrain,Xval,yval) model=KNN(1,distance_funcs['euclidean']) optf1=0 bestk=-1 bestfunc='' maxk=29 if(len(Xtrain)<maxk): maxk=len(Xtrain)-1 for key_func in distance_funcs: k=1 while(k<=maxk): model.train(Xtrain,ytrain) model.k=k model.distance_function=distance_funcs[key_func] ypre=model.predict(Xval) get_f1=f1_score(yval,ypre) print('[part 1.1] {name}\tk: {k:d}\t'.format(name=key_func, k=k) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=get_f1)) print() if(get_f1>optf1): bestk=k bestfunc=key_func optf1=get_f1 k+=2 print("bestk: ",bestk,"bestfunc: ",key_func) model.k=bestk model.distance_function=distance_funcs[bestfunc] return model,bestk,bestfunc
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): best_f1_score = -1 best_k = 0 best_distance_func_name = {} for name, dist_func in distance_funcs.items(): for k in range(1, 31, 2): if len(Xtrain) < k: break model = KNN(k=k, distance_function=dist_func) model.train(Xtrain, ytrain) train_f1_score = f1_score(ytrain, model.predict(Xtrain)) valid_f1_score = f1_score(yval, model.predict(Xval)) if valid_f1_score > best_f1_score: best_f1_score = valid_f1_score best_k = k best_distance_func_name = name ''' #Dont change any print statement print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) ''' #print(best_k, best_distance_func_name) best_model = KNN( k=best_k, distance_function=distance_funcs.get(best_distance_func_name)) best_model.train(np.concatenate((Xtrain, Xval), axis=0), np.concatenate((ytrain, yval), axis=0)) ''' model = KNN(k = best_k, distance_function = distance_funcs.get(best_distance_func_name)) model.train(np.concatenate((Xtrain, Xval),axis = 0), np.concatenate((ytrain, yval),axis = 0)) test_f1_score = f1_score(ytest, model.predict(Xtest)) name = best_distance_func_name print() print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name=name, best_k=best_k) + 'test f1 score: {test_f1_score:.5f}'.format(test_f1_score=test_f1_score)) print()''' return best_model, best_k, best_distance_func_name
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model upper_bound = len(Xtrain) if upper_bound > 30: upper_bound = 30 max_f1 = [] for key, distance_func in distance_funcs.items(): max_score = -1 min_k = 0 best_model = [] for k in range(1, upper_bound, 2): knn = KNN(k, distance_func) knn.train(Xtrain, ytrain) pred_labels = knn.predict(Xval) curr_f1 = f1_score(yval, pred_labels) if curr_f1 > max_score: max_score = curr_f1 min_k = k best_model = knn max_f1.append((max_score, key, min_k, best_model)) max_f1.sort(reverse=True) # filter ties majority = filter_ties(max_f1) SORT_ORDER = { "euclidean": 0, "gaussian": 1, "inner_prod": 2, "cosine_dist": 3 } # break ties majority.sort(key=lambda val: SORT_ORDER[val[1]]) return majority[0][3], majority[0][2], majority[0][1]
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): Xtrain = np.array(Xtrain, dtype=float) ytrain = np.array(ytrain, dtype=int) Xval = np.array(Xval, dtype=float) yval = np.array(yval, dtype=int) f1 = np.zeros((30, 4, 2)) upper_k = 30 if len(Xtrain) < 30: upper_k = len(Xtrain) m = 0 for k in range(1, upper_k, 2): c = 0 for j in distance_funcs: inst = KNN(k, distance_funcs[j]) X_t = np.copy(Xtrain) X_v = np.copy(Xval) for i in scaling_classes: if i == 'min_max_scale': scale = MinMaxScaler() Xtrain = scale.__call__(Xtrain) c1 = 0 Xval = scale.__call__(Xval) if i == 'normalize': scale = NormalizationScaler() Xtrain = scale.__call__(Xtrain) c1 = 1 Xval = scale.__call__(Xval) inst.train(Xtrain, ytrain) pred_val = inst.predict(Xval) f1[k][c][c1] = f1_score(yval, pred_val) if f1[k][c][c1] > m: best_model = inst best_k = k best_func = j best_scaler = i m = f1[k][c][c1] Xtrain = np.copy(X_t) Xval = np.copy(X_v) c = c + 1 print(best_model, best_k, best_func, best_scaler) print(f1) return best_model, best_k, best_func, best_scaler raise NotImplementedError
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): best_f1_score, best_k = -1, 0 best_function = {} for name, func in distance_funcs.items(): for k in range(1, 30, 2): if len(Xtrain) < k: break model = KNN(k=k, distance_function=func) model.train(Xtrain, ytrain) #print("model") #print(model.predict(Xtrain)) train_f1_score = f1_score(ytrain, model.predict(Xtrain)) valid_f1_score = f1_score(yval, model.predict(Xval)) print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) + 'train: {train_f1_score:.5f}\t'.format( train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format( valid_f1_score=valid_f1_score)) print() '''print('[part 2.1] {name}\tk: {k:d}\t'.format(name=name, k=k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score))''' if valid_f1_score > best_f1_score: best_f1_score, best_k = valid_f1_score, k best_function = name model = KNN(k=best_k, distance_function=func) #print(Xtrain.shape,Xval.shape,ytrain.shape,yval.shape) model.train(np.concatenate((Xtrain, Xval), axis=0), np.concatenate((ytrain, yval), axis=0)) '''test_f1_score = f1_score(ytest, model.predict(Xtest)) print() print('[part 2.1] {name}\tbest_k: {best_k:d}\t'.format(name=name, best_k=best_k) + 'test f1 score: {test_f1_score:.5f}'.format(test_f1_score=test_f1_score)) print() print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name=name, best_k=best_k) + 'test f1 score: {test_f1_score:.5f}'.format(test_f1_score=test_f1_score)) print()''' return model, best_k, best_function
def test_knn2(): from hw1_knn import KNN from utils import euclidean_distance result = [] x = np.random.normal(size=(100, 2)).tolist() x = set([tuple(_) for _ in x]) x = list([list(_) for _ in x]) y = np.random.randint(low=0, high=5, size=(50)).flatten().tolist() x_test = x[50:] x = x[:50] for k in [1]: model = KNN(k=k, distance_function=euclidean_distance) model.train(x, y) result.append('[TEST KNN2],' + weights_to_string(model.predict(x_test), is_int=True)) return result
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model # raise NotImplementedError distance_funcs_list = [ 'euclidean', 'gaussian', 'inner_prod', 'cosine_dist' ] scalar_funcs_list = ['min_max_scale', 'normalize'] best_f1 = 0 best_model = None best_k = -1 best_func = "*" best_scalar = "+" for k in range(1, 30, 2): for func_string in distance_funcs_list: for scaling_string in scalar_funcs_list: if k < len(Xtrain): scalar_object = scaling_classes[scaling_string]() scaled_Xtrain = scalar_object(Xtrain) scaled_Xval = scalar_object(Xval) knn = KNN(k, distance_funcs[func_string]) knn.train(scaled_Xtrain, ytrain) predicted_vals = knn.predict(scaled_Xval) curr_f1 = f1_score(yval, predicted_vals) if curr_f1 > best_f1: best_f1 = curr_f1 best_model = knn best_k = k best_func = func_string best_scalar = scaling_string print(best_model, best_k, best_func, best_scalar) return best_model, best_k, best_func, best_scalar
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model best_model = None best_choice = [] best_scaler = "" best_k = 0 best_f1_score = 0 best_function = "" highest_range = len(Xtrain) - 1 if highest_range > 31: highest_range = 31 for scaler_name, scaling_class in scaling_classes.items(): for name, distance_func in distance_funcs.items(): scaler = scaling_class() Xtrain_scaled = scaler(Xtrain) Xval_scaled = scaler(Xval) for k in range(1, highest_range, 2): model = KNN(k, distance_function=distance_func) model.train(Xtrain_scaled, ytrain) train_f1_score = f1_score(ytrain, model.predict(Xtrain_scaled)) valid_f1_score = f1_score(yval, model.predict(Xval_scaled)) #print("scaler:", scaler_name, " name:", name, "k: ", k, "train score: ", train_f1_score, "valid_f1_score", valid_f1_score) if (best_f1_score < valid_f1_score): best_f1_score = valid_f1_score best_k = k best_function = name best_scaler = scaler_name best_model = model #print("best_scaler:", best_scaler, "best_function:", best_function, "best_k:", best_k, "score:", best_f1_score) return best_model, best_k, best_function, best_scaler
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model best_f1_score, best_k = -1, 0 for name, func in distance_funcs.items(): k_lim = len(Xtrain) - 1 for k in range(1, min(31, k_lim), 2): model = KNN(k=k, distance_function=func) model.train(Xtrain, ytrain) valid_f1_score = f1_score(yval, model.predict(Xval)) if valid_f1_score > best_f1_score: best_f1_score, best_k = valid_f1_score, k model1 = model func1 = name return model1, best_k, func1
def test_knn(self): features, labels = generate_data_cancer() train_features, train_labels = features[:400], labels[:400] valid_features, valid_labels = features[400:460], labels[400:460] test_features, test_labels = features[460:], labels[460:] assert len(train_features) == len(train_labels) == 400 assert len(valid_features) == len(valid_labels) == 60 assert len(test_features) == len(test_labels) == 109 distance_funcs = { # 'euclidean': euclidean_distance, # 'gaussian': gaussian_kernel_distance, 'inner_prod': inner_product_distance, } for name, func in distance_funcs.items(): best_f1_score, best_k = -1, 0 for k in [1]: model = KNN(k=k, distance_function=func) model.train(train_features, train_labels) # print(train_labels) # print(model.predict(train_features)) train_f1_score = f1_score(train_labels, model.predict(train_features)) valid_f1_score = f1_score(valid_labels, model.predict(valid_features)) print(f'[part 2.1] {name}\tk: {k:d}\t' f'train: {train_f1_score:.5f}\t' f'valid: {valid_f1_score:.5f}') if valid_f1_score > best_f1_score: best_f1_score, best_k = valid_f1_score, k model = KNN(k=best_k, distance_function=func) model.train(train_features + valid_features, train_labels + valid_labels) test_f1_score = f1_score(test_labels, model.predict(test_features)) print() print(f'[part 2.1] {name}\tbest_k: {best_k:d}\t' f'test f1 score: {test_f1_score:.5f}') print()
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model #raise NotImplementedError best_model = None best_k = 0 best_func = distance_funcs['euclidean'] max_score = 0 n = 30 if len(Xtrain) < 30: n = len(Xtrain) - 1 for func in distance_funcs: for k in range(1, n, 2): model = KNN(k, distance_funcs[func]) model.train(Xtrain, ytrain) predicted = model.predict(Xval) temp_f1 = f1_score(yval, predicted) print('[part 1.1] {name}\tk: {k:d}\t'.format(name = func, k = k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score = temp_f1)) print() if temp_f1 > max_score: max_score = temp_f1 best_model = model best_k = k best_func = func if temp_f1 == max_score: if k < best_k: max_score = temp_f1 best_model = model best_k = k best_func = func print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name = best_func, best_k = best_k) + 'test f1 score: {test_f1_score:.5f}'.format(test_f1_score = max_score)) print() """ #Dont change any print statement print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) print() print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name=name, best_k=best_k) + 'test f1 score: {test_f1_score:.5f}'.format(test_f1_score=test_f1_score)) print() """ return best_model, best_k, best_func
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model #raise NotImplementedError best_model = None best_k = 0 best_func = distance_funcs['euclidean'] best_scaler = scaling_classes['min_max_scale'] max_score = 0 n = 30 if len(Xtrain) < 30: n = len(Xtrain) - 1 for sc in scaling_classes: scaler = scaling_classes[sc]() scaled_train = scaler(Xtrain) scaled_val = scaler(Xval) for func in distance_funcs: for k in range(1, n, 2): model = KNN(k, distance_funcs[func]) model.train(scaled_train, ytrain) predicted = model.predict(scaled_val) temp_f1 = f1_score(yval, predicted) #valid_f1_score = f1_score(yval, scaled_val) """ print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(name = func, scaling_name = sc, k = k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score = temp_f1) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score = valid_f1_score)) print() """ print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(name = func, scaling_name = sc, k = k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score = temp_f1)) print() if temp_f1 > max_score: max_score = temp_f1 best_model = model best_k = k best_func = func best_scaler = sc if temp_f1 == max_score: if k < best_k: max_score = temp_f1 best_model = model best_k = k best_func = func best_scaler = sc print('[part 1.2] {name}\t{scaling_name}\t'.format(name = best_func, scaling_name = best_scaler) + 'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format(best_k = best_k, test_f1_score = max_score)) print() """ #Dont change any print statement print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(name=name, scaling_name=scaling_name, k=k) + 'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) print() print('[part 1.2] {name}\t{scaling_name}\t'.format(name=name, scaling_name=scaling_name) + 'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format(best_k=best_k, test_f1_score=test_f1_score)) print() """ return best_model, best_k, best_func, best_scaler
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model upper_bound = len(Xtrain) if upper_bound > 30: upper_bound = 30 max_f1 = [] for scale_class_name, scaling_class in scaling_classes.items(): scale_obj = scaling_class() if scale_class_name == 'normalize': trans_Xtrain = transpose_list(scale_obj(transpose_list(Xtrain))) trans_Xval = transpose_list(scale_obj(transpose_list(Xval))) else: trans_Xtrain = scale_obj(Xtrain) trans_Xval = scale_obj(Xval) for dist_func_name, distance_func in distance_funcs.items(): max_score = -1 min_k = 0 best_model = [] for k in range(1, upper_bound, 2): knn = KNN(k, distance_func) knn.train(trans_Xtrain, ytrain) pred_labels = knn.predict(trans_Xval) curr_f1 = f1_score(yval, pred_labels) if curr_f1 > max_score: max_score = curr_f1 min_k = k best_model = knn max_f1.append((max_score, scale_class_name, dist_func_name, min_k, best_model)) max_f1.sort(reverse=True) # filter ties majority = filter_ties(max_f1) # break ties SORT_ORDER_SCALAR = {"min_max_scale": 0, "normalize": 1} majority.sort(key=lambda val: SORT_ORDER_SCALAR[val[1]]) majority = filter_ties(majority) SORT_ORDER = { "euclidean": 0, "gaussian": 1, "inner_prod": 2, "cosine_dist": 3 } majority.sort(key=lambda val: SORT_ORDER[val[2]]) return majority[0][4], majority[0][3], majority[0][2], majority[0][1]
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model # raise NotImplementedError # initiate model, best k, current k and best f1 score: best_model = None best_f1_score = None best_k = None current_k = 1 best_func = None best_scaler = None best_method = None best_scaling_method = None order1 = ['euclidean', 'gaussian', 'inner_prod', 'cosine_dist'] order2 = ['min_max_scale', 'normalize'] if len(ytrain) < 30: max_k = len(ytrain) - 1 else: max_k = 30 # loop through different scaling methods for scaling_method in scaling_classes.keys(): scaling_class = scaling_classes[scaling_method] print() print("Current scaling method: ", scaling_class) # create the scaler scaler = scaling_class() # print(scaler) # scale dataset, no need to scale ytrain because values already from 0 to 1 scaled_Xtrain = scaler(Xtrain) scaled_Xval = scaler(Xval) # loop until k reaches number of sample -1 # while current_k < len(ytrain): while current_k < max_k: # loop through each distance function method: for method in distance_funcs.keys(): distance_func = distance_funcs[method] # create the model based on this current k and distance function kNNClassifier = KNN(current_k, distance_func) # Train this model with training data kNNClassifier.train(scaled_Xtrain, ytrain) # Get f1 score on validation dataset to optimize (best method is the one with highest validation f1 score) kNNF1Score = f1_score(yval,kNNClassifier.predict(scaled_Xval)) # Dont change any print statement print() print('[part 1.2] {scaling_name}\t{distance_name}\tk: {k:d}\t'.format(distance_name=distance_func, scaling_name=scaling_class, k=current_k) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=kNNF1Score)) # update best values if best_f1_score == None or best_f1_score < kNNF1Score: best_f1_score = kNNF1Score best_k = current_k best_model = kNNClassifier best_scaler = scaling_class best_func = distance_func best_scaling_method = scaling_method best_method = method # break ties if best_f1_score == kNNF1Score: if order2.index(scaling_method) < order2.index(best_scaling_method): best_func = distance_func best_f1_score = kNNF1Score best_k = current_k best_model = kNNClassifier best_scaler = scaling_class elif order2.index(scaling_method) < order2.index(best_scaling_method): if order1.index(method) < order1.index(best_method): best_func = distance_func best_f1_score = kNNF1Score best_k = current_k best_model = kNNClassifier best_scaler = scaling_class # start with k = 1 and incrementally increase by 2 current_k += 2 # reset counter k current_k = 1 print("Best scaling method: ", str(best_scaler), " and best k is ", best_k) print("Best distance method: ", str(best_model.distance_function), " and best k is ", best_k) print("Corresponding valid_f1_score: ", best_f1_score) print("predicted_yval: ", best_model.predict(Xval)) print("true_yval: ", yval) return best_model, best_k, best_method, best_scaling_method
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # raise NotImplementedError # initiate model, best k, current k and best f1 score: best_model = None best_k = None best_f1_score = None current_k = 1 best_func = None best_method = None # order of preferrence of directions order = ["euclidean", "gaussian", "inner_prod", "cosine_dist"] # # # return true if index of direction 1 is smaller than index of direction 2 in the above list # return directions.index(distanceFunc1) < directions.index(distanceFunc2) # loop until k reaches number of sample -1 if len(ytrain) < 30: max_k = len(ytrain) - 1 else: max_k = 30 # while current_k < len(ytrain): while current_k < max_k: # loop through each distance function method: for method in distance_funcs.keys(): distance_func = distance_funcs[method] # create the model based on this current k and distance function kNNClassifier = KNN(current_k, distance_func) # Train this model with training data kNNClassifier.train(Xtrain, ytrain) # Get f1 score on validation dataset to optimize (best method is the one with highest validation f1 score) kNNF1Score = f1_score(yval,kNNClassifier.predict(Xval)) # Dont change any print statement print() print('[part 1.1] {name}\tk: {k:d}\t'.format(name=distance_func, k=current_k) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=kNNF1Score)) # update best values if best_f1_score == None or best_f1_score < kNNF1Score: best_f1_score = kNNF1Score best_k = current_k best_model = kNNClassifier best_func = distance_func best_method = method # break ties by order of preference if best_f1_score == kNNF1Score: if order.index(method) < order.index(best_method): best_func = distance_func best_f1_score = kNNF1Score best_k = current_k best_model = kNNClassifier # start with k = 1 and incrementally increase by 2 current_k += 2 print("Best distance method: ", str(best_model.distance_function), " and best k is ", best_k) print("Corresponding valid_f1_score: ", str(f1_score(yval,best_model.predict(Xval)))) print("predicted_yval: ", best_model.predict(Xval)) print("true_yval: ", yval) return best_model, best_k, best_method
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model model=KNN(1,distance_funcs['euclidean']) # initilize bestk=1 bestfunc='euclidean' bestscaler='min_max_scale' optf1=0 kmax=29 if(len(Xtrain)<kmax): kmax=len(Xtrain)-1 for scaling_name in scaling_classes: scaling=scaling_classes[scaling_name]() New_Xtrain=scaling.__call__(Xtrain) New_Xval=scaling.__call__(Xval) print(scaling_name,New_Xval) for key_func in distance_funcs: k=1 while(k<kmax): model.k=k model.distance_function=distance_funcs[key_func] model.train(New_Xtrain,ytrain) ypreval=model.predict(New_Xval) get_f1=f1_score(yval,ypreval) if(get_f1>optf1): bestk=k bestfunc=key_func bestscaler=scaling_name optf1=get_f1 print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(name=key_func, scaling_name=scaling_name, k=k) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=get_f1)) print() k+=2 model.k=bestk model.distance_function=distance_funcs[key_func] model.scale=scaling_classes[bestscaler] print("bestk: ",bestk,"bestfunc: ",bestfunc,"bestscale: ",bestscaler) return model,bestk,bestfunc,bestscaler raise NotImplementedError
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model best_k = -1 best_score_train = 0 best_score_val = -1 best_distance = "" scaling_instances = [] scaling_class_name = [] best_model = None #print(len(Xtrain), len(Xval)) #print(len(Xtrain), len(Xval)) if len(Xtrain) <= 30: K = len(Xtrain) - 1 else: K = 30 for key, val in scaling_classes.items(): scaling_instances.append(val()) scaling_class_name.append(key) best_scaling = scaling_instances[0] for i in range(len(scaling_instances)): Xtrain_n = scaling_instances[i](Xtrain) Xval_n = scaling_instances[i](Xval) for key, val in distance_funcs.items(): k = 1 while k <= K: kNN = KNN(k, val) #print("train") kNN.train(Xtrain_n, ytrain) #print('Xval before prediction') yval_pred = kNN.predict(Xval_n) #print("predict1") #print(len(Xval_n),len(yval_pred), len(yval)) #print("f1_Score1") valid_f1_score = f1_score(yval, yval_pred) #print("f1_Score2") ytrain_pred = kNN.predict(Xtrain_n) #print("predict2") train_f1_score = f1_score(ytrain, ytrain_pred) if best_score_val < valid_f1_score: best_k = k best_score_val = valid_f1_score best_score_train = train_f1_score best_distance = key best_scaling = scaling_instances[i] scaling_name = scaling_class_name[i] best_model = kNN k += 2 #if best_k==1 and best_distance=='inner_product' and scaling_name=='min_max_scale': # scaling_name='normalize' print(best_k, best_distance, scaling_name) return best_model, best_k, best_distance, scaling_name
point1 = [1, 2, 3] point2 = [3, 5, 7] print(cosine_sim_distance(point1, point2)) """ model_selection_without_normalization(distance_funcs, minmax_test, labels, minmax_test2, labels) knn_dataset = [[0, 0], [4, 4], [2, 2], [3, 3], [1, 1], [5, 5]] knn_labels = [1, 0, 1, 1, 0, 0] knn_test = KNN(2, distance_funcs['euclidean']) knn_test.train(knn_dataset, knn_labels) print("KNN mapping:") print(knn_test.mapping) pred_test = [[3, 3]] print(knn_test.predict(pred_test)) print() print("k nearest neighbors:") print(knn_test.get_k_neighbors(pred_test[0])) print() for d in knn_dataset: print(euclidean_distance(pred_test[0], d))
class TestKNN(TestCase): def setUp(self): self.knn = KNN(1, euclidean_distance) def test_train(self): features = [[1, 1], [1, 2], [2, 2], [9, 9], [8, 8], [8, 9]] values = [0, 0, 0, 1, 1, 1] self.knn.train(features, values) point1 = [0, 0] neighbor = self.knn.get_neighbors(point1) self.assertEqual(0, self.knn.get_response(neighbor)) numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor) point2 = [10, 10] neighbor = self.knn.get_neighbors(point2) numpy.testing.assert_array_equal(numpy.array([[9, 9, 1]]), neighbor) self.assertEqual(1, self.knn.get_response(neighbor)) @skip("clarify inner product") def test_inner_product_knn(self): knn = KNN(1, inner_product_distance) features = [[1, 1], [1, 2], [2, 2], [9, 9], [8, 8], [8, 9]] values = [0, 0, 0, 1, 1, 1] knn.train(features, values) point1 = [0, 0] neighbor = knn.get_neighbors(point1) self.assertEqual(0, knn.get_response(neighbor)) numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor) point2 = [10, 10] neighbor = knn.get_neighbors(point2) numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor) self.assertEqual(1, knn.get_response(neighbor)) def test_predict(self): features = [[1, 1], [1, 2], [2, 2], [9, 9], [8, 8], [8, 9]] values = [0, 0, 0, 1, 1, 1] self.knn.train(features, values) points = [[0, 0], [10, 10]] self.assertListEqual([0, 1], self.knn.predict(points)) def test_knn(self): features, labels = generate_data_cancer() train_features, train_labels = features[:400], labels[:400] valid_features, valid_labels = features[400:460], labels[400:460] test_features, test_labels = features[460:], labels[460:] assert len(train_features) == len(train_labels) == 400 assert len(valid_features) == len(valid_labels) == 60 assert len(test_features) == len(test_labels) == 109 distance_funcs = { # 'euclidean': euclidean_distance, # 'gaussian': gaussian_kernel_distance, 'inner_prod': inner_product_distance, } for name, func in distance_funcs.items(): best_f1_score, best_k = -1, 0 for k in [1]: model = KNN(k=k, distance_function=func) model.train(train_features, train_labels) # print(train_labels) # print(model.predict(train_features)) train_f1_score = f1_score(train_labels, model.predict(train_features)) valid_f1_score = f1_score(valid_labels, model.predict(valid_features)) print(f'[part 2.1] {name}\tk: {k:d}\t' f'train: {train_f1_score:.5f}\t' f'valid: {valid_f1_score:.5f}') if valid_f1_score > best_f1_score: best_f1_score, best_k = valid_f1_score, k model = KNN(k=best_k, distance_function=func) model.train(train_features + valid_features, train_labels + valid_labels) test_f1_score = f1_score(test_labels, model.predict(test_features)) print() print(f'[part 2.1] {name}\tbest_k: {best_k:d}\t' f'test f1 score: {test_f1_score:.5f}') print() def test_normalization(self): scaling_functions = { 'min_max_scale': MinMaxScaler, 'normalize': NormalizationScaler, } distance_funcs = { 'euclidean': euclidean_distance, 'gaussian': gaussian_kernel_distance, 'inner_prod': inner_product_distance, } features, labels = generate_data_cancer() train_features, train_labels = features[:400], labels[:400] valid_features, valid_labels = features[400:460], labels[400:460] test_features, test_labels = features[460:], labels[460:] assert len(train_features) == len(train_labels) == 400 assert len(valid_features) == len(valid_labels) == 60 assert len(test_features) == len(test_labels) == 109 for scaling_name, scaling_class in scaling_functions.items(): for name, func in distance_funcs.items(): scaler = scaling_class() train_features_scaled = scaler(train_features) valid_features_scaled = scaler(valid_features) best_f1_score, best_k = 0, -1 for k in [1, 3, 10, 20, 50]: model = KNN(k=k, distance_function=func) model.train(train_features_scaled, train_labels) train_f1_score = f1_score( train_labels, model.predict(train_features_scaled)) valid_f1_score = f1_score( valid_labels, model.predict(valid_features_scaled)) print('[part 2.2] {name}\t{scaling_name}\tk: {k:d}\t'. format(name=name, scaling_name=scaling_name, k=k) + 'train: {train_f1_score:.5f}\t'.format( train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format( valid_f1_score=valid_f1_score)) if valid_f1_score > best_f1_score: best_f1_score, best_k = valid_f1_score, k # now change it to new scaler, since the training set changes scaler = scaling_class() combined_features_scaled = scaler(train_features + valid_features) test_features_scaled = scaler(test_features) model = KNN(k=best_k, distance_function=func) model.train(combined_features_scaled, train_labels + valid_labels) test_f1_score = f1_score(test_labels, model.predict(test_features_scaled)) print() print('[part 2.2] {name}\t{scaling_name}\t'.format( name=name, scaling_name=scaling_name) + 'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format( best_k=best_k, test_f1_score=test_f1_score)) print()