def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # scaling_classes: diction of scalers # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model # return best_scaler: best function choosed for best_model # ifthere are less than 30 points in dataset, choose n-1 as the upper bound of K. #'n' is the number of points in dataset. # You can choose N-1 as best k if N-1 is an odd number. best_scale = None best_score = 0.0 best_distance = None best_func = "" best_k = 0 max_score = 0.0 #modified due to grading instructions if len(Xtrain) < 30: kvals = np.arange(1, len(Xtrain), 2) kvals = np.arange(1, len(Xtrain), 2) train_f1_score = 1.0 valid_f1_score = 0.0 model = None for scaling_name, new_scaler in scaling_classes.items(): print(scaling_name, new_scaler) scaler = new_scaler() scaled_Xtrain = scaler(Xtrain) scaled_Xval = scaler(Xval) for name, f in distance_funcs.items(): for k in kvals: if k == 1: model = KNN(k, f) model.train(scaled_Xtrain, ytrain) else: model.k = k valid_f1_score = f1_score(yval, model.predict(scaled_Xval)) if valid_f1_score > max_score: max_score = valid_f1_score best_distance = f best_k = k best_func = name best_scale = scaling_name if valid_f1_score == max_score: if k < best_k: max_score = valid_f1_score best_distance = f best_k = k best_func = name best_scale = scaling_name model.k = best_k model.distance_function = best_distance model.f1 = max_score model.scaler = scaling_classes[best_scale] best_model = KNN(best_k, best_distance) best_model.scaler = scaling_classes[best_scale] best_model.train(Xtrain, ytrain) #print('best score: ', best_score) #print('best k: ', best_k) #print('best scaler: ', scaling_name) # Dont change any print statement print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) + 'train:{train_f1_score:.5f}\t'.format( train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format( name=name, scaling_name=scaling_name, k=k) + 'train: {train_f1_score:.5f}\t'.format( train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) print() print('[part 1.2] {name}\t{scaling_name}\t'.format( name=name, scaling_name=scaling_name) + 'best_k: {best_k:d}\t'.format(best_k=best_k)) print() return best_model, best_k, best_func, best_scale
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval): # distance_funcs: dictionary of distance funtion # Xtrain: List[List[int]] train set # ytrain: List[int] train labels # Xval: List[List[int]] validation set # yval: List[int] validation labels # return best_model: an instance of KNN # return best_k: best k choosed for best_model # return best_func: best function choosed for best_model best_distance = None best_function = "" best_k = 0 max_score = 0.0 kvals = np.arange(1, len(Xtrain), 2) train_f1_score = 1.0 valid_f1_score = 0.0 model = None for name, f in distance_funcs.items(): for k in kvals: if k == 1: model = KNN(k, f) model.train(Xtrain, ytrain) else: model.k = k # train_f1_score =f1_score(ytrain,model.predict(Xtrain)) # predicted=model.predict(Xval) valid_f1_score = f1_score(yval, model.predict(Xval)) if valid_f1_score > max_score: max_score = valid_f1_score print("new valid score: ", valid_f1_score) best_distance = f best_function = name best_k = k print('**NEW BEST MODEL**') if valid_f1_score == max_score: if k < best_k: max_score = valid_f1_score print("new valid score: ", valid_f1_score) best_distance = f best_function = name best_k = k # best_model=KNN(best_k,best_distance) # best_model.train(Xtrain,ytrain) model.k = best_k model.distance_function = best_distance model.f1 = max_score best_model = KNN(best_k, best_distance) best_model.train(Xtrain, ytrain) # Dont change any print statement print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) + 'train: {train_f1_score:.5f}\t'.format( train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) + 'train: {train_f1_score:.5f}\t'.format( train_f1_score=train_f1_score) + 'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score)) print() print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name=name, best_k=best_k)) return best_model, best_k, best_function