예제 #1
0
def model_selection_with_transformation(distance_funcs, scaling_classes,
                                        Xtrain, ytrain, Xval, yval):
    # distance_funcs: dictionary of distance funtion
    # scaling_classes: diction of scalers
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    # return best_scaler: best function choosed for best_model
    # ifthere are less than 30 points in dataset, choose n-1 as the upper bound of K.
    #'n' is the number of points in dataset.
    # You can choose N-1 as best k if N-1 is an odd number.
    best_scale = None
    best_score = 0.0
    best_distance = None
    best_func = ""
    best_k = 0
    max_score = 0.0
    #modified due to grading instructions
    if len(Xtrain) < 30:
        kvals = np.arange(1, len(Xtrain), 2)
    kvals = np.arange(1, len(Xtrain), 2)
    train_f1_score = 1.0
    valid_f1_score = 0.0
    model = None
    for scaling_name, new_scaler in scaling_classes.items():
        print(scaling_name, new_scaler)
        scaler = new_scaler()
        scaled_Xtrain = scaler(Xtrain)
        scaled_Xval = scaler(Xval)
        for name, f in distance_funcs.items():
            for k in kvals:
                if k == 1:
                    model = KNN(k, f)
                    model.train(scaled_Xtrain, ytrain)
                else:
                    model.k = k
                valid_f1_score = f1_score(yval, model.predict(scaled_Xval))
                if valid_f1_score > max_score:
                    max_score = valid_f1_score
                    best_distance = f
                    best_k = k
                    best_func = name
                    best_scale = scaling_name
                if valid_f1_score == max_score:
                    if k < best_k:
                        max_score = valid_f1_score
                        best_distance = f
                        best_k = k
                        best_func = name
                        best_scale = scaling_name

    model.k = best_k
    model.distance_function = best_distance
    model.f1 = max_score
    model.scaler = scaling_classes[best_scale]

    best_model = KNN(best_k, best_distance)
    best_model.scaler = scaling_classes[best_scale]
    best_model.train(Xtrain, ytrain)
    #print('best score: ', best_score)
    #print('best k: ', best_k)
    #print('best scaler: ', scaling_name)

    # Dont change any print statement

    print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) +
          'train:{train_f1_score:.5f}\t'.format(
              train_f1_score=train_f1_score) +
          'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score))

    print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(
        name=name, scaling_name=scaling_name, k=k) +
          'train: {train_f1_score:.5f}\t'.format(
              train_f1_score=train_f1_score) +
          'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score))

    print()
    print('[part 1.2] {name}\t{scaling_name}\t'.format(
        name=name, scaling_name=scaling_name) +
          'best_k: {best_k:d}\t'.format(best_k=best_k))
    print()
    return best_model, best_k, best_func, best_scale
예제 #2
0
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval,
                                          yval):
    # distance_funcs: dictionary of distance funtion
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model

    best_distance = None
    best_function = ""
    best_k = 0
    max_score = 0.0
    kvals = np.arange(1, len(Xtrain), 2)
    train_f1_score = 1.0
    valid_f1_score = 0.0
    model = None

    for name, f in distance_funcs.items():
        for k in kvals:
            if k == 1:
                model = KNN(k, f)
                model.train(Xtrain, ytrain)
            else:
                model.k = k
            # train_f1_score =f1_score(ytrain,model.predict(Xtrain))
            # predicted=model.predict(Xval)
            valid_f1_score = f1_score(yval, model.predict(Xval))
            if valid_f1_score > max_score:
                max_score = valid_f1_score
                print("new valid score: ", valid_f1_score)
                best_distance = f
                best_function = name
                best_k = k
                print('**NEW BEST MODEL**')
            if valid_f1_score == max_score:
                if k < best_k:
                    max_score = valid_f1_score
                    print("new valid score: ", valid_f1_score)
                    best_distance = f
                    best_function = name
                    best_k = k

    # best_model=KNN(best_k,best_distance)
    # best_model.train(Xtrain,ytrain)
    model.k = best_k
    model.distance_function = best_distance
    model.f1 = max_score

    best_model = KNN(best_k, best_distance)
    best_model.train(Xtrain, ytrain)

    # Dont change any print statement
    print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) +
          'train: {train_f1_score:.5f}\t'.format(
              train_f1_score=train_f1_score) +
          'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score))
    print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) +
          'train: {train_f1_score:.5f}\t'.format(
              train_f1_score=train_f1_score) +
          'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score))

    print()
    print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name=name,
                                                           best_k=best_k))
    return best_model, best_k, best_function