예제 #1
0
def rhc(problem, iterations, random_seed, graph_file, graph_title):
    fitness = []
    fit_time = []
    fn_evals = []
    global eval_count
    for i in iterations:
        eval_count = 0
        start = datetime.datetime.now()
        best_state, best_fitness, _ = mlrose_hiive.random_hill_climb(problem,
                                   max_iters=i, random_state=random_seed)
        finish = datetime.datetime.now()
        fitness.append(best_fitness)
        fit_time.append((finish - start).total_seconds())
        fn_evals.append(eval_count)

    plt.plot(iterations, fitness, label="Fitness score")
    plt.legend(loc="best")
    plt.grid()
    generate_graph(graph_file + "rhc", graph_title + "Random Hill Climbing",
                   "Iterations", "Fitness")
    print('Best score achieved: ', max(fitness))
    index = fitness.index(max(fitness))
    print('Time taken to achieve that: ', fit_time[index])
    print('Function evaluations taken to achieve that: ', fn_evals[index])
예제 #2
0
def ga(problem, iterations, random_seed, graph_file, graph_title):
    mutation_prob = [0.1, 0.2, 0.3, 0.4, 0.5]
    best_score = []
    time_taken = []
    fn_evals_taken = []
    global eval_count
    for m in mutation_prob:
        fitness = []
        fit_time = []
        fn_evals = []
        for i in iterations:
            eval_count = 0
            start = datetime.datetime.now()
            best_state, best_fitness, _ = mlrose_hiive.genetic_alg(problem, mutation_prob=m,
                                                                max_iters=i, random_state=random_seed)
            finish = datetime.datetime.now()
            fitness.append(best_fitness)
            fit_time.append((finish - start).total_seconds())
            fn_evals.append(eval_count)
        # Find the best score achieved in that mutation prob
        best_score.append(max(fitness))
        index = fitness.index(max(fitness))
        # find the time that was taken to achieve that
        time_taken.append(fit_time[index])
        fn_evals_taken.append(fn_evals[index])
        plt.plot(iterations, fitness, label="Mutation = " + str(m))

    plt.legend(loc="best", title='Mutation Probability')
    plt.grid()
    generate_graph(graph_file + "ga", graph_title + "Genetic Algorithm", "Iterations", "Fitness")

    # Decays best_score and time_taken
    plt.plot(mutation_prob, best_score)
    plt.grid()
    generate_graph(graph_file + "ga_mut", graph_title + "Genetic Algorithm",
                   "Mutation Probability", "Best Score Achieved")

    """
    plt.plot(mutation_prob, time_taken)
    plt.grid()
    generate_graph("cp_sa_decay_time", "Continuous Peaks - Genetic Algorithm", "Mutation Probability",
                   "Time taken to achieve that")
    """

    plt.scatter(time_taken, best_score)
    for i, txt in enumerate(mutation_prob):
        plt.annotate(s=str(txt), xy=(time_taken[i], best_score[i]))
    plt.legend(loc='best', title='Mutation Probability')
    plt.grid()
    generate_graph(graph_file + "ga_scatter", graph_title + "Genetic Algorithm",
                   "Time Taken", "Best Score achieved")

    print('Mutation prob: ', mutation_prob)
    print('Best scores reached: ', best_score)
    print('Time taken to do that: ', time_taken)
    print('Function evaluations taken: ', fn_evals_taken)
예제 #3
0
def mimic(problem, iterations, random_seed, graph_file, graph_title):
    keep_pct = [0.1, 0.25, 0.50]
    best_score = []
    time_taken = []
    fn_evals_taken = []
    global eval_count
    for k in keep_pct:
        fitness = []
        fit_time = []
        fn_evals = []
        for i in iterations:
            eval_count = 0
            start = datetime.datetime.now()
            best_state, best_fitness, _ = mlrose_hiive.mimic(problem, keep_pct=k,
                                                            max_iters=i, random_state=random_seed)
            finish = datetime.datetime.now()
            fitness.append(best_fitness)
            fit_time.append((finish - start).total_seconds())
            fn_evals.append(eval_count)
        # Find the best score achieved in that mutation prob
        best_score.append(max(fitness))
        index = fitness.index(max(fitness))
        # find the time that was taken to achieve that
        time_taken.append(fit_time[index])
        fn_evals_taken.append(fn_evals[index])
        plt.plot(iterations, fitness, label="keep_pct = " + str(k))

    plt.legend(loc="best", title='Proportion of samples kept')
    plt.grid()
    generate_graph(graph_file + "mimic", graph_title + "MIMIC: ", "Iterations", "Fitness")

    # Decays best_score and time_taken
    plt.plot(keep_pct, best_score)
    plt.grid()
    generate_graph(graph_file + "mimic_pct", graph_title + "MIMIC",
                   "Proportion of samples kept", "Best Score Achieved")

    """
    plt.plot(mutation_prob, time_taken)
    plt.grid()
    generate_graph("cp_sa_decay_time", "Continuous Peaks - Genetic Algorithm", "Mutation Probability",
                   "Time taken to achieve that")
    """

    plt.scatter(time_taken, best_score)
    for i, txt in enumerate(keep_pct):
        plt.annotate(s=str(txt), xy=(time_taken[i], best_score[i]))
    plt.legend(loc='best', title='Proportion of samples kept')
    plt.grid()
    generate_graph(graph_file + "mimic_scatter", graph_title + "MIMIC",
                   "Time Taken", "Best Score achieved")

    print('Proportion of samples kept: ', keep_pct)
    print('Best scores reached: ', best_score)
    print('Time taken to do that: ', time_taken)
    print('Function evaluations taken: ', fn_evals_taken)
def one_max():
    algorithms = ['RHC', 'SA', 'GA', 'MIMIC']
    best_score_om = [46, 44, 50, 50]
    time_taken_om = [0.00773, 0.006309, 0.554985, 19.869137]
    fn_evals_om = [88, 214, 6039, 3221]
    x = np.arange(4)
    colors = ['coral', 'orange', 'mediumseagreen', 'cornflowerblue']
    # Best Score achieved
    plt.bar(x, height=best_score_om, color=colors)
    plt.xticks(x, algorithms)
    generate_graph("one_max_score", "One Max - Best Scores", "Algorithms",
                   "Best Score Achieved")

    # Time taken to achieve that
    plt.bar(x, height=time_taken_om, color=colors)
    plt.xticks(x, algorithms)
    generate_graph("one_max_time", "One Max - Running Time", "Algorithms",
                   "Time taken to achieve that")

    # Time taken to achieve that
    plt.bar(x, height=fn_evals_om, color=colors)
    plt.xticks(x, algorithms)
    generate_graph("one_max_evals", "One Max - Function evaluations",
                   "Algorithms", "Function evaluations taken")
def ks():
    algorithms = ['RHC', 'SA', 'GA', 'MIMIC']
    best_score_om = [41, 45, 50, 50]
    time_taken_om = [0.002853, 0.007676, 0.287459, 0.608017]
    fn_evals_om = [18, 28, 2615, 2413]
    x = np.arange(4)
    colors = ['coral', 'orange', 'mediumseagreen', 'cornflowerblue']
    # Best Score achieved
    plt.bar(x, height=best_score_om, color=colors)
    plt.xticks(x, algorithms)
    generate_graph("ks_score", "Knapsack - Best Scores", "Algorithms",
                   "Best Score Achieved")

    # Time taken to achieve that
    plt.bar(x, height=time_taken_om, color=colors)
    plt.xticks(x, algorithms)
    generate_graph("ks_time", "Knapsack - Running Time", "Algorithms",
                   "Time taken to achieve that")

    # Time taken to achieve that
    plt.bar(x, height=fn_evals_om, color=colors)
    plt.xticks(x, algorithms)
    generate_graph("ks_evals", "Knapsack - Function evaluations", "Algorithms",
                   "Function evaluations taken")
def cp():
    algorithms = ['RHC', 'SA', 'GA', 'MIMIC']
    best_score_om = [56, 84, 94, 85]
    time_taken_om = [0.002085, 0.048171, 1.746986, 43.326225]
    fn_evals_om = [13, 819, 12880, 6846]
    x = np.arange(4)
    colors = ['coral', 'orange', 'mediumseagreen', 'cornflowerblue']
    # Best Score achieved
    plt.bar(x, height=best_score_om, color=colors)
    plt.xticks(x, algorithms)
    generate_graph("cp_score", "Continuous Peaks - Best Scores", "Algorithms",
                   "Best Score Achieved")

    # Time taken to achieve that
    plt.bar(x, height=time_taken_om, color=colors)
    plt.xticks(x, algorithms)
    generate_graph("cp_time", "Continuous Peaks - Running Time", "Algorithms",
                   "Time taken to achieve that")

    # Time taken to achieve that
    plt.bar(x, height=fn_evals_om, color=colors)
    plt.xticks(x, algorithms)
    generate_graph("cp_evals", "Continuous Peaks - Function evaluations",
                   "Algorithms", "Function evaluations taken")
    return False, None, None, None


def get_manhattan_heuristic(node, goal):
    i, j = divmod(int(node), 8)
    i_goal, j_goal = divmod(int(goal), 8)
    i_delta = abs(i - i_goal)
    j_delta = abs(j - j_goal)

    manhattan_dist = i_delta + j_delta
    return manhattan_dist


if __name__ == '__main__':
    graph_neighbours = generate_graph()

    print("============ UCS Search ================")
    path_ucs, explored_ucs = uniform_cost_search(graph_neighbours, '0', '61')
    print("Path UCS:", path_ucs)
    # print("Explored Nodes UCS: ", explored_ucs)
    print(len(explored_ucs))
    print()

    print("============ AStar Search ================")
    path_astar, explored_astar = astar_search(graph_neighbours, '0', '61')
    print("Path_astar:", path_astar)
    print("Explored Nodes A Star: ", explored_astar)
    print(len(explored_astar))
    print()
예제 #8
0
def sa(problem, iterations, random_seed, graph_file, graph_title):
    decays = [0.001, 0.002, 0.003, 0.004, 0.005]
    best_score = []
    time_taken = []
    fn_evals_taken = []
    # fig1, ax1 = plt.subplots()
    # fig2, ax2 = plt.subplots()
    global eval_count
    for decay in decays:
        schedule = mlrose_hiive.ArithDecay(init_temp=1.0, decay=decay)
        fitness = []
        fit_time = []
        fn_evals = []
        for i in iterations:
            eval_count = 0
            start = datetime.datetime.now()
            # Solve using simulated annealing - attempt 1
            best_state, best_fitness, _ = mlrose_hiive.simulated_annealing(problem, schedule=schedule,
                                                                max_iters=i, random_state=random_seed)
            finish = datetime.datetime.now()
            fn_evals.append(eval_count)
            fitness.append(best_fitness)
            fit_time.append((finish - start).total_seconds())
            # print('iteration: ',i)
            # print('best_state:', best_state)
            # print('best_fitness: ', best_fitness)
        best_score.append(max(fitness))
        index = fitness.index(max(fitness))
        time_taken.append(fit_time[index])
        fn_evals_taken.append(fn_evals[index])
        # print('index: ', index)
        # print('time for that: ', fit_time[index])
        plt.plot(iterations, fitness, label="Cooling = " + str(decay))
        # ax2.plot(fn_evals, fitness, label="Cooling = " + str(decay))

    plt.legend(loc="best")
    plt.grid()
    generate_graph(graph_file + "sa_iter", graph_title + "Simulated Annealing", "Iterations", "Fitness")

    """
    ax2.legend(loc="best")
    ax2.grid()
    generate_graph("cp_sa_evals", "Continuous Peaks - Simulated Annealing", "Function evaluations", "Fitness")
    """
    # Decays best_score and time_taken
    plt.plot(decays, best_score)
    plt.grid()
    generate_graph(graph_file + "sa_decays", graph_title + "Simulated Annealing",
                   "Cooling Component", "Best Score Achieved")

    plt.plot(decays, time_taken)
    plt.grid()
    generate_graph(graph_file + "sa_decay_time", graph_title + "Simulated Annealing",
                   "Cooling Component", "Time taken to achieve that")

    plt.scatter(time_taken, best_score)
    for i, txt in enumerate(decays):
        plt.annotate(s=str(txt), xy=(time_taken[i], best_score[i]))
    plt.legend(loc='best', title='Cooling Component')
    plt.grid()
    generate_graph(graph_file + "sa_scatter", graph_title + "Simulated Annealing",
                   "Time Taken", "Best Score achieved")

    print('decays: ', decays)
    print('Best scores reached: ', best_score)
    print('Time taken to do that: ', time_taken)
    print('Function evaluations taken: ', fn_evals_taken)
def pulsar_dataset():
    random_seed = 7
    df = pd.read_csv('datasets/HTRU_2.csv')
    df = df.dropna()
    print('data size***********', df.shape)
    # Let us keep aside data for final testing, since we are going to employ cross-validation
    data_X = df.iloc[:, :-1]
    data_y = df.iloc[:, -1]
    X, X_test, y, y_test = train_test_split(data_X,
                                            data_y,
                                            train_size=0.8,
                                            random_state=random_seed)
    # We will use X,y for tuning the model
    # Plot learning curves before tuning with default hidden layers
    mlp_model = MLPClassifier(hidden_layer_sizes=(1), random_state=random_seed)
    train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    util.plot_lc_nn(mlp_model=mlp_model,
                    X=X,
                    y=y,
                    train_sizes=train_sizes,
                    graph_name='nn/nn_htru_')

    # Hyperparameter tuning, hidden layer size
    X_train, X_val_test, y_train, y_val_test = \
        train_test_split(X, y, train_size=0.8, random_state=random_seed)
    hidden_layer_sizes = [1, 3, 5, 7, 10]
    train_score = []
    test_score = []
    for i in hidden_layer_sizes:
        mlp_model = MLPClassifier(hidden_layer_sizes=(i),
                                  random_state=random_seed)
        mlp_model.fit(X=X_train, y=y_train)
        y_train_predict = mlp_model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_predict)
        train_score.append(train_accuracy)

        y_val_test_predict = mlp_model.predict(X_val_test)
        test_accuracy = accuracy_score(y_val_test, y_val_test_predict)
        test_score.append(test_accuracy)

    df_layers = pd.DataFrame({
        'Hidden layer sizes': hidden_layer_sizes,
        'train score': train_score,
        'validation score': test_score
    })
    print('Hidden layers**************')
    print(df_layers)

    # Plot Max depth
    plt.plot(hidden_layer_sizes,
             train_score,
             'o-',
             color="r",
             label="Training score")
    plt.plot(hidden_layer_sizes,
             test_score,
             'o-',
             color="g",
             label="Validation score")
    plt.legend(loc="best")
    util.generate_graph("nn/nn_htru_layers", "Hidden layer sizes Vs Accuracy",
                        "Hidden layer sizes", "Accuracy Score")

    # Choosing layer size = 3
    # Decision Tree after pruning/tuning
    mlp_model = MLPClassifier(hidden_layer_sizes=(3), random_state=random_seed)
    util.plot_lc_nn(mlp_model=mlp_model,
                    X=X,
                    y=y,
                    train_sizes=train_sizes,
                    graph_name='nn/nn_htru_tuned_')

    # Final Model Accuracy against test set we kept aside, with max_depth = 11
    mlp_model = MLPClassifier(hidden_layer_sizes=(3), random_state=random_seed)
    mlp_model.fit(X, y)
    y_predict = mlp_model.predict(X_test)
    final_accuracy = accuracy_score(y_test, y_predict)
    print(
        "MLPClassifier - HTRU_2 Dataset - Final Accuracy score on the test set: ",
        final_accuracy)
def wine_dataset():
    random_seed = 7
    df = pd.read_csv('datasets/winequality-white.csv', sep=';')
    df = df.dropna()
    print('data size***********', df.shape)
    # Let us keep aside data for final testing, since we are going to employ cross-validation
    data_X = df.iloc[:, :-1]
    data_y = df.iloc[:, -1]
    X, X_test, y, y_test = train_test_split(data_X, data_y, train_size=0.8, random_state=random_seed)
    # We will use X,y for tuning the model
    KNN_model = KNeighborsClassifier(n_neighbors=3)
    train_sizes = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    # Plot learning curves before pruning
    util.plot_learning_curve(estimator=KNN_model, title='Learning Curve - KNN', X=X, y=y,
                             cv=3, train_sizes = train_sizes, graph_name= 'knn/knn_wine_')

    # Tuning the KNN model by the n_neighbours parameter
    X_train, X_val_test, y_train, y_val_test = \
        train_test_split(X, y, train_size=0.8, random_state=random_seed)
    k_neighbours = range(1,31)
    train_score = []
    test_score = []
    for k in k_neighbours:
        KNN_model = KNeighborsClassifier(n_neighbors=k)
        KNN_model.fit(X=X_train, y=y_train)
        y_train_predict = KNN_model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_predict)
        train_score.append(train_accuracy)

        y_val_test_predict = KNN_model.predict(X_val_test)
        test_accuracy = accuracy_score(y_val_test, y_val_test_predict)
        test_score.append(test_accuracy)

    df_neighbours = pd.DataFrame({
        'No. neighbours': k_neighbours,
        'train score': train_score,
        'test score': test_score
    })
    print('K neighbours**************')
    print(df_neighbours)

    # Plot Max depth
    plt.plot(k_neighbours, train_score, 'o-', color="r",
             label="Training score")
    plt.plot(k_neighbours, test_score, 'o-', color="g",
             label="Test score")
    plt.legend(loc="best")
    util.generate_graph("knn/knn_wine_nei", "K neighbours Vs Accuracy",
                        "K neighbours", "Accuracy Score")

    # At k = 14, we get a good meeting of train and validation scores.
    # KNN model after tuning
    KNN_model = KNeighborsClassifier(n_neighbors=14)
    train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    # Plot learning curves before pruning
    util.plot_learning_curve(estimator=KNN_model, title='Learning Curve - KNN', X=X, y=y,
                             cv=3, train_sizes=train_sizes, graph_name='knn/knn_wine_tuned_')
    # Final Model Accuracy against test set we kept aside, with k = 14
    KNN_model = KNeighborsClassifier(n_neighbors=14)
    KNN_model.fit(X, y)
    y_predict = KNN_model.predict(X_test)
    final_accuracy = accuracy_score(y_test, y_predict)
    print("KNeighborsClassifier - Wine Dataset - Final Accuracy score on the test set: ", final_accuracy)
    iterations = range(1, 1001, 1)
    nn_rhc_fitness = rhc(X_train_scaled, X_test_scaled, y_train_hot, y_test_hot)
    nn_sa_fitness = sa(X_train_scaled, X_test_scaled, y_train_hot, y_test_hot)
    nn_ga_fitness = ga(X_train_scaled, X_test_scaled, y_train_hot, y_test_hot)

    print('nn_rhc_fitness.shape: ', nn_rhc_fitness.shape)
    print('nn_sa_fitness.shape: ', nn_sa_fitness.shape)
    print('nn_ga_fitness.shape: ', nn_ga_fitness.shape)

    # Plot the fitness vs iterations for each algorithm
    plt.plot(iterations, nn_rhc_fitness, label="RHC")
    plt.plot(iterations, nn_sa_fitness, label="SA")
    plt.plot(iterations, nn_ga_fitness, label="GA")
    plt.legend(loc="best")
    plt.grid()
    generate_graph("nn_fitness", "Neural Network - RHC, SA, GA", "Iterations", "Fitness")

    # Algorithm comparison
    algorithms = ['RHC', 'SA', 'GA']
    train_accuracy = [0.22077590607452782, 0.21311893823379274, 0.5063808065339459]
    test_accuracy = [0.20204081632653062, 0.19795918367346937, 0.5387755102040817]
    fit_times = [11.030777, 12.34732, 1274.722984]
    x = np.arange(3)
    colors = ['coral', 'orange', 'mediumseagreen']

    # Train accuracy score
    plt.bar(x, height= train_accuracy, color=colors)
    plt.xticks(x, algorithms)
    generate_graph("nn_train_score", "Neural Network - Train Accuracy Score", "Algorithms", "Accuracy score")

    # Test accuracy score
def pulsar_dataset():
    random_seed = 7
    df = pd.read_csv('datasets/HTRU_2.csv')
    df = df.dropna()
    print('data size***********', df.shape)
    # Let us keep aside data for final testing, since we are going to employ cross-validation
    data_X = df.iloc[:, :-1]
    data_y = df.iloc[:, -1]
    X, X_test, y, y_test = train_test_split(data_X,
                                            data_y,
                                            train_size=0.8,
                                            random_state=random_seed)
    # We will use X,y for tuning the model
    svm_model = svm.SVC(kernel='linear', random_state=random_seed)
    train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    # Plot learning curves before tuning
    util.plot_learning_curve(estimator=svm_model,
                             title='Learning Curve - Decision Trees',
                             X=X,
                             y=y,
                             cv=3,
                             train_sizes=train_sizes,
                             graph_name='svm/svm_htru_linear_')

    # Swapping kernels in the SVM model
    X_train, X_val_test, y_train, y_val_test = \
        train_test_split(X, y, train_size=0.8, random_state=random_seed)
    kernels = ['linear', 'rbf']
    train_score = []
    test_score = []
    for kernel in kernels:
        svm_model = svm.SVC(kernel=kernel, random_state=random_seed)
        svm_model.fit(X=X_train, y=y_train)
        y_train_predict = svm_model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_predict)
        train_score.append(train_accuracy)

        y_val_test_predict = svm_model.predict(X_val_test)
        test_accuracy = accuracy_score(y_val_test, y_val_test_predict)
        test_score.append(test_accuracy)

    df_kernels = pd.DataFrame({
        'SVM kernel': kernels,
        'train score': train_score,
        'test score': test_score
    })
    print('SVM Kernels**************')
    print(df_kernels)

    # Plot Kernels
    plt.plot(kernels, train_score, 'o-', color="r", label="Training score")
    plt.plot(kernels, test_score, 'o-', color="g", label="Test score")
    plt.legend(loc="best")
    util.generate_graph("svm/svm_htru_kernels", "SVM Kernels Vs Accuracy",
                        "SVM Kernels", "Accuracy Score")

    # Accuracy score is more or less same for both kernels
    # But the performance (fit time) for rbf(0.2s) is lesser compared to linear(8s)
    svm_model = svm.SVC(kernel='rbf', random_state=random_seed)
    train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    # Plot learning curves before pruning
    util.plot_learning_curve(estimator=svm_model,
                             title='Learning Curve - SVM',
                             X=X,
                             y=y,
                             cv=3,
                             train_sizes=train_sizes,
                             graph_name='svm/svm_htru_rbf_')

    # Final Model Accuracy against test set we kept aside, with kernel = rbf
    svm_model = svm.SVC(kernel='rbf', random_state=random_seed)
    svm_model.fit(X, y)
    y_predict = svm_model.predict(X_test)
    final_accuracy = accuracy_score(y_test, y_predict)
    print("SVC - HTRU_2 Dataset - Final Accuracy score on the test set: ",
          final_accuracy)
def pulsar_dataset():
    random_seed = 7
    df = pd.read_csv('datasets/HTRU_2.csv')
    df = df.dropna()
    print('data size***********', df.shape)
    # Let us keep aside data for final testing, since we are going to employ cross-validation
    data_X = df.iloc[:, :-1]
    data_y = df.iloc[:, -1]
    X, X_test, y, y_test = train_test_split(data_X,
                                            data_y,
                                            train_size=0.8,
                                            random_state=random_seed)
    # We will use X,y for tuning the model
    boost_model = AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(random_state=random_seed),
        n_estimators=10)
    train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    # Plot learning curves before pruning
    util.plot_learning_curve(estimator=boost_model,
                             title='Learning Curve - Ada Boost Classifier',
                             X=X,
                             y=y,
                             cv=3,
                             train_sizes=train_sizes,
                             graph_name='boost/boost_htru_')

    # Let's choose training set size 0.8, since dataset seems almost evenly distributed
    # Tuning no of estimators
    X_train, X_val_test, y_train, y_val_test = \
        train_test_split(X, y, train_size=0.8, random_state=random_seed)
    no_estimators = [10, 100, 150, 200]
    train_score = []
    test_score = []
    for i in no_estimators:
        boost_model = AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(random_state=random_seed),
            n_estimators=i,
            random_state=random_seed)
        boost_model.fit(X=X_train, y=y_train)
        y_train_predict = boost_model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_predict)
        train_score.append(train_accuracy)

        y_val_test_predict = boost_model.predict(X_val_test)
        test_accuracy = accuracy_score(y_val_test, y_val_test_predict)
        test_score.append(test_accuracy)

    df_depth = pd.DataFrame({
        'No Estimators': no_estimators,
        'train score': train_score,
        'validation score': test_score
    })
    print('No Estimators**************')
    print(df_depth)

    # Plot Max depth
    plt.plot(no_estimators,
             train_score,
             'o-',
             color="r",
             label="Training score")
    plt.plot(no_estimators,
             test_score,
             'o-',
             color="g",
             label="Validation score")
    plt.legend(loc="best")
    util.generate_graph("boost/boost_htru_estimators",
                        "No of Estimators Vs Accuracy", "No Estimators",
                        "Accuracy Score")

    # Let us take no_estimators = 10
    max_depths = range(1, 31)
    train_score = []
    test_score = []
    for max_depth in max_depths:
        boost_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
            random_state=random_seed, max_depth=max_depth),
                                         n_estimators=10,
                                         random_state=random_seed)
        boost_model.fit(X=X_train, y=y_train)
        y_train_predict = boost_model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_predict)
        train_score.append(train_accuracy)

        y_val_test_predict = boost_model.predict(X_val_test)
        test_accuracy = accuracy_score(y_val_test, y_val_test_predict)
        test_score.append(test_accuracy)

    df_depth = pd.DataFrame({
        'max_depths': max_depths,
        'train score': train_score,
        'validation score': test_score
    })
    print('Max depth**************')
    print(df_depth)

    # Plot Max depth
    plt.plot(max_depths, train_score, 'o-', color="r", label="Training score")
    plt.plot(max_depths, test_score, 'o-', color="g", label="Validation score")
    plt.legend(loc="best")
    util.generate_graph("boost/boost_htru_depth", "Max Depth Vs Accuracy",
                        "Max depth", "Accuracy Score")

    # At max_depth = 1, test score = 0.976955, train = 0.978086, not much difference increasing depth
    # so going with a very simple tree
    # Avoid too much overfitting
    # Decision Tree after pruning/tuning
    boost_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
        max_depth=1, random_state=random_seed),
                                     n_estimators=10)
    train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    # Plot learning curves before pruning
    util.plot_learning_curve(estimator=boost_model,
                             title='Learning Curve - Ada Boost Classifier',
                             X=X,
                             y=y,
                             cv=3,
                             train_sizes=train_sizes,
                             graph_name='boost/boost_htru_pruned_')

    # Final Model Accuracy against test set we kept aside, with max_depth = 1
    boost_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
        max_depth=1, random_state=random_seed),
                                     n_estimators=10)
    boost_model.fit(X, y)
    y_predict = boost_model.predict(X_test)
    final_accuracy = accuracy_score(y_test, y_predict)
    print(
        "AdaBoostClassifier - HTRU_2 Dataset - Final Accuracy score on the test set: ",
        final_accuracy)
def pulsar_dataset():
    random_seed = 7
    df = pd.read_csv('datasets/HTRU_2.csv')
    df = df.dropna()
    print('data size***********', df.shape)
    # Let us keep aside data for final testing, since we are going to employ cross-validation
    data_X = df.iloc[:, :-1]
    data_y = df.iloc[:, -1]
    X, X_test, y, y_test = train_test_split(data_X,
                                            data_y,
                                            train_size=0.8,
                                            random_state=random_seed)
    # We will use X,y for tuning the model
    DT_model = tree.DecisionTreeClassifier(random_state=random_seed)
    train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    # Plot learning curves before pruning
    util.plot_learning_curve(estimator=DT_model,
                             title='Learning Curve - Decision Trees',
                             X=X,
                             y=y,
                             cv=3,
                             train_sizes=train_sizes,
                             graph_name='dt/dt_htru_')
    # Let's choose training set size 0.8, since dataset seems almost evenly distributed
    # Pruning
    X_train, X_val_test, y_train, y_val_test = \
        train_test_split(X, y, train_size=0.8, random_state=random_seed)
    max_depths = np.linspace(1, 32, 32, endpoint=True)
    train_score = []
    test_score = []
    for max_depth in max_depths:
        DT_model = tree.DecisionTreeClassifier(max_depth=max_depth,
                                               random_state=random_seed)
        DT_model.fit(X=X_train, y=y_train)
        y_train_predict = DT_model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_predict)
        train_score.append(train_accuracy)

        y_val_test_predict = DT_model.predict(X_val_test)
        test_accuracy = accuracy_score(y_val_test, y_val_test_predict)
        test_score.append(test_accuracy)

    df_depth = pd.DataFrame({
        'max depth': max_depths,
        'train score': train_score,
        'validation score': test_score
    })
    print('max depth**************')
    print(df_depth)

    # Plot Max depth
    plt.plot(max_depths, train_score, 'o-', color="r", label="Training score")
    plt.plot(max_depths, test_score, 'o-', color="g", label="Validation score")
    plt.legend(loc="best")
    util.generate_graph("dt/dt_htru_max_depths",
                        "Decision Tree Depths Vs Accuracy", "Max Tree Depth",
                        "Accuracy Score")

    # choose max_depth = 1
    # Decision Tree after pruning/tuning
    DT_model = tree.DecisionTreeClassifier(max_depth=1,
                                           random_state=random_seed)
    train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    # Plot learning curves before pruning
    util.plot_learning_curve(estimator=DT_model,
                             title='Learning Curve - Decision Trees',
                             X=X,
                             y=y,
                             cv=3,
                             train_sizes=train_sizes,
                             graph_name='dt/dt_htru_pruned_')
    # Final Model Accuracy against test set we kept aside, with max_depth = 11
    DT_model = tree.DecisionTreeClassifier(max_depth=1,
                                           random_state=random_seed)
    DT_model.fit(X, y)
    y_predict = DT_model.predict(X_test)
    final_accuracy = accuracy_score(y_test, y_predict)
    print(
        "DecisionTreeClassifier - HTRU_2 Dataset - Final Accuracy score on the test set: ",
        final_accuracy)