Пример #1
0
def make_model(size_ts = 10000, size_ls = 250, data_set = 1, graph = False, depth = None, random_state = 0):
    
    if data_set == 1 :
        [X_train, y_train, X_test, y_test] = make_data1(size_ts,size_ls,0,False,random_state=random_state)
    else:
        [X_train, y_train, X_test, y_test] = make_data2(size_ts,size_ls,0,False,random_state=random_state)

    clf = DecisionTreeClassifier(max_depth=depth)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    if graph:
        if depth == None :
            plot_boundary(fname="figures/data"+str(data_set)+"_depthNone",fitted_estimator=clf,
                          X=X_test,y=y_pred,title="data set "+str(data_set)+" with depth = None")
            
            tree.plot_tree(clf)
            dot_data = tree.export_graphviz(clf, out_file=None)
            graph = graphviz.Source(dot_data)
            graph.render("figures/tree_data"+str(data_set)+"_depthNone")
        
        else:
            plot_boundary(fname="figures/data"+str(data_set)+"_depth"+str(depth),fitted_estimator=clf,
                          X=X_test,y=y_pred,title="data set "+str(data_set)+" with depth = "+str(depth))
                   
    return accuracy_score(y_test, y_pred)
Пример #2
0
def get_sets(nb_samples, nb_training_set, seed, which):
    """
    Return the training and testing sets for a given number of samples, a proportion of training
    set, a seed and for a dataset
    
    Arguments:
        nb_sample: the number of samples in the dataset
        nb_training_set: size of the training set
        seed: the seed used to make some random operations
        which: which dataset should be used
        
    Return:
        The result of the function train_test_split on the part X and y of the dataset, the proportion
        of the training set and learning set and on the seed.
    """
    if which == 1:
        dataset = make_data1(nb_samples, random_state=seed)
    else:
        dataset = make_data2(nb_samples, random_state=seed)

    proportion_training_set = nb_training_set / nb_samples

    return train_test_split(dataset[0],
                            dataset[1],
                            train_size=proportion_training_set,
                            test_size=1 - proportion_training_set,
                            random_state=seed)
def test_set2_accuracy(max_depth):
    """Compute the accuracy of our model with the second data set"""
    test = []
    for i in range(5):
        X, y = make_data2(2000, i + 1)
        tr, te = score(X, y, max_depth)
        test.append(te)

    test = np.asarray(test)
    my_mean = np.mean(test)
    my_std = np.std(test)
    return my_mean, my_std
Пример #4
0
def tenfold(nb_sub, nb_neighbors, nb_samples, which):
    """
    This function will implementent the K-fold cros validation startegy and plot the different
    accuracies in fonction of the number of neighbors
    
    Argument:
        nb_sub: the number of sub-division of the samples in order to make the K-fold strategy
        nb_neighbors: the maximal number of neighbors
        nb_samples: the number of samples in the dataset
        which: which dataset should be used
        
    Return:
        /
    """
    results = []
    neighbors_toplot = []
    optimal_nb_neighbors = -1
    max_score = -1
    neighbors = 1

    if which == 1:
        dataset = make_data1(nb_samples, nb_sub)
    else:
        dataset = make_data2(nb_samples, nb_sub)

    # Ten-fold cross validation strategy
    while neighbors <= nb_neighbors:
        knn = KNeighborsClassifier(n_neighbors=neighbors)
        scores = cross_val_score(knn,
                                 dataset[0],
                                 dataset[1],
                                 cv=nb_sub,
                                 scoring='accuracy')
        mean_score = scores.mean()
        results.append(mean_score)
        neighbors_toplot.append(neighbors)

        # Determination of the optimal number of neighbours
        if mean_score > max_score:
            max_score = mean_score
            optimal_nb_neighbors = neighbors

        neighbors += 1

    print("The optimal number of neighbours is: " + str(optimal_nb_neighbors) + \
            " with an accuracy of %0.4f" %max_score)

    plt.plot(neighbors_toplot, results)
    plt.xlabel('Number of neighbours')
    plt.ylabel('Accuracy')
    file_name = "Tenfold_cross_ds=" + str(which)
    plt.savefig("%s.pdf" % file_name)
Пример #5
0
def make_model(size_ts = 10000, size_ls = 250, data_set = 1, graph = False, n_neigh = 1, cv = False):
    
    if data_set == 1 :
        [X_train, y_train, X_test, y_test] = make_data1(size_ts,size_ls,0,None)
    else:
        [X_train, y_train, X_test, y_test] = make_data2(size_ts,size_ls,0,None)

    clf = KNeighborsClassifier(n_neighbors=n_neigh)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
                            
    
    if graph :
        plot_boundary(fname="figures/data_" + str(data_set) + "_neighbors"+str(n), fitted_estimator=clf, 
                      X=X_test, y=y_pred, title="data set " + str(data_set) + " with neighbors = "+str(n))
        
    return accuracy_score(y_test, y_pred)
Пример #6
0
        

      
    # Second data set
    print("\n\nSecond data set :\n")
   
    #Q1
    for n in n_neighbors :
        score = make_model(n_neigh = n, data_set = 2, graph = True)
        print("Accuracy for n_neighbors " + str(n) + " : " + str(score))

    
    #Q2
    #Cross validation 
    for i in list(range(1, iteration)):
        [X_train, y_train, X_test, y_test] = make_data2(10000, 250, 0, False ,None)
        best, acc_rate = cross_validation_function(5, X_train, y_train, 200)
        best_arr.append(best)
        acc_rate_arr.append(acc_rate)
    
    counts = np.bincount(best_arr)
    print(np.argmax(counts))
    mean = np.mean(best_arr)
    
    #Q3          
    LS_size = [50, 200, 250, 500]
    
    for size in LS_size:
        res = []
        x = list(range(1, size))
        [X_train, y_train, X_test, y_test] = make_data2(500, size, 0, None)
Пример #7
0
        """Return probability estimates for the test data X.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The input samples.

        Returns
        -------
        p : array of shape = [n_samples, n_classes]
            The class probabilities of the input samples. Classes are ordered
            by lexicographic order.
        """

        return np.array([self.ppredict_y0, self.ppredict_y1])


if __name__ == "__main__":

    X, y = make_data1(2000, 1)
    X2, y2 = make_data2(2000, 1)
    clf = GaussianNaiveBayes()
    clf = clf.fit(X[:150], y[:150])
    p = clf.predict(X[-1850:])

    clf2 = GaussianNaiveBayes()
    clf2 = clf2.fit(X2[:150], y2[:150])
    p2 = clf2.predict(X2[-1850:])

    print(accuracy_score(y[-1850:], p))
    print(accuracy_score(y2[-1850:], p2))