示例#1
0
def get_sets(nb_samples, nb_training_set, seed, which):
    """
    Return the training and testing sets for a given number of samples, a proportion of training
    set, a seed and for a dataset
    
    Arguments:
        nb_sample: the number of samples in the dataset
        nb_training_set: size of the training set
        seed: the seed used to make some random operations
        which: which dataset should be used
        
    Return:
        The result of the function train_test_split on the part X and y of the dataset, the proportion
        of the training set and learning set and on the seed.
    """
    if which == 1:
        dataset = make_data1(nb_samples, random_state=seed)
    else:
        dataset = make_data2(nb_samples, random_state=seed)

    proportion_training_set = nb_training_set / nb_samples

    return train_test_split(dataset[0],
                            dataset[1],
                            train_size=proportion_training_set,
                            test_size=1 - proportion_training_set,
                            random_state=seed)
示例#2
0
def make_model(size_ts = 10000, size_ls = 250, data_set = 1, graph = False, depth = None, random_state = 0):
    
    if data_set == 1 :
        [X_train, y_train, X_test, y_test] = make_data1(size_ts,size_ls,0,False,random_state=random_state)
    else:
        [X_train, y_train, X_test, y_test] = make_data2(size_ts,size_ls,0,False,random_state=random_state)

    clf = DecisionTreeClassifier(max_depth=depth)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    if graph:
        if depth == None :
            plot_boundary(fname="figures/data"+str(data_set)+"_depthNone",fitted_estimator=clf,
                          X=X_test,y=y_pred,title="data set "+str(data_set)+" with depth = None")
            
            tree.plot_tree(clf)
            dot_data = tree.export_graphviz(clf, out_file=None)
            graph = graphviz.Source(dot_data)
            graph.render("figures/tree_data"+str(data_set)+"_depthNone")
        
        else:
            plot_boundary(fname="figures/data"+str(data_set)+"_depth"+str(depth),fitted_estimator=clf,
                          X=X_test,y=y_pred,title="data set "+str(data_set)+" with depth = "+str(depth))
                   
    return accuracy_score(y_test, y_pred)
def test_set1_accuracy(max_depth):
    """Compute the accuracy of our model with the first data set"""
    test = []
    for i in range(5):
        X, y = make_data1(2000, i + 1)
        tr, te = score(X, y, max_depth)
        test.append(te)

    test = np.asarray(test)
    my_mean = np.mean(test)
    my_std = np.std(test)
    return my_mean, my_std
示例#4
0
def tenfold(nb_sub, nb_neighbors, nb_samples, which):
    """
    This function will implementent the K-fold cros validation startegy and plot the different
    accuracies in fonction of the number of neighbors
    
    Argument:
        nb_sub: the number of sub-division of the samples in order to make the K-fold strategy
        nb_neighbors: the maximal number of neighbors
        nb_samples: the number of samples in the dataset
        which: which dataset should be used
        
    Return:
        /
    """
    results = []
    neighbors_toplot = []
    optimal_nb_neighbors = -1
    max_score = -1
    neighbors = 1

    if which == 1:
        dataset = make_data1(nb_samples, nb_sub)
    else:
        dataset = make_data2(nb_samples, nb_sub)

    # Ten-fold cross validation strategy
    while neighbors <= nb_neighbors:
        knn = KNeighborsClassifier(n_neighbors=neighbors)
        scores = cross_val_score(knn,
                                 dataset[0],
                                 dataset[1],
                                 cv=nb_sub,
                                 scoring='accuracy')
        mean_score = scores.mean()
        results.append(mean_score)
        neighbors_toplot.append(neighbors)

        # Determination of the optimal number of neighbours
        if mean_score > max_score:
            max_score = mean_score
            optimal_nb_neighbors = neighbors

        neighbors += 1

    print("The optimal number of neighbours is: " + str(optimal_nb_neighbors) + \
            " with an accuracy of %0.4f" %max_score)

    plt.plot(neighbors_toplot, results)
    plt.xlabel('Number of neighbours')
    plt.ylabel('Accuracy')
    file_name = "Tenfold_cross_ds=" + str(which)
    plt.savefig("%s.pdf" % file_name)
示例#5
0
def make_model(size_ts = 10000, size_ls = 250, data_set = 1, graph = False, n_neigh = 1, cv = False):
    
    if data_set == 1 :
        [X_train, y_train, X_test, y_test] = make_data1(size_ts,size_ls,0,None)
    else:
        [X_train, y_train, X_test, y_test] = make_data2(size_ts,size_ls,0,None)

    clf = KNeighborsClassifier(n_neighbors=n_neigh)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
                            
    
    if graph :
        plot_boundary(fname="figures/data_" + str(data_set) + "_neighbors"+str(n), fitted_estimator=clf, 
                      X=X_test, y=y_pred, title="data set " + str(data_set) + " with neighbors = "+str(n))
        
    return accuracy_score(y_test, y_pred)
示例#6
0
    # First data set
    print("First data set :\n")
    
    #Q1       
    for n in n_neighbors :
        score = make_model(n_neigh = n, data_set = 1, graph = True)
        print("Accuracy for n_neighbors " + str(n) + " : " + str(score))
    
    
    #Q3 
    LS_size = [50, 200, 250, 500]
    
    for size in LS_size:
        res = []
        x = list(range(1, size))
        [X_train, y_train, X_test, y_test] = make_data1(500, size, 0, None)
            
        for n in x :
            clf = KNeighborsClassifier(n_neighbors=n)
            clf = clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            res.append(accuracy_score(y_test, y_pred))
        
        plt.figure()
        plt.plot(x, res)
        plt.xlabel('Neighbors')
        plt.ylabel('Accuracy')
        plt.savefig('data_1_'+ str(size))
        

      
示例#7
0
ELEN0062 - Introduction to machine learning
Project 1 - Classification algorithms
"""
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from data import make_data1, make_data2
from plot import plot_boundary

X, y = make_data1(2000)
# data_train  = dataset[0:149]
# data_test = dataset[150:]

x_data_train, x_data_test, y_data_train, y_data_test = train_test_split(
    X, y, test_size=150 / 2000, random_state=None)
# data_test, data_hold = train_test_split(data_test_hold, test_size=0.33, random_state=21)


def tree(max_depth_input=None, fname=""):
    model = DecisionTreeClassifier(criterion='gini',
                                   splitter='best',
                                   max_depth=max_depth_input,
                                   min_samples_split=2,
                                   min_samples_leaf=1,
                                   min_weight_fraction_leaf=0.0,
            X1X2_X_test.append(X_test[i, 0] * X_test[i, 1])

        modified_X_test = np.column_stack(
            (X_test[:, 0], X_test[:,
                                  1], X1X1_X_test, X2X2_X_test, X1X2_X_test))

        return modified_X_train, modified_X_test


if __name__ == "__main__":

    size_ts = 10000
    size_ls = 250

    [X_train, y_train, X_test, y_test] = make_data1(size_ts,
                                                    size_ls,
                                                    0,
                                                    random_state=0)
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    clf = residual_fitting()

    [X_train_add, X_test_add] = clf.add_attributes(X_train, X_test)

    # clf.fit(X=X_train, y=y_train)
    clf.fit(X=X_train_add, y=y_train)

    # clf.predict(X_test)
    clf.predict(X_test_add)