def get_sets(nb_samples, nb_training_set, seed, which): """ Return the training and testing sets for a given number of samples, a proportion of training set, a seed and for a dataset Arguments: nb_sample: the number of samples in the dataset nb_training_set: size of the training set seed: the seed used to make some random operations which: which dataset should be used Return: The result of the function train_test_split on the part X and y of the dataset, the proportion of the training set and learning set and on the seed. """ if which == 1: dataset = make_data1(nb_samples, random_state=seed) else: dataset = make_data2(nb_samples, random_state=seed) proportion_training_set = nb_training_set / nb_samples return train_test_split(dataset[0], dataset[1], train_size=proportion_training_set, test_size=1 - proportion_training_set, random_state=seed)
def make_model(size_ts = 10000, size_ls = 250, data_set = 1, graph = False, depth = None, random_state = 0): if data_set == 1 : [X_train, y_train, X_test, y_test] = make_data1(size_ts,size_ls,0,False,random_state=random_state) else: [X_train, y_train, X_test, y_test] = make_data2(size_ts,size_ls,0,False,random_state=random_state) clf = DecisionTreeClassifier(max_depth=depth) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) if graph: if depth == None : plot_boundary(fname="figures/data"+str(data_set)+"_depthNone",fitted_estimator=clf, X=X_test,y=y_pred,title="data set "+str(data_set)+" with depth = None") tree.plot_tree(clf) dot_data = tree.export_graphviz(clf, out_file=None) graph = graphviz.Source(dot_data) graph.render("figures/tree_data"+str(data_set)+"_depthNone") else: plot_boundary(fname="figures/data"+str(data_set)+"_depth"+str(depth),fitted_estimator=clf, X=X_test,y=y_pred,title="data set "+str(data_set)+" with depth = "+str(depth)) return accuracy_score(y_test, y_pred)
def test_set1_accuracy(max_depth): """Compute the accuracy of our model with the first data set""" test = [] for i in range(5): X, y = make_data1(2000, i + 1) tr, te = score(X, y, max_depth) test.append(te) test = np.asarray(test) my_mean = np.mean(test) my_std = np.std(test) return my_mean, my_std
def tenfold(nb_sub, nb_neighbors, nb_samples, which): """ This function will implementent the K-fold cros validation startegy and plot the different accuracies in fonction of the number of neighbors Argument: nb_sub: the number of sub-division of the samples in order to make the K-fold strategy nb_neighbors: the maximal number of neighbors nb_samples: the number of samples in the dataset which: which dataset should be used Return: / """ results = [] neighbors_toplot = [] optimal_nb_neighbors = -1 max_score = -1 neighbors = 1 if which == 1: dataset = make_data1(nb_samples, nb_sub) else: dataset = make_data2(nb_samples, nb_sub) # Ten-fold cross validation strategy while neighbors <= nb_neighbors: knn = KNeighborsClassifier(n_neighbors=neighbors) scores = cross_val_score(knn, dataset[0], dataset[1], cv=nb_sub, scoring='accuracy') mean_score = scores.mean() results.append(mean_score) neighbors_toplot.append(neighbors) # Determination of the optimal number of neighbours if mean_score > max_score: max_score = mean_score optimal_nb_neighbors = neighbors neighbors += 1 print("The optimal number of neighbours is: " + str(optimal_nb_neighbors) + \ " with an accuracy of %0.4f" %max_score) plt.plot(neighbors_toplot, results) plt.xlabel('Number of neighbours') plt.ylabel('Accuracy') file_name = "Tenfold_cross_ds=" + str(which) plt.savefig("%s.pdf" % file_name)
def make_model(size_ts = 10000, size_ls = 250, data_set = 1, graph = False, n_neigh = 1, cv = False): if data_set == 1 : [X_train, y_train, X_test, y_test] = make_data1(size_ts,size_ls,0,None) else: [X_train, y_train, X_test, y_test] = make_data2(size_ts,size_ls,0,None) clf = KNeighborsClassifier(n_neighbors=n_neigh) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) if graph : plot_boundary(fname="figures/data_" + str(data_set) + "_neighbors"+str(n), fitted_estimator=clf, X=X_test, y=y_pred, title="data set " + str(data_set) + " with neighbors = "+str(n)) return accuracy_score(y_test, y_pred)
# First data set print("First data set :\n") #Q1 for n in n_neighbors : score = make_model(n_neigh = n, data_set = 1, graph = True) print("Accuracy for n_neighbors " + str(n) + " : " + str(score)) #Q3 LS_size = [50, 200, 250, 500] for size in LS_size: res = [] x = list(range(1, size)) [X_train, y_train, X_test, y_test] = make_data1(500, size, 0, None) for n in x : clf = KNeighborsClassifier(n_neighbors=n) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) res.append(accuracy_score(y_test, y_pred)) plt.figure() plt.plot(x, res) plt.xlabel('Neighbors') plt.ylabel('Accuracy') plt.savefig('data_1_'+ str(size))
ELEN0062 - Introduction to machine learning Project 1 - Classification algorithms """ #! /usr/bin/env python # -*- coding: utf-8 -*- import numpy as np from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from data import make_data1, make_data2 from plot import plot_boundary X, y = make_data1(2000) # data_train = dataset[0:149] # data_test = dataset[150:] x_data_train, x_data_test, y_data_train, y_data_test = train_test_split( X, y, test_size=150 / 2000, random_state=None) # data_test, data_hold = train_test_split(data_test_hold, test_size=0.33, random_state=21) def tree(max_depth_input=None, fname=""): model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=max_depth_input, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
X1X2_X_test.append(X_test[i, 0] * X_test[i, 1]) modified_X_test = np.column_stack( (X_test[:, 0], X_test[:, 1], X1X1_X_test, X2X2_X_test, X1X2_X_test)) return modified_X_train, modified_X_test if __name__ == "__main__": size_ts = 10000 size_ls = 250 [X_train, y_train, X_test, y_test] = make_data1(size_ts, size_ls, 0, random_state=0) X_train = np.array(X_train) y_train = np.array(y_train) X_test = np.array(X_test) y_test = np.array(y_test) clf = residual_fitting() [X_train_add, X_test_add] = clf.add_attributes(X_train, X_test) # clf.fit(X=X_train, y=y_train) clf.fit(X=X_train_add, y=y_train) # clf.predict(X_test) clf.predict(X_test_add)