def make_model(size_ts = 10000, size_ls = 250, data_set = 1, graph = False, depth = None, random_state = 0): if data_set == 1 : [X_train, y_train, X_test, y_test] = make_data1(size_ts,size_ls,0,False,random_state=random_state) else: [X_train, y_train, X_test, y_test] = make_data2(size_ts,size_ls,0,False,random_state=random_state) clf = DecisionTreeClassifier(max_depth=depth) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) if graph: if depth == None : plot_boundary(fname="figures/data"+str(data_set)+"_depthNone",fitted_estimator=clf, X=X_test,y=y_pred,title="data set "+str(data_set)+" with depth = None") tree.plot_tree(clf) dot_data = tree.export_graphviz(clf, out_file=None) graph = graphviz.Source(dot_data) graph.render("figures/tree_data"+str(data_set)+"_depthNone") else: plot_boundary(fname="figures/data"+str(data_set)+"_depth"+str(depth),fitted_estimator=clf, X=X_test,y=y_pred,title="data set "+str(data_set)+" with depth = "+str(depth)) return accuracy_score(y_test, y_pred)
def get_sets(nb_samples, nb_training_set, seed, which): """ Return the training and testing sets for a given number of samples, a proportion of training set, a seed and for a dataset Arguments: nb_sample: the number of samples in the dataset nb_training_set: size of the training set seed: the seed used to make some random operations which: which dataset should be used Return: The result of the function train_test_split on the part X and y of the dataset, the proportion of the training set and learning set and on the seed. """ if which == 1: dataset = make_data1(nb_samples, random_state=seed) else: dataset = make_data2(nb_samples, random_state=seed) proportion_training_set = nb_training_set / nb_samples return train_test_split(dataset[0], dataset[1], train_size=proportion_training_set, test_size=1 - proportion_training_set, random_state=seed)
def test_set2_accuracy(max_depth): """Compute the accuracy of our model with the second data set""" test = [] for i in range(5): X, y = make_data2(2000, i + 1) tr, te = score(X, y, max_depth) test.append(te) test = np.asarray(test) my_mean = np.mean(test) my_std = np.std(test) return my_mean, my_std
def tenfold(nb_sub, nb_neighbors, nb_samples, which): """ This function will implementent the K-fold cros validation startegy and plot the different accuracies in fonction of the number of neighbors Argument: nb_sub: the number of sub-division of the samples in order to make the K-fold strategy nb_neighbors: the maximal number of neighbors nb_samples: the number of samples in the dataset which: which dataset should be used Return: / """ results = [] neighbors_toplot = [] optimal_nb_neighbors = -1 max_score = -1 neighbors = 1 if which == 1: dataset = make_data1(nb_samples, nb_sub) else: dataset = make_data2(nb_samples, nb_sub) # Ten-fold cross validation strategy while neighbors <= nb_neighbors: knn = KNeighborsClassifier(n_neighbors=neighbors) scores = cross_val_score(knn, dataset[0], dataset[1], cv=nb_sub, scoring='accuracy') mean_score = scores.mean() results.append(mean_score) neighbors_toplot.append(neighbors) # Determination of the optimal number of neighbours if mean_score > max_score: max_score = mean_score optimal_nb_neighbors = neighbors neighbors += 1 print("The optimal number of neighbours is: " + str(optimal_nb_neighbors) + \ " with an accuracy of %0.4f" %max_score) plt.plot(neighbors_toplot, results) plt.xlabel('Number of neighbours') plt.ylabel('Accuracy') file_name = "Tenfold_cross_ds=" + str(which) plt.savefig("%s.pdf" % file_name)
def make_model(size_ts = 10000, size_ls = 250, data_set = 1, graph = False, n_neigh = 1, cv = False): if data_set == 1 : [X_train, y_train, X_test, y_test] = make_data1(size_ts,size_ls,0,None) else: [X_train, y_train, X_test, y_test] = make_data2(size_ts,size_ls,0,None) clf = KNeighborsClassifier(n_neighbors=n_neigh) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) if graph : plot_boundary(fname="figures/data_" + str(data_set) + "_neighbors"+str(n), fitted_estimator=clf, X=X_test, y=y_pred, title="data set " + str(data_set) + " with neighbors = "+str(n)) return accuracy_score(y_test, y_pred)
# Second data set print("\n\nSecond data set :\n") #Q1 for n in n_neighbors : score = make_model(n_neigh = n, data_set = 2, graph = True) print("Accuracy for n_neighbors " + str(n) + " : " + str(score)) #Q2 #Cross validation for i in list(range(1, iteration)): [X_train, y_train, X_test, y_test] = make_data2(10000, 250, 0, False ,None) best, acc_rate = cross_validation_function(5, X_train, y_train, 200) best_arr.append(best) acc_rate_arr.append(acc_rate) counts = np.bincount(best_arr) print(np.argmax(counts)) mean = np.mean(best_arr) #Q3 LS_size = [50, 200, 250, 500] for size in LS_size: res = [] x = list(range(1, size)) [X_train, y_train, X_test, y_test] = make_data2(500, size, 0, None)
"""Return probability estimates for the test data X. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- p : array of shape = [n_samples, n_classes] The class probabilities of the input samples. Classes are ordered by lexicographic order. """ return np.array([self.ppredict_y0, self.ppredict_y1]) if __name__ == "__main__": X, y = make_data1(2000, 1) X2, y2 = make_data2(2000, 1) clf = GaussianNaiveBayes() clf = clf.fit(X[:150], y[:150]) p = clf.predict(X[-1850:]) clf2 = GaussianNaiveBayes() clf2 = clf2.fit(X2[:150], y2[:150]) p2 = clf2.predict(X2[-1850:]) print(accuracy_score(y[-1850:], p)) print(accuracy_score(y2[-1850:], p2))