def load_iris_dataset(self): # Loading the Iris dataset from scikit-learn. # The classes are already converted to integer labels where 0=Iris-Setosa, 1=Iris-Versicolor, 2=Iris-Virginica. iris = datasets.load_iris() x = iris.data[:, [2, 3]] y = iris.target print('Class labels:', np.unique(y)) # plotter data and save it to file Plotter.plot_iris_data_set(x, FilesystemUtils.get_test_resources_plot_file_name( 'ScikitLearn-Iris-Training-Set.png')) # Splitting data into 70% training and 30% test data x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1, stratify=y) print('Labels counts in y:', np.bincount(y)) print('Labels counts in y_train:', np.bincount(y_train)) print('Labels counts in y_test:', np.bincount(y_test)) # Standardize features sc = StandardScaler() sc.fit(x_train) x_train_std = sc.transform(x_train) sc.fit(x_test) x_test_std = sc.transform(x_test) self.x_train = x_train_std self.y_train = y_train self.x_test = x_test_std self.y_test = y_test
def test_logistic_regresssion(self): # train the logistic regression model logistic_regression = LogisticRegressionBGD(learning_rate=0.05, num_epochs=1000) logistic_regression.fit(self.x, self.y) # plot learning curve curve = { 'cost_length': len(logistic_regression.cost), 'cost': np.log10(logistic_regression.cost), 'marker': 'o', 'x_label': 'Epochs', 'y_label': 'Number of updates', 'title': 'Logistic regression - Learning rate 0.05' } Plotter.plot_learning_curve( curve, FilesystemUtils.get_test_resources_plot_file_name( 'logistic_regression/LogisticRegressionBGD-Learning-Curve.png') ) # plot decision boundary diagram_options = { 'x_label': 'sepal length [cm]', 'y_label': 'petal length [cm]', 'legend': 'upper left' } Plotter.plot_decision_boundary( self.x, self.y, classifier=logistic_regression, diagram_options=diagram_options, image_file_path=FilesystemUtils.get_test_resources_plot_file_name( 'logistic_regression/LogisticRegressionBGD-Decision-Boundary.png' ))
def test_perceptron(self): # train the perceptron model perceptron = Perceptron(learning_rate=0.1, num_epochs=10) perceptron.fit(self.x, self.y) # plot learning curve = { 'cost_length': len(perceptron.cost), 'cost': perceptron.cost, 'marker': 'o', 'x_label': 'Epochs', 'y_label': 'Number of updates', 'title': 'Perceptron - Learning rate 0.1' } Plotter.plot_learning_curve( curve, FilesystemUtils.get_test_resources_plot_file_name( 'perceptron/Perceptron-Learning-Curve.png')) # plot decision boundary diagram_options = { 'x_label': 'sepal length [cm]', 'y_label': 'petal length [cm]', 'legend': 'upper left' } Plotter.plot_decision_boundary( self.x, self.y, classifier=perceptron, diagram_options=diagram_options, image_file_path=FilesystemUtils.get_test_resources_plot_file_name( 'perceptron/Perceptron-Decision-Boundary.png'))
def setUp(self): # load subset of Iris data iris_data_reader = IrisDataReader( FilesystemUtils.get_resources_data_file_name('iris/iris.data')) self.x, self.y = iris_data_reader.get_data() # plotter data and save it to file Plotter.plot_iris_data_set( self.x, FilesystemUtils.get_test_resources_plot_file_name( 'adaline/Adaline-Training-Set.png'))
def load_svm_nonlinear_data_set(self): # Load a non linearly separable dataset np.random.seed(1) x_xor = np.random.randn(200, 2) y_xor = np.logical_xor(x_xor[:, 0] > 0, x_xor[:, 1] > 0) y_xor = np.where(y_xor, 1, -1) # plotter data and save it to file Plotter.plot_svm_nonlinear_data_set( x_xor, y_xor, FilesystemUtils.get_test_resources_plot_file_name( 'svm/SVM-ScikitLearn-NonLinear-Training-Set.png')) self.x_train = x_xor self.y_train = y_xor
def test_scikit_learn_svm_nonlinear(self): # The γ parameter, which we set to gamma=0.1, can be understood as a cut-off parameter for the Gaussian sphere. # If we increase the value for γ , we increase the influence or reach of the training samples, which leads to a # tighter and bumpier decision boundary. svm = SVC(kernel='rbf', random_state=1, gamma=0.10, C=10.0) svm.fit(self.x_train, self.y_train) diagram_options = { 'x_label': 'feature 1', 'y_label': 'feature 2', 'legend': 'best' } Plotter.plot_decision_boundary( self.x_train, self.y_train, svm, diagram_options, image_file_path=FilesystemUtils.get_test_resources_plot_file_name( 'svm/SVM-ScikitLearn-NonLinear-Decision-Boundary.png'))
def predict_and_evaluate(self, perceptron: Perceptron, image_file_path: str = None): # Run predictions and count the number of misclassified examples y_pred = perceptron.predict(self.x_test) print('Misclassified samples: %d' % (self.y_test != y_pred).sum()) # Evaluate model accuracy # Each classifier in scikit-learn has a score method, which computes a classifier's prediction accuracy by # combining the predict call with the accuracy_score call print('Accuracy: %.2f' % perceptron.score(self.x_test, self.y_test)) # Show decision boundary diagram_options = { 'x_label': 'petal length [standardized]', 'y_label': 'petal width [standardized]', 'legend': 'upper left', 'draw_test_samples': range(105, 150) } x_combined_std = np.vstack((self.x_train, self.x_test)) y_combined = np.hstack((self.y_train, self.y_test)) Plotter.plot_decision_boundary(x_combined_std, y_combined, perceptron, diagram_options, image_file_path=image_file_path)
def setUp(self): # load subset of Iris data iris = datasets.load_iris() x_train = iris.data[:, [2, 3]] y_train = iris.target # consider only 0 and 1 labels x_train_01_subset = x_train[(y_train == 0) | (y_train == 1)] y_train_01_subset = y_train[(y_train == 0) | (y_train == 1)] # Standardize features sc = StandardScaler() sc.fit(x_train_01_subset) self.x = sc.transform(x_train_01_subset) self.y = y_train_01_subset print('Class labels:', np.unique(self.y)) # plotter data and save it to file Plotter.plot_iris_data_set( self.x, FilesystemUtils.get_test_resources_plot_file_name( 'logistic_regression/LogisticRegressionBGD-Training-Set.png'))
def test_adaline(self): # train the first with bigger learning ratio adaline1 = AdalineBGD(learning_rate=0.01, num_epochs=30) adaline1.fit(self.x, self.y) # train the second adaline model with smaller learning ration adaline2 = AdalineBGD(learning_rate=0.0001, num_epochs=30) adaline2.fit(self.x, self.y) # plot multiple learning curves for both adaline trained models curves = [{ 'cost_length': len(adaline1.cost), 'cost': np.log10(adaline1.cost), 'marker': 'o', 'x_label': 'Epochs', 'y_label': 'log(Sum-squared-error)', 'title': 'Adaline - Learning rate 0.1' }, { 'cost_length': len(adaline2.cost), 'cost': np.log10(adaline2.cost), 'marker': 'o', 'x_label': 'Epochs', 'y_label': 'log(Sum-squared-error)', 'title': 'Adaline - Learning rate 0.0001' }] Plotter.plot_multiple_learning_curves( curves, image_file_path=FilesystemUtils.get_test_resources_plot_file_name( 'adaline/AdalineBGD-Learning-Curves.png')) # plot decision boundary for divergent model (adaline 1) Plotter.plot_decision_boundary( self.x, self.y, classifier=adaline1, diagram_options={ 'x_label': 'sepal length [cm]', 'y_label': 'petal length [cm]', 'legend': 'upper left' }, image_file_path=FilesystemUtils.get_test_resources_plot_file_name( 'adaline/AdalineBGD-Decision-Boundary-Divergent.png')) # plot decision boundary for convergent model (adaline 2) Plotter.plot_decision_boundary( self.x, self.y, classifier=adaline2, diagram_options={ 'x_label': 'sepal length [cm]', 'y_label': 'petal length [cm]', 'legend': 'upper left' }, image_file_path=FilesystemUtils.get_test_resources_plot_file_name( 'adaline/AdalineBGD-Decision-Boundary-Convergent.png'))
def test_adaline_with_stochastic_update(self): # standardize features x_std: np.matrix = np.copy(self.x) x_std[:, 0] = (self.x[:, 0] - self.x[:, 0].mean()) / self.x[:, 0].std() x_std[:, 1] = (self.x[:, 1] - self.x[:, 1].mean()) / self.x[:, 1].std() # plotter data and save it to file Plotter.plot_iris_data_set( x_std, FilesystemUtils.get_test_resources_plot_file_name( 'adaline/AdalineSGD-Standardized-Training-Set.png')) # train adaline on standardized features with a small number of epochs adaline = AdalineSGD(learning_rate=0.01, num_epochs=15) adaline.fit(x_std, self.y) # plot learning curve curve = { 'cost_length': len(adaline.cost), 'cost': adaline.cost, 'marker': 'o', 'x_label': 'Epochs', 'y_label': 'log(Sum-squared-error)', 'title': 'Adaline - Learning rate 0.01' } Plotter.plot_learning_curve( curve, FilesystemUtils.get_test_resources_plot_file_name( 'adaline/AdalineSGD-Learning-Curve-Standardized-Features.png')) # plot decision boundary Plotter.plot_decision_boundary( x_std, self.y, classifier=adaline, diagram_options={ 'x_label': 'sepal length [cm]', 'y_label': 'petal length [cm]', 'legend': 'upper left' }, image_file_path=FilesystemUtils.get_test_resources_plot_file_name( 'adaline/AdalineSGD-Decision-Boundary-Standardized-Features.png' )) adaline.partial_fit(x_std[0, :], self.y[0])