def test_model_selection_nb_best_b(self): data = test_data['model_selection_NB'] error_best, best_a, best_b, errors = model_selection_nb(data['Xtrain'], data['Xval'], data['ytrain'], data['yval'], data['a_values'], data['b_values']) expected_error_best, expected_best_a, expected_best_b, expected_errors = \ data['error_best'], data['best_a'], data['best_b'], data['errors'] self.assertEquals(best_b, expected_best_b)
def test_model_selection_nb_errors(self): data = test_data['model_selection_NB'] error_best, best_a, best_b, errors = model_selection_nb(data['Xtrain'], data['Xval'], data['ytrain'], data['yval'], data['a_values'], data['b_values']) expected_error_best, expected_best_a, expected_best_b, expected_errors = \ data['error_best'], data['best_a'], data['best_b'], data['errors'] max_diff = np.max(np.abs(expected_errors - errors)) self.assertAlmostEqual(max_diff, 0, 8)
def test_model_selection_nb_best_a(self): data = TEST_DATA['model_selection_NB'] best_a_expected = data['best_a'] _, best_a, _, _ = model_selection_nb(data['Xtrain'], data['Xval'], data['ytrain'], data['yval'], data['a_values'], data['b_values']) self.assertEqual(np.size(best_a), 1) self.assertEqual(best_a, best_a_expected)
def test_model_selection_nb_errors(self): data = TEST_DATA['model_selection_NB'] errors_expected = data['errors'] _, _, _, errors = model_selection_nb(data['Xtrain'], data['Xval'], data['ytrain'], data['yval'], data['a_values'], data['b_values']) self.assertEqual(np.shape(errors), (3, 3)) np.testing.assert_almost_equal(errors, errors_expected)
def test_model_selection_nb_best_error(self): data = TEST_DATA['model_selection_NB'] error_best_expected = data['error_best'] error_best, _, _, _ = model_selection_nb(data['Xtrain'], data['Xval'], data['ytrain'], data['yval'], data['a_values'], data['b_values']) self.assertEqual(np.size(error_best), 1) self.assertAlmostEqual(error_best, error_best_expected)
def test_model_selection_nb_errors(self): X_train = TEST_DATA['model_selection_NB']['Xtrain'] X_val = TEST_DATA['model_selection_NB']['Xval'] y_train = TEST_DATA['model_selection_NB']['ytrain'] y_val = TEST_DATA['model_selection_NB']['yval'] a_values = TEST_DATA['model_selection_NB']['a_values'] b_values = TEST_DATA['model_selection_NB']['b_values'] errors_expected = TEST_DATA['model_selection_NB']['errors'] _, _, _, errors = model_selection_nb(X_train, X_val, y_train, y_val, a_values, b_values) self.assertEqual(np.shape(errors), (3, 3)) np.testing.assert_almost_equal(errors, errors_expected)
def test_model_selection_nb_best_b(self): X_train = TEST_DATA['model_selection_NB']['Xtrain'] X_val = TEST_DATA['model_selection_NB']['Xval'] y_train = TEST_DATA['model_selection_NB']['ytrain'] y_val = TEST_DATA['model_selection_NB']['yval'] a_values = TEST_DATA['model_selection_NB']['a_values'] b_values = TEST_DATA['model_selection_NB']['b_values'] best_b_expected = TEST_DATA['model_selection_NB']['best_b'] _, _, best_b, _ = model_selection_nb(X_train, X_val, y_train, y_val, a_values, b_values) self.assertEqual(np.size(best_b), 1) self.assertEqual(best_b, best_b_expected)
def test_model_selection_nb_best_error(self): X_train = TEST_DATA['model_selection_NB']['Xtrain'] X_val = TEST_DATA['model_selection_NB']['Xval'] y_train = TEST_DATA['model_selection_NB']['ytrain'] y_val = TEST_DATA['model_selection_NB']['yval'] a_values = TEST_DATA['model_selection_NB']['a_values'] b_values = TEST_DATA['model_selection_NB']['b_values'] error_best_expected = TEST_DATA['model_selection_NB']['error_best'] error_best, _, _, _ = model_selection_nb(X_train, X_val, y_train, y_val, a_values, b_values) self.assertEqual(np.size(error_best), 1) self.assertAlmostEqual(error_best, error_best_expected)
def run_training(): data = load_data() # KNN model selection k_values = range(1, 201, 2) print( '\n------------- Selekcja liczby sasiadow dla modelu dla KNN -------------' ) print( '-------------------- Wartosci k: 1, 3, ..., 200 -----------------------' ) print( '--------------------- To moze potrwac ok. 1 min ------------------------' ) error_best, best_k, errors = model_selection_knn(data['Xval'], data['Xtrain'], data['yval'], data['ytrain'], k_values) print('Najlepsze k: {num1} i najlepszy blad: {num2:.4f}'.format( num1=best_k, num2=error_best)) print('\n--- Wcisnij klawisz, aby kontynuowac ---') classification_KNN_vs_no_neighbours(k_values, errors) a_values = [1, 3, 10, 30, 100, 300, 1000] b_values = [1, 3, 10, 30, 100, 300, 1000] print( '\n----------------- Selekcja parametrow a i b dla NB --------------------' ) print( '--------- Wartosci a i b: 1, 3, 10, 30, 100, 300, 1000 -----------------' ) print( '--------------------- To moze potrwac ok. 1 min ------------------------' ) # NB model selection error_best, best_a, best_b, errors = model_selection_nb( data['Xtrain'], data['Xval'], data['ytrain'], data['yval'], a_values, b_values) print('Najlepsze a: {}, b: {} i najlepszy blad: {:.4f}'.format( best_a, best_b, error_best)) print('\n--- Wcisnij klawisz, aby kontynuowac ---') plot_a_b_errors(errors, a_values, b_values) p_x_y = estimate_p_x_y_nb(data['Xtrain'], data['ytrain'], best_a, best_b) classes_no = p_x_y.shape[0] print( '\n------Wizualizacja najbardziej popularnych slow dla poszczegolnych klas------' ) print( '--Sa to slowa o najwyzszym prawdopodobienstwie w danej klasie dla modelu NB--' ) try: groupnames = data['groupnames'] words = {} for x in range(classes_no): indices = np.argsort(p_x_y[x, :])[::-1][:50] words[groupnames[x]] = { word: prob for word, prob in zip(data['wordlist'][indices], p_x_y[ x, indices]) } word_clouds(words.values(), words.keys()) except Exception: print('---Wystapil problem z biblioteka wordcloud--- ') print('\n--- Wcisnij klawisz, aby kontynuowac ---') print( '\n----------------Porownanie bledow dla KNN i NB---------------------' ) Dist = hamming_distance(data['Xtest'], data['Xtrain']) y_sorted = sort_train_labels_knn(Dist, data['ytrain']) p_y_x = p_y_x_knn(y_sorted, best_k) error_KNN = classification_error(p_y_x, data['ytest']) p_y = estimate_a_priori_nb(data['ytrain']) p_y_x = p_y_x_nb(p_y, p_x_y, data['Xtest']) error_NB = classification_error(p_y_x, data['ytest']) plot_error_NB_KNN(error_NB, error_KNN) print('\n--- Wcisnij klawisz, aby kontynuowac ---')
def run_training(): data = load_data() # KNN model selection k_values = range(1, 201, 2) print('\n------------- Model selection for KNN -------------') print( '-------------------- Values k: 1, 3, ..., 200 -----------------------' ) print( '--------------------- Calculation may take up to 1 min ------------------------' ) error_best, best_k, errors = model_selection_knn(data['Xval'], data['Xtrain'], data['yval'], data['ytrain'], k_values) print('The best k: {num1} and the best error: {num2:.4f}'.format( num1=best_k, num2=error_best)) print('\n--- Press any key to continue ---') classification_KNN_vs_no_neighbours(k_values, errors) a_values = [1, 3, 10, 30, 100, 300, 1000] b_values = [1, 3, 10, 30, 100, 300, 1000] print( '\n----------------- Model selection for a and b --------------------') print( '--------- Values a and b: 1, 3, 10, 30, 100, 300, 1000 -----------------' ) print( '--------------------- Calculation may take up to 1 min ------------------------' ) # NB model selection error_best, best_a, best_b, errors = model_selection_nb( data['Xtrain'], data['Xval'], data['ytrain'], data['yval'], a_values, b_values) print('The best a: {}, b: {} and the best error: {:.4f}'.format( best_a, best_b, error_best)) print('\n--- Press any key to continue ---') plot_a_b_errors(errors, a_values, b_values) p_x_y = estimate_p_x_y_nb(data['Xtrain'], data['ytrain'], best_a, best_b) classes_no = p_x_y.shape[0] print('\n------ Visualization of most popular words for each class ------') print( '-- These are words that are most probable for each class and NB model --' ) try: groupnames = data['groupnames'] words = {} for x in range(classes_no): indices = np.argsort(p_x_y[x, :])[::-1][:50] words[groupnames[x]] = { word: prob for word, prob in zip(data['wordlist'][indices], p_x_y[ x, indices]) } word_clouds(words.values(), words.keys()) except Exception: print('--- A problem with wordcloud library --- ') print('\n--- Press any key to continue ---') print( '\n---------------- Comparison of KNN and NB errors ---------------------' ) Dist = hamming_distance(data['Xtest'], data['Xtrain']) y_sorted = sort_train_labels_knn(Dist, data['ytrain']) p_y_x = p_y_x_knn(y_sorted, best_k) error_KNN = classification_error(p_y_x, data['ytest']) p_y = estimate_a_priori_nb(data['ytrain']) p_y_x = p_y_x_nb(p_y, p_x_y, data['Xtest']) error_NB = classification_error(p_y_x, data['ytest']) plot_error_NB_KNN(error_NB, error_KNN) print('\n--- Press any key to continue ---')