def test_02_for_dataset_02(self): """Verify that a classifier can be constructed with initial weights for a fiven dataset.""" expected_weights = {'highly': 0, 'boring': 0, 'green': 0, 'eggs': 0} classifier = PerceptronClassifier.for_dataset( self.small_dataset_train_2) if classifier is None: self.fail( msg= 'Constructing classifier for dataset failed: for_dataset returned None' ) self.assertEqual(classifier.weights, expected_weights)
def nltk_movie_review_accuracy(num_iterations): """ Try different number of features, and optimize number of training iterations.""" # Exercise 4: remove line (training_documents, dev_documents, test_documents) = load_reviews() best_development_accuracy = 0.0 best_num_features = 0 best_classifier = None best_feature_set = None # Test different numbers of features. for n in [100,1000,10000]: print("Training with %d features..." % n) # Training set training_set = Dataset.from_document_collection(training_documents, num_features=n) # Development set development_set = Dataset.from_document_collection(dev_documents, feature_set=training_set.feature_set) # Train classifier # Exercise 4: train the classifier classifier = PerceptronClassifier.for_dataset(training_set) classifier.train(training_set, development_set,num_iterations) # Accuracies of classifier with n features train_accuracy = classifier.test_accuracy(training_set) development_accuracy = classifier.test_accuracy(development_set) if development_accuracy > best_development_accuracy: best_development_accuracy = development_accuracy best_num_features = n best_classifier = classifier.copy() best_feature_set = training_set.feature_set print("Best classifier with %d features: \t Train Accuracy: %.4f \t Dev Accuracy: %.4f" % (n, train_accuracy, best_development_accuracy)) print("Best number of features: %d " % best_num_features) print("Top features for positive class:") print(best_classifier.features_for_class(True)) print("Top features for negative class:") print(best_classifier.features_for_class(False)) # Compute test score for best setting. testing_set = Dataset.from_document_collection(test_documents, feature_set=best_feature_set) testing_accuracy = best_classifier.test_accuracy(testing_set) print("Test score for best setting: %.4f" % testing_accuracy) return best_development_accuracy, testing_accuracy