LogisticClassifier, SVMClassifier, PerceptronClassifier, RandomForestTextClassifier from nlpdatahandlers import ImdbDataHandler import sys IMDB_DATA_DEFAULT = '../deep-text/datasets/aclImdb/aclImdb' if __name__ == '__main__': if len(sys.argv) > 1 and sys.argv[1] != "": source = sys.argv[1] else: source = IMDB_DATA_DEFAULT print "Loading data from original source" imdb = ImdbDataHandler(source=source) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN, shuffle=True) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST, shuffle=True) # Simple bag of words with SGD sgd = SGDTextClassifier(train_reviews, train_labels, test_texts=test_reviews, test_labels=test_labels, compute_features=True) sgd.grid_search_cv(verbose=5, n_jobs=4) test_error = sgd.get_test_error() print "Test error in held out set: " + str(test_error)
logger.info(LOGGER_PREFIX % msg) IMDB_DATA = './datasets/aclImdb/aclImdb' CHARACTERS_PER_WORD = 15 WORDS_PER_DOCUMENT = 300 PREPEND = False if __name__ == '__main__': log('Initializing CharMapper') cm = CharMapper() log('Load data from original source') imdb = ImdbDataHandler(source=IMDB_DATA) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST) log('Converting to character level representations') train_global_wvs_reviews = imdb.to_char_level_idx( train_reviews, char_container=cm, chars_per_word=CHARACTERS_PER_WORD, words_per_document=WORDS_PER_DOCUMENT, prepend=PREPEND) test_global_wvs_reviews = imdb.to_char_level_idx( test_reviews, char_container=cm,
from classic.classifiers import TextClassifier, NaiveBayesClassifier, SGDTextClassifier, \ LogisticClassifier, SVMClassifier, PerceptronClassifier, RandomForestTextClassifier from nlpdatahandlers import ImdbDataHandler import sys IMDB_DATA_DEFAULT = '../deep-text/datasets/aclImdb/aclImdb' if __name__ == '__main__': print "Loading data from original source" imdb = ImdbDataHandler(source=IMDB_DATA_DEFAULT) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN, shuffle=True) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST, shuffle=True) print "Naive Bayes" nb = NaiveBayesClassifier() nb.set_training_data(train_reviews, train_labels) nb.set_test_data(test_reviews, test_labels) nb.set_bag_of_ngrams() nb.train() train_error = nb.get_training_error() test_error = nb.get_test_error() print "Training error: " + str(train_error) print "Test error: " + str(test_error) print "SGD Classifier"