LogisticClassifier, SVMClassifier, PerceptronClassifier, RandomForestTextClassifier from nlpdatahandlers import ImdbDataHandler import sys IMDB_DATA_DEFAULT = '../deep-text/datasets/aclImdb/aclImdb' if __name__ == '__main__': if len(sys.argv) > 1 and sys.argv[1] != "": source = sys.argv[1] else: source = IMDB_DATA_DEFAULT print "Loading data from original source" imdb = ImdbDataHandler(source=source) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN, shuffle=True) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST, shuffle=True) # Simple bag of words with SGD sgd = SGDTextClassifier(train_reviews, train_labels, test_texts=test_reviews, test_labels=test_labels, compute_features=True) sgd.grid_search_cv(verbose=5, n_jobs=4) test_error = sgd.get_test_error() print "Test error in held out set: " + str(test_error) print "=" * 20 # Now with bigrams too sgd = SGDTextClassifier(train_reviews, train_labels, ngram_range=(1,2), test_texts=test_reviews, test_labels=test_labels,
logger.info(LOGGER_PREFIX % msg) IMDB_DATA = './datasets/aclImdb/aclImdb' CHARACTERS_PER_WORD = 15 WORDS_PER_DOCUMENT = 300 PREPEND = False if __name__ == '__main__': log('Initializing CharMapper') cm = CharMapper() log('Load data from original source') imdb = ImdbDataHandler(source=IMDB_DATA) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST) log('Converting to character level representations') train_global_wvs_reviews = imdb.to_char_level_idx( train_reviews, char_container=cm, chars_per_word=CHARACTERS_PER_WORD, words_per_document=WORDS_PER_DOCUMENT, prepend=PREPEND) test_global_wvs_reviews = imdb.to_char_level_idx( test_reviews, char_container=cm,
from cervantes.box import WordVectorBox from cervantes.language import OneLevelEmbedding from cervantes.nn.models import RNNClassifier YELP_FUNNY_TRAIN = '../yelp-dataset/TrainSet_funny_75064' YELP_FUNNY_DEV = '../yelp-dataset/DevSet_funny_75064' YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064' IMDB_DATA = '../deep-text/datasets/aclImdb/aclImdb' WV_FILE = '../deep-text/embeddings/wv/glove.42B.300d.120000.txt' if __name__ == '__main__': print "Getting data in format texts / labels" imdb = ImdbDataHandler(source=IMDB_DATA) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST) train_reviews = train_reviews[:5000] test_reviews = test_reviews[:1000] train_labels = list(train_labels)[:5000] test_labels = list(test_labels)[:1000] #yelp = YelpDataHandler() #(train_reviews, train_labels, test_reviews, test_labels) = \ # yelp.get_data(YELP_FUNNY_TRAIN, YELP_FUNNY_DEV, YELP_FUNNY_TEST) print "Building language embeddings. This requires parsing text so it might " \ "be pretty slow " # Compute text embeddings, containing the processed text tokens together with a vector-to-index # translation object (the vector box), should be pickled in order to be efficiently used with
if __name__ == '__main__': log('Building word vectors from {}'.format(IMDB_WV_FILE)) gb = GloVeBox(IMDB_WV_FILE) gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log('Building global word vectors from {}'.format(GLOBAL_WV_FILE)) global_gb = GloVeBox(GLOBAL_WV_FILE) global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log('writing GloVeBox pickle...') pickle.dump(gb, open(IMDB_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) log('Load data from original source') imdb = ImdbDataHandler(source=IMDB_DATA) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST) log('Converting to sentences: global word vectors') train_global_wvs_reviews = imdb.to_sentence_level_idx(train_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, global_gb) test_global_wvs_reviews = imdb.to_sentence_level_idx(test_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, global_gb) log('Converting to sentences: only imdb word vectors') train_imdb_wvs_reviews = imdb.to_sentence_level_idx(train_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, gb) test_imdb_wvs_reviews = imdb.to_sentence_level_idx(test_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, gb)
LogisticClassifier, SVMClassifier, PerceptronClassifier, RandomForestTextClassifier from nlpdatahandlers import ImdbDataHandler import sys IMDB_DATA_DEFAULT = '../deep-text/datasets/aclImdb/aclImdb' if __name__ == '__main__': if len(sys.argv) > 1 and sys.argv[1] != "": source = sys.argv[1] else: source = IMDB_DATA_DEFAULT print "Loading data from original source" imdb = ImdbDataHandler(source=source) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN, shuffle=True) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST, shuffle=True) # Simple bag of words with SGD sgd = SGDTextClassifier(train_reviews, train_labels, test_texts=test_reviews, test_labels=test_labels, compute_features=True) sgd.grid_search_cv(verbose=5, n_jobs=4) test_error = sgd.get_test_error() print "Test error in held out set: " + str(test_error)
def log(msg, logger=logger): logger.info(LOGGER_PREFIX % msg) IMDB_DATA = './datasets/aclImdb/aclImdb' CHARACTERS_PER_WORD = 15 WORDS_PER_DOCUMENT = 300 PREPEND = False if __name__ == '__main__': log('Initializing CharMapper') cm = CharMapper() log('Load data from original source') imdb = ImdbDataHandler(source=IMDB_DATA) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST) log('Converting to character level representations') train_global_wvs_reviews = imdb.to_char_level_idx(train_reviews, char_container=cm, chars_per_word=CHARACTERS_PER_WORD, words_per_document=WORDS_PER_DOCUMENT, prepend=PREPEND) test_global_wvs_reviews = imdb.to_char_level_idx(test_reviews, char_container=cm, chars_per_word=CHARACTERS_PER_WORD, words_per_document=WORDS_PER_DOCUMENT, prepend=PREPEND)
IMDB_DATA = './datasets/aclImdb/aclImdb' IMDB_WV_FILE = './embeddings/wv/IMDB-GloVe-300dim.txt' GLOBAL_WV_FILE = './embeddings/wv/glove.42B.300d.120000.txt' WORDS_PER_TEXT = 300 if __name__ == '__main__': log('Building global word vectors from {}'.format(GLOBAL_WV_FILE)) global_gb = GloVeBox(GLOBAL_WV_FILE) global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log('writing GloVeBox pickle...') pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) log('Load data from original source') imdb = ImdbDataHandler(source=IMDB_DATA) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST) log('Converting to global word vectors - train') reviews_wvs_train = imdb.to_word_level_idx(train_reviews, global_gb, WORDS_PER_TEXT) # -- training data save np.save('IMDB_train_fulltext_glove_X.npy', reviews_wvs_train) np.save('IMDB_train_fulltext_glove_y.npy', train_labels) del reviews_wvs_train log('Converting to global word vectors - test') reviews_wvs_test = imdb.to_word_level_idx(test_reviews, global_gb, WORDS_PER_TEXT) # -- testing data save np.save('IMDB_test_fulltext_glove_X.npy', reviews_wvs_test)
from cervantes.box import WordVectorBox from cervantes.language import OneLevelEmbedding from cervantes.nn.models import RNNClassifier YELP_FUNNY_TRAIN = '../yelp-dataset/TrainSet_funny_75064' YELP_FUNNY_DEV = '../yelp-dataset/DevSet_funny_75064' YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064' IMDB_DATA = '../deep-text/datasets/aclImdb/aclImdb' WV_FILE = '../deep-text/embeddings/wv/glove.42B.300d.120000.txt' if __name__ == '__main__': print "Getting data in format texts / labels" imdb = ImdbDataHandler(source=IMDB_DATA) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST) train_reviews = train_reviews[:5000] test_reviews = test_reviews[:1000] train_labels = list(train_labels)[:5000] test_labels = list(test_labels)[:1000] #yelp = YelpDataHandler() #(train_reviews, train_labels, test_reviews, test_labels) = \ # yelp.get_data(YELP_FUNNY_TRAIN, YELP_FUNNY_DEV, YELP_FUNNY_TEST) print "Building language embeddings. This requires parsing text so it might " \ "be pretty slow " # Compute text embeddings, containing the processed text tokens together with a vector-to-index
if __name__ == "__main__": log("Building word vectors from {}".format(IMDB_WV_FILE)) gb = GloVeBox(IMDB_WV_FILE) gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log("Building global word vectors from {}".format(GLOBAL_WV_FILE)) global_gb = GloVeBox(GLOBAL_WV_FILE) global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log("writing GloVeBox pickle...") pickle.dump(gb, open(IMDB_WV_FILE.replace(".txt", "-glovebox.pkl"), "wb"), pickle.HIGHEST_PROTOCOL) pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace(".txt", "-glovebox.pkl"), "wb"), pickle.HIGHEST_PROTOCOL) log("Load data from original source") imdb = ImdbDataHandler(source=IMDB_DATA) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST) log("Converting to sentences: global word vectors") train_global_wvs_reviews = imdb.to_sentence_level_idx( train_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, global_gb ) test_global_wvs_reviews = imdb.to_sentence_level_idx( test_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, global_gb ) log("Converting to sentences: only imdb word vectors") train_imdb_wvs_reviews = imdb.to_sentence_level_idx(train_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, gb) test_imdb_wvs_reviews = imdb.to_sentence_level_idx(test_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, gb)