if __name__ == '__main__': log('Building word vectors from {}'.format(IMDB_WV_FILE)) gb = GloVeBox(IMDB_WV_FILE) gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log('Building global word vectors from {}'.format(GLOBAL_WV_FILE)) global_gb = GloVeBox(GLOBAL_WV_FILE) global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log('writing GloVeBox pickle...') pickle.dump(gb, open(IMDB_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) log('Load data from original source') imdb = ImdbDataHandler(source=IMDB_DATA) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST) log('Converting to sentences: global word vectors') train_global_wvs_reviews = imdb.to_sentence_vectors(train_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, global_gb) test_global_wvs_reviews = imdb.to_sentence_vectors(test_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, global_gb) log('Converting to sentences: only imdb word vectors') train_imdb_wvs_reviews = imdb.to_sentence_vectors(train_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, gb) test_imdb_wvs_reviews = imdb.to_sentence_vectors(test_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, gb)
from classic.classifiers import TextClassifier, NaiveBayesClassifier, SGDTextClassifier, \ LogisticClassifier, SVMClassifier, PerceptronClassifier, RandomForestTextClassifier from datahandlers import ImdbDataHandler IMDB_DATA = './datasets/aclImdb/aclImdb' if __name__ == '__main__': print "Loading data from original source" imdb = ImdbDataHandler(source=IMDB_DATA) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST) # TODO: Shuffle data # Simple bag of words with SGD sgd = SGDTextClassifier(train_reviews, train_labels, test_texts=test_reviews, test_labels=test_labels, compute_features=True) sgd.grid_search_cv(verbose=0, n_jobs=4) # Simple bag of words with NB nb = NaiveBayesClassifier(train_reviews, train_labels, test_texts=test_reviews, test_labels=test_labels) nb.set_bag_of_ngrams() # Also can compute bag of words manually nb.grid_search_cv(n_jobs=4) # Now shit with bigrams too sgd = SGDTextClassifier(train_reviews, train_labels, ngram_range=(1,2), test_texts=test_reviews, test_labels=test_labels, compute_features=True) sgd.grid_search_cv(n_jobs=4, verbose=1)