logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def log(msg, logger=logger): logger.info(LOGGER_PREFIX % msg) IMDB_DATA = './datasets/aclImdb/aclImdb' IMDB_WV_FILE = './data/wv/IMDB-GloVe-300dim.txt' GLOBAL_WV_FILE = './data/wv/glove.42B.300d.120000.txt' WORDS_PER_SENTENCE = 50 SENTENCES_PER_PARAGRAPH = 50 PREPEND = False if __name__ == '__main__': log('Building word vectors from {}'.format(IMDB_WV_FILE)) gb = GloVeBox(IMDB_WV_FILE) gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log('Building global word vectors from {}'.format(GLOBAL_WV_FILE)) global_gb = GloVeBox(GLOBAL_WV_FILE) global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log('writing GloVeBox pickle...') pickle.dump(gb, open(IMDB_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) log('Load data from original source') imdb = ImdbDataHandler(source=IMDB_DATA) (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN) (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)
def parse_tokens(txt): ''' Takes a text and returns a list of tokens ''' return [tx for tx in (t.text for t in nlp(u'' + txt.decode('ascii',errors='ignore'))) if tx != '\n'] if __name__ == '__main__': log('Checking data integrity...') data_integrity() log('Building word vectors from {}'.format(WV_FILE)) gb = GloVeBox(WV_FILE) gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)#.index() log('Building global word vectors from {}'.format(GLOBAL_WV_FILE)) global_gb = GloVeBox(GLOBAL_WV_FILE) global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)#.index() log('writing GloVeBox pickle...') pickle.dump(gb, open(WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) log('Getting training examples') train_neg = get_data(positive=False) train_pos = get_data()
# -- parameters to tune and set WORDS_PER_SENTENCE = 20 SENTENCES_PER_PARAGRAPH = 20 WV_FILE = './data/wv/glove.42B.300d.120000.txt' log('Importing spaCy...') from spacy.en import English log('Initializing spaCy...') nlp = English() if __name__ == '__main__': log('Building word vectors from {}'.format(WV_FILE)) gb = GloVeBox(WV_FILE) gb.build(zero_token=True).index() log('writing GloVeBox pickle...') pickle.dump(gb, open(WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) log('Loading train and test pickles...') with open(TRAIN_FILE) as file: [train_reviews, train_labels] = pickle.load(file) with open(DEV_FILE) as file: [dev_reviews, dev_labels] = pickle.load(file) with open(TEST_FILE) as file: [test_reviews, test_labels] = pickle.load(file) # Merge train and dev