log('Building global word vectors from {}'.format(GLOBAL_WV_FILE)) global_gb = GloVeBox(GLOBAL_WV_FILE) global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log('writing GloVeBox pickle...') pickle.dump(yelp_gb, open(YELP_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) yelp = YelpDataHandler() ################################## ### YELP USEFUL ################################## log('Creating "useful" reviews sentence-datasets') (train_reviews, train_labels, test_reviews, test_labels) = \ yelp.get_data(YELP_USEFUL_TRAIN, YELP_USEFUL_DEV, YELP_USEFUL_TEST) log('Converting to sentences: global word vectors') train_global_wvs_reviews = yelp.to_sentence_level_idx(train_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, global_gb) test_global_wvs_reviews = yelp.to_sentence_level_idx(test_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, global_gb) log('Converting to sentences: yelp word vectors') train_yelp_wvs_reviews = yelp.to_sentence_level_idx(train_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, yelp_gb) test_yelp_wvs_reviews = yelp.to_sentence_level_idx(test_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, yelp_gb) # -- training data save np.save('Yelp_useful_sentences_train_yelp_glove_X.npy', train_yelp_wvs_reviews)
log(' --> Training Data Complete') log(' --> Starting Testing Data...') test_reviews = yelp.to_char_level_idx(test_reviews, char_container=cm, chars_per_word=CHARACTERS_PER_WORD, words_per_document=WORDS_PER_DOCUMENT, prepend=PREPEND) log(' --> Testing Data Complete') return train_reviews, test_reviews ################################## ### YELP USEFUL ################################## log('Creating "useful" reviews sentence-datasets') (train_reviews, train_labels, test_reviews, test_labels) = \ yelp.get_data(YELP_USEFUL_TRAIN, YELP_USEFUL_DEV, YELP_USEFUL_TEST) train_reviews, test_reviews = get_yelp_char(train_reviews, test_reviews) # -- training data save np.save('Yelp_useful_sentences_train_char_X.npy', train_reviews) np.save('Yelp_useful_sentences_train_char_y.npy', train_labels) # -- testing data save np.save('Yelp_useful_sentences_test_char_X.npy', test_reviews) np.save('Yelp_useful_sentences_test_char_y.npy', test_labels) ################################## ### YELP FUNNY
from keras.models import Sequential from keras.layers import Embedding from keras.layers.core import Dense, Activation, Dropout, Reshape, Flatten YELP_FUNNY_TRAIN = '../yelp-dataset/TrainSet_funny_75064' YELP_FUNNY_DEV = '../yelp-dataset/DevSet_funny_75064' YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064' WV_FILE = '../deep-text/embeddings/wv/glove.42B.300d.120000.txt' if __name__ == '__main__': print "Getting data in format texts / labels" yelp = YelpDataHandler() (train_reviews, train_labels, test_reviews, test_labels) = \ yelp.get_data(YELP_FUNNY_TRAIN, YELP_FUNNY_DEV, YELP_FUNNY_TEST) print "Building character embedding" EMBEDDING_FILE = "YelpChar.pkl" if not os.path.isfile(EMBEDDING_FILE): cbox = EnglishCharBox(vector_dim=300) # Build the language embedding with the given vector box and 300 words per text lembedding = OneLevelEmbedding(cbox, type=OneLevelEmbedding.CHAR_EMBEDDING, size=5000) lembedding.compute(train_reviews) lembedding.save(EMBEDDING_FILE) else: lembedding = OneLevelEmbedding.load(EMBEDDING_FILE) # Create a recurrent neural network model and train it, the data from the computed