Пример #1
0
    log('Building global word vectors from {}'.format(GLOBAL_WV_FILE))
    global_gb = GloVeBox(GLOBAL_WV_FILE)
    global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log('writing GloVeBox pickle...')
    pickle.dump(yelp_gb, open(YELP_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
    pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

    yelp = YelpDataHandler()

    ##################################
    ### YELP USEFUL
    ##################################
    log('Creating "useful" reviews sentence-datasets')
    (train_reviews, train_labels, test_reviews, test_labels) = \
        yelp.get_data(YELP_USEFUL_TRAIN, YELP_USEFUL_DEV, YELP_USEFUL_TEST)

    log('Converting to sentences: global word vectors')
    train_global_wvs_reviews = yelp.to_sentence_level_idx(train_reviews, SENTENCES_PER_PARAGRAPH,
                                                    WORDS_PER_SENTENCE, global_gb)
    test_global_wvs_reviews = yelp.to_sentence_level_idx(test_reviews, SENTENCES_PER_PARAGRAPH,
                                                   WORDS_PER_SENTENCE, global_gb)

    log('Converting to sentences: yelp word vectors')
    train_yelp_wvs_reviews = yelp.to_sentence_level_idx(train_reviews, SENTENCES_PER_PARAGRAPH,
                                                    WORDS_PER_SENTENCE, yelp_gb)
    test_yelp_wvs_reviews = yelp.to_sentence_level_idx(test_reviews, SENTENCES_PER_PARAGRAPH,
                                                   WORDS_PER_SENTENCE, yelp_gb)

    # -- training data save
    np.save('Yelp_useful_sentences_train_yelp_glove_X.npy', train_yelp_wvs_reviews)
Пример #2
0
        log('    --> Training Data Complete')
        log('    --> Starting Testing Data...')
        test_reviews = yelp.to_char_level_idx(test_reviews, 
            char_container=cm,
            chars_per_word=CHARACTERS_PER_WORD,
            words_per_document=WORDS_PER_DOCUMENT,
            prepend=PREPEND)
        log('    --> Testing Data Complete')
        return train_reviews, test_reviews

    ##################################
    ### YELP USEFUL
    ##################################
    log('Creating "useful" reviews sentence-datasets')
    (train_reviews, train_labels, test_reviews, test_labels) = \
        yelp.get_data(YELP_USEFUL_TRAIN, YELP_USEFUL_DEV, YELP_USEFUL_TEST)

    train_reviews, test_reviews = get_yelp_char(train_reviews, test_reviews)


    # -- training data save
    
    np.save('Yelp_useful_sentences_train_char_X.npy', train_reviews)
    np.save('Yelp_useful_sentences_train_char_y.npy', train_labels)

    # -- testing data save
    np.save('Yelp_useful_sentences_test_char_X.npy', test_reviews)
    np.save('Yelp_useful_sentences_test_char_y.npy', test_labels)

    ##################################
    ### YELP FUNNY
Пример #3
0
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers.core import Dense, Activation, Dropout, Reshape, Flatten

YELP_FUNNY_TRAIN = '../yelp-dataset/TrainSet_funny_75064'
YELP_FUNNY_DEV = '../yelp-dataset/DevSet_funny_75064'
YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064'

WV_FILE = '../deep-text/embeddings/wv/glove.42B.300d.120000.txt'

if __name__ == '__main__':

    print "Getting data in format texts / labels"
    yelp = YelpDataHandler()
    (train_reviews, train_labels, test_reviews, test_labels) = \
        yelp.get_data(YELP_FUNNY_TRAIN, YELP_FUNNY_DEV, YELP_FUNNY_TEST)

    print "Building character embedding"
    EMBEDDING_FILE = "YelpChar.pkl"
    if not os.path.isfile(EMBEDDING_FILE):

        cbox = EnglishCharBox(vector_dim=300)

        # Build the language embedding with the given vector box and 300 words per text
        lembedding = OneLevelEmbedding(cbox, type=OneLevelEmbedding.CHAR_EMBEDDING, size=5000)
        lembedding.compute(train_reviews)
        lembedding.save(EMBEDDING_FILE)
    else:
        lembedding = OneLevelEmbedding.load(EMBEDDING_FILE)

    # Create a recurrent neural network model and train it, the data from the computed