LogisticClassifier, SVMClassifier, PerceptronClassifier, RandomForestTextClassifier
from nlpdatahandlers import ImdbDataHandler

import sys

IMDB_DATA_DEFAULT = '../deep-text/datasets/aclImdb/aclImdb'

if __name__ == '__main__':

    if len(sys.argv) > 1 and sys.argv[1] != "":
        source = sys.argv[1]
    else:
        source = IMDB_DATA_DEFAULT

    print "Loading data from original source"
    imdb = ImdbDataHandler(source=source)
    (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN, shuffle=True)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST, shuffle=True)

    # Simple bag of words with SGD
    sgd = SGDTextClassifier(train_reviews, train_labels,
                            test_texts=test_reviews, test_labels=test_labels,
                            compute_features=True)
    sgd.grid_search_cv(verbose=5, n_jobs=4)
    test_error = sgd.get_test_error()
    print "Test error in held out set: " + str(test_error)
    print "=" * 20

    # Now with bigrams too
    sgd = SGDTextClassifier(train_reviews, train_labels, ngram_range=(1,2),
                            test_texts=test_reviews, test_labels=test_labels,
示例#2
0
    logger.info(LOGGER_PREFIX % msg)


IMDB_DATA = './datasets/aclImdb/aclImdb'

CHARACTERS_PER_WORD = 15
WORDS_PER_DOCUMENT = 300
PREPEND = False

if __name__ == '__main__':

    log('Initializing CharMapper')
    cm = CharMapper()

    log('Load data from original source')
    imdb = ImdbDataHandler(source=IMDB_DATA)
    (train_reviews,
     train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)

    log('Converting to character level representations')
    train_global_wvs_reviews = imdb.to_char_level_idx(
        train_reviews,
        char_container=cm,
        chars_per_word=CHARACTERS_PER_WORD,
        words_per_document=WORDS_PER_DOCUMENT,
        prepend=PREPEND)

    test_global_wvs_reviews = imdb.to_char_level_idx(
        test_reviews,
        char_container=cm,
示例#3
0
from cervantes.box import WordVectorBox
from cervantes.language import OneLevelEmbedding
from cervantes.nn.models import RNNClassifier

YELP_FUNNY_TRAIN = '../yelp-dataset/TrainSet_funny_75064'
YELP_FUNNY_DEV = '../yelp-dataset/DevSet_funny_75064'
YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064'

IMDB_DATA = '../deep-text/datasets/aclImdb/aclImdb'
WV_FILE = '../deep-text/embeddings/wv/glove.42B.300d.120000.txt'

if __name__ == '__main__':

    print "Getting data in format texts / labels"

    imdb = ImdbDataHandler(source=IMDB_DATA)
    (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)
    train_reviews = train_reviews[:5000]
    test_reviews = test_reviews[:1000]
    train_labels = list(train_labels)[:5000]
    test_labels = list(test_labels)[:1000]

    #yelp = YelpDataHandler()
    #(train_reviews, train_labels, test_reviews, test_labels) = \
    #    yelp.get_data(YELP_FUNNY_TRAIN, YELP_FUNNY_DEV, YELP_FUNNY_TEST)

    print "Building language embeddings. This requires parsing text so it might " \
          "be pretty slow "
    # Compute text embeddings, containing the processed text tokens together with a vector-to-index
    # translation object (the vector box), should be pickled in order to be efficiently used with
if __name__ == '__main__':

    log('Building word vectors from {}'.format(IMDB_WV_FILE))
    gb = GloVeBox(IMDB_WV_FILE)
    gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log('Building global word vectors from {}'.format(GLOBAL_WV_FILE))
    global_gb = GloVeBox(GLOBAL_WV_FILE)
    global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log('writing GloVeBox pickle...')
    pickle.dump(gb, open(IMDB_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
    pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

    log('Load data from original source')
    imdb = ImdbDataHandler(source=IMDB_DATA)
    (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)

    log('Converting to sentences: global word vectors')
    train_global_wvs_reviews = imdb.to_sentence_level_idx(train_reviews, SENTENCES_PER_PARAGRAPH,
                                                    WORDS_PER_SENTENCE, global_gb)
    test_global_wvs_reviews = imdb.to_sentence_level_idx(test_reviews, SENTENCES_PER_PARAGRAPH,
                                                   WORDS_PER_SENTENCE, global_gb)

    log('Converting to sentences: only imdb word vectors')
    train_imdb_wvs_reviews = imdb.to_sentence_level_idx(train_reviews, SENTENCES_PER_PARAGRAPH,
                                                    WORDS_PER_SENTENCE, gb)
    test_imdb_wvs_reviews = imdb.to_sentence_level_idx(test_reviews, SENTENCES_PER_PARAGRAPH,
                                                   WORDS_PER_SENTENCE, gb)
    LogisticClassifier, SVMClassifier, PerceptronClassifier, RandomForestTextClassifier
from nlpdatahandlers import ImdbDataHandler

import sys

IMDB_DATA_DEFAULT = '../deep-text/datasets/aclImdb/aclImdb'

if __name__ == '__main__':

    if len(sys.argv) > 1 and sys.argv[1] != "":
        source = sys.argv[1]
    else:
        source = IMDB_DATA_DEFAULT

    print "Loading data from original source"
    imdb = ImdbDataHandler(source=source)
    (train_reviews,
     train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN,
                                   shuffle=True)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST,
                                                shuffle=True)

    # Simple bag of words with SGD
    sgd = SGDTextClassifier(train_reviews,
                            train_labels,
                            test_texts=test_reviews,
                            test_labels=test_labels,
                            compute_features=True)
    sgd.grid_search_cv(verbose=5, n_jobs=4)
    test_error = sgd.get_test_error()
    print "Test error in held out set: " + str(test_error)
示例#6
0
def log(msg, logger=logger):
    logger.info(LOGGER_PREFIX % msg)

IMDB_DATA = './datasets/aclImdb/aclImdb'

CHARACTERS_PER_WORD = 15
WORDS_PER_DOCUMENT = 300
PREPEND = False

if __name__ == '__main__':

    log('Initializing CharMapper')
    cm = CharMapper()

    log('Load data from original source')
    imdb = ImdbDataHandler(source=IMDB_DATA)
    (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)

    log('Converting to character level representations')
    train_global_wvs_reviews = imdb.to_char_level_idx(train_reviews, 
        char_container=cm,
        chars_per_word=CHARACTERS_PER_WORD,
        words_per_document=WORDS_PER_DOCUMENT,
        prepend=PREPEND)

    test_global_wvs_reviews = imdb.to_char_level_idx(test_reviews, 
        char_container=cm,
        chars_per_word=CHARACTERS_PER_WORD,
        words_per_document=WORDS_PER_DOCUMENT,
        prepend=PREPEND)
示例#7
0
IMDB_DATA = './datasets/aclImdb/aclImdb'
IMDB_WV_FILE = './embeddings/wv/IMDB-GloVe-300dim.txt'
GLOBAL_WV_FILE = './embeddings/wv/glove.42B.300d.120000.txt'
WORDS_PER_TEXT = 300

if __name__ == '__main__':

    log('Building global word vectors from {}'.format(GLOBAL_WV_FILE))
    global_gb = GloVeBox(GLOBAL_WV_FILE)
    global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log('writing GloVeBox pickle...')
    pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

    log('Load data from original source')
    imdb = ImdbDataHandler(source=IMDB_DATA)
    (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)

    log('Converting to global word vectors - train')
    reviews_wvs_train = imdb.to_word_level_idx(train_reviews, global_gb, WORDS_PER_TEXT)
    # -- training data save
    np.save('IMDB_train_fulltext_glove_X.npy', reviews_wvs_train)
    np.save('IMDB_train_fulltext_glove_y.npy', train_labels)

    del reviews_wvs_train

    log('Converting to global word vectors - test')
    reviews_wvs_test = imdb.to_word_level_idx(test_reviews, global_gb, WORDS_PER_TEXT)
    # -- testing data save
    np.save('IMDB_test_fulltext_glove_X.npy', reviews_wvs_test)
示例#8
0
from cervantes.box import WordVectorBox
from cervantes.language import OneLevelEmbedding
from cervantes.nn.models import RNNClassifier

YELP_FUNNY_TRAIN = '../yelp-dataset/TrainSet_funny_75064'
YELP_FUNNY_DEV = '../yelp-dataset/DevSet_funny_75064'
YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064'

IMDB_DATA = '../deep-text/datasets/aclImdb/aclImdb'
WV_FILE = '../deep-text/embeddings/wv/glove.42B.300d.120000.txt'

if __name__ == '__main__':

    print "Getting data in format texts / labels"

    imdb = ImdbDataHandler(source=IMDB_DATA)
    (train_reviews,
     train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)
    train_reviews = train_reviews[:5000]
    test_reviews = test_reviews[:1000]
    train_labels = list(train_labels)[:5000]
    test_labels = list(test_labels)[:1000]

    #yelp = YelpDataHandler()
    #(train_reviews, train_labels, test_reviews, test_labels) = \
    #    yelp.get_data(YELP_FUNNY_TRAIN, YELP_FUNNY_DEV, YELP_FUNNY_TEST)

    print "Building language embeddings. This requires parsing text so it might " \
          "be pretty slow "
    # Compute text embeddings, containing the processed text tokens together with a vector-to-index
if __name__ == "__main__":

    log("Building word vectors from {}".format(IMDB_WV_FILE))
    gb = GloVeBox(IMDB_WV_FILE)
    gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log("Building global word vectors from {}".format(GLOBAL_WV_FILE))
    global_gb = GloVeBox(GLOBAL_WV_FILE)
    global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log("writing GloVeBox pickle...")
    pickle.dump(gb, open(IMDB_WV_FILE.replace(".txt", "-glovebox.pkl"), "wb"), pickle.HIGHEST_PROTOCOL)
    pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace(".txt", "-glovebox.pkl"), "wb"), pickle.HIGHEST_PROTOCOL)

    log("Load data from original source")
    imdb = ImdbDataHandler(source=IMDB_DATA)
    (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)

    log("Converting to sentences: global word vectors")
    train_global_wvs_reviews = imdb.to_sentence_level_idx(
        train_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, global_gb
    )
    test_global_wvs_reviews = imdb.to_sentence_level_idx(
        test_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, global_gb
    )

    log("Converting to sentences: only imdb word vectors")
    train_imdb_wvs_reviews = imdb.to_sentence_level_idx(train_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, gb)
    test_imdb_wvs_reviews = imdb.to_sentence_level_idx(test_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, gb)