Python ImdbDataHandler 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: datahandlers

클래스/타입: ImdbDataHandler

hotexamples.com에서의 예제들: 2

Python ImdbDataHandler - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 datahandlers.ImdbDataHandler에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

get_data(1)

to_sentence_vectors(1)

예제 #1

파일 보기

파일: prepare_imdb_new.py 프로젝트: alfredolainez/deep-text-classification

if __name__ == '__main__':

    log('Building word vectors from {}'.format(IMDB_WV_FILE))
    gb = GloVeBox(IMDB_WV_FILE)
    gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log('Building global word vectors from {}'.format(GLOBAL_WV_FILE))
    global_gb = GloVeBox(GLOBAL_WV_FILE)
    global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log('writing GloVeBox pickle...')
    pickle.dump(gb, open(IMDB_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
    pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

    log('Load data from original source')
    imdb = ImdbDataHandler(source=IMDB_DATA)
    (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)

    log('Converting to sentences: global word vectors')
    train_global_wvs_reviews = imdb.to_sentence_vectors(train_reviews, SENTENCES_PER_PARAGRAPH,
                                                    WORDS_PER_SENTENCE, global_gb)
    test_global_wvs_reviews = imdb.to_sentence_vectors(test_reviews, SENTENCES_PER_PARAGRAPH,
                                                   WORDS_PER_SENTENCE, global_gb)

    log('Converting to sentences: only imdb word vectors')
    train_imdb_wvs_reviews = imdb.to_sentence_vectors(train_reviews, SENTENCES_PER_PARAGRAPH,
                                                    WORDS_PER_SENTENCE, gb)
    test_imdb_wvs_reviews = imdb.to_sentence_vectors(test_reviews, SENTENCES_PER_PARAGRAPH,
                                                   WORDS_PER_SENTENCE, gb)

예제 #2

파일 보기

파일: classic_classification.py 프로젝트: alfredolainez/deep-text-classification

from classic.classifiers import TextClassifier, NaiveBayesClassifier, SGDTextClassifier, \
    LogisticClassifier, SVMClassifier, PerceptronClassifier, RandomForestTextClassifier
from datahandlers import ImdbDataHandler

IMDB_DATA = './datasets/aclImdb/aclImdb'

if __name__ == '__main__':

    print "Loading data from original source"
    imdb = ImdbDataHandler(source=IMDB_DATA)
    (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)
    # TODO: Shuffle data

    # Simple bag of words with SGD
    sgd = SGDTextClassifier(train_reviews, train_labels,
                            test_texts=test_reviews, test_labels=test_labels,
                            compute_features=True)
    sgd.grid_search_cv(verbose=0, n_jobs=4)

    # Simple bag of words with NB
    nb = NaiveBayesClassifier(train_reviews, train_labels,
                              test_texts=test_reviews, test_labels=test_labels)
    nb.set_bag_of_ngrams() # Also can compute bag of words manually
    nb.grid_search_cv(n_jobs=4)

    # Now shit with bigrams too
    sgd = SGDTextClassifier(train_reviews, train_labels, ngram_range=(1,2),
                            test_texts=test_reviews, test_labels=test_labels,
                            compute_features=True)
    sgd.grid_search_cv(n_jobs=4, verbose=1)