test_text.append(txt)

    #print test_files
    print len(test_files)

    total_train_files = []
    TOTAL = INCREMENT
    UPPER_LIMIT = 500

    while len(total_train_files) < UPPER_LIMIT:
        total_train_files = train_files[:TOTAL]
        data_set_corpus = PlaintextCorpusReader(sys.argv[1], total_train_files)
        estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
        lm = NgramModel(3, data_set_corpus.words(), estimator)
        #lm = NgramModel(2, data_set_corpus.words(), estimator)

        P = []
        for s in test_text:
            s_tokens = nltk.word_tokenize(s)
            if SENTENCE:
                #if len(s_tokens) > 3:
                if len(s_tokens) > 10:
                    p = lm.perplexity(s_tokens)
                    P.append(p)
            else:
                p = lm.perplexity(s_tokens)
                P.append(p)
        TOTAL += INCREMENT

        print "%d %f" % (len(total_train_files), sum(P) / len(P))
예제 #2
0
    print "Negative unigram model complete."
    neg_bigram_lm = NgramModel(2, neg_movie_reviews.words(), estimator)
    print "Negative bigram model complete."
    #neg_trigram_lm = NgramModel(3, neg_movie_reviews.words(), estimator)

    #read in the tweets
    tweets = []
    tokenizer = utils.Tokenizer()

    neg_review_higher = 0
    pos_review_higher = 0
    with open(sys.argv[2], 'r') as tweets_file:
        tweets.extend(tweets_file.readlines())
        for tweet in tweets:
            tokens = tokenizer.tokenize(tweet)
            pu = pos_unigram_lm.perplexity(tokens)
            nu = neg_unigram_lm.perplexity(tokens)

            pb = pos_bigram_lm.perplexity(tokens)
            nb = neg_bigram_lm.perplexity(tokens)

            #pt = pos_trigram_lm.perplexity(tokens)
            #nt = neg_trigram_lm.perplexity(tokens)

            #print pu, nu, pb, nb, pt, nt
            #print pu, nu

            line = ""
            if pu > nu:
                pos_review_higher += 1
                line += "9002:1"