예제 #1
0
파일: tests.py 프로젝트: stasi009/MyKaggle
def test_load_review_words():
    dal = ReviewsDAL()
    r_stream = dal.load_reviews_words("unlabeled")

    for index in xrange(10):
        review = next(r_stream)
        print "*************** {} ***************".format(index+1)
        print "sentiment: {}".format(review.sent.sentiment)
        print "words: {}".format(review.sent.words)

    dal.close()
예제 #2
0
def test_load_review_words():
    dal = ReviewsDAL()
    r_stream = dal.load_reviews_words("unlabeled")

    for index in xrange(10):
        review = next(r_stream)
        print "*************** {} ***************".format(index + 1)
        print "sentiment: {}".format(review.sent.sentiment)
        print "words: {}".format(review.sent.words)

    dal.close()
예제 #3
0
    def stream(self):
        dal = ReviewsDAL()
        review_stream = dal.load_words(self._colname)
        for index, r in enumerate(review_stream):

            self.append_sentiment_words(r.sent.words)
            yield r.sent.words

            if index % 300 == 0:
                print "{} examples loaded from mongodb[{}]".format(index + 1, self._colname)

        dal.close()
예제 #4
0
def words_stream():
    word_coder = WordCoder()

    dal = ReviewsDAL()
    review_stream = dal.load_words()
    for index, r in enumerate(review_stream):
        yield word_coder.code(r.sent.words)

        if index % 300 == 0:
            print "{} examples loaded from mongodb".format(index + 1)

    dal.close()
예제 #5
0
    def words_stream(self):
        self._metas = []

        dal = ReviewsDAL()
        review_stream = dal.load_words(self._colname)
        for index, r in enumerate(review_stream):
            self._metas.append((r.id, r.sent.sentiment))
            yield r.sent.words

            if index % 300 == 0:
                print "{} examples loaded from mongodb[{}]".format(index + 1, self._colname)

        dal.close()
예제 #6
0
def read_save_mongodb(filename,labeled,colname,buffersize=300):
    r_stream = reviews_stream(filename,labeled)
    dal = ReviewsDAL()

    buffer = []
    for index,review in enumerate(r_stream):
        if index % buffersize == 0:
            dal.insert_many(colname,buffer)
            del buffer[:] # clear
            print "{} reviews saved into mongo[{}]".format(index,colname)

        buffer.append(review)

    dal.insert_many(colname,buffer)
    dal.close()

    print "----------- DONE -----------"
    print "totally {} reviews inserted into mongodb[{}]".format(index+1,colname)
def read_save_mongodb(buffersize=300):
    r_stream = review_stream()
    dal = ReviewsDAL()

    buffer = []
    for index,review in enumerate(r_stream):
        if index % buffersize == 0:
            dal.insert_many(buffer)
            del buffer[:] # clear
            print "{} reviews saved into mongodb".format(index)

        buffer.append(review)

    dal.insert_many(buffer)
    dal.close()

    print "----------- DONE -----------"
    print "totally {} reviews inserted into mongodb".format(index+1)
예제 #8
0
    coded_words = wordcoder.code(sentence.words)
    bow = dictionary.doc2bow(coded_words)

    topic_distribution = lda_model[bow]
    topic_distribution.sort(key=lambda t: t[1], reverse=True)

    tags = None
    for index, (topic_id, topic_percentage) in enumerate(topic_distribution):
        mt = MixTopic(topic_mapping[topic_id])
        mt.weight(topic_percentage)

        if tags is None:
            tags = mt
        else:
            tags.add(mt)

    tags.normalize()
    print tags

if __name__ == "__main__":
    dal = ReviewsDAL()
    review_stream = dal.sampling(10)

    for index,review in enumerate( review_stream):
        print "*********** [{}] ***********".format(index+1)

        for sentence in sent_tokenizer.tokenize(review.sent.raw):
            print_topics(sentence)

    dal.close()