def check_examples(): dal = ReviewsDAL() r_stream = dal.load_words("train") for index in xrange(10): print "******* {} *******".format(index+1) r = next(r_stream) print r.sent.words
def test_code_words(): dal = ReviewsDAL() wordcoder = WordCoder() for index,r in enumerate(dal.sampling(5)): print "****************** [{}]".format(index+1) print r.sent.raw print "\n" print wordcoder.code(r.sent.words)
def test_load_review_words(): dal = ReviewsDAL() r_stream = dal.load_reviews_words("unlabeled") for index in xrange(10): review = next(r_stream) print "*************** {} ***************".format(index+1) print "sentiment: {}".format(review.sent.sentiment) print "words: {}".format(review.sent.words) dal.close()
def test_load_review_words(): dal = ReviewsDAL() r_stream = dal.load_reviews_words("unlabeled") for index in xrange(10): review = next(r_stream) print "*************** {} ***************".format(index + 1) print "sentiment: {}".format(review.sent.sentiment) print "words: {}".format(review.sent.words) dal.close()
def stream(self): dal = ReviewsDAL() review_stream = dal.load_words(self._colname) for index, r in enumerate(review_stream): self.append_sentiment_words(r.sent.words) yield r.sent.words if index % 300 == 0: print "{} examples loaded from mongodb[{}]".format(index + 1, self._colname) dal.close()
def words_stream(): word_coder = WordCoder() dal = ReviewsDAL() review_stream = dal.load_words() for index, r in enumerate(review_stream): yield word_coder.code(r.sent.words) if index % 300 == 0: print "{} examples loaded from mongodb".format(index + 1) dal.close()
def words_stream(self): self._metas = [] dal = ReviewsDAL() review_stream = dal.load_words(self._colname) for index, r in enumerate(review_stream): self._metas.append((r.id, r.sent.sentiment)) yield r.sent.words if index % 300 == 0: print "{} examples loaded from mongodb[{}]".format(index + 1, self._colname) dal.close()
def split_train_validation(): """ load samples from 'train' collection, draw some samples out, to use as validation set remove them from training set and insert those samples into 'validation' collection """ random.seed(999) valid_ratio = 0.3 dal = ReviewsDAL() train_ids = list(dal.load_ids("train")) total_train = len(train_ids) print "originally, there are {} reviews in train set".format(total_train) valid_ids = random.sample(train_ids,int(total_train * valid_ratio)) print "randomly draw {} samples to use as validation".format(len(valid_ids)) train_collect = dal._db['train'] valid_collect = dal._db['validate'] for index,valid_id in enumerate(valid_ids): # load from train collection cursor = train_collect.find({'_id':valid_id}) review_dict = next(cursor) # insert into validation collection valid_collect.insert_one(review_dict) # remove from train collection result = train_collect.delete_one({'_id':valid_id}) assert result.deleted_count == 1 # if index % 100 == 0: print "{} reviews transferred from train to validation".format(index+1) print "*** totally {} reviews transferred from train to validation ***".format(index+1) print "now, train set has {} reviews".format(train_collect.count({})) print "now, validation set has {} reviews".format(valid_collect.count({}))
def read_save_mongodb(filename,labeled,colname,buffersize=300): r_stream = reviews_stream(filename,labeled) dal = ReviewsDAL() buffer = [] for index,review in enumerate(r_stream): if index % buffersize == 0: dal.insert_many(colname,buffer) del buffer[:] # clear print "{} reviews saved into mongo[{}]".format(index,colname) buffer.append(review) dal.insert_many(colname,buffer) dal.close() print "----------- DONE -----------" print "totally {} reviews inserted into mongodb[{}]".format(index+1,colname)
def read_save_mongodb(buffersize=300): r_stream = review_stream() dal = ReviewsDAL() buffer = [] for index,review in enumerate(r_stream): if index % buffersize == 0: dal.insert_many(buffer) del buffer[:] # clear print "{} reviews saved into mongodb".format(index) buffer.append(review) dal.insert_many(buffer) dal.close() print "----------- DONE -----------" print "totally {} reviews inserted into mongodb".format(index+1)
coded_words = wordcoder.code(sentence.words) bow = dictionary.doc2bow(coded_words) topic_distribution = lda_model[bow] topic_distribution.sort(key=lambda t: t[1], reverse=True) tags = None for index, (topic_id, topic_percentage) in enumerate(topic_distribution): mt = MixTopic(topic_mapping[topic_id]) mt.weight(topic_percentage) if tags is None: tags = mt else: tags.add(mt) tags.normalize() print tags if __name__ == "__main__": dal = ReviewsDAL() review_stream = dal.sampling(10) for index,review in enumerate( review_stream): print "*********** [{}] ***********".format(index+1) for sentence in sent_tokenizer.tokenize(review.sent.raw): print_topics(sentence) dal.close()
def test_load_ids(): dal = ReviewsDAL() ids = dal.load_ids("train")