def generateTfidf(dataset_train_filename): for tokens, stars in generateYelpExample(dataset_train_filename): doc_tfidf = tfidf_model[dictionary_train.doc2bow(tokens)] if not len(doc_tfidf): continue yield stars, Counter(dict(doc_tfidf))
def generateWordFreq(dataset_train_filename): for tokens, stars in generateYelpExample(dataset_train_filename): doc_wordFreq = Counter(dict( dictionary_train.doc2bow(tokens) ) ) if len(doc_wordFreq): doc_nwords = float(sum(doc_wordFreq.values())) for key in doc_wordFreq: doc_wordFreq[key] /= doc_nwords yield stars, doc_wordFreq;