def get_review_tfidf_dict(self, a_document_name): # return a dictionary with each key,value pair as review_id and review_content hw1 = Hw1() review_id_content_dict = {} words_lists = [] review_id_mapper = {} # create a map from Review-ID to words_lists in a review # store all the stemmed words_lists for line in open(a_document_name, 'r'): line = hw1.read_line(line) review_id_mapper[line['review_id']] = line review_id_content_dict[line['review_id']] = dict( hw1.unigram_count(line['text'])) words_lists.append(Hw1.get_stemmed_words(line['text'])) # get all the unique words in a list words = [] i = 0 while i < len(words_lists): for word in words_lists[i]: words.append(word) i = i + 1 #get the unique words only, using set for that words = list(set(words)) # calculate number of reviews in which each term occur term_review_freq_dict = {} for word in words: i = i + 1 term_review_freq_dict[word] = 0 for review_id in review_id_content_dict: if word in review_id_content_dict[review_id].keys(): term_review_freq_dict[word] += 1 # calculate the TFIDF scores of each term in each review tfidf_dict = {} total_reviews = len(review_id_content_dict.keys()) print 'Total' + str(total_reviews) for review_id in review_id_content_dict: tfidf_dict[review_id] = {} for word in review_id_content_dict[review_id]: tfidf_dict[review_id][word] = (1 + math.log10( review_id_content_dict[review_id][word])) * ( math.log(total_reviews) - math.log(term_review_freq_dict[word])) i = 0 for key in tfidf_dict.keys(): i = i + 1 print str(key) + ':' + str(tfidf_dict[key]) if i > 10: break return tfidf_dict, review_id_mapper
def get_review_tfidf_dict(self, a_document_name): # return a dictionary with each key,value pair as review_id and review_content hw1 = Hw1() review_id_content_dict = {} words_lists = [] review_id_mapper = {} # create a map from Review-ID to words_lists in a review # store all the stemmed words_lists for line in open(a_document_name, 'r'): line = hw1.read_line(line) review_id_mapper[line['review_id']] = line review_id_content_dict[line['review_id']] = dict(hw1.unigram_count(line['text'])) words_lists.append(Hw1.get_stemmed_words(line['text'])) # get all the unique words in a list words = [] i = 0 while i < len(words_lists) : for word in words_lists[i] : words.append(word) i = i + 1 #get the unique words only, using set for that words = list(set(words)) # calculate number of reviews in which each term occur term_review_freq_dict = {} for word in words: i = i + 1 term_review_freq_dict[word] = 0 for review_id in review_id_content_dict: if word in review_id_content_dict[review_id].keys(): term_review_freq_dict[word] += 1 # calculate the TFIDF scores of each term in each review tfidf_dict = {} total_reviews = len(review_id_content_dict.keys()) print 'Total' + str(total_reviews) for review_id in review_id_content_dict: tfidf_dict[review_id] = {} for word in review_id_content_dict[review_id]: tfidf_dict[review_id][word] = (1 + math.log10(review_id_content_dict[review_id][word])) * (math.log(total_reviews) - math.log(term_review_freq_dict[word])) i = 0 for key in tfidf_dict.keys() : i = i + 1 print str(key) + ':' + str(tfidf_dict[key]) if i > 10: break return tfidf_dict, review_id_mapper
from hw1 import Hw1 infilename='review_KcSJUq1kwO8awZRMS6Q49g' f=open(infilename,'r') tokens = Hw1.tokenize(Hw1.read_line(f.readline())['text']) stop_removed = Hw1.stopword(tokens) stemmed = Hw1.stemming(stop_removed) print 'task-4.py // print out the "text" part of the first review after stemming' print stemmed
from hw1 import Hw1 infilename='review_KcSJUq1kwO8awZRMS6Q49g' f=open(infilename,'r') line_num=1 while(line_num<300): f.readline() line_num+=1 print 'task-1.py // print out the "text" part of the 300th review' print Hw1.read_line(f.readline())['text']
from hw1 import Hw1 infilename = 'review_KcSJUq1kwO8awZRMS6Q49g' f = open(infilename, 'r') line_num = 1 while (line_num < 300): f.readline() line_num += 1 print 'task-1.py // print out the "text" part of the 300th review' print Hw1.read_line(f.readline())['text']
from hw1 import Hw1 infilename='review_KcSJUq1kwO8awZRMS6Q49g' f=open(infilename,'r') line_num=1 while(line_num<3): line = Hw1.read_line(f.readline())['text'] print "Original: %s" %line print Hw1.tokenize(line) line_num+=1
from hw1 import Hw1 infilename = 'review_KcSJUq1kwO8awZRMS6Q49g' f = open(infilename, 'r') tokens = Hw1.tokenize(Hw1.read_line(f.readline())['text']) stop_removed = Hw1.stopword(tokens) stemmed = Hw1.stemming(stop_removed) print 'task-4.py // print out the "text" part of the first review after stemming' print stemmed
from hw1 import Hw1 infilename='review_KcSJUq1kwO8awZRMS6Q49g' f=open(infilename,'r') print 'task-2.py // print out the tokenized "text" part of the first 3 reviews' for i in xrange(0,3): tokens = Hw1.tokenize(Hw1.read_line(f.readline())['text']) print tokens
from hw1 import Hw1 infilename = 'review_KcSJUq1kwO8awZRMS6Q49g' hw = Hw1() print 'task-6.py // find the top twenty bigrams in all reviews without stemming' print hw.bigram_count(infilename)
from hw1 import Hw1 infilename='review_KcSJUq1kwO8awZRMS6Q49g' f=open(infilename,'r') line_num=1 words = [] while(line_num<2): line = Hw1.read_line(f.readline())['text'] words = Hw1.tokenize(line) line_num+=1 print Hw1.stopword(words)