Пример #1
0
    def get_review_tfidf_dict(self, a_document_name):
        # return a dictionary with each key,value pair as review_id and review_content
        hw1 = Hw1()
        review_id_content_dict = {}
        words_lists = []
        review_id_mapper = {}
        # create a map from Review-ID to words_lists in a review
        # store all the stemmed words_lists
        for line in open(a_document_name, 'r'):
            line = hw1.read_line(line)
            review_id_mapper[line['review_id']] = line
            review_id_content_dict[line['review_id']] = dict(
                hw1.unigram_count(line['text']))
            words_lists.append(Hw1.get_stemmed_words(line['text']))

        # get all the unique words in a list
        words = []
        i = 0
        while i < len(words_lists):
            for word in words_lists[i]:
                words.append(word)
            i = i + 1
        #get the unique words only, using set for that
        words = list(set(words))

        # calculate number of reviews in which each term occur
        term_review_freq_dict = {}
        for word in words:
            i = i + 1
            term_review_freq_dict[word] = 0
            for review_id in review_id_content_dict:
                if word in review_id_content_dict[review_id].keys():
                    term_review_freq_dict[word] += 1

        # calculate the TFIDF scores of each term in each review
        tfidf_dict = {}
        total_reviews = len(review_id_content_dict.keys())
        print 'Total' + str(total_reviews)
        for review_id in review_id_content_dict:
            tfidf_dict[review_id] = {}
            for word in review_id_content_dict[review_id]:
                tfidf_dict[review_id][word] = (1 + math.log10(
                    review_id_content_dict[review_id][word])) * (
                        math.log(total_reviews) -
                        math.log(term_review_freq_dict[word]))

        i = 0
        for key in tfidf_dict.keys():
            i = i + 1
            print str(key) + ':' + str(tfidf_dict[key])
            if i > 10:
                break

        return tfidf_dict, review_id_mapper
Пример #2
0
    def get_review_tfidf_dict(self, a_document_name):
        # return a dictionary with each key,value pair as review_id and review_content
        hw1 = Hw1()
        review_id_content_dict = {}
        words_lists = []
        review_id_mapper = {}
        # create a map from Review-ID to words_lists in a review
        # store all the stemmed words_lists
        for line in open(a_document_name, 'r'):
            line = hw1.read_line(line)
            review_id_mapper[line['review_id']] = line
            review_id_content_dict[line['review_id']] = dict(hw1.unigram_count(line['text']))
            words_lists.append(Hw1.get_stemmed_words(line['text']))
            
        # get all the unique words in a list
        words = []
        i = 0
        while i < len(words_lists) :
            for word in words_lists[i] :
                words.append(word)
            i = i + 1
        #get the unique words only, using set for that
        words = list(set(words))
 
        # calculate number of reviews in which each term occur
        term_review_freq_dict = {}
        for word in words:
            i = i + 1
            term_review_freq_dict[word] = 0
            for review_id in review_id_content_dict:
                if word in review_id_content_dict[review_id].keys():
                    term_review_freq_dict[word] += 1 
        
        # calculate the TFIDF scores of each term in each review
        tfidf_dict = {}
        total_reviews = len(review_id_content_dict.keys())
        print 'Total' + str(total_reviews)
        for review_id in review_id_content_dict:
            tfidf_dict[review_id] = {}
            for word in review_id_content_dict[review_id]:
                tfidf_dict[review_id][word] = (1 + math.log10(review_id_content_dict[review_id][word])) * (math.log(total_reviews) - math.log(term_review_freq_dict[word]))
           
        i = 0
        for key in tfidf_dict.keys() :
            i = i + 1
            print str(key) + ':' + str(tfidf_dict[key])
            if i > 10:
                break
        
        return tfidf_dict, review_id_mapper
Пример #3
0
from hw1 import Hw1

infilename='review_KcSJUq1kwO8awZRMS6Q49g'
f=open(infilename,'r')

tokens = Hw1.tokenize(Hw1.read_line(f.readline())['text'])
stop_removed = Hw1.stopword(tokens)
stemmed = Hw1.stemming(stop_removed)

print 'task-4.py // print out the "text" part of the first review after stemming'
print stemmed
Пример #4
0
from hw1 import Hw1

infilename='review_KcSJUq1kwO8awZRMS6Q49g'
f=open(infilename,'r')
line_num=1
while(line_num<300):
    f.readline()
    line_num+=1
    
print 'task-1.py // print out the "text" part of the 300th review'
print Hw1.read_line(f.readline())['text']
Пример #5
0
from hw1 import Hw1

infilename = 'review_KcSJUq1kwO8awZRMS6Q49g'
f = open(infilename, 'r')
line_num = 1
while (line_num < 300):
    f.readline()
    line_num += 1

print 'task-1.py // print out the "text" part of the 300th review'
print Hw1.read_line(f.readline())['text']
Пример #6
0
from hw1 import Hw1

infilename='review_KcSJUq1kwO8awZRMS6Q49g'
f=open(infilename,'r')
line_num=1
while(line_num<3):
    line = Hw1.read_line(f.readline())['text']
    print "Original: %s" %line
    print Hw1.tokenize(line)
    line_num+=1

Пример #7
0
from hw1 import Hw1

infilename = 'review_KcSJUq1kwO8awZRMS6Q49g'
f = open(infilename, 'r')

tokens = Hw1.tokenize(Hw1.read_line(f.readline())['text'])
stop_removed = Hw1.stopword(tokens)
stemmed = Hw1.stemming(stop_removed)

print 'task-4.py // print out the "text" part of the first review after stemming'
print stemmed
Пример #8
0
from hw1 import Hw1

infilename='review_KcSJUq1kwO8awZRMS6Q49g'
f=open(infilename,'r')

print 'task-2.py // print out the tokenized "text" part of the first 3 reviews'

for i in xrange(0,3):
    tokens = Hw1.tokenize(Hw1.read_line(f.readline())['text'])
    print tokens
Пример #9
0
from hw1 import Hw1

infilename = 'review_KcSJUq1kwO8awZRMS6Q49g'

hw = Hw1()
print 'task-6.py // find the top twenty bigrams in all reviews without stemming'
print hw.bigram_count(infilename)
Пример #10
0
from hw1 import Hw1

infilename='review_KcSJUq1kwO8awZRMS6Q49g'
f=open(infilename,'r')
line_num=1
words = []
while(line_num<2):
    line = Hw1.read_line(f.readline())['text']
    words =  Hw1.tokenize(line)
    line_num+=1
print Hw1.stopword(words)