Пример #1
0
    def __init__(self, a_document_name):
        # term counts: number of documents that contain a given term (key)
        self.term_counts = dict()
        self.reviews = dict()

        file = open(a_document_name)

        for review in file:
            review_id = TextProcess.read_line(review)['review_id'].encode('utf8')
            review_content = TextProcess.read_line(review)['text'].encode('utf8')
            self.reviews[review_id] = review_content
            tokens = TextProcess.stemming(TextProcess.stopword(TextProcess.tokenize(TextProcess.read_line(review))))
            unique_tokens = list(set(tokens))
            for token in unique_tokens:
                if token in self.term_counts:
                    self.term_counts[token] += 1
                else:
                    self.term_counts[token] = 1

        self.idf = dict()
        for term, term_count in self.term_counts.iteritems():
            self.idf[term] = log(len(self.reviews)/term_count)

        self.tfidf_dict = dict()
        for review_id, review in self.reviews.iteritems():
            tokens = TextProcess.stemming(TextProcess.stopword(re.findall(r'\w+',review.lower())))
            tf = collections.Counter(tokens)
            review_tfidfs = list()
            for term, term_count in self.term_counts.iteritems():
                if term in tf:
                    review_tfidfs.append((1+log(tf[term]))*self.idf[term])
                else:
                    review_tfidfs.append(0)
            self.tfidf_dict[review_id] = review_tfidfs
Пример #2
0
def text_preproc(text):
    text = TextProcess.shrink_whitespace(text)
    text = TextProcess.tolower(text)
    text = TextProcess.remove_html(text)
    text = TextProcess.remove_url(text)
    text = TextProcess.remove_number(text)
    text = TextProcess.remove_punctuation(text)
    text = TextProcess.remove_stopword(text)
    text = TextProcess.shrink_empty_line(text)
    return text
Пример #3
0
def text_preproc(text):
    text = TextProcess.shrinkWhitespace(text)
    text = TextProcess.toLower(text)
    text = TextProcess.removeHTML(text)
    text = TextProcess.removeURL(text)
    text = TextProcess.removeNumber(text)
    text = TextProcess.removePunctuation(text)
    text = TextProcess.removeStopword(text)
    text = TextProcess.shrinkEmptyLine(text)
    return text
Пример #4
0
def getRestaurants():
	restaurants = []

	inFileName='restaurants.json'
	f=open(inFileName,'r')

	for i in range(0, 14303):
		rest = TextProcess.read_line(f.readline())
		restaurants.append(rest)

	return restaurants
Пример #5
0
 def calc_tweet_scores(self):
         '''calculate relevancy of tweets
                         - 20% from favorite count
                         - 30% from retweet count
                         - 30% from number of followers_count
                         - 20% from user's total number of tweets
            (these amounts may be adjusted later)'''
         #score = 0
         with open('test.json','w') as f1:
                 json.dump(self.tweets,f1)
         for word in self.tweets:
                 #print word
                 sortedList =[]
                 count = 0
                 totalscore = 0
                 s = self.tweets[word]
                 with open('test2.json','w') as f1:
                         json.dump(s,f1)
                 for t in self.tweets[word]:
                         # adds up to 100%
                         #print t
                         #print stuff
                         #for t in stuff:    
                         #print "t is:"
                         #print t['favorite_count']
                         print count
                         #t = s[count]
                         if word in t['text']:
                                 #print word
                                 #print unicode(t['text'])
                                 score = 0
                                 score += math.log(t['favorite_count']+1,2) * 0.25
                                 score += math.log(t['retweet_count']+1,2) * 0.025
                                 score += math.log(t['user']['followers_count']+1,2) * 0.05
                                 #score += math.log(t['user']['statuses_count']+1,2) * 0.05
                                 totalscore += score
                                 #stemming the texts
                                 tokens = TextProcess.tokenize(t['text'])
                                 #list_of_stem_words = TextProcess.stemming(tokens)
                                 text = ' '.join(tokens).strip()
                                 self.scores.append({ 'id': t['id'], 'text':unicode(text,errors='ignore'), 'score' : score, 'geo':t['geo']['coordinates']})
                                 #print self.scores
                         count+=1
                 if (totalscore >=0):
                         sortedList = sorted(self.scores, key = lambda k: k['score'], reverse=True)[0:100]
                         sortedList2 = sorted(sortedList, key = lambda k: k['geo'][1], reverse=True)
                         if word not in self.sortedTweets:
                                 self.sortedTweets[word] = sortedList2
                         else:
                                 self.sortedTweets[word] += sortedList2
                         self.scores = []
Пример #6
0
    def __init__(self, a_document_name):
        # term counts: number of documents that contain a given term (key)
        self.term_counts = dict()
        self.reviews = dict()

        file = open(a_document_name)

        for review in file:
            review_id = TextProcess.read_line(review)['review_id'].encode(
                'utf8')
            review_content = TextProcess.read_line(review)['text'].encode(
                'utf8')
            self.reviews[review_id] = review_content
            tokens = TextProcess.stemming(
                TextProcess.stopword(
                    TextProcess.tokenize(TextProcess.read_line(review))))
            unique_tokens = list(set(tokens))
            for token in unique_tokens:
                if token in self.term_counts:
                    self.term_counts[token] += 1
                else:
                    self.term_counts[token] = 1

        self.idf = dict()
        for term, term_count in self.term_counts.iteritems():
            self.idf[term] = log(len(self.reviews) / term_count)

        self.tfidf_dict = dict()
        for review_id, review in self.reviews.iteritems():
            tokens = TextProcess.stemming(
                TextProcess.stopword(re.findall(r'\w+', review.lower())))
            tf = collections.Counter(tokens)
            review_tfidfs = list()
            for term, term_count in self.term_counts.iteritems():
                if term in tf:
                    review_tfidfs.append((1 + log(tf[term])) * self.idf[term])
                else:
                    review_tfidfs.append(0)
            self.tfidf_dict[review_id] = review_tfidfs
Пример #7
0
print "start training"
classifier = NaiveBayesClassifier.train(train_features)

reviewList = []
confList = []
reviewContent = {}
print "starting sentiment"

cutoff = 0
for line in open('../../data/yelp_academic_dataset_review.json', 'r'):

    #cutoff+=1
    #if cutoff > 10:
    #    break

    review_json = TextProcess.read_line(line)
    review_id = review_json['review_id']
    chopPoint = review_json['text'][::-1].find(' ', 400)
    if chopPoint > 400:
        reviewContent[review_id] = review_json['text'][::-1][:chopPoint][::-1]
        posScore = classifier.posScore(word_features(TextProcess.tokenize(review_json, chopPoint)))
        negScore = classifier.negScore(word_features(TextProcess.tokenize(review_json, chopPoint)))
    else:
        reviewContent[review_id] = review_json['text']
        posScore = classifier.posScore(word_features(TextProcess.tokenize(review_json, len(review_json['text']))))
        negScore = classifier.negScore(word_features(TextProcess.tokenize(review_json, len(review_json['text']))))

    reviewList.append((review_id,posScore))
    confList.append((review_id, abs(posScore-negScore)))

print "done with sentiment"
Пример #8
0
from __future__ import division

__author__ = "jtgoen"

import json
from utility import TextProcess


business_dict = dict()
review_dict = dict()
user_dict = dict(dict())

for line in open("yelp_academic_dataset_business.json", "r"):
    business_json = TextProcess.read_line(line)
    business_dict[business_json["business_id"]] = business_json

for line in open("yelp_academic_dataset_review.json", "r"):
    review_json = TextProcess.read_line(line)
    review_dict[review_json["review_id"]] = review_json

sentiment_dict = json.loads(open("reviewSentimentStars.json").read())
confidence_dict = json.loads(open("reviewConfidence.json").read())

for review in review_dict:
    if review_dict[review]["user_id"] not in user_dict:
        user_id = review_dict[review]["user_id"]
        user_dict[user_id] = dict(
            value=abs(review_dict[review]["stars"] - sentiment_dict[review_dict[review]["review_id"]]), reviews=1.0
        )
    else:
        user_dict[review_dict[review]["user_id"]]["value"] += abs(