Exemplo n.º 1
0
 def processReviewXls(self, sheet, row):
     review = Review()
     start_col = 0
     end_col = 11
     for col in range(start_col, end_col):
         if col == 0:
             review.reviewId = sheet.cell_value(row, col)
         elif col == 1:
             review.review = sheet.cell_value(row, col)
         elif col == 2:
             review.Food = self.XlsCheckValue(sheet.cell_value(row, col))
         elif col == 3:
             review.Drinks = self.XlsCheckValue(sheet.cell_value(row, col))
         elif col == 4:
             review.Ambiance = self.XlsCheckValue(sheet.cell_value(row, col))
         elif col == 5:
             review.Service = self.XlsCheckValue(sheet.cell_value(row, col))
         elif col == 6:
             review.Location = self.XlsCheckValue(sheet.cell_value(row, col))
         elif col == 7:
             review.Deals = self.XlsCheckValue(sheet.cell_value(row, col))
         elif col == 8:
             review.Price = self.XlsCheckValue(sheet.cell_value(row, col))
         else:
             pass  # control should have never reached here as there are only 11 columns in xls
     return review
Exemplo n.º 2
0
    def stemmingStopWRemoval(self, review, vocab):
        ''' Does Following things:
        1. Tokenize review into sentences, and then into words
        2. Remove stopwords, punctuation and stem each word
        3. Add words into vocab 
        4. Make Sentence objects and corresponding Review object
        '''
        reviewObj = Review()
        #copying ratings into reviewObj
        for ratingType, rating in review["Ratings"].items():
            reviewObj.ratings[ratingType] = rating
        reviewObj.reviewId = review["ReviewID"]

        stemmer = PorterStemmer()
        reviewContent = review["Content"]
        #TODO: Append title too!
        sentencesInReview = nltk.sent_tokenize(reviewContent)
        puncs = set(string.punctuation)  #punctuation marks
        for sentence in sentencesInReview:
            wordList = []
            words = nltk.word_tokenize(sentence)
            for word in words:
                if not all(c.isdigit() or c in puncs for c in word):
                    word = word.lower()
                    if word not in self.stopWords:
                        word = stemmer.stem(word.lower())
                        vocab.append(word)
                        wordList.append(word)
            if wordList:
                sentenceObj = Sentence(wordList)
                reviewObj.sentences.append(sentenceObj)
        if reviewObj.sentences:
            self.allReviews.append(reviewObj)
Exemplo n.º 3
0
 def processReviewXls(self, sheet, row):
     review = Review()
     start_col = 0
     end_col = 11
     for col in range(start_col, end_col):
         if (col == 0):
             review.reviewId = sheet.cell_value(row, col)
         elif (col == 1):
             review.review = sheet.cell_value(row, col)
         elif (col == 2):
             review.Food = self.XlsCheckValue(sheet.cell_value(row, col))
         elif (col == 3):
             review.Drinks = self.XlsCheckValue(sheet.cell_value(row, col))
         elif (col == 4):
             review.Ambiance = self.XlsCheckValue(sheet.cell_value(
                 row, col))
         elif (col == 5):
             review.Service = self.XlsCheckValue(sheet.cell_value(row, col))
         elif (col == 6):
             review.Location = self.XlsCheckValue(sheet.cell_value(
                 row, col))
         elif (col == 7):
             review.Deals = self.XlsCheckValue(sheet.cell_value(row, col))
         elif (col == 8):
             review.Price = self.XlsCheckValue(sheet.cell_value(row, col))
         else:
             pass  #control should have never reached here as there are only 11 columns in xls
     return review