Пример #1
0
    def problem1(self):
            #creates awesome data object for each paragraph in document
        for doc in train_docs:
            for (par, rating) in doc.get_par_rating_tuples():
            #Protecting against the chance of a failed parse
                if (par is not None and rating is not None):
                    data_list.append(data(par.lower(), rating, doc.filename, doc.author))
                else:
                    print "Found bad review by -> " + doc.author + " (this comes from: par = None)"

        #4 fold cross validation
        fold_size = math.floor(len(data_list) / DEFAULT_NUM_FOLDS)

        folds = [[], [], [], []]
        temp_data_list = copy.deepcopy(data_list)
    
        #divide into 4 folds
        while len(temp_data_list) != 0:
            ndx = random.randrange(0, len(temp_data_list))
            fold = random.randrange(0, DEFAULT_NUM_FOLDS)
            
            if len(folds[fold]) <= fold_size + 1:
                folds[fold].append(temp_data_list[ndx])
                del(temp_data_list[ndx])

        rmses = []
        #Big loop: For each fold
        for fold_num in range(DEFAULT_NUM_FOLDS):
            test_data = folds[fold_num]
            train_data = []
            for ndx in range(len(folds)): 
                if ndx != fold_num:
                    train_data.extend(folds[ndx])

        # Begin Filtering
        bag_words_train = []
        
        for train_datum in train_data:
            bag_words_train.extend([(train_datum.get_bag_of_words(), train_datum.rating)])
            
        bag_words_test = []

        for test_datum in test_data:
            bag_words_test.extend([(test_datum.get_bag_of_words(), test_datum.rating)])
            
        #get sentiment for words
        #get sentiment scores
        sent = buildSenti()

        

        #build feature set

        #train


        #RMSE
        """
Пример #2
0
   def __init__(self, corpus):
      self.tots = FreqDist()
      self.pos = FreqDist()
      self.negs = FreqDist()

      self.senti = buildSenti()
      self.ml = 2
      self.classifier = corpus.buildSentClassifier(self.langFeatures, 1000, self.isValid)

#      print "tots"
#      self.tots.tabulate()
#      print "pos"
#      self.pos.tabulate()
#      print "negs"
#      self.negs.tabulate()
      self.classifier.show_most_informative_features(5)
Пример #3
0
 def __init__(self, corpus):
    self.senti = buildSenti()
    self.ml = 2
    self.classifier = corpus.buildParaClassifier(self.langFeatures, 100, self.isValid)
Пример #4
0
 def __init__(self, corpus):
    self.ml = 2
    self.senti = buildSenti()
    self.classifier = corpus.buildRevClassifier(self.langFeatures, 100000, self.isValid)
    self.classifier.show_most_informative_features(5)
Пример #5
0
    def complete(self, data_list):
        #4 fold cross validation
        fold_size = math.floor(len(data_list) / DEFAULT_NUM_FOLDS)

        folds = [[], [], [], []]
        temp_data_list = copy.deepcopy(data_list)
    
        #divide into 4 folds
        while len(temp_data_list) != 0:
            ndx = random.randrange(0, len(temp_data_list))
            fold = random.randrange(0, DEFAULT_NUM_FOLDS)
            
            if len(folds[fold]) <= fold_size + 1:
                folds[fold].append(temp_data_list[ndx])
                del(temp_data_list[ndx])

        #get sentiment for words
        sent = buildSenti()

        rmses = []
        #for each fold, get bag of words
        for fold in folds:
            guesses = []
            actuals = []
            for datum in fold:
                #get words
                bag_of_words = datum.get_bag_of_words()
                #get actual rating
                actual_rating = datum.rating

                good_seed = ['excellent', 'amazing', 'best', 'delicious', 'tradition', 'fastest', 'clean', 'favorite', 'taste', 'worth',
                             'nice', 'friendly', 'positive', 'quality', 'great', 'prompt', 'amazing']
                bad_seed = ['horrible', 'terrible', 'metro', 'alright', 'cannot', 'mediocre', 'bad', 'wrong', 'messing', 'long', 'took', 
                            'unfortunately', 'obvious', 'drops', 'incorrect']

                senti_word = []
                for word in bag_of_words:
                    if word in good_seed:
                        senti_word.append(6)
                    elif word in bad_seed:
                        senti_word.append(-1)
                    elif word in sent:
                        sentiment = sent[word]

                        #augment sent value
                        if sentiment[1] > sentiment[0]:
                            actual_sent = round((sentiment[1] * 0.9) + 4.6)
                            senti_word.append(actual_sent)
                        else:
                            actual_sent = round((sentiment[0] * 5 ) + 1.5)
                            senti_word.append(actual_sent)

                if (len(senti_word) > 0):
                    prediction = round(sum(senti_word) / len(senti_word), 1)
#                    print "\nHERE: " 
#                    print "Prediction: "
#                    print prediction
#                    print "Actual: "
#                    print actual_rating
                    guesses.append(prediction)
                    actuals.append(actual_rating)
                
            temp_rm = rmse(guesses, actuals)
            print "For this fold: %f" % (temp_rm)
            rmses.append(temp_rm)    
            
        print "Average RMSE: %f" % (sum(rmses) /len(rmses))
Пример #6
0
from buildSenti import buildSenti

ml = 2
senti = buildSenti()

def isValid(w):
   if len(w) > ml and w.isalpha():
      return True
   return False

def langFeatures(word):
   sent = senti.get(word, (0, 0))[0] - senti.get(word, (0, 0))[1] 
   return { 'sent': sent}

def getClassifier(corpus):
   return corpus.buildWordClassifier(langFeatures, 100, isValid)

def test(rev, classifier):
   predictions = []
   for section in rev:
      predictions.append(4)

   return predictions