def problem1(self): #creates awesome data object for each paragraph in document for doc in train_docs: for (par, rating) in doc.get_par_rating_tuples(): #Protecting against the chance of a failed parse if (par is not None and rating is not None): data_list.append(data(par.lower(), rating, doc.filename, doc.author)) else: print "Found bad review by -> " + doc.author + " (this comes from: par = None)" #4 fold cross validation fold_size = math.floor(len(data_list) / DEFAULT_NUM_FOLDS) folds = [[], [], [], []] temp_data_list = copy.deepcopy(data_list) #divide into 4 folds while len(temp_data_list) != 0: ndx = random.randrange(0, len(temp_data_list)) fold = random.randrange(0, DEFAULT_NUM_FOLDS) if len(folds[fold]) <= fold_size + 1: folds[fold].append(temp_data_list[ndx]) del(temp_data_list[ndx]) rmses = [] #Big loop: For each fold for fold_num in range(DEFAULT_NUM_FOLDS): test_data = folds[fold_num] train_data = [] for ndx in range(len(folds)): if ndx != fold_num: train_data.extend(folds[ndx]) # Begin Filtering bag_words_train = [] for train_datum in train_data: bag_words_train.extend([(train_datum.get_bag_of_words(), train_datum.rating)]) bag_words_test = [] for test_datum in test_data: bag_words_test.extend([(test_datum.get_bag_of_words(), test_datum.rating)]) #get sentiment for words #get sentiment scores sent = buildSenti() #build feature set #train #RMSE """
def __init__(self, corpus): self.tots = FreqDist() self.pos = FreqDist() self.negs = FreqDist() self.senti = buildSenti() self.ml = 2 self.classifier = corpus.buildSentClassifier(self.langFeatures, 1000, self.isValid) # print "tots" # self.tots.tabulate() # print "pos" # self.pos.tabulate() # print "negs" # self.negs.tabulate() self.classifier.show_most_informative_features(5)
def __init__(self, corpus): self.senti = buildSenti() self.ml = 2 self.classifier = corpus.buildParaClassifier(self.langFeatures, 100, self.isValid)
def __init__(self, corpus): self.ml = 2 self.senti = buildSenti() self.classifier = corpus.buildRevClassifier(self.langFeatures, 100000, self.isValid) self.classifier.show_most_informative_features(5)
def complete(self, data_list): #4 fold cross validation fold_size = math.floor(len(data_list) / DEFAULT_NUM_FOLDS) folds = [[], [], [], []] temp_data_list = copy.deepcopy(data_list) #divide into 4 folds while len(temp_data_list) != 0: ndx = random.randrange(0, len(temp_data_list)) fold = random.randrange(0, DEFAULT_NUM_FOLDS) if len(folds[fold]) <= fold_size + 1: folds[fold].append(temp_data_list[ndx]) del(temp_data_list[ndx]) #get sentiment for words sent = buildSenti() rmses = [] #for each fold, get bag of words for fold in folds: guesses = [] actuals = [] for datum in fold: #get words bag_of_words = datum.get_bag_of_words() #get actual rating actual_rating = datum.rating good_seed = ['excellent', 'amazing', 'best', 'delicious', 'tradition', 'fastest', 'clean', 'favorite', 'taste', 'worth', 'nice', 'friendly', 'positive', 'quality', 'great', 'prompt', 'amazing'] bad_seed = ['horrible', 'terrible', 'metro', 'alright', 'cannot', 'mediocre', 'bad', 'wrong', 'messing', 'long', 'took', 'unfortunately', 'obvious', 'drops', 'incorrect'] senti_word = [] for word in bag_of_words: if word in good_seed: senti_word.append(6) elif word in bad_seed: senti_word.append(-1) elif word in sent: sentiment = sent[word] #augment sent value if sentiment[1] > sentiment[0]: actual_sent = round((sentiment[1] * 0.9) + 4.6) senti_word.append(actual_sent) else: actual_sent = round((sentiment[0] * 5 ) + 1.5) senti_word.append(actual_sent) if (len(senti_word) > 0): prediction = round(sum(senti_word) / len(senti_word), 1) # print "\nHERE: " # print "Prediction: " # print prediction # print "Actual: " # print actual_rating guesses.append(prediction) actuals.append(actual_rating) temp_rm = rmse(guesses, actuals) print "For this fold: %f" % (temp_rm) rmses.append(temp_rm) print "Average RMSE: %f" % (sum(rmses) /len(rmses))
from buildSenti import buildSenti ml = 2 senti = buildSenti() def isValid(w): if len(w) > ml and w.isalpha(): return True return False def langFeatures(word): sent = senti.get(word, (0, 0))[0] - senti.get(word, (0, 0))[1] return { 'sent': sent} def getClassifier(corpus): return corpus.buildWordClassifier(langFeatures, 100, isValid) def test(rev, classifier): predictions = [] for section in rev: predictions.append(4) return predictions