def ImpactTraining(docPath, lexPath, lexiconID): """ Final score of the review is calculated as follows: (Score1*Multiplier1 + Score2*Multiplier2 ... ScoreN*MultiplierN) * BaseMultiplier = ReviewScore Ignoring BaseMultiplier for this training, assuming it has minimal impact (TODO: test this impact) ScoreK*MultiplierK/ReviewScore * 100 = PercentImpact (impact %age of word K on the final score) TotalAdjustment = Expected Score - FinalScore AdjustmentK = PercentImpact of TotalAdjustment (total adjustment needed for word K) Adjustment on word K for this review = AdjustmentK/MultiplierK Take the mean of all adjustments, and applying to the final lexicon, to get the new lexicon Repeat process until there is performance improvement. """ oldAccuracy = 0 oldAngel = None se = PerformanceTest(lexPath, docPath) while True: adjustments = defaultdict(list) newAngel = Angel(se.lexicon, smallReviews=True) expectedSentiment, predictedOverall = [], [] se.ResetIterator() while True: try: sentences, expectedLabel, notCount, docId = se.NextElement() expectedSentiment.append(expectedLabel) predictedScore = newAngel.PredictReviewScore(sentences, expectedLabel) predictedLabel = Sentiment.GetSentimentClass(predictedScore) predictedOverall.append(predictedLabel) if oldAngel is not None: oldPredictedLabel = Sentiment.GetSentimentClass(oldAngel.PredictReviewScore(sentences, expectedLabel)) if oldPredictedLabel != predictedLabel: oldAngel.DumpDetails(sentences, expectedLabel) newAngel.DumpDetails(sentences, expectedLabel) totalImpact, impactTable = newAngel.GetImpact(sentences) if totalImpact == 0: continue totalAdjustment = expectedLabel*10 - predictedScore for word, (wordScore, multiplier) in impactTable.iteritems(): if multiplier == 0: continue wordAdjustment = ((wordScore/totalImpact) * totalAdjustment) / multiplier if wordAdjustment != 0: adjustments[word].append(wordAdjustment) except StopIteration: break newAccuracy = util.accuracy(predictedOverall, expectedSentiment) print "Accuracy:", oldAccuracy, "--->", newAccuracy if newAccuracy <= oldAccuracy: break for word in adjustments: se.lexicon[word] = str(float(se.lexicon[word]) + numpy.mean(adjustments[word])) oldAngel = newAngel oldAccuracy = newAccuracy filename = "../files/lexicons/" + lexiconID + ".csv" handle = open(filename, 'wb') wr = csv.writer(handle) for key, value in sorted(oldAngel.lexicon.items()): row = [key, value] wr.writerow(row) handle.close()
def __init__(self, lexPath, docPath): self.lexicon = util.LoadLexiconFromCSV(lexPath) self.docPath = docPath if docPath.endswith("json"): self.docType = "json" elif docPath.endswith("xml"): self.docType = "xml" else: self.docType = "json" self.iterator = self.GetIter()
def predict(self, filePath): #PREDICT lexicon = util.LoadLexiconFromCSV( "../files/lexicons/SentiWordNet_Lexicon_concise.csv") angel = Angel(lexicon, True) parsedReviewsPath = os.path.join(os.path.dirname(filePath), "YelpParsedReviews.json") with open(parsedReviewsPath, 'r') as file: TrainingFile = file.read() classificationData = json.loads(TrainingFile) for k in range(len(classificationData["ClassificationModel"])): current = classificationData["ClassificationModel"][str(k + 1)] notCount = current["NotCount"] if "Sentences" in current: if not isinstance(current["Sentences"], list): current["Sentences"] = [current["Sentences"]] sentences = current["Sentences"] else: continue current["Label"] = Sentiment.GetSentimentClass( angel.PredictReviewScore(sentences, notCount), 1) angel.DumpDetails(sentences, current["Label"]) return classificationData
def PerformTest(self): """ This method loads the test data file, and tests how good the prediction is. It also prints the precision, recall and F1 scores. """ angel = Angel(self.lexicon, True) angel.SetDumpParameters(7, -7) posx, negx, neutx, accx, = 0, 0, 0, 0 maxnegf1 = maxneutf1 = maxposf1 = maxacc = 0 for threshold in range(1, 0, -1): predictedOverall = [] expectedSentiment = [] demons = TotPos = TotNeg = TotNeut = 0 while True: try: sentences, label, notCount, docId = self.NextElement() if not sentences: continue if label == 'NULL': break label = int(label) expectedSentiment.append(label) predicted = angel.PredictReviewScore(sentences, label) predictedOverall.append(Sentiment.GetSentimentClass(predicted, threshold)) if label == Sentiment.POSITIVE: TotPos += 1 elif label == Sentiment.NEGATIVE: TotNeg += 1 else: TotNeut += 1 if angel.DumpRequested(predicted, label): print "ID", docId, "\n" demons += 1 except StopIteration: break print "Demons:", demons pos_prec = util.precision_with_class(predictedOverall, expectedSentiment, 1) neg_prec = util.precision_with_class(predictedOverall, expectedSentiment, -1) neut_prec = util.precision_with_class(predictedOverall, expectedSentiment, 0) pos_rec = util.recall_with_class(predictedOverall, expectedSentiment, 1) neg_rec = util.recall_with_class(predictedOverall, expectedSentiment, -1) neut_rec = util.recall_with_class(predictedOverall, expectedSentiment, 0) pos_f1 = util.f1_with_class(predictedOverall, expectedSentiment, 1) neg_f1 = util.f1_with_class(predictedOverall, expectedSentiment, -1) neut_f1 = util.f1_with_class(predictedOverall, expectedSentiment, 0) accuracy = util.accuracy(predictedOverall, expectedSentiment) print "Current Positive stats (", threshold, "): ","\t", '{:.2%}'.format(pos_prec), \ "\t", '{:.2%}'.format(pos_rec), "\t", '{:.2%}'.format(pos_f1) print "Current Negative stats (", threshold, "): ", "\t", '{:.2%}'.format(neg_prec), "\t", \ '{:.2%}'.format(neg_rec), "\t", '{:.2%}'.format(neg_f1) print "Current Neutral stats (", threshold, "): ", "\t", '{:.2%}'.format(neut_prec), "\t", \ '{:.2%}'.format(neut_rec), "\t", '{:.2%}'.format(neut_f1) cprint("Current Accuracy ( " + str(threshold) + " ):\t\t\t" + '{:.2%}'.format(accuracy), 'red') if pos_f1 > maxposf1: maxposf1 = pos_f1 posx = threshold if neg_f1 > maxnegf1: maxnegf1 = neg_f1 negx = threshold if neut_f1 > maxneutf1: maxneutf1 = neut_f1 neutx = threshold if accuracy > maxacc: maxacc = accuracy accx = threshold print "Maximum Positive F1: ", '{:.2%}'.format(maxposf1), "at", posx print "Maximum Negative F1: ", '{:.2%}'.format(maxnegf1), "at", negx print "Maximum Neutral F1: ", '{:.2%}'.format(maxneutf1), "at", neutx cprint("Maximum Accuracy: " + '{:.2%}'.format(maxacc) + " at " + str(accx), 'red')
def ImpactTraining(docPath, lexPath, lexiconID): """ Final score of the review is calculated as follows: (Score1*Multiplier1 + Score2*Multiplier2 ... ScoreN*MultiplierN) * BaseMultiplier = ReviewScore Ignoring BaseMultiplier for this training, assuming it has minimal impact (TODO: test this impact) ScoreK*MultiplierK/ReviewScore * 100 = PercentImpact (impact %age of word K on the final score) TotalAdjustment = Expected Score - FinalScore AdjustmentK = PercentImpact of TotalAdjustment (total adjustment needed for word K) Adjustment on word K for this review = AdjustmentK/MultiplierK Take the mean of all adjustments, and applying to the final lexicon, to get the new lexicon Repeat process until there is performance improvement. """ oldAccuracy = 0 oldAngel = None se = PerformanceTest(lexPath, docPath) while True: adjustments = defaultdict(list) newAngel = Angel(se.lexicon, smallReviews=True) expectedSentiment, predictedOverall = [], [] se.ResetIterator() while True: try: sentences, expectedLabel, notCount, docId = se.NextElement() expectedSentiment.append(expectedLabel) predictedScore = newAngel.PredictReviewScore( sentences, expectedLabel) predictedLabel = Sentiment.GetSentimentClass(predictedScore) predictedOverall.append(predictedLabel) if oldAngel is not None: oldPredictedLabel = Sentiment.GetSentimentClass( oldAngel.PredictReviewScore(sentences, expectedLabel)) if oldPredictedLabel != predictedLabel: oldAngel.DumpDetails(sentences, expectedLabel) newAngel.DumpDetails(sentences, expectedLabel) totalImpact, impactTable = newAngel.GetImpact(sentences) if totalImpact == 0: continue totalAdjustment = expectedLabel * 10 - predictedScore for word, (wordScore, multiplier) in impactTable.iteritems(): if multiplier == 0: continue wordAdjustment = ((wordScore / totalImpact) * totalAdjustment) / multiplier if wordAdjustment != 0: adjustments[word].append(wordAdjustment) except StopIteration: break newAccuracy = util.accuracy(predictedOverall, expectedSentiment) print "Accuracy:", oldAccuracy, "--->", newAccuracy if newAccuracy <= oldAccuracy: break for word in adjustments: se.lexicon[word] = str( float(se.lexicon[word]) + numpy.mean(adjustments[word])) oldAngel = newAngel oldAccuracy = newAccuracy filename = "../files/lexicons/" + lexiconID + ".csv" handle = open(filename, 'wb') wr = csv.writer(handle) for key, value in sorted(oldAngel.lexicon.items()): row = [key, value] wr.writerow(row) handle.close()