classifier_file.close() # setup word stemmer self.stemmer = SnowballStemmer("english") def classify(self, tree): words = [self.stemmer.stem(word).lower() for word in tree.leaves()] categories = {} for category, classifier in self.classifiers.items(): features = { feature: (feature in words) for feature in self.word_features[category] } prob = classifier.prob_classify(features).prob(category) categories[category] = prob return categories classifier = Classifier() predictor = SentimentPredictor() ANALYZER_VERSION = 3 # iterate over confessions and predict categories and sentiment confessions = db.parses.find({ "analyzed": { "$ne": ANALYZER_VERSION } }, limit=500) threshold = 0.2 for confession in confessions: for tree_id, raw_tree in enumerate(confession["trees"]): if raw_tree == "None": continue # get sentence categories tree = Tree.fromstring(raw_tree) categories = [(category, prob) for (category, prob) in classifier.classify(tree).items() if prob > threshold]
def score_accuracy(data): accurate = 0 inaccurate = 0 for datum in data: if abs(datum["gold_sentiment"]) <= 0.5: continue s = datum["sentiment"] if datum["gold_sentiment"] < 0: if s < 0: accurate += 1 else: inaccurate += 1 else: if s > 0: accurate += 1 else: inaccurate += 1 return accurate*1.0/(accurate+inaccurate) # predict sentence sentiments predictor = SentimentPredictor() for (parse_id, tree_id), sentiments in sentences.items(): confession = db.parses.find_one({ "_id": ObjectId(parse_id) }) tree = confession["trees"][tree_id] sentiment = sum(sentiments)*1.0/len(sentiments) predictor.add_tree({ "raw_tree": tree, "gold_sentiment": sentiment }) predictor.run() print score_accuracy(predictor.trees) # predict confession sentiments predictor = SentimentPredictor() for parse_id, sentiments in confessions.items(): confession = db.parses.find_one({ "_id": ObjectId(parse_id) })