def handleTree(self, tree1, tree2): for (n1, n2) in itertools.izip_longest(ma_util.walkTree(tree1), ma_util.walkTree(tree2), fillvalue=F): if n1 == F or n2 == F: raise ValueError('Tree length not equal or other breakage') prod1 = self.getProduction(n1) prod2 = self.getProduction(n2) goldLabel = int(n1.node) predLabel = int(n2.node) if prod1: self.count(prod1, prod2) self.gold.append(prod1) self.predicted.append(prod2) if not self.interesting: return coarse_map = {ma_util.VERY_NEG: ma_util.NEG, ma_util.SLIGHTLY_NEG: ma_util.NEG, ma_util.VERY_POS: ma_util.POS, ma_util.SLIGHTLY_POS: ma_util.POS} ruleLabel = self.getInterestingLabel(n1) if ruleLabel and goldLabel in coarse_map: if not ruleLabel in self.totalI: self.totalI[ruleLabel] = 0 self.totalI[ruleLabel] += 1 if (predLabel in coarse_map and coarse_map[goldLabel] == coarse_map[predLabel]): if not ruleLabel in self.correctI: self.correctI[ruleLabel] = 0 self.correctI[ruleLabel] += 1
def extract_POS(self, goldSentence, tagged=None): if tagged is None: tagged = self.posCache[goldSentence.pprint().encode('utf-8')] if tagged is None: #tagged = self.get_pos_tags_for_sentences([goldSentence])[0] raise ValueError("Should have seen sentence in cache: %s" % goldSentence) leaves = goldSentence.leaves() if not len(leaves) == len(tagged): logger.error("leaves do not correspond to tagged!") logger.error("leaves: %s, tagged: %s", leaves, tagged) # TODO: there's a chance that similar leaves will have their POS tags # overriden # but yeah, good enough for now. leafDict = {} for (leaf, pos) in itertools.izip(leaves, tagged): pos = pos[1] leafDict[leaf] = pos items = [] all_pos_tags = set() for goldNode in ma_util.walkTree(goldSentence): res = {} for subTreeLeaf in goldNode.leaves(): key = leafDict[subTreeLeaf] # [0] if not key in res: res[key] = 0 res[key] += 1 # += 1 all_pos_tags.add(key) items.append(res) return items
def extract_phrase_predictor_sentiment(self, goldSentence, returnSpans=False): """Extracts features from PhrasePredictor. The PhrasePredictor returns three features: - ScoreSum - sum over learned word weights - RegressionScore - predicted Amazon Review Star Rating - Token count @param goldSentence {PTB tree} Parse tree with sentiment annotation @returns {tuple} 3-tuple of lists with features """ data = [] data2 = [] counts = [] spans = [] for goldNode in ma_util.walkTree(goldSentence): ppSpan = self.pp.getSpan(goldNode) ppSentiment = self.pp.main(ppSpan, True) sentiString = self.pp.score_sum_to_sentiment(ppSentiment[0]) data.append(ma_util.strSen(sentiString)) # this works almost as well (to the point of the difference # likely being noise), and the feature importance is # bit more evenly distributed with the original sumscore # instead of the discretized version # data.append(ppSentiment[0]) data2.append(ppSentiment[1]) # counts.append(ppSentiment[2]) counts.append(len(goldNode.leaves())) spans.append(ppSpan) if returnSpans: return (data, data2, counts, spans) else: return (data, data2, counts)
def extract_xgrams_from_tree(self, tree): # a tree is a single document # returns a horizontally stacked scipy.sparse.csr_matrix vectors = None for subTree in ma_util.walkTree(tree): xgrams = self.handleGrams(subTree.leaves()) vec = self.convert_document_to_vector(xgrams) if vectors is None: # make 2d array vectors = np.asarray([vec]) else: vectors = np.concatenate([vectors, [vec]]) return vectors
def extract_gold_sentiment(self, goldSentence, extractLength=False): """Extracts gold label. @param goldSentence {PTB tree} Parse tree with sentiment annotation @returns {list} List of labels """ data = [] for goldNode in ma_util.walkTree(goldSentence): label = ma_util.sen(goldNode.node, self.granularity) if extractLength: data.append((label, len(goldNode.leaves()))) else: data.append(label) return data
def extract_sentiWS(self, goldSentence): """ Extracts features from SentiWS. For each span in the goldSentence tree, the positive and negative weights (if any) are added in separate features. @param goldSentence {PTB tree} Parse tree with sentiment annotation @param posWords {dict} Mapping from positive word forms to weights @param negWords {dict} Mapping from negative word forms to weights @returns {tuple({list}, {list}} Positive and Negative features per span """ pos = [] neg = [] for goldNode in ma_util.walkTree(goldSentence): (negScore, posScore) = self.getSentiWSScore(goldNode.leaves()) pos.append(posScore) neg.append(negScore) return (pos, neg)