def gold_analysis(self): print 'Triggers:', freq = {} for trig in self.train_triggers + self.test_triggers + self.val_triggers: if not freq.has_key(trig.type): freq[trig.type] = 1 else: freq[trig.type] += 1 print freq print 'Total:', sum(freq.values()) d = {'starts_with_aux': [], 'ends_with_aux': []} all_pos = set() for s in self.sentences: for p in s.pos: all_pos.add(p) ant_start_pos = set() for trig in self.train_triggers: if wc.is_aux_lemma(trig.gold_ant.sub_sentdict.words[0]): d['starts_with_aux'].append(trig.gold_ant) ant_start_pos.add(trig.gold_ant.sub_sentdict.pos[0]) ant_end_pos = set() for trig in self.train_triggers: if wc.is_aux_lemma(trig.gold_ant.sub_sentdict.words[-1]): d['ends_with_aux'].append(trig.gold_ant) ant_start_pos.add(trig.gold_ant.sub_sentdict.pos[-1]) print 'Ants never start with these tags: ', all_pos - ant_start_pos print 'Percent of ants that start with auxs: ', len(d['starts_with_aux']) / float(len(self.train_triggers)) print 'Ants never END with these tags: ', all_pos - ant_end_pos print 'Percent of ants that END with auxs: ', len(d['ends_with_aux']) / float(len(self.train_triggers))
def get_head(self, idx=False, idx_in_subsentdict=False): """ @type return: str """ for i in range(len(self.sub_sentdict)): if wc.is_verb(self.sub_sentdict.pos[i]) and not wc.is_aux_lemma(self.sub_sentdict.lemmas[i]): if idx: if idx_in_subsentdict: return i else: return self.start + i else: return self.sub_sentdict.words[i] if idx and not idx_in_subsentdict: return self.start elif idx_in_subsentdict: return 0 else: try: return self.sub_sentdict.words[0] except IndexError: return ''
def get_head(self, idx=False, idx_in_subsentdict=False): """ @type return: str """ for i in range(len(self.sub_sentdict)): if wc.is_verb(self.sub_sentdict.pos[i]) and not wc.is_aux_lemma( self.sub_sentdict.lemmas[i]): if idx: if idx_in_subsentdict: return i else: return self.start + i else: return self.sub_sentdict.words[i] if idx and not idx_in_subsentdict: return self.start elif idx_in_subsentdict: return 0 else: try: return self.sub_sentdict.words[0] except IndexError: return ''
def hardt_features(ant, trig, sentences, pos_tags): """ This exists to add features that are somewhat based on what Hardt did in 1997. @type ant: vpe_objects.Antecedent @type trig: vpe_objects.Auxiliary @type sentences: vpe_objects.AllSentences """ v = [] sent_tree = sentences.get_sentence_tree(ant.sentnum) ant_sent = sentences.get_sentence(ant.sentnum) trig_sent = sentences.get_sentence(trig.sentnum) vp = sentences.nearest_vp(trig) vp_head = vp.get_head() vp_head_idx = vp.get_head(idx=True) ant_head = ant.get_head() ant_head_idx = ant.get_head(idx=True) v.append(1.0 if ant == vp else 0.0) v.append(1.0 if ant_head == vp_head else 0.0) v.append(1.0 if vp.start <= ant_head_idx <= vp.end else 0.0) v.append(1.0 if ant.start <= vp_head_idx <= ant.end else 0.0) v.append(ant.sentnum - vp.sentnum) v.append(ant.start - vp.start) v.append(ant.end - vp.end) # be-do form try: v.append(1.0 if wc.is_be(ant_sent.lemmas[ant.start - 1]) or wc.is_be(ant_sent.lemmas[ant.start]) else 0.0) v.append(1.0 if trig.type == 'do' and v[-1] == 1.0 else 0.0) except IndexError: v += [0.0, 0.0] # quotation features quote_start_trig, quote_end_trig = None, None for i, w in enumerate(trig_sent.lemmas): if w == "\"": if not quote_start_trig: quote_start_trig = i else: quote_end_trig = i break trig_in_quotes = False if quote_start_trig and quote_end_trig: trig_in_quotes = quote_start_trig <= trig.wordnum <= quote_end_trig v.append(1.0 if trig_in_quotes else 0.0) else: v.append(0.0) quote_start_ant, quote_end_ant = None, None for i, w in enumerate(ant_sent.lemmas): if w == "\"": if not quote_start_ant: quote_start_ant = i else: quote_end_ant = i break ant_in_quotes = False if quote_start_ant and quote_end_ant: ant_in_quotes = quote_start_ant <= ant.start <= quote_end_ant and quote_start_ant <= ant.end <= quote_end_ant v.append(1.0 if quote_start_ant <= ant.start <= quote_end_ant else 0.0) v.append(1.0 if quote_start_ant <= ant.end <= quote_end_ant else 0.0) else: v += [0.0, 0.0] v.append(1.0 if trig_in_quotes and ant_in_quotes else 0.0) # Nielsen features v.append(1.0 if wc.is_aux_lemma(ant.sub_sentdict.lemmas[0]) else 0.0) v.append(1.0 if wc.is_aux_lemma(ant.sub_sentdict.lemmas[ant.get_head( idx=True, idx_in_subsentdict=True)]) else 0.0) for tag in pos_tags: v.append(1.0 if tag == ant.sub_sentdict.pos[0] else 0.0) # Sparse encoding of the pos tag of first word in ant v.append(1.0 if tag == ant.sub_sentdict.pos[-1] else 0.0) # Sparse encoding of the pos tag of last word in ant v.append( float(ant.sub_sentdict.pos.count(tag)) / len(ant.sub_sentdict)) # Frequency of the given pos tag in ant for fun in [ wc.is_adverb, wc.is_verb, wc.is_adverb, wc.is_noun, wc.is_preposition, wc.is_punctuation, wc.is_predicative ]: v.append(1.0 if fun(ant.sub_sentdict.pos[0]) else 0.0) # Sparse encoding of the identity of first word in ant v.append(1.0 if fun(ant.sub_sentdict.pos[-1]) else 0.0) # Sparse encoding of the identity of last word in ant v.append( float(len(map(fun, ant.sub_sentdict.pos))) / len(ant.sub_sentdict)) # Frequency of the given function in ant sent_phrases = get_phrases(sent_tree) ant_phrases = lowest_common_subtree_phrases(sent_tree, ant.get_words()) v.append(float(len(ant_phrases)) / len(sent_phrases)) for phrase in ['NP', 'VP', 'S', 'SINV', 'ADVP', 'ADJP', 'PP']: v.append( len(map(lambda s: s.startswith(phrase), ant_phrases)) / float(len(ant_phrases))) v.append( len(map(lambda s: s.startswith(phrase), sent_phrases)) / float(len(sent_phrases))) continuation_words = ['than', 'as', 'so'] if ant.sentnum == trig.sentnum: v.append(1.0) for word in continuation_words: v.append(1.0 if word in ant_sent.words[ant.end:trig.wordnum] else 0.0) else: v.append(0.0) for _ in continuation_words: v.append(0.0) try: v.append(1.0 if ant_sent.words[ant.start - 1] == trig.word else 0.0) v.append(1.0 if ant_sent.lemmas[ant.start - 1] == trig.lemma else 0.0) v.append(1.0 if ant_sent.lemmas[ant.start - 1] == trig.type else 0.0) v.append(1.0 if ant_sent.pos[ant.start - 1] == trig.pos else 0.0) except IndexError: v += [0.0, 0.0, 0.0, 0.0] # Theoretical linguistics features if ant.sentnum == trig.sentnum: word_positions = getwordtreepositions(sent_tree) v.append(1.0) v.append(1.0 if wc.ccommands(ant.start, trig.wordnum, sent_tree, word_positions) else 0.0) v.append(1.0 if wc.ccommands(trig.wordnum, ant.start, sent_tree, word_positions) else 0.0) v.append(1.0 if wc.ccommands(ant.end, trig.wordnum, sent_tree, word_positions) else 0.0) v.append(1.0 if wc.ccommands(trig.wordnum, ant. end, sent_tree, word_positions) else 0.0) # Check if a word in the antecedent c-commands the trig and vice versa. ant_word_ccommands, trig_ccommands = False, False for idx in range(ant.start, ant.end + 1): if wc.ccommands(idx, trig.wordnum, sent_tree, word_positions): v.append(1.0) ant_word_ccommands = True if wc.ccommands(trig.wordnum, idx, sent_tree, word_positions): v.append(1.0) trig_ccommands = True if ant_word_ccommands and trig_ccommands: # speed boost of 0.02ms kek break if not ant_word_ccommands: v.append(0.0) if not trig_ccommands: v.append(0.0) else: v += [0.0 for _ in range(7)] return v