def antecedent_description(trig_sentdict, ant_sentdict, ant, POS_TAGS): vector = [] ant_words = ant.get_words() subtree = ant.get_subtree() # Feature 1. vector.append(truth(len(subtree.leaves()) == len(ant_words))) # Feature 2. vector.append(len(ant_words)) # Feature 3. vector.append( truth( NT.dominates(subtree.root(), subtree, ant.get_trigger().get_subtree()))) # Features 4. pos_tags_dict = {} for tag in POS_TAGS: pos_tags_dict[tag] = 0 idx = get_antecedent_head_index(ant.get_context(), ant) for tag in ant.get_context()['pos'][idx:len(ant_words)]: pos_tags_dict[tag] += 1 vector += [pos_tags_dict[tag] for tag in pos_tags_dict] # Feature 5: if the antecedent starts with an auxiliary, verb, adj. vector.append(truth(DV.isauxiliary(ant_sentdict, idx))) vector.append(truth(DV.isverb(ant_sentdict['pos'][idx]))) vector.append(truth(DV.isadj(ant_sentdict['pos'][idx]))) return vector
def antecedent_description(trig_sentdict, ant_sentdict, ant, POS_TAGS): vector = [] ant_words = ant.get_words() subtree = ant.get_subtree() # Feature 1. vector.append(truth(len(subtree.leaves()) == len(ant_words))) # Feature 2. vector.append(len(ant_words)) # Feature 3. vector.append(truth(NT.dominates(subtree.root(), subtree, ant.get_trigger().get_subtree()))) # Features 4. pos_tags_dict = {} for tag in POS_TAGS: pos_tags_dict[tag] = 0 idx = get_antecedent_head_index(ant.get_context(), ant) for tag in ant.get_context()['pos'][idx:len(ant_words)]: pos_tags_dict[tag] += 1 vector += [pos_tags_dict[tag] for tag in pos_tags_dict] # Feature 5: if the antecedent starts with an auxiliary, verb, adj. vector.append(truth(DV.isauxiliary(ant_sentdict, idx))) vector.append(truth(DV.isverb(ant_sentdict['pos'][idx]))) vector.append(truth(DV.isadj(ant_sentdict['pos'][idx]))) return vector
def alignment_comparison(trig_sentdict, ant_sentdict, ant, trigger, word2vec_dict): vector = [] ant_context_sentdict = ant.get_context() trig_context_sentdict = trigger.get_context() ant_head_idx = get_antecedent_head_index(ant_sentdict, ant) # Feature 1. ant_auxs = [] for i in range(0, len(ant_sentdict['words'])): if DV.isauxiliary(ant_sentdict, i): ant_auxs.append(ant_sentdict['lemmas'][i]) found = False for aux in ant_auxs: if aux in trig_sentdict['lemmas']: vector.append(truth(True)) found = True break if not found: vector.append(truth(False)) # Feature 2. if ant.get_sentnum() == trigger.get_sentnum(): vector.append(truth(ant_head_idx > trigger.get_idx())) vector.append(truth(ant_head_idx == trigger.get_idx())) vector.append(truth(ant_head_idx < trigger.get_idx())) else: vector += [0, 0, 0] # Features 3,4,5. for k in ['words', 'lemmas', 'pos']: total = len(ant_context_sentdict[k]) + len(trig_context_sentdict[k]) common = len( set(ant_context_sentdict[k]).intersection( trig_context_sentdict[k])) vector.append(common) vector.append((2.0 * float(common)) / float(total)) # Feature 6 - number of words between trigger and antecedent. vector.append(ant.get_sentnum() - trigger.get_sentnum()) if ant.get_sentnum() == trigger.get_sentnum(): vector.append(ant_head_idx - trigger.get_idx()) else: crt_sentnum = trigger.get_sentnum() distance = ant_head_idx while crt_sentnum < ant.get_sentnum(): distance += len(trig_sentdict['words']) crt_sentnum += 1 vector.append(distance) # Feature 7. # First we get the vecs from the Ant NP and average them. blank_np = False ant_np_word2vec = [] ant_np_location = ant.get_context()['np'] if ant_np_location != (-1, -1): ant_np_word2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_np_location[0], ant_np_location[1]) else: blank_np = True # Next we do the same for the Trigger NP. trig_np_word2vec = [] trig_np_location = trigger.get_context()['np'] if trig_np_location != (-1, -1): trig_np_word2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trig_np_location[0], trig_np_location[1]) else: blank_np = True # Adding the angle of the vector between the trigger NP and antecedent NP. if not blank_np: ant_length = vector_length(ant_np_word2vec) trig_length = vector_length(trig_np_word2vec) try: angle = angle_btwn_vectors(ant_np_word2vec, trig_np_word2vec, v1_length=ant_length, v2_length=trig_length) except ValueError: angle = 90.0 vector.append(angle) vector.append(truth(angle == 0.0)) else: vector.append(90.0) vector.append(truth(90.0 == 0.0)) if not ant_np_word2vec: vector += [0 for _ in range(0, WORD2VEC_LENGTH)] else: vector += ant_np_word2vec if not trig_np_word2vec: vector += [0 for _ in range(0, WORD2VEC_LENGTH)] else: vector += trig_np_word2vec # Now for what comes after the head. ant_head_idx = get_antecedent_head_index(ant_sentdict, ant) ant_post_head_w2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_head_idx, len(ant_sentdict['words'])) # if not ant_post_head_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)] # else: vector += ant_post_head_w2vec stop_idx = len(trig_sentdict['words']) for i in range(trigger.get_idx(), len(trig_sentdict['words'])): if DV.ispunctuation(trig_sentdict['lemmas'][i]): stop_idx = i break post_trig_w2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trigger.get_idx(), stop_idx) # if not post_trig_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)] # else: vector += post_trig_w2vec if ant_post_head_w2vec and post_trig_w2vec: try: post_angle = angle_btwn_vectors(ant_post_head_w2vec, post_trig_w2vec) except ValueError: post_angle = 90.0 vector.append(post_angle) vector.append(truth(post_angle == 0.0)) else: vector.append(90.0) vector.append(truth(90.0 == 0.0)) # Sentenial complement check. tree = NT.maketree(ant_sentdict['tree'][0]) if NT.dominates(tree, ant.get_subtree(), trigger.get_subtree()): vector.append( truth( NT.has_phrases_between_trees( ant.get_subtree(), trigger.get_subtree(), NIELSON_SENTENIAL_COMPLEMENT_PHRASES))) else: vector.append(truth(False)) # Features to account for the number of each phrase type between the antecedent and trigger. phrases_between = [0 for _ in ALL_PHRASES] if ant.get_sentnum() == trigger.get_sentnum(): for i in range(0, len(phrases_between)): if NT.has_phrases_between_trees(ant.get_subtree(), trigger.get_subtree(), [ALL_PHRASES[i]]): phrases_between[i] += 1 vector += phrases_between vector.append(sum(phrases_between)) return vector
def alignment_comparison(trig_sentdict, ant_sentdict, ant, trigger, word2vec_dict): vector = [] ant_context_sentdict = ant.get_context() trig_context_sentdict = trigger.get_context() ant_head_idx = get_antecedent_head_index(ant_sentdict, ant) # Feature 1. ant_auxs = [] for i in range(0,len(ant_sentdict['words'])): if DV.isauxiliary(ant_sentdict, i): ant_auxs.append(ant_sentdict['lemmas'][i]) found = False for aux in ant_auxs: if aux in trig_sentdict['lemmas']: vector.append(truth(True)) found = True break if not found: vector.append(truth(False)) # Feature 2. if ant.get_sentnum() == trigger.get_sentnum(): vector.append(truth(ant_head_idx > trigger.get_idx())) vector.append(truth(ant_head_idx == trigger.get_idx())) vector.append(truth(ant_head_idx < trigger.get_idx())) else: vector += [0,0,0] # Features 3,4,5. for k in ['words','lemmas','pos']: total = len(ant_context_sentdict[k])+len(trig_context_sentdict[k]) common = len(set(ant_context_sentdict[k]).intersection(trig_context_sentdict[k])) vector.append(common) vector.append((2.0*float(common))/float(total)) # Feature 6 - number of words between trigger and antecedent. vector.append(ant.get_sentnum()-trigger.get_sentnum()) if ant.get_sentnum() == trigger.get_sentnum(): vector.append(ant_head_idx - trigger.get_idx()) else: crt_sentnum = trigger.get_sentnum() distance = ant_head_idx while crt_sentnum < ant.get_sentnum(): distance += len(trig_sentdict['words']) crt_sentnum += 1 vector.append(distance) # Feature 7. # First we get the vecs from the Ant NP and average them. blank_np = False ant_np_word2vec = [] ant_np_location = ant.get_context()['np'] if ant_np_location != (-1,-1): ant_np_word2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_np_location[0], ant_np_location[1]) else: blank_np = True # Next we do the same for the Trigger NP. trig_np_word2vec = [] trig_np_location = trigger.get_context()['np'] if trig_np_location != (-1,-1): trig_np_word2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trig_np_location[0], trig_np_location[1]) else: blank_np = True # Adding the angle of the vector between the trigger NP and antecedent NP. if not blank_np: ant_length = vector_length(ant_np_word2vec) trig_length = vector_length(trig_np_word2vec) try: angle = angle_btwn_vectors(ant_np_word2vec, trig_np_word2vec, v1_length=ant_length, v2_length=trig_length) except ValueError: angle = 90.0 vector.append(angle) vector.append(truth(angle == 0.0)) else: vector.append(90.0) vector.append(truth(90.0 == 0.0)) if not ant_np_word2vec: vector += [0 for _ in range(0,WORD2VEC_LENGTH)] else: vector += ant_np_word2vec if not trig_np_word2vec: vector += [0 for _ in range(0,WORD2VEC_LENGTH)] else: vector += trig_np_word2vec # Now for what comes after the head. ant_head_idx = get_antecedent_head_index(ant_sentdict, ant) ant_post_head_w2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_head_idx, len(ant_sentdict['words'])) # if not ant_post_head_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)] # else: vector += ant_post_head_w2vec stop_idx = len(trig_sentdict['words']) for i in range(trigger.get_idx(), len(trig_sentdict['words'])): if DV.ispunctuation(trig_sentdict['lemmas'][i]): stop_idx = i break post_trig_w2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trigger.get_idx(), stop_idx) # if not post_trig_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)] # else: vector += post_trig_w2vec if ant_post_head_w2vec and post_trig_w2vec: try: post_angle = angle_btwn_vectors(ant_post_head_w2vec, post_trig_w2vec) except ValueError: post_angle = 90.0 vector.append(post_angle) vector.append(truth(post_angle == 0.0)) else: vector.append(90.0) vector.append(truth(90.0 == 0.0)) # Sentenial complement check. tree = NT.maketree(ant_sentdict['tree'][0]) if NT.dominates(tree, ant.get_subtree(), trigger.get_subtree()): vector.append(truth( NT.has_phrases_between_trees(ant.get_subtree(), trigger.get_subtree(), NIELSON_SENTENIAL_COMPLEMENT_PHRASES))) else: vector.append(truth(False)) # Features to account for the number of each phrase type between the antecedent and trigger. phrases_between = [0 for _ in ALL_PHRASES] if ant.get_sentnum() == trigger.get_sentnum(): for i in range(0,len(phrases_between)): if NT.has_phrases_between_trees(ant.get_subtree(), trigger.get_subtree(), [ALL_PHRASES[i]]): phrases_between[i] += 1 vector += phrases_between vector.append(sum(phrases_between)) return vector