예제 #1
0
def antecedent_description(trig_sentdict, ant_sentdict, ant, POS_TAGS):
    vector = []

    ant_words = ant.get_words()
    subtree = ant.get_subtree()

    # Feature 1.
    vector.append(truth(len(subtree.leaves()) == len(ant_words)))

    # Feature 2.
    vector.append(len(ant_words))

    # Feature 3.
    vector.append(
        truth(
            NT.dominates(subtree.root(), subtree,
                         ant.get_trigger().get_subtree())))

    # Features 4.
    pos_tags_dict = {}
    for tag in POS_TAGS:
        pos_tags_dict[tag] = 0

    idx = get_antecedent_head_index(ant.get_context(), ant)
    for tag in ant.get_context()['pos'][idx:len(ant_words)]:
        pos_tags_dict[tag] += 1

    vector += [pos_tags_dict[tag] for tag in pos_tags_dict]

    # Feature 5: if the antecedent starts with an auxiliary, verb, adj.
    vector.append(truth(DV.isauxiliary(ant_sentdict, idx)))
    vector.append(truth(DV.isverb(ant_sentdict['pos'][idx])))
    vector.append(truth(DV.isadj(ant_sentdict['pos'][idx])))

    return vector
예제 #2
0
def antecedent_description(trig_sentdict, ant_sentdict, ant, POS_TAGS):
    vector = []

    ant_words = ant.get_words()
    subtree = ant.get_subtree()

    # Feature 1.
    vector.append(truth(len(subtree.leaves()) == len(ant_words)))

    # Feature 2.
    vector.append(len(ant_words))

    # Feature 3.
    vector.append(truth(NT.dominates(subtree.root(), subtree, ant.get_trigger().get_subtree())))

    # Features 4.
    pos_tags_dict = {}
    for tag in POS_TAGS:
        pos_tags_dict[tag] = 0

    idx = get_antecedent_head_index(ant.get_context(), ant)
    for tag in ant.get_context()['pos'][idx:len(ant_words)]:
        pos_tags_dict[tag] += 1

    vector += [pos_tags_dict[tag] for tag in pos_tags_dict]

    # Feature 5: if the antecedent starts with an auxiliary, verb, adj.
    vector.append(truth(DV.isauxiliary(ant_sentdict, idx)))
    vector.append(truth(DV.isverb(ant_sentdict['pos'][idx])))
    vector.append(truth(DV.isadj(ant_sentdict['pos'][idx])))

    return vector
예제 #3
0
def nielson_features(trig_sentdict, ant_sentdict, ant, trigger):
    vector = []

    if NT.dominates(ant.get_subtree().root(), ant.get_subtree(), trigger.get_subtree()):
        print

    return vector
예제 #4
0
def nielson_features(trig_sentdict, ant_sentdict, ant, trigger):
    vector = []

    if NT.dominates(ant.get_subtree().root(), ant.get_subtree(),
                    trigger.get_subtree()):
        print

    return vector
예제 #5
0
def alignment_comparison(trig_sentdict, ant_sentdict, ant, trigger,
                         word2vec_dict):
    vector = []

    ant_context_sentdict = ant.get_context()
    trig_context_sentdict = trigger.get_context()

    ant_head_idx = get_antecedent_head_index(ant_sentdict, ant)

    # Feature 1.
    ant_auxs = []
    for i in range(0, len(ant_sentdict['words'])):
        if DV.isauxiliary(ant_sentdict, i):
            ant_auxs.append(ant_sentdict['lemmas'][i])

    found = False
    for aux in ant_auxs:
        if aux in trig_sentdict['lemmas']:
            vector.append(truth(True))
            found = True
            break

    if not found:
        vector.append(truth(False))

    # Feature 2.
    if ant.get_sentnum() == trigger.get_sentnum():
        vector.append(truth(ant_head_idx > trigger.get_idx()))
        vector.append(truth(ant_head_idx == trigger.get_idx()))
        vector.append(truth(ant_head_idx < trigger.get_idx()))
    else:
        vector += [0, 0, 0]

    # Features 3,4,5.
    for k in ['words', 'lemmas', 'pos']:
        total = len(ant_context_sentdict[k]) + len(trig_context_sentdict[k])
        common = len(
            set(ant_context_sentdict[k]).intersection(
                trig_context_sentdict[k]))
        vector.append(common)
        vector.append((2.0 * float(common)) / float(total))

    # Feature 6 - number of words between trigger and antecedent.
    vector.append(ant.get_sentnum() - trigger.get_sentnum())
    if ant.get_sentnum() == trigger.get_sentnum():
        vector.append(ant_head_idx - trigger.get_idx())
    else:
        crt_sentnum = trigger.get_sentnum()
        distance = ant_head_idx
        while crt_sentnum < ant.get_sentnum():
            distance += len(trig_sentdict['words'])
            crt_sentnum += 1
        vector.append(distance)

    # Feature 7.
    # First we get the vecs from the Ant NP and average them.
    blank_np = False

    ant_np_word2vec = []
    ant_np_location = ant.get_context()['np']

    if ant_np_location != (-1, -1):
        ant_np_word2vec = get_average_np_vec(word2vec_dict, ant_sentdict,
                                             ant_np_location[0],
                                             ant_np_location[1])
    else:
        blank_np = True

    # Next we do the same for the Trigger NP.
    trig_np_word2vec = []
    trig_np_location = trigger.get_context()['np']

    if trig_np_location != (-1, -1):
        trig_np_word2vec = get_average_np_vec(word2vec_dict, trig_sentdict,
                                              trig_np_location[0],
                                              trig_np_location[1])
    else:
        blank_np = True

    # Adding the angle of the vector between the trigger NP and antecedent NP.
    if not blank_np:
        ant_length = vector_length(ant_np_word2vec)
        trig_length = vector_length(trig_np_word2vec)
        try:
            angle = angle_btwn_vectors(ant_np_word2vec,
                                       trig_np_word2vec,
                                       v1_length=ant_length,
                                       v2_length=trig_length)
        except ValueError:
            angle = 90.0

        vector.append(angle)
        vector.append(truth(angle == 0.0))
    else:
        vector.append(90.0)
        vector.append(truth(90.0 == 0.0))

    if not ant_np_word2vec:
        vector += [0 for _ in range(0, WORD2VEC_LENGTH)]
    else:
        vector += ant_np_word2vec
    if not trig_np_word2vec:
        vector += [0 for _ in range(0, WORD2VEC_LENGTH)]
    else:
        vector += trig_np_word2vec

    # Now for what comes after the head.
    ant_head_idx = get_antecedent_head_index(ant_sentdict, ant)
    ant_post_head_w2vec = get_average_np_vec(word2vec_dict, ant_sentdict,
                                             ant_head_idx,
                                             len(ant_sentdict['words']))

    # if not ant_post_head_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)]
    # else: vector += ant_post_head_w2vec

    stop_idx = len(trig_sentdict['words'])
    for i in range(trigger.get_idx(), len(trig_sentdict['words'])):
        if DV.ispunctuation(trig_sentdict['lemmas'][i]):
            stop_idx = i
            break

    post_trig_w2vec = get_average_np_vec(word2vec_dict, trig_sentdict,
                                         trigger.get_idx(), stop_idx)

    # if not post_trig_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)]
    # else: vector += post_trig_w2vec

    if ant_post_head_w2vec and post_trig_w2vec:
        try:
            post_angle = angle_btwn_vectors(ant_post_head_w2vec,
                                            post_trig_w2vec)
        except ValueError:
            post_angle = 90.0
        vector.append(post_angle)
        vector.append(truth(post_angle == 0.0))
    else:
        vector.append(90.0)
        vector.append(truth(90.0 == 0.0))

    # Sentenial complement check.
    tree = NT.maketree(ant_sentdict['tree'][0])
    if NT.dominates(tree, ant.get_subtree(), trigger.get_subtree()):
        vector.append(
            truth(
                NT.has_phrases_between_trees(
                    ant.get_subtree(), trigger.get_subtree(),
                    NIELSON_SENTENIAL_COMPLEMENT_PHRASES)))
    else:
        vector.append(truth(False))

    # Features to account for the number of each phrase type between the antecedent and trigger.
    phrases_between = [0 for _ in ALL_PHRASES]

    if ant.get_sentnum() == trigger.get_sentnum():
        for i in range(0, len(phrases_between)):
            if NT.has_phrases_between_trees(ant.get_subtree(),
                                            trigger.get_subtree(),
                                            [ALL_PHRASES[i]]):
                phrases_between[i] += 1

    vector += phrases_between
    vector.append(sum(phrases_between))

    return vector
예제 #6
0
def alignment_comparison(trig_sentdict, ant_sentdict, ant, trigger, word2vec_dict):
    vector = []

    ant_context_sentdict = ant.get_context()
    trig_context_sentdict = trigger.get_context()

    ant_head_idx = get_antecedent_head_index(ant_sentdict, ant)

    # Feature 1.
    ant_auxs = []
    for i in range(0,len(ant_sentdict['words'])):
        if DV.isauxiliary(ant_sentdict, i):
            ant_auxs.append(ant_sentdict['lemmas'][i])

    found = False
    for aux in ant_auxs:
        if aux in trig_sentdict['lemmas']:
            vector.append(truth(True))
            found = True
            break

    if not found:
        vector.append(truth(False))

    # Feature 2.
    if ant.get_sentnum() == trigger.get_sentnum():
        vector.append(truth(ant_head_idx > trigger.get_idx()))
        vector.append(truth(ant_head_idx == trigger.get_idx()))
        vector.append(truth(ant_head_idx < trigger.get_idx()))
    else: vector += [0,0,0]

    # Features 3,4,5.
    for k in ['words','lemmas','pos']:
        total = len(ant_context_sentdict[k])+len(trig_context_sentdict[k])
        common = len(set(ant_context_sentdict[k]).intersection(trig_context_sentdict[k]))
        vector.append(common)
        vector.append((2.0*float(common))/float(total))

    # Feature 6 - number of words between trigger and antecedent.
    vector.append(ant.get_sentnum()-trigger.get_sentnum())
    if ant.get_sentnum() == trigger.get_sentnum(): vector.append(ant_head_idx - trigger.get_idx())
    else:
        crt_sentnum = trigger.get_sentnum()
        distance = ant_head_idx
        while crt_sentnum < ant.get_sentnum():
            distance += len(trig_sentdict['words'])
            crt_sentnum += 1
        vector.append(distance)

    # Feature 7.
    # First we get the vecs from the Ant NP and average them.
    blank_np = False

    ant_np_word2vec = []
    ant_np_location = ant.get_context()['np']

    if ant_np_location != (-1,-1):
        ant_np_word2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_np_location[0], ant_np_location[1])
    else: blank_np = True

    # Next we do the same for the Trigger NP.
    trig_np_word2vec = []
    trig_np_location = trigger.get_context()['np']

    if trig_np_location != (-1,-1):
        trig_np_word2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trig_np_location[0], trig_np_location[1])
    else: blank_np = True

    # Adding the angle of the vector between the trigger NP and antecedent NP.
    if not blank_np:
        ant_length = vector_length(ant_np_word2vec)
        trig_length = vector_length(trig_np_word2vec)
        try:
            angle = angle_btwn_vectors(ant_np_word2vec, trig_np_word2vec, v1_length=ant_length, v2_length=trig_length)
        except ValueError:
            angle = 90.0

        vector.append(angle)
        vector.append(truth(angle == 0.0))
    else:
        vector.append(90.0)
        vector.append(truth(90.0 == 0.0))

    if not ant_np_word2vec:
        vector += [0 for _ in range(0,WORD2VEC_LENGTH)]
    else:
        vector += ant_np_word2vec
    if not trig_np_word2vec:
        vector += [0 for _ in range(0,WORD2VEC_LENGTH)]
    else:
        vector += trig_np_word2vec

    # Now for what comes after the head.
    ant_head_idx = get_antecedent_head_index(ant_sentdict, ant)
    ant_post_head_w2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_head_idx, len(ant_sentdict['words']))

    # if not ant_post_head_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)]
    # else: vector += ant_post_head_w2vec

    stop_idx = len(trig_sentdict['words'])
    for i in range(trigger.get_idx(), len(trig_sentdict['words'])):
        if DV.ispunctuation(trig_sentdict['lemmas'][i]):
            stop_idx = i
            break

    post_trig_w2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trigger.get_idx(), stop_idx)

    # if not post_trig_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)]
    # else: vector += post_trig_w2vec

    if ant_post_head_w2vec and post_trig_w2vec:
        try:
            post_angle = angle_btwn_vectors(ant_post_head_w2vec, post_trig_w2vec)
        except ValueError: post_angle = 90.0
        vector.append(post_angle)
        vector.append(truth(post_angle == 0.0))
    else:
        vector.append(90.0)
        vector.append(truth(90.0 == 0.0))

    # Sentenial complement check.
    tree = NT.maketree(ant_sentdict['tree'][0])
    if NT.dominates(tree, ant.get_subtree(), trigger.get_subtree()):
        vector.append(truth( NT.has_phrases_between_trees(ant.get_subtree(), trigger.get_subtree(), NIELSON_SENTENIAL_COMPLEMENT_PHRASES)))
    else:
        vector.append(truth(False))

    # Features to account for the number of each phrase type between the antecedent and trigger.
    phrases_between = [0 for _ in ALL_PHRASES]

    if ant.get_sentnum() == trigger.get_sentnum():
        for i in range(0,len(phrases_between)):
            if NT.has_phrases_between_trees(ant.get_subtree(), trigger.get_subtree(), [ALL_PHRASES[i]]):
                phrases_between[i] += 1

    vector += phrases_between
    vector.append(sum(phrases_between))

    return vector