Пример #1
0
    def set_possible_ants(self, trigger, pos_tests):
        for sentnum in range(max(0, trigger.sentnum - SENTENCE_SEARCH_DISTANCE), trigger.sentnum + 1):
            functions = [f for f in pos_tests if hasattr(f, '__call__')]

            for i in range(len(self.sentences[sentnum])):
                tag = self.sentences[sentnum].pos[i]

                # TODO: ADDED SECOND CLAUSE TO THIS IF TO LOWER NUMBER OF CANDIDATES GENERATED
                if True in (f(tag) for f in functions):  # and not wc.is_aux_lemma(self.sentences[sentnum].lemmas[i]):
                    phrase = nt.get_nearest_phrase(nt.maketree(self.sentences[sentnum]['tree'][0]), i, pos_tests)
                    phrase_length = nt.get_phrase_length(phrase)

                    # if phrase_length <= 2:
                    #     print phrase

                    for j in range(i, min(i + phrase_length + 1, len(self.sentences[sentnum]))):
                        if not ant_after_trigger(sentnum, i, j, trigger):
                            bad = False
                            for pos_check in [wc.is_preposition, wc.is_punctuation, wc.is_determiner]:
                                if pos_check(self.sentences[sentnum].pos[j - 1]):
                                    bad = True

                            if not bad:
                                ant = self.idxs_to_ant(sentnum, i, j, trigger)
                                if len(ant.sub_sentdict) > 0:
                                    trigger.add_possible_ant(ant)
Пример #2
0
def myfeaturesvector(sentdict, idx, features):
    vector = []

    tree = NT.maketree(sentdict["tree"][0])
    subtrees = NT.getsmallestsubtrees(tree)
    subtree_positions = NT.get_smallest_subtree_positions(tree, subtree_list=subtrees)
    aux = sentdict["lemmas"][idx]

    if "my_features" in features:
        vector.append(truth(DV.auxccommandsverb(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.auxccommandsverbthatcomesafter(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.auxisccommandedbyverb(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.auxislocallyccommandedbyverb(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.auxlocallyccommandsverb(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.isccommandedbycontinuationword(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.nexttopunct(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.isfollowedbypunct(sentdict, idx, end=["."])))
        vector.append(truth(DV.previouswordisasorsoorthan(sentdict["words"], idx)))
        vector.append(truth(DV.thesamecheck(sentdict["words"], idx)))
        vector.append(truth(DV.toprecedesaux(sentdict, idx)))
        vector.append(truth(DV.verbfollowsaux(sentdict, idx)))

        # TODO: added this new feature!
        vector.append(truth(DV.nextwordistoo(sentdict, idx)))

    if "my_rules" in features:
        vector.append(truth(aux in DV.MODALS and DV.modalcheck(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(aux in DV.BE and DV.becheck(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(aux in DV.HAVE and DV.havecheck(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(aux in DV.DO and DV.docheck(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(aux in DV.TO and DV.tocheck(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(aux in DV.SO and DV.socheck(sentdict, idx, tree, subtree_positions)))

        # This adds a new layer of features by combining all of the ones I had.
    if "square_rules" in features:
        size = len(vector)
        for i in range(0, size):
            for j in range(0, size):
                if i != j:
                    vector.append(truth(untruth(vector[i]) and untruth(vector[j])))

    if "combine_aux_type" in features:
        bools = [aux in DV.MODALS, aux in DV.BE, aux in DV.HAVE, aux in DV.DO, aux in DV.TO, aux in DV.SO]
        vec = [v for v in vector]
        for v in vec:
            for b in bools:
                vector.append(truth(untruth(v) and b))

    return vector
Пример #3
0
def testmyrules(classifier, section_start, section_end):
    gs_vector = classifier.getgsdata(section_start, section_end)

    aux_start, aux_end = classifier.section_split[
        section_start], classifier.section_split[section_end]

    my_rules_return_vector = []
    count = 0
    for sentdict in classifier.each_sentence.sentences:
        for i in range(0, len(sentdict['lemmas'])):
            word = sentdict['lemmas'][i]
            if isauxiliary(sentdict, i):
                count += 1
                if aux_start < count <= aux_end:
                    tree = NT.maketree(sentdict['tree'][0])
                    subtree_positions = NT.get_smallest_subtree_positions(tree)
                    if word in MODALS:
                        my_rules_return_vector.append(
                            truth(
                                modalcheck(sentdict, i, tree,
                                           subtree_positions))
                        )  #Todo: I modified these b/c they were incorrectly written.
                    elif word in BE:
                        my_rules_return_vector.append(
                            truth(becheck(sentdict, i, tree,
                                          subtree_positions)))
                    elif word in HAVE:
                        my_rules_return_vector.append(
                            truth(
                                havecheck(sentdict, i, tree,
                                          subtree_positions)))
                    elif word in DO:
                        my_rules_return_vector.append(
                            truth(docheck(sentdict, i, tree,
                                          subtree_positions)))
                    elif word in TO:
                        my_rules_return_vector.append(
                            truth(tocheck(sentdict, i, tree,
                                          subtree_positions)))
                    elif word in SO:
                        my_rules_return_vector.append(
                            truth(socheck(sentdict, i, tree,
                                          subtree_positions)))

    classifier.compare(gs_vector,
                       my_rules_return_vector,
                       section_start - 1,
                       verbose=False)
Пример #4
0
    def set_possible_ants(self, trigger, pos_tests):
        for sentnum in range(
                max(0, trigger.sentnum - SENTENCE_SEARCH_DISTANCE),
                trigger.sentnum + 1):
            functions = [f for f in pos_tests if hasattr(f, '__call__')]

            for i in range(len(self.sentences[sentnum])):
                tag = self.sentences[sentnum].pos[i]

                # TODO: ADDED SECOND CLAUSE TO THIS IF TO LOWER NUMBER OF CANDIDATES GENERATED
                if True in (
                        f(tag) for f in functions
                ):  # and not wc.is_aux_lemma(self.sentences[sentnum].lemmas[i]):
                    phrase = nt.get_nearest_phrase(
                        nt.maketree(self.sentences[sentnum]['tree'][0]), i,
                        pos_tests)
                    phrase_length = nt.get_phrase_length(phrase)

                    # if phrase_length <= 2:
                    #     print phrase

                    for j in range(
                            i,
                            min(i + phrase_length + 1,
                                len(self.sentences[sentnum]))):
                        if not ant_after_trigger(sentnum, i, j, trigger):
                            bad = False
                            for pos_check in [
                                    wc.is_preposition, wc.is_punctuation,
                                    wc.is_determiner
                            ]:
                                if pos_check(self.sentences[sentnum].pos[j -
                                                                         1]):
                                    bad = True

                            if not bad:
                                ant = self.idxs_to_ant(sentnum, i, j, trigger)
                                if len(ant.sub_sentdict) > 0:
                                    trigger.add_possible_ant(ant)
Пример #5
0
def testmyrules(classifier, section_start, section_end):
    gs_vector = classifier.getgsdata(section_start, section_end)

    aux_start,aux_end = classifier.section_split[section_start], classifier.section_split[section_end]

    my_rules_return_vector = []
    count = 0
    for sentdict in classifier.each_sentence.sentences:
        for i in range(0,len(sentdict['lemmas'])):
            word = sentdict['lemmas'][i]
            if isauxiliary(sentdict, i):
                count += 1
                if aux_start < count <= aux_end:
                    tree = NT.maketree(sentdict['tree'][0])
                    subtree_positions = NT.get_smallest_subtree_positions(tree)
                    if word in MODALS: my_rules_return_vector.append(truth(modalcheck(sentdict, i, tree, subtree_positions))) #Todo: I modified these b/c they were incorrectly written.
                    elif word in BE: my_rules_return_vector.append(truth(becheck(sentdict, i, tree, subtree_positions)))
                    elif word in HAVE: my_rules_return_vector.append(truth(havecheck(sentdict, i, tree, subtree_positions)))
                    elif word in DO: my_rules_return_vector.append(truth(docheck(sentdict, i, tree, subtree_positions)))
                    elif word in TO: my_rules_return_vector.append(truth(tocheck(sentdict, i, tree, subtree_positions)))
                    elif word in SO: my_rules_return_vector.append(truth(socheck(sentdict, i, tree, subtree_positions)))

    classifier.compare(gs_vector, my_rules_return_vector, section_start-1, verbose=False)
Пример #6
0
 def get_nltk_tree(self):
     return nt.maketree(self.tree_text[0])
Пример #7
0
 def get_sentence_tree(self, i):
     return nt.maketree(self.sentences[i]['tree'][0])
Пример #8
0
def alignment_comparison(trig_sentdict, ant_sentdict, ant, trigger,
                         word2vec_dict):
    vector = []

    ant_context_sentdict = ant.get_context()
    trig_context_sentdict = trigger.get_context()

    ant_head_idx = get_antecedent_head_index(ant_sentdict, ant)

    # Feature 1.
    ant_auxs = []
    for i in range(0, len(ant_sentdict['words'])):
        if DV.isauxiliary(ant_sentdict, i):
            ant_auxs.append(ant_sentdict['lemmas'][i])

    found = False
    for aux in ant_auxs:
        if aux in trig_sentdict['lemmas']:
            vector.append(truth(True))
            found = True
            break

    if not found:
        vector.append(truth(False))

    # Feature 2.
    if ant.get_sentnum() == trigger.get_sentnum():
        vector.append(truth(ant_head_idx > trigger.get_idx()))
        vector.append(truth(ant_head_idx == trigger.get_idx()))
        vector.append(truth(ant_head_idx < trigger.get_idx()))
    else:
        vector += [0, 0, 0]

    # Features 3,4,5.
    for k in ['words', 'lemmas', 'pos']:
        total = len(ant_context_sentdict[k]) + len(trig_context_sentdict[k])
        common = len(
            set(ant_context_sentdict[k]).intersection(
                trig_context_sentdict[k]))
        vector.append(common)
        vector.append((2.0 * float(common)) / float(total))

    # Feature 6 - number of words between trigger and antecedent.
    vector.append(ant.get_sentnum() - trigger.get_sentnum())
    if ant.get_sentnum() == trigger.get_sentnum():
        vector.append(ant_head_idx - trigger.get_idx())
    else:
        crt_sentnum = trigger.get_sentnum()
        distance = ant_head_idx
        while crt_sentnum < ant.get_sentnum():
            distance += len(trig_sentdict['words'])
            crt_sentnum += 1
        vector.append(distance)

    # Feature 7.
    # First we get the vecs from the Ant NP and average them.
    blank_np = False

    ant_np_word2vec = []
    ant_np_location = ant.get_context()['np']

    if ant_np_location != (-1, -1):
        ant_np_word2vec = get_average_np_vec(word2vec_dict, ant_sentdict,
                                             ant_np_location[0],
                                             ant_np_location[1])
    else:
        blank_np = True

    # Next we do the same for the Trigger NP.
    trig_np_word2vec = []
    trig_np_location = trigger.get_context()['np']

    if trig_np_location != (-1, -1):
        trig_np_word2vec = get_average_np_vec(word2vec_dict, trig_sentdict,
                                              trig_np_location[0],
                                              trig_np_location[1])
    else:
        blank_np = True

    # Adding the angle of the vector between the trigger NP and antecedent NP.
    if not blank_np:
        ant_length = vector_length(ant_np_word2vec)
        trig_length = vector_length(trig_np_word2vec)
        try:
            angle = angle_btwn_vectors(ant_np_word2vec,
                                       trig_np_word2vec,
                                       v1_length=ant_length,
                                       v2_length=trig_length)
        except ValueError:
            angle = 90.0

        vector.append(angle)
        vector.append(truth(angle == 0.0))
    else:
        vector.append(90.0)
        vector.append(truth(90.0 == 0.0))

    if not ant_np_word2vec:
        vector += [0 for _ in range(0, WORD2VEC_LENGTH)]
    else:
        vector += ant_np_word2vec
    if not trig_np_word2vec:
        vector += [0 for _ in range(0, WORD2VEC_LENGTH)]
    else:
        vector += trig_np_word2vec

    # Now for what comes after the head.
    ant_head_idx = get_antecedent_head_index(ant_sentdict, ant)
    ant_post_head_w2vec = get_average_np_vec(word2vec_dict, ant_sentdict,
                                             ant_head_idx,
                                             len(ant_sentdict['words']))

    # if not ant_post_head_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)]
    # else: vector += ant_post_head_w2vec

    stop_idx = len(trig_sentdict['words'])
    for i in range(trigger.get_idx(), len(trig_sentdict['words'])):
        if DV.ispunctuation(trig_sentdict['lemmas'][i]):
            stop_idx = i
            break

    post_trig_w2vec = get_average_np_vec(word2vec_dict, trig_sentdict,
                                         trigger.get_idx(), stop_idx)

    # if not post_trig_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)]
    # else: vector += post_trig_w2vec

    if ant_post_head_w2vec and post_trig_w2vec:
        try:
            post_angle = angle_btwn_vectors(ant_post_head_w2vec,
                                            post_trig_w2vec)
        except ValueError:
            post_angle = 90.0
        vector.append(post_angle)
        vector.append(truth(post_angle == 0.0))
    else:
        vector.append(90.0)
        vector.append(truth(90.0 == 0.0))

    # Sentenial complement check.
    tree = NT.maketree(ant_sentdict['tree'][0])
    if NT.dominates(tree, ant.get_subtree(), trigger.get_subtree()):
        vector.append(
            truth(
                NT.has_phrases_between_trees(
                    ant.get_subtree(), trigger.get_subtree(),
                    NIELSON_SENTENIAL_COMPLEMENT_PHRASES)))
    else:
        vector.append(truth(False))

    # Features to account for the number of each phrase type between the antecedent and trigger.
    phrases_between = [0 for _ in ALL_PHRASES]

    if ant.get_sentnum() == trigger.get_sentnum():
        for i in range(0, len(phrases_between)):
            if NT.has_phrases_between_trees(ant.get_subtree(),
                                            trigger.get_subtree(),
                                            [ALL_PHRASES[i]]):
                phrases_between[i] += 1

    vector += phrases_between
    vector.append(sum(phrases_between))

    return vector
Пример #9
0
def alignment_comparison(trig_sentdict, ant_sentdict, ant, trigger, word2vec_dict):
    vector = []

    ant_context_sentdict = ant.get_context()
    trig_context_sentdict = trigger.get_context()

    ant_head_idx = get_antecedent_head_index(ant_sentdict, ant)

    # Feature 1.
    ant_auxs = []
    for i in range(0,len(ant_sentdict['words'])):
        if DV.isauxiliary(ant_sentdict, i):
            ant_auxs.append(ant_sentdict['lemmas'][i])

    found = False
    for aux in ant_auxs:
        if aux in trig_sentdict['lemmas']:
            vector.append(truth(True))
            found = True
            break

    if not found:
        vector.append(truth(False))

    # Feature 2.
    if ant.get_sentnum() == trigger.get_sentnum():
        vector.append(truth(ant_head_idx > trigger.get_idx()))
        vector.append(truth(ant_head_idx == trigger.get_idx()))
        vector.append(truth(ant_head_idx < trigger.get_idx()))
    else: vector += [0,0,0]

    # Features 3,4,5.
    for k in ['words','lemmas','pos']:
        total = len(ant_context_sentdict[k])+len(trig_context_sentdict[k])
        common = len(set(ant_context_sentdict[k]).intersection(trig_context_sentdict[k]))
        vector.append(common)
        vector.append((2.0*float(common))/float(total))

    # Feature 6 - number of words between trigger and antecedent.
    vector.append(ant.get_sentnum()-trigger.get_sentnum())
    if ant.get_sentnum() == trigger.get_sentnum(): vector.append(ant_head_idx - trigger.get_idx())
    else:
        crt_sentnum = trigger.get_sentnum()
        distance = ant_head_idx
        while crt_sentnum < ant.get_sentnum():
            distance += len(trig_sentdict['words'])
            crt_sentnum += 1
        vector.append(distance)

    # Feature 7.
    # First we get the vecs from the Ant NP and average them.
    blank_np = False

    ant_np_word2vec = []
    ant_np_location = ant.get_context()['np']

    if ant_np_location != (-1,-1):
        ant_np_word2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_np_location[0], ant_np_location[1])
    else: blank_np = True

    # Next we do the same for the Trigger NP.
    trig_np_word2vec = []
    trig_np_location = trigger.get_context()['np']

    if trig_np_location != (-1,-1):
        trig_np_word2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trig_np_location[0], trig_np_location[1])
    else: blank_np = True

    # Adding the angle of the vector between the trigger NP and antecedent NP.
    if not blank_np:
        ant_length = vector_length(ant_np_word2vec)
        trig_length = vector_length(trig_np_word2vec)
        try:
            angle = angle_btwn_vectors(ant_np_word2vec, trig_np_word2vec, v1_length=ant_length, v2_length=trig_length)
        except ValueError:
            angle = 90.0

        vector.append(angle)
        vector.append(truth(angle == 0.0))
    else:
        vector.append(90.0)
        vector.append(truth(90.0 == 0.0))

    if not ant_np_word2vec:
        vector += [0 for _ in range(0,WORD2VEC_LENGTH)]
    else:
        vector += ant_np_word2vec
    if not trig_np_word2vec:
        vector += [0 for _ in range(0,WORD2VEC_LENGTH)]
    else:
        vector += trig_np_word2vec

    # Now for what comes after the head.
    ant_head_idx = get_antecedent_head_index(ant_sentdict, ant)
    ant_post_head_w2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_head_idx, len(ant_sentdict['words']))

    # if not ant_post_head_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)]
    # else: vector += ant_post_head_w2vec

    stop_idx = len(trig_sentdict['words'])
    for i in range(trigger.get_idx(), len(trig_sentdict['words'])):
        if DV.ispunctuation(trig_sentdict['lemmas'][i]):
            stop_idx = i
            break

    post_trig_w2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trigger.get_idx(), stop_idx)

    # if not post_trig_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)]
    # else: vector += post_trig_w2vec

    if ant_post_head_w2vec and post_trig_w2vec:
        try:
            post_angle = angle_btwn_vectors(ant_post_head_w2vec, post_trig_w2vec)
        except ValueError: post_angle = 90.0
        vector.append(post_angle)
        vector.append(truth(post_angle == 0.0))
    else:
        vector.append(90.0)
        vector.append(truth(90.0 == 0.0))

    # Sentenial complement check.
    tree = NT.maketree(ant_sentdict['tree'][0])
    if NT.dominates(tree, ant.get_subtree(), trigger.get_subtree()):
        vector.append(truth( NT.has_phrases_between_trees(ant.get_subtree(), trigger.get_subtree(), NIELSON_SENTENIAL_COMPLEMENT_PHRASES)))
    else:
        vector.append(truth(False))

    # Features to account for the number of each phrase type between the antecedent and trigger.
    phrases_between = [0 for _ in ALL_PHRASES]

    if ant.get_sentnum() == trigger.get_sentnum():
        for i in range(0,len(phrases_between)):
            if NT.has_phrases_between_trees(ant.get_subtree(), trigger.get_subtree(), [ALL_PHRASES[i]]):
                phrases_between[i] += 1

    vector += phrases_between
    vector.append(sum(phrases_between))

    return vector
Пример #10
0
 def get_nltk_tree(self):
     return nt.maketree(self.tree_text[0])
Пример #11
0
 def get_sentence_tree(self, i):
     return nt.maketree(self.sentences[i]['tree'][0])
Пример #12
0
def myfeaturesvector(sentdict, idx, features):
    vector = []

    tree = NT.maketree(sentdict['tree'][0])
    subtrees = NT.getsmallestsubtrees(tree)
    subtree_positions = NT.get_smallest_subtree_positions(
        tree, subtree_list=subtrees)
    aux = sentdict['lemmas'][idx]

    if 'my_features' in features:
        vector.append(
            truth(DV.auxccommandsverb(sentdict, idx, tree, subtree_positions)))
        vector.append(
            truth(
                DV.auxccommandsverbthatcomesafter(sentdict, idx, tree,
                                                  subtree_positions)))
        vector.append(
            truth(
                DV.auxisccommandedbyverb(sentdict, idx, tree,
                                         subtree_positions)))
        vector.append(
            truth(
                DV.auxislocallyccommandedbyverb(sentdict, idx, tree,
                                                subtree_positions)))
        vector.append(
            truth(
                DV.auxlocallyccommandsverb(sentdict, idx, tree,
                                           subtree_positions)))
        vector.append(
            truth(
                DV.isccommandedbycontinuationword(sentdict, idx, tree,
                                                  subtree_positions)))
        vector.append(
            truth(DV.nexttopunct(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.isfollowedbypunct(sentdict, idx, end=['.'])))
        vector.append(
            truth(DV.previouswordisasorsoorthan(sentdict['words'], idx)))
        vector.append(truth(DV.thesamecheck(sentdict['words'], idx)))
        vector.append(truth(DV.toprecedesaux(sentdict, idx)))
        vector.append(truth(DV.verbfollowsaux(sentdict, idx)))

        # TODO: added this new feature!
        vector.append(truth(DV.nextwordistoo(sentdict, idx)))

    if 'my_rules' in features:
        vector.append(
            truth(aux in DV.MODALS
                  and DV.modalcheck(sentdict, idx, tree, subtree_positions)))
        vector.append(
            truth(aux in DV.BE
                  and DV.becheck(sentdict, idx, tree, subtree_positions)))
        vector.append(
            truth(aux in DV.HAVE
                  and DV.havecheck(sentdict, idx, tree, subtree_positions)))
        vector.append(
            truth(aux in DV.DO
                  and DV.docheck(sentdict, idx, tree, subtree_positions)))
        vector.append(
            truth(aux in DV.TO
                  and DV.tocheck(sentdict, idx, tree, subtree_positions)))
        vector.append(
            truth(aux in DV.SO
                  and DV.socheck(sentdict, idx, tree, subtree_positions)))

        # This adds a new layer of features by combining all of the ones I had.
    if 'square_rules' in features:
        size = len(vector)
        for i in range(0, size):
            for j in range(0, size):
                if i != j:
                    vector.append(
                        truth(untruth(vector[i]) and untruth(vector[j])))

    if 'combine_aux_type' in features:
        bools = [
            aux in DV.MODALS, aux in DV.BE, aux in DV.HAVE, aux in DV.DO, aux
            in DV.TO, aux in DV.SO
        ]
        vec = [v for v in vector]
        for v in vec:
            for b in bools:
                vector.append(truth(untruth(v) and b))

    return vector