예제 #1
0
def nearest_trig_np(trig, sentences, all_tags, get_words=False):
    """
    @type trig: vpe_objects.Auxiliary
    @type sentences: vpe_objects.AllSentences
    """
    t = sentences.get_sentence_tree(trig.sentnum)
    tree_tuples = nt.pos_word_tuples(t)
    all_nps = nt.find_subtree_phrases(t, ['NP', 'NP-PRD'])

    trig_tup = (trig.pos, trig.word)
    trig_tup_idx = tree_tuples.index(trig_tup)

    closest_np_value = 99
    closest_np = None
    for NP in all_nps:
        last_np_word_idx = tree_tuples.index(nt.pos_word_tuples(NP)[-1])
        if abs(trig_tup_idx - last_np_word_idx) < closest_np_value:
            closest_np_value = trig_tup_idx - last_np_word_idx
            closest_np = NP
    if closest_np == None:
        closest_np = t

    if get_words:
        try:
            return closest_np.leaves()
        except AttributeError:
            return []

    np_pos = [
        subtree.label() for subtree in nt.getsmallestsubtrees(closest_np)
    ]

    return encode_pos_tag_frequencies(np_pos, all_tags)
예제 #2
0
파일: detectVPE.py 프로젝트: kiankd/vpe
def nexttopunct(sentdict, auxidx, t, word_positions_in_tree):
    localt = nltktree.generate_local_structure_from_subtree(t, t[word_positions_in_tree[auxidx-1]])
    local_word_subtrees = nltktree.getsmallestsubtrees(localt)
    try:
        checkpuncttag = sentdict['pos'][auxidx+1]
        if isperiod(checkpuncttag) or iscomma(checkpuncttag) or isdashorcolon(checkpuncttag):
            endbool = True

            # for subtree in local_word_subtrees:
            #     if isverb( subtree.label() ) and subtree != t[word_positions_in_tree[auxidx-1]]:
            #         if nltktree.ccommands(localt, subtree, t[word_positions_in_tree[auxidx-1]]):
            #             endbool = False
            #             break

            if endbool: return endbool
    except IndexError:
        return False

    try:
        checkpuncttag = sentdict['pos'][auxidx+2]
        if sentdict['lemmas'][auxidx+1] == 'not' and (isperiod(checkpuncttag) or iscomma(checkpuncttag) or isdashorcolon(checkpuncttag)):
            endbool = True

            # for subtree in local_word_subtrees:
            #     if isverb( subtree.label() ):
            #         if nltktree.ccommands(localt, subtree, t[word_positions_in_tree[auxidx-1]]):
            #             endbool = False
            #             break

            if endbool: return endbool
    except IndexError:
        return False

    return False
예제 #3
0
def nearest_trig_np(trig, sentences, all_tags, get_words=False):
    """
    @type trig: vpe_objects.Auxiliary
    @type sentences: vpe_objects.AllSentences
    """
    t = sentences.get_sentence_tree(trig.sentnum)
    tree_tuples = nt.pos_word_tuples(t)
    all_nps = nt.find_subtree_phrases(t, ['NP','NP-PRD'])

    trig_tup = (trig.pos, trig.word)
    trig_tup_idx = tree_tuples.index(trig_tup)

    closest_np_value = 99
    closest_np = None
    for NP in all_nps:
        last_np_word_idx = tree_tuples.index(nt.pos_word_tuples(NP)[-1])
        if abs(trig_tup_idx - last_np_word_idx) < closest_np_value:
            closest_np_value = trig_tup_idx - last_np_word_idx
            closest_np = NP
    if closest_np == None:
        closest_np = t

    if get_words:
        try:
            return closest_np.leaves()
        except AttributeError:
            return []

    np_pos = [subtree.label() for subtree in nt.getsmallestsubtrees(closest_np)]

    return encode_pos_tag_frequencies(np_pos, all_tags)
예제 #4
0
def nearest_ant_np(ant, sentences, all_tags, get_words=False):
    """
    @type ant: vpe_objects.Antecedent
    @type sentences: vpe_objects.AllSentences
    """
    t = sentences.get_sentence_tree(ant.sentnum)
    tree_tuples = nt.pos_word_tuples(t)
    all_nps = nt.find_subtree_phrases(t, ['NP','NP-PRD'])

    ant_tup = (ant.sub_sentdict.pos[len(ant.sub_sentdict)/2], ant.sub_sentdict.words[len(ant.sub_sentdict)/2])
    if ant.sentnum != ant.trigger.sentnum:
        ant_tup_idx = len(tree_tuples)
    else:
        ant_tup_idx = tree_tuples.index(ant_tup)

    closest_np_value = 99
    closest_np = None
    for NP in all_nps:
        last_np_word_idx = tree_tuples.index(nt.pos_word_tuples(NP)[-1])
        if abs(ant_tup_idx - last_np_word_idx) < closest_np_value:
            closest_np_value = ant_tup_idx - last_np_word_idx
            closest_np = NP

    if get_words:
        try:
            return closest_np.leaves()
        except AttributeError:
            return []

    try:
        np_pos = [subtree.label() for subtree in nt.getsmallestsubtrees(closest_np)]
    except AttributeError:
        np_pos = []

    return encode_pos_tag_frequencies(np_pos, all_tags)
예제 #5
0
def do_rule(sentdict, aux, tree, word_positions_in_tree):
    auxidx = aux.wordnum

    try:
        if sentdict.lemmas[auxidx+1] == 'that':
            return True
    except IndexError: pass

    if not aux_locally_ccommanded_by_verb(sentdict, aux, tree, word_positions_in_tree):
        if to_precedes_aux(sentdict, aux): return False

        localt = nt.generate_local_structure_from_subtree(tree, tree[word_positions_in_tree[auxidx-1]])
        local_word_subtrees = nt.getsmallestsubtrees(localt)

        try:
            checkpuncttag = sentdict.pos[auxidx+1]
            if is_period(checkpuncttag) or is_comma(checkpuncttag) or is_dash_or_colon(checkpuncttag):
                endbool = True

                for subtree in local_word_subtrees:
                    if is_verb(subtree.label()) and subtree != tree[word_positions_in_tree[auxidx-1]]:
                        if nt.ccommands(localt, subtree, tree[word_positions_in_tree[auxidx-1]]):
                            endbool = False
                            break
                if endbool:
                    return endbool
        except IndexError:
            pass

        # Don't at the end of sentence.
        try:
            checkpuncttag = sentdict.pos[auxidx+2]
            if sentdict.lemmas[auxidx+1] == 'not' and (is_period(checkpuncttag) or is_comma(checkpuncttag) or is_dash_or_colon(checkpuncttag)):
                endbool = True
                for subtree in local_word_subtrees:
                    if is_verb(subtree.label()):
                        if nt.ccommands(localt, subtree, tree[word_positions_in_tree[auxidx-1]]):
                            endbool = False
                            break
                if endbool:
                    return endbool
        except IndexError:
            pass

        if is_ccommanded_by_continuation_word(sentdict ,aux, tree, word_positions_in_tree):
            return True

        if verb_follows_aux(sentdict, aux):
            return False

        try:
            if is_preposition(sentdict.pos[auxidx+1]) and sentdict.words[auxidx] != 'done':
                return True
        except IndexError:
            pass

    return False
예제 #6
0
파일: detectVPE.py 프로젝트: kiankd/vpe
def auxccommandsverb(sentdict, auxidx, t, word_positions_in_tree):
    subtrees = nltktree.getsmallestsubtrees(t)

    for subtree in subtrees:
        if isverb(subtree.label()):
            try:
                if nltktree.ccommands(t, t[word_positions_in_tree[auxidx-1]], subtree):
                    return True
            except IndexError:
                pass
    return False
예제 #7
0
def aux_ccommanded_by_verb(sentdict, aux, tree, word_positions_in_tree):
    subtrees = nt.getsmallestsubtrees(tree)

    for subtree in subtrees:
        if is_verb(subtree.label()):
            try:
                if nt.ccommands(tree, subtree, tree[word_positions_in_tree[aux.wordnum-1]]):
                    return True
            except IndexError:
                pass
    return False
예제 #8
0
파일: detectVPE.py 프로젝트: kiankd/vpe
def auxlocallyccommandsverb(sentdict, auxidx, t, word_positions_in_tree):
    try:
        localt = nltktree.generate_local_structure_from_subtree(t, t[word_positions_in_tree[auxidx-1]])
        local_word_subtrees = nltktree.getsmallestsubtrees(localt)

        for subtree in local_word_subtrees:
            if isverb(subtree.label()):
                if nltktree.ccommands(localt, t[word_positions_in_tree[auxidx-1]], subtree)\
                        and not nltktree.ccommands(localt, subtree, t[word_positions_in_tree[auxidx-1]]):
                    return True
    except IndexError: pass
    return False
예제 #9
0
def aux_locally_ccommands_verb(sentdict, aux, tree, word_positions_in_tree):
    try:
        localt = nt.generate_local_structure_from_subtree(tree, tree[word_positions_in_tree[aux.wordnum-1]])
        local_word_subtrees = nt.getsmallestsubtrees(localt)

        for subtree in local_word_subtrees:
            if is_verb(subtree.label()):
                if nt.ccommands(localt, tree[word_positions_in_tree[aux.wordnum-1]], subtree)\
                        and not nt.ccommands(localt, subtree, tree[word_positions_in_tree[aux.wordnum-1]]):
                    return True
    except IndexError: pass
    return False
예제 #10
0
def myfeaturesvector(sentdict, idx, features):
    vector = []

    tree = NT.maketree(sentdict["tree"][0])
    subtrees = NT.getsmallestsubtrees(tree)
    subtree_positions = NT.get_smallest_subtree_positions(tree, subtree_list=subtrees)
    aux = sentdict["lemmas"][idx]

    if "my_features" in features:
        vector.append(truth(DV.auxccommandsverb(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.auxccommandsverbthatcomesafter(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.auxisccommandedbyverb(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.auxislocallyccommandedbyverb(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.auxlocallyccommandsverb(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.isccommandedbycontinuationword(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.nexttopunct(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.isfollowedbypunct(sentdict, idx, end=["."])))
        vector.append(truth(DV.previouswordisasorsoorthan(sentdict["words"], idx)))
        vector.append(truth(DV.thesamecheck(sentdict["words"], idx)))
        vector.append(truth(DV.toprecedesaux(sentdict, idx)))
        vector.append(truth(DV.verbfollowsaux(sentdict, idx)))

        # TODO: added this new feature!
        vector.append(truth(DV.nextwordistoo(sentdict, idx)))

    if "my_rules" in features:
        vector.append(truth(aux in DV.MODALS and DV.modalcheck(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(aux in DV.BE and DV.becheck(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(aux in DV.HAVE and DV.havecheck(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(aux in DV.DO and DV.docheck(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(aux in DV.TO and DV.tocheck(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(aux in DV.SO and DV.socheck(sentdict, idx, tree, subtree_positions)))

        # This adds a new layer of features by combining all of the ones I had.
    if "square_rules" in features:
        size = len(vector)
        for i in range(0, size):
            for j in range(0, size):
                if i != j:
                    vector.append(truth(untruth(vector[i]) and untruth(vector[j])))

    if "combine_aux_type" in features:
        bools = [aux in DV.MODALS, aux in DV.BE, aux in DV.HAVE, aux in DV.DO, aux in DV.TO, aux in DV.SO]
        vec = [v for v in vector]
        for v in vec:
            for b in bools:
                vector.append(truth(untruth(v) and b))

    return vector
예제 #11
0
def nearest_ant_np(ant, sentences, all_tags, get_words=False):
    """
    @type ant: vpe_objects.Antecedent
    @type sentences: vpe_objects.AllSentences
    """
    t = sentences.get_sentence_tree(ant.sentnum)
    tree_tuples = nt.pos_word_tuples(t)
    all_nps = nt.find_subtree_phrases(t, ['NP', 'NP-PRD'])

    ant_tup = (ant.sub_sentdict.pos[len(ant.sub_sentdict) / 2],
               ant.sub_sentdict.words[len(ant.sub_sentdict) / 2])
    if ant.sentnum != ant.trigger.sentnum:
        ant_tup_idx = len(tree_tuples)
    else:
        ant_tup_idx = tree_tuples.index(ant_tup)

    closest_np_value = 99
    closest_np = None
    for NP in all_nps:
        last_np_word_idx = tree_tuples.index(nt.pos_word_tuples(NP)[-1])
        if abs(ant_tup_idx - last_np_word_idx) < closest_np_value:
            closest_np_value = ant_tup_idx - last_np_word_idx
            closest_np = NP

    if get_words:
        try:
            return closest_np.leaves()
        except AttributeError:
            return []

    try:
        np_pos = [
            subtree.label() for subtree in nt.getsmallestsubtrees(closest_np)
        ]
    except AttributeError:
        np_pos = []

    return encode_pos_tag_frequencies(np_pos, all_tags)
예제 #12
0
def myfeaturesvector(sentdict, idx, features):
    vector = []

    tree = NT.maketree(sentdict['tree'][0])
    subtrees = NT.getsmallestsubtrees(tree)
    subtree_positions = NT.get_smallest_subtree_positions(
        tree, subtree_list=subtrees)
    aux = sentdict['lemmas'][idx]

    if 'my_features' in features:
        vector.append(
            truth(DV.auxccommandsverb(sentdict, idx, tree, subtree_positions)))
        vector.append(
            truth(
                DV.auxccommandsverbthatcomesafter(sentdict, idx, tree,
                                                  subtree_positions)))
        vector.append(
            truth(
                DV.auxisccommandedbyverb(sentdict, idx, tree,
                                         subtree_positions)))
        vector.append(
            truth(
                DV.auxislocallyccommandedbyverb(sentdict, idx, tree,
                                                subtree_positions)))
        vector.append(
            truth(
                DV.auxlocallyccommandsverb(sentdict, idx, tree,
                                           subtree_positions)))
        vector.append(
            truth(
                DV.isccommandedbycontinuationword(sentdict, idx, tree,
                                                  subtree_positions)))
        vector.append(
            truth(DV.nexttopunct(sentdict, idx, tree, subtree_positions)))
        vector.append(truth(DV.isfollowedbypunct(sentdict, idx, end=['.'])))
        vector.append(
            truth(DV.previouswordisasorsoorthan(sentdict['words'], idx)))
        vector.append(truth(DV.thesamecheck(sentdict['words'], idx)))
        vector.append(truth(DV.toprecedesaux(sentdict, idx)))
        vector.append(truth(DV.verbfollowsaux(sentdict, idx)))

        # TODO: added this new feature!
        vector.append(truth(DV.nextwordistoo(sentdict, idx)))

    if 'my_rules' in features:
        vector.append(
            truth(aux in DV.MODALS
                  and DV.modalcheck(sentdict, idx, tree, subtree_positions)))
        vector.append(
            truth(aux in DV.BE
                  and DV.becheck(sentdict, idx, tree, subtree_positions)))
        vector.append(
            truth(aux in DV.HAVE
                  and DV.havecheck(sentdict, idx, tree, subtree_positions)))
        vector.append(
            truth(aux in DV.DO
                  and DV.docheck(sentdict, idx, tree, subtree_positions)))
        vector.append(
            truth(aux in DV.TO
                  and DV.tocheck(sentdict, idx, tree, subtree_positions)))
        vector.append(
            truth(aux in DV.SO
                  and DV.socheck(sentdict, idx, tree, subtree_positions)))

        # This adds a new layer of features by combining all of the ones I had.
    if 'square_rules' in features:
        size = len(vector)
        for i in range(0, size):
            for j in range(0, size):
                if i != j:
                    vector.append(
                        truth(untruth(vector[i]) and untruth(vector[j])))

    if 'combine_aux_type' in features:
        bools = [
            aux in DV.MODALS, aux in DV.BE, aux in DV.HAVE, aux in DV.DO, aux
            in DV.TO, aux in DV.SO
        ]
        vec = [v for v in vector]
        for v in vec:
            for b in bools:
                vector.append(truth(untruth(v) and b))

    return vector
예제 #13
0
파일: detectVPE.py 프로젝트: kiankd/vpe
def docheck(sentdict, auxidx, t, word_positions_in_tree, verbose=False):

    # We DO NOT want to consider 'do so' or 'do the same' sentences here!
    """try:
        if sentdict['lemmas'][auxidx+1] == 'so' or (sentdict['lemmas'][auxidx+1] == 'the' and sentdict['lemmas'][auxidx+2] == 'same'):
            return False
        # We can be POSITIVE that there is NO vpe if we have 'don't do ...' or 'x does do ...'
        if sentdict['lemmas'][auxidx-1] == 'do' or (sentdict['lemmas'][auxidx-2] == 'do' and sentdict['lemmas'][auxidx-1] == 'not'):
            return False
        if sentdict['lemmas'][auxidx+1] == 'do' or (sentdict['lemmas'][auxidx+1] == 'do' and sentdict['lemmas'][auxidx+2] == 'not'):
            return False
    except IndexError: pass"""

    try:
        if sentdict['lemmas'][auxidx+1] == 'that':
            return True
    except IndexError: pass

    if not auxislocallyccommandedbyverb(sentdict, auxidx, t, word_positions_in_tree):

        # # If 'do' locally c-commands a verb AND is locally c-commanded by a verb, we can be basically 100% sure that there is no VPE.
        # if auxislocallyccommandedbyverb(sentdict, auxidx, tree, word_positions_in_tree) and auxlocallyccommandsverb(sentdict, auxidx, tree, word_positions_in_tree):
        #     return False

        if toprecedesaux(sentdict, auxidx): return False

        localt = nltktree.generate_local_structure_from_subtree(t, t[word_positions_in_tree[auxidx-1]])
        local_word_subtrees = nltktree.getsmallestsubtrees(localt)

        # Do at the end of sentence.
        checkpuncttag = sentdict['pos'][auxidx+1]
        if isperiod(checkpuncttag) or iscomma(checkpuncttag) or isdashorcolon(checkpuncttag):
            endbool = True

            for subtree in local_word_subtrees:
                if isverb( subtree.label() ) and subtree != t[word_positions_in_tree[auxidx-1]]:
                    if nltktree.ccommands(localt, subtree, t[word_positions_in_tree[auxidx-1]]):
                        endbool = False
                        break

            if endbool: return endbool

        # Don't at the end of sentence.
        try:
            checkpuncttag = sentdict['pos'][auxidx+2]
            if sentdict['lemmas'][auxidx+1] == 'not' and (isperiod(checkpuncttag) or iscomma(checkpuncttag) or isdashorcolon(checkpuncttag)):
                endbool = True

                for subtree in local_word_subtrees:
                    if isverb( subtree.label() ):
                        if nltktree.ccommands(localt, subtree, t[word_positions_in_tree[auxidx-1]]):
                            endbool = False
                            break

                if endbool: return endbool
        except IndexError:
            pass

        # Small increase in recall, decrease in precision from this.
        # numverbs = 0
        # for subtree in local_word_subtrees:
        #     if isverb(subtree.label()) or isnounorprep(subtree.label()):
        #         numverbs+=1
        # if numverbs == 1:
        #     return True

        if isccommandedbycontinuationword(sentdict ,auxidx, t, word_positions_in_tree):
            #if not auxlocallyccommandsverb(sentdict ,auxidx, tree, word_positions_in_tree): # 8% recall traded for 4% precision.
            #if (not isverb(sentdict['pos'][auxidx+1])) or (sentdict['lemmas'][auxidx+1]=='not' and not isverb(sentdict['pos'][auxidx+2])):
            return True

        # if not auxccommandsverbthatcomesafter(sentdict ,auxidx, tree, word_positions_in_tree):
        #     return True

        if verbfollowsaux(sentdict, auxidx):
            return False

        if isprep(sentdict['pos'][auxidx+1]) and sentdict['words'][auxidx] != 'done':
            return True

    return False