예제 #1
0
def process_2_1(sentence, tagged_corpus):
    # adding another part for the first time
    # if possible, integrate this into the main loop
    if sentence["pdt"] is None:  # try to re-cast this
        possibles = []

        for word_index in range(len(tagged_corpus)):
            word = tagged_corpus[word_index]
            if word[1] == "NNS":
                word_processed = (NLP.inflect.singular_noun(word[0]), "NN")
                obj = EC.Container(word=word,
                                   word_processed=word_processed,
                                   trd_p=True,
                                   index=word_index)
            else:
                word_processed = word
                obj = EC.Container(word=word,
                                   word_processed=word_processed,
                                   index=word_index)

            if word_processed[0] in NLP.wordtags and NLP.wordtags[
                    word_processed[0]].index("VERB") < 3:
                possibles.append(obj)

        for possibility in possibles:
            tagged_alternative = copy.deepcopy(tagged_corpus)
            tag = "VB"
            if hasattr(possibility, "trd_p"):
                tag = "VBZ"
            tagged_alternative[possibility.index] = (possibility.word[0], tag)
            if possibility.index == 0:
                continue
            elif not utils.deep_in(tagged_alternative[:possibility.index],
                                   "n",
                                   key=NLP.Converter.penn_to_wn):
                print(tagged_alternative)
                continue
            new_sentence = extract_2(tagged_alternative)
            if utils.verify(new_sentence):
                yield new_sentence
예제 #2
0
def extract_7(tagged_corpus, special=False):
    global TIER
    TIER = 3.7
    sentence = EC.SentenceResult()
    mark = False
    for word in tagged_corpus:
        if word[0].lower() == "by":
            mark = True
        if word[0].lower() == "to":
            if sentence["sbj"] and sentence["pdt"] and not sentence["obj"]:
                sentence["obj_toi"] = True
            elif not sentence["sbj"]:
                sentence["sbj_toi"] = True
        if word[0].lower() in ["a", "the", "an"]:
            if sentence["pdt"] and not sentence["obj_toi"]:
                if sentence["obj_adt"] or sentence["obj_att"]:
                    print("determiner too late error")
                    return {"sbj": None, "pdt": None}
                sentence["obj_dt"] = word
            elif not sentence["sbj_toi"]:
                if sentence["sbj_adt"] or sentence["sbj_att"]:
                    print("determiner too late error (subject)")
                    return {
                        "sbj": None,
                        "pdt": None
                    }  # so that the error will be noted and passed over
                sentence["sbj_dt"] = word
        if NLP.Converter.penn_to_wn(word[1]) == "r":
            if sentence["pdt"]:
                if not sentence["obj_dtr"] and not mark and not sentence[
                        "obj_adt"] and not sentence["obj_att"] and not sentence[
                            "obj"]:  # the same thing
                    sentence["pdt_ptc"].append(word)
                elif not sentence["obj_toi"]:
                    if sentence["obj_att"]:
                        closest = sentence["obj_att"][len(sentence["obj_att"])
                                                      - 1]
                        if closest[1] is None:
                            closest[0].append(word)
                        else:
                            sentence["obj_att"].append([[word], None])
                    else:
                        sentence["obj_att"].append([[word], None])
                else:
                    if sentence["obj"]:
                        sentence["obj_tpa"].append(word)
            else:
                if not sentence["pss_vce"] and sentence["sbj"]:
                    sentence["pdt_ptc"].append(word)
                if not sentence["sbj_toi"]:
                    if not sentence["pdt"]:
                        if sentence["sbj_att"]:
                            closest = sentence["sbj_att"][
                                len(sentence["sbj_att"]) - 1]
                            if closest[1] is None:
                                closest[0].append(word)
                            else:
                                sentence["sbj_att"].append([[word], None])
                        else:
                            sentence["sbj_att"].append([[word], None])
                else:
                    if sentence["sbj"]:
                        sentence["sbj_tpa"].append(word)

        if NLP.Converter.penn_to_wn(
                word[1]) == "a" or word[1] == "PRP$" or word[1] == "CD":
            if sentence["pdt"] and not sentence["obj_toi"]:
                if sentence["obj_att"]:
                    closest = sentence["obj_att"][len(sentence["obj_att"]) - 1]
                    if closest[1] is None:
                        closest[1] = word
                    else:
                        sentence["obj_att"].append([[], word])
                else:
                    sentence["obj_att"].append([[], word])
            elif not sentence["sbj_toi"] and not sentence["pdt"]:
                if sentence["sbj_att"]:
                    closest = sentence["sbj_att"][len(sentence["sbj_att"]) - 1]
                    if closest[1] is None:
                        closest[1] = word
                    else:
                        sentence["sbj_att"].append([[], word])
                else:
                    sentence["sbj_att"].append([[], word])

        if sentence["obj_cmp"] and (NLP.Converter.penn_to_wn(word[1]) == "a" or
                                    NLP.Converter.penn_to_wn(word[1]) == "r"):
            sentence["obj"] = word
            res = utils.deep_in(sentence["obj_att"], word)
            if res:
                sentence["obj_att"].remove(res)

        if NLP.Converter.penn_to_wn(word[1]) == "n" or \
                word[1] == "PRP" or word[0].lower() == "it" or \
                word[1] == "VBG" or \
                NLP.Determiner.is_gerund(word[0]):
            if NLP.Converter.penn_to_wn(word[1]) == "n":
                if not sentence["pdt"] and not sentence["sbj_toi"]:
                    if sentence["sbj"]:
                        sentence["sbj_adt"].append(sentence["sbj"])
                        sentence["sbj"] = word
                    else:
                        sentence["sbj"] = word
                elif not sentence["obj_toi"]:
                    if sentence["obj"]:
                        if not sentence["obj_cmp"]:
                            sentence["obj_adt"].append(sentence["obj"])
                        sentence["obj"] = word
                    else:
                        sentence["obj"] = word
            else:
                if sentence["pdt"] and not sentence["obj_toi"]:
                    sentence["obj"] = word
                    if word[1] == "PRP" and word[0].lower() in [
                            "he", "she", "they", "I"
                    ]:
                        sentence["opn_dis"] = True
                        sentence["err"] = True
                elif not sentence["sbj_toi"]:
                    if word[1] == "PRP" and word[0].lower() in [
                            "him", "her", "them", "me"
                    ]:
                        if not special:
                            sentence["spn_dis"] = True
                            sentence["err"] = True
                        else:
                            sentence["sbj"] = (NLP.Converter.to_sbj(word[0]),
                                               word[1])
                    else:
                        sentence["sbj"] = word

        if NLP.Converter.penn_to_wn(word[1]) == "v" and \
                word[1] != "VBG":
            if sentence["obj_cmp"] and word[1] == "VBN":
                sentence["obj_cmp"] = False
                sentence["pss_vce"] = True
                sentence["pdt"] = word
            if sentence["sbj_toi"] and not sentence["sbj"]:
                sentence["sbj"] = word
            elif not sentence["pdt"]:
                sentence["pdt"] = word
                if not sentence["sbj"]:
                    print("subjectless predicate error")
                    print("tgd", tagged_corpus)
                    print("stc", sentence)
                    return {"sbj": None, "pdt": None}
                if sentence["sbj"][1] in ["NNS", "NNPS"] and word[1] == "VBZ":
                    sentence["svb_dis"] = True
                    sentence["err"] = True
                if (sentence["sbj"][1] in ["NN", "NNP"]
                        or sentence["sbj"][0].lower()
                        in ["he", "she", "it"]) and word[1] == "VB":
                    sentence["svb_dis"] = True
                    sentence["err"] = True
                if word[0].lower() in [
                        "am", "is", "are", "was", "were", "be", "been"
                ]:
                    sentence["obj_cmp"] = True
            elif sentence["obj_toi"] and not sentence["obj"]:
                sentence["obj"] = word

        if word[1] == "RP":
            if not sentence["obj_cmp"]:
                sentence["pdt_ptc"].append(word)

    # placing apposition detection outside the main loop will also be clearer and more efficient, and do it before the swap
    if sentence["sbj"][
            1] == "NNP":  # appositions only apply to proper nouns currently, because most are named entities
        for adt in sentence["sbj_adt"][::-1]:
            if adt[1] == "NNP":
                sentence["sbj_adt"].remove(adt)
                sentence["sbj"] = (adt[0] + " " + sentence["sbj"][0], "NNP")
            else:
                break

    if sentence["obj"]:
        if sentence["obj"][
                1] == "NNP":  # appositions only apply to proper nouns currently, because most are named entities
            for adt in sentence["obj_adt"][::-1]:
                if adt[1] == "NNP":
                    sentence["obj_adt"].remove(adt)
                    sentence["obj"] = (adt[0] + " " + sentence["obj"][0],
                                       "NNP")
                else:
                    break

    # remove those adjective phrases in attributes that have no body, just adverbs
    for char in sentence["sbj_att"]:
        if char[1] is None:
            sentence["sbj_att"].remove(char)
    for char in sentence["obj_att"]:
        if char[1] is None:
            sentence["obj_att"].remove(char)

    # will be more readable to leave this switch outside the main loop
    if sentence["pss_vce"]:
        sentence.swap()
        if not sentence["sbj"]:
            sentence["sbj"] = EC.Pointer("&any")

    return sentence
예제 #3
0
def extract_3(tagged_corpus):
    global TIER
    TIER = 3.3
    sentence = EC.SentenceResult()
    mark = False
    for word in tagged_corpus:
        if word[0].lower() == "by":
            mark = True
        if word[0].lower() == "to":
            if sentence["sbj"] and sentence["pdt"] and not sentence["obj"]:
                sentence["obj_toi"] = True
            elif not sentence["sbj"]:
                sentence["sbj_toi"] = True
        if word[0].lower() in ["a", "the", "an"]:
            if sentence["pdt"] and not sentence["obj_toi"]:
                if sentence["obj_adt"] or sentence["obj_att"]:
                    return {"sbj": None, "pdt": None}
                sentence["obj_dt"] = word
            elif not sentence["sbj_toi"]:
                if sentence["sbj_adt"] or sentence["sbj_att"]:
                    return {
                        "sbj": None,
                        "pdt": None
                    }  # so that the error will be noted and passed over
                sentence["sbj_dt"] = word
        if NLP.Converter.penn_to_wn(word[1]) == "r":
            if sentence["pdt"]:
                if not sentence["obj_dtr"] and not mark:  # the same thing
                    sentence["pdt_ptc"].append(word)
                elif not sentence["obj_toi"]:
                    if sentence["obj_att"]:
                        closest = sentence["obj_att"][len(sentence["obj_att"])
                                                      - 1]
                        if closest[1] is None:
                            closest[0].append(word)
                        else:
                            sentence["obj_att"].append([[word], None])
                    else:
                        sentence["obj_att"].append([[word], None])
                else:
                    if sentence["obj"]:
                        sentence["obj_tpa"].append(word)
            else:
                if not sentence["sbj_toi"]:
                    if sentence["sbj_att"]:
                        closest = sentence["sbj_att"][len(sentence["sbj_att"])
                                                      - 1]
                        if closest[1] is None:
                            closest[0].append(word)
                        else:
                            sentence["sbj_att"].append([[word], None])
                    else:
                        sentence["sbj_att"].append([[word], None])
                else:
                    if sentence["sbj"]:
                        sentence["sbj_tpa"].append(word)

        if NLP.Converter.penn_to_wn(
                word[1]) == "a" or word[1] == "PRP$" or word[1] == "CD":
            if sentence["pdt"] and not sentence["obj_toi"]:
                if sentence["obj_att"]:
                    closest = sentence["obj_att"][len(sentence["obj_att"]) - 1]
                    if closest[1] is None:
                        closest[1] = word
                    else:
                        sentence["obj_att"].append([[], word])
                else:
                    sentence["obj_att"].append([[], word])
            elif not sentence["sbj_toi"]:
                if sentence["sbj_att"]:
                    closest = sentence["sbj_att"][len(sentence["sbj_att"]) - 1]
                    if closest[1] is None:
                        closest[1] = word
                    else:
                        sentence["sbj_att"].append([[], word])
                else:
                    sentence["sbj_att"].append([[], word])

        if sentence["obj_cmp"] and NLP.Converter.penn_to_wn(word[1]) == "a":
            sentence["obj"] = word
            res = utils.deep_in(sentence["obj_att"], word)
            if res:
                sentence["obj_att"].remove(res)

        if NLP.Converter.penn_to_wn(word[1]) == "n" or \
                word[1] == "PRP" or word[0].lower() == "it" or \
                word[1] == "VBG" or \
                NLP.Determiner.is_gerund(word[0]):
            if NLP.Converter.penn_to_wn(word[1]) == "n":
                if not sentence["pdt"] and not sentence["sbj_toi"]:
                    if sentence["sbj"]:
                        sentence["sbj_adt"].append(sentence["sbj"])
                        sentence["sbj"] = word
                    else:
                        sentence["sbj"] = word
                elif not sentence["obj_toi"]:
                    if sentence["obj"]:
                        if not sentence["obj_cmp"]:
                            sentence["obj_adt"].append(sentence["obj"])
                        sentence["obj"] = word
                    else:
                        sentence["obj"] = word
            else:
                if sentence["pdt"] and not sentence["obj_toi"]:
                    sentence["obj"] = word
                    if word[1] == "PRP" and word[0].lower() in [
                            "he", "she", "they", "I"
                    ]:
                        sentence["opn_dis"] = True
                        sentence["err"] = True
                elif not sentence["sbj_toi"]:
                    if word[1] == "PRP" and word[0].lower() in [
                            "him", "her", "them", "me"
                    ]:
                        sentence["spn_dis"] = True
                        sentence["err"] = True
                    sentence["sbj"] = word

        if NLP.Converter.penn_to_wn(word[1]) == "v" and \
                word[1] != "VBG":
            if sentence["obj_cmp"] and word[1] == "VBN":
                sentence["obj_cmp"] = False
                sentence["pss_vce"] = True
                sentence["pdt"] = word
            if sentence["sbj_toi"] and not sentence["sbj"]:
                sentence["sbj"] = word
            elif not sentence["pdt"]:
                sentence["pdt"] = word
                if not sentence["sbj"]:
                    return {"sbj": None, "pdt": None}
                if sentence["sbj"][1] in ["NNS", "NNPS"] and word[1] == "VBZ":
                    sentence["svb_dis"] = True
                    sentence["err"] = True
                if (sentence["sbj"][1] in ["NN", "NNP"]
                        or sentence["sbj"][0].lower()
                        in ["he", "she", "it"]) and word[1] == "VB":
                    sentence["svb_dis"] = True
                    sentence["err"] = True
                if word[0].lower() in [
                        "am", "is", "are", "was", "were", "be", "been"
                ]:
                    sentence["obj_cmp"] = True
            elif sentence["obj_toi"] and not sentence["obj"]:
                sentence["obj"] = word

        if word[1] == "RP":
            if not sentence["obj_cmp"]:
                sentence["pdt_ptc"].append(word)

    # will be more readable to leave this switch outside the main loop
    if sentence["pss_vce"]:
        sentence.swap()
        if not sentence["sbj"]:
            sentence["sbj"] = EC.Pointer("&any")

    return sentence
예제 #4
0
def extract_1(tagged_corpus):
    global TIER
    TIER = 3.1
    sentence = {
        "sbj_toi": False,
        "sbj_dt": None,
        "sbj_att": [],
        "sbj_adt": [],
        "sbj": None,
        "sbj_tpa": [],
        "pdt": None,
        "obj_toi": False,
        "obj_cmp": False,
        "obj_dt": None,
        "obj_att": [],
        "obj_adt": [],
        "obj": None,
        "obj_tpa": [],
        "pss_vce": False,  # passive voice
        "svb_dis": False,  # subject-verb disagreement
        "spn_dis": False,  # subject pronoun nominative disagreement
        "opn_dis": False,
        "err": False
    }
    for word in tagged_corpus:
        if word[0].lower() == "to":
            if sentence["sbj"] and sentence["pdt"] and not sentence["obj"]:
                sentence["obj_toi"] = True
            elif not sentence["sbj"]:
                sentence["sbj_toi"] = True
        if word[0].lower() in ["a", "the"]:
            if sentence["pdt"] and not sentence["obj_toi"]:
                if sentence["obj_adt"] or sentence["obj_att"]:
                    return {"sbj": None, "pdt": None}
                sentence["obj_dt"] = word
            elif not sentence["sbj_toi"]:
                if sentence["sbj_adt"] or sentence["sbj_att"]:
                    return {
                        "sbj": None,
                        "pdt": None
                    }  # so that the error will be noted and passed over
                sentence["sbj_dt"] = word
        if NLP.Converter.penn_to_wn(word[1]) == "r":
            if sentence["pdt"]:
                if not sentence["obj_toi"]:
                    if sentence["obj_att"]:
                        closest = sentence["obj_att"][len(sentence["obj_att"])
                                                      - 1]
                        if closest[1] is None:
                            closest[0].append(word)
                        else:
                            sentence["obj_att"].append([[word], None])
                    else:
                        sentence["obj_att"].append([[word], None])
                else:
                    if sentence["obj"]:
                        sentence["obj_tpa"].append(word)
            else:
                if not sentence["sbj_toi"]:
                    if sentence["sbj_att"]:
                        closest = sentence["sbj_att"][len(sentence["sbj_att"])
                                                      - 1]
                        if closest[1] is None:
                            closest[0].append(word)
                        else:
                            sentence["sbj_att"].append([[word], None])
                    else:
                        sentence["sbj_att"].append([[word], None])
                else:
                    if sentence["sbj"]:
                        sentence["sbj_tpa"].append(word)

        if NLP.Converter.penn_to_wn(
                word[1]) == "a" or word[1] == "PRP$" or word[1] == "CD":
            if sentence["pdt"] and not sentence["obj_toi"]:
                if sentence["obj_att"]:
                    closest = sentence["obj_att"][len(sentence["obj_att"]) - 1]
                    if closest[1] is None:
                        closest[1] = word
                    else:
                        sentence["obj_att"].append([[], word])
                else:
                    sentence["obj_att"].append([[], word])
            elif not sentence["sbj_toi"]:
                if sentence["sbj_att"]:
                    closest = sentence["sbj_att"][len(sentence["sbj_att"]) - 1]
                    if closest[1] is None:
                        closest[1] = word
                    else:
                        sentence["sbj_att"].append([[], word])
                else:
                    sentence["sbj_att"].append([[], word])

        if sentence["obj_cmp"] and NLP.Converter.penn_to_wn(word[1]) == "a":
            sentence["obj"] = word
            res = utils.deep_in(sentence["obj_att"], word)
            if res:
                sentence["obj_att"].remove(res)

        if NLP.Converter.penn_to_wn(word[1]) == "n" or \
                word[1] == "PRP" or word[0].lower() == "it" or \
                word[1] == "VBG" or \
                NLP.Determiner.is_gerund(word[0]):
            if NLP.Converter.penn_to_wn(word[1]) == "n":
                if not sentence["pdt"] and not sentence["sbj_toi"]:
                    if sentence["sbj"]:
                        sentence["sbj_adt"].append(sentence["sbj"])
                        sentence["sbj"] = word
                    else:
                        sentence["sbj"] = word
                elif not sentence["obj_toi"]:
                    if sentence["obj"]:
                        if not sentence["obj_cmp"]:
                            sentence["obj_adt"].append(sentence["obj"])
                        sentence["obj"] = word
                    else:
                        sentence["obj"] = word
            else:
                if sentence["pdt"] and not sentence["obj_toi"]:
                    sentence["obj"] = word
                    if word[1] == "PRP" and word[0].lower() in [
                            "he", "she", "they"
                    ]:
                        sentence["opn_dis"] = True
                        sentence["err"] = True
                elif not sentence["sbj_toi"]:
                    if word[1] == "PRP" and word[0].lower() in [
                            "him", "her", "them"
                    ]:
                        sentence["spn_dis"] = True
                        sentence["err"] = True
                    sentence["sbj"] = word

        if NLP.Converter.penn_to_wn(word[1]) == "v" and \
                word[1] != "VBG":
            if sentence["obj_cmp"] and word[1] == "VBN":
                sentence["obj_cmp"] = False
                sentence["pss_vce"] = True
                sentence["pdt"] = word
            if sentence["sbj_toi"] and not sentence["sbj"]:
                sentence["sbj"] = word
            elif not sentence["pdt"]:
                sentence["pdt"] = word
                if not sentence["sbj"]:
                    return {"sbj": None, "pdt": None}
                if sentence["sbj"][1] in ["NNS", "NNPS"] and word[1] == "VBZ":
                    sentence["svb_dis"] = True
                    sentence["err"] = True
                if word[0].lower() in [
                        "am", "is", "are", "was", "were", "be", "been"
                ]:
                    sentence["obj_cmp"] = True
            elif sentence["obj_toi"] and not sentence["obj"]:
                sentence["obj"] = word
    return sentence
예제 #5
0
def extract_1(tagged_corpus):
    global TIER
    TIER = 2.1
    # from now on, these will be using a more concise method
    sentence = {
        "sbj_toi": False,
        "sbj_dt": None,
        "sbj_att": [],
        "sbj_adt": [],
        "sbj": None,
        "sbj_tpa": [],
        "pdt": None,
        "obj_toi": False,
        "obj_cmp": False,  # is complement?
        "obj_dt": None,
        "obj_att": [],
        "obj_adt": [],
        "obj": None,
        "obj_tpa": []
    }
    for word in tagged_corpus:
        if word[0].lower() == "to":
            if sentence["sbj"] and sentence["pdt"] and not sentence["obj"]:
                sentence["obj_toi"] = True
            elif not sentence["sbj"]:
                sentence["sbj_toi"] = True
        if word[1] == "DT":
            if sentence["pdt"] and not sentence["obj_toi"]:
                sentence["obj_dt"] = word
            elif not sentence["sbj_toi"]:
                sentence["sbj_dt"] = word
        if NLP.Converter.penn_to_wn(word[1]) == "r":
            if sentence["pdt"]:
                if not sentence["obj_toi"]:
                    if sentence["obj_att"]:
                        closest = sentence["obj_att"][len(obj_att) - 1]
                        if closest[1] is None:
                            closest[0].append(word)
                        else:
                            sentence["obj_att"].append([[word], None])
                    else:
                        sentence["obj_att"].append([[word], None])
                else:
                    if sentence["obj"]:
                        sentence["obj_tpa"].append(word)
            else:
                if not sentence["sbj_toi"]:
                    if sentence["sbj_att"]:
                        closest = sentence["sbj_att"][len(sbj_att) - 1]
                        if closest[1] is None:
                            closest[0].append(word)
                        else:
                            sentence["sbj_att"].append([[word], None])
                    else:
                        sentence["sbj_att"].append([[word], None])
                else:
                    if sentence["sbj"]:
                        sentence["sbj_tpa"].append(word)
        if NLP.Converter.penn_to_wn(word[1]) == "a" or word[1] == "PRP$":
            if sentence["pdt"] and not sentence["obj_toi"]:
                if sentence["obj_att"]:
                    closest = sentence["obj_att"][len(sentence["obj_att"]) - 1]
                    if closest[1] is None:
                        closest[1] = word
                    else:
                        sentence["obj_att"].append([[], word])
                else:
                    sentence["obj_att"].append([[], word])
            elif not sentence["sbj_toi"]:
                if sentence["sbj_att"]:
                    closest = sentence["sbj_att"][len(sentence["sbj_att"]) - 1]
                    if closest[1] is None:
                        closest[1] = word
                    else:
                        sentence["sbj_att"].append([[], word])
                else:
                    sentence["sbj_att"].append([[], word])
        if sentence["obj_cmp"] and NLP.Converter.penn_to_wn(word[1]) == "a":
            # if sentence["obj_cmp"] is not False, which is its default value
            # then the predicate has to have been found already
            # making checking for sentence["pdt"] unnecessary.
            sentence["obj"] = word  # this could work ...
            res = utils.deep_in(sentence["obj_att"], word)
            if res:
                sentence["obj_att"].remove(res)
        if NLP.Converter.penn_to_wn(word[1]) == "n" or \
                word[1] == "PRP" or word[0].lower() == "it" or \
                word[1] == "VBG" or \
                NLP.Determiner.is_gerund(word[0]):
            # subjects have to appear before objects
            if NLP.Converter.penn_to_wn(word[1]) == "n":
                if not sentence["pdt"] and not sentence["sbj_toi"]:
                    if sentence["sbj"]:
                        sentence["sbj_adt"].append(sentence["sbj"])
                        sentence["sbj"] = word
                    else:
                        sentence["sbj"] = word
                elif not sentence["obj_toi"]:
                    if sentence["obj"]:
                        if not sentence["obj_cmp"]:
                            sentence["obj_adt"].append(sentence["obj"])
                        sentence["obj"] = word
                    else:
                        sentence["obj"] = word
            else:
                if sentence["pdt"] and not sentence["obj_toi"]:
                    sentence["obj"] = word
                elif not sentence["sbj_toi"]:
                    sentence["sbj"] = word
        if NLP.Converter.penn_to_wn(word[1]) == "v" and \
                word[1] != "VBG":
            if sentence["sbj_toi"] and not sentence["sbj"]:
                sentence["sbj"] = word
            elif not sentence["pdt"]:
                sentence["pdt"] = word  # the all important line
                # this is what tier 2.1 is all about
                if word[0].lower() in [
                        "am", "is", "are", "was", "were", "be", "been"
                ]:
                    sentence["obj_cmp"] = True
            elif sentence["obj_toi"] and not sentence["obj"]:
                sentence["obj"] = word
    return sentence  # much more concise!
예제 #6
0
def extract_2(tagged_corpus):
    global TIER
    TIER = 2.2
    sentence = {
        "sbj_toi": False,
        "sbj_dt": None,
        "sbj_att": [],
        "sbj_adt": [],
        "sbj": None,
        "sbj_tpa": [],
        "pdt": None,
        "obj_toi": False,
        "obj_cmp": False,
        "obj_dt": None,
        "obj_att": [],
        "obj_adt": [],
        "obj": None,
        "obj_tpa": [],
        "disagree": False
    }
    for word in tagged_corpus:
        if word[0].lower() == "to":
            if sentence["sbj"] and sentence["pdt"] and not sentence["obj"]:
                sentence["obj_toi"] = True
            elif not sentence["sbj"]:
                sentence["sbj_toi"] = True
        if word[0].lower() in ["a", "the"]:
            if sentence["pdt"] and not sentence["obj_toi"]:
                if sentence["obj_adt"] or sentence["obj_att"]:
                    return {"sbj": None, "pdt": None}
                sentence["obj_dt"] = word
            elif not sentence["sbj_toi"]:
                if sentence["sbj_adt"] or sentence["sbj_att"]:
                    return {
                        "sbj": None,
                        "pdt": None
                    }  # so that the error will be noted and passed over
                sentence["sbj_dt"] = word
        if NLP.Converter.penn_to_wn(word[1]) == "r":
            if sentence["pdt"]:
                if not sentence["obj_toi"]:
                    if sentence["obj_att"]:
                        closest = sentence["obj_att"][len(sentence["obj_att"])
                                                      - 1]
                        if closest[1] is None:
                            closest[0].append(word)
                        else:
                            sentence["obj_att"].append([[word], None])
                    else:
                        sentence["obj_att"].append([[word], None])
                else:
                    if sentence["obj"]:
                        sentence["obj_tpa"].append(word)
            else:
                if not sentence["sbj_toi"]:
                    if sentence["sbj_att"]:
                        closest = sentence["sbj_att"][len(sentence["sbj_att"])
                                                      - 1]
                        if closest[1] is None:
                            closest[0].append(word)
                        else:
                            sentence["sbj_att"].append([[word], None])
                    else:
                        sentence["sbj_att"].append([[word], None])
                else:
                    if sentence["sbj"]:
                        sentence["sbj_tpa"].append(word)

        if NLP.Converter.penn_to_wn(word[1]) == "a" or word[1] == "PRP$":
            if sentence["pdt"] and not sentence["obj_toi"]:
                if sentence["obj_att"]:
                    closest = sentence["obj_att"][len(sentence["obj_att"]) - 1]
                    if closest[1] is None:
                        closest[1] = word
                    else:
                        sentence["obj_att"].append([[], word])
                else:
                    sentence["obj_att"].append([[], word])
            elif not sentence["sbj_toi"]:
                if sentence["sbj_att"]:
                    closest = sentence["sbj_att"][len(sentence["sbj_att"]) - 1]
                    if closest[1] is None:
                        closest[1] = word
                    else:
                        sentence["sbj_att"].append([[], word])
                else:
                    sentence["sbj_att"].append([[], word])

        if sentence["obj_cmp"] and NLP.Converter.penn_to_wn(word[1]) == "a":
            sentence["obj"] = word
            res = utils.deep_in(sentence["obj_att"], word)
            if res:
                sentence["obj_att"].remove(res)

        if NLP.Converter.penn_to_wn(word[1]) == "n" or \
                word[1] == "PRP" or word[0].lower() == "it" or \
                word[1] == "VBG" or \
                NLP.Determiner.is_gerund(word[0]):
            if NLP.Converter.penn_to_wn(word[1]) == "n":
                if not sentence["pdt"] and not sentence["sbj_toi"]:
                    if sentence["sbj"]:
                        sentence["sbj_adt"].append(sentence["sbj"])
                        sentence["sbj"] = word
                    else:
                        sentence["sbj"] = word
                elif not sentence["obj_toi"]:
                    if sentence["obj"]:
                        if not sentence["obj_cmp"]:
                            sentence["obj_adt"].append(sentence["obj"])
                        sentence["obj"] = word
                    else:
                        sentence["obj"] = word
            else:
                if sentence["pdt"] and not sentence["obj_toi"]:
                    sentence["obj"] = word
                elif not sentence["sbj_toi"]:
                    sentence["sbj"] = word

        if NLP.Converter.penn_to_wn(word[1]) == "v" and \
                word[1] != "VBG":
            if sentence["sbj_toi"] and not sentence["sbj"]:
                sentence["sbj"] = word
            elif not sentence["pdt"]:
                sentence["pdt"] = word
                if not sentence["sbj"]:
                    return {
                        "sbj": None,
                        "pdt": None
                    }  # so that the error will be noted and passed over
                # will first do some basic determination based on plurality tags
                if sentence["sbj"][1] in ["NNS", "NNPS"] and word[1] == "VBZ":
                    sentence["disagree"] = True
                if word[0].lower() in [
                        "am", "is", "are", "was", "were", "be", "been"
                ]:
                    sentence["obj_cmp"] = True
            elif sentence["obj_toi"] and not sentence["obj"]:
                sentence["obj"] = word
    return sentence