示例#1
0
def process(f, np, head, text, heads2qp, stanford_deps):
    dep_key = "{0},{1}".format(np["HEAD_START"], np["HEAD_END"])
    pos_tags = reconcile.getPOS(f)
    for dep in stanford_deps:
        if (dep["RELATION"] == "nsubj" or dep["RELATION"]
                == "nsubjpass") and dep_key == dep["DEP_SPAN"]:
            #we have a subj
            heads2qp[head].subj += 1

        if dep["RELATION"] == "dobj" and dep_key == dep["DEP_SPAN"]:
            #direct obj
            heads2qp[head].dobj += 1

        if dep["RELATION"] == "iobj" and dep_key == dep["DEP_SPAN"]:
            #indirect object
            heads2qp[head].iobj += 1

        #apposition
        if dep["RELATION"] == "appos":
            if dep_key == dep["DEP_SPAN"]:
                heads2qp[head].appos_dep += 1
            elif dep_key == dep["GOV_SPAN"]:
                heads2qp[head].appos_gov += 1

        if dep["RELATION"] == "agent":
            if dep_key == dep["DEP_SPAN"]:
                heads2qp[head].agent += 1
                heads2qp[head].agent_verbs.append(dep["GOV"])

        if dep["RELATION"] == "amod":
            if dep_key == dep["GOV_SPAN"]:
                heads2qp[head].adj_mod += 1

        if dep["RELATION"] == "advmod":
            if dep_key == dep["GOV_SPAN"]:
                heads2qp[head].adv_mod += 1

        if dep["RELATION"] == "nn":
            if dep_key == dep["GOV_SPAN"]:
                heads2qp[head].nn_mod += 1

                dep_start = int(dep["DEP_SPAN"].split(",")[0])
                dep_end = int(dep["DEP_SPAN"].split(",")[1])

                tag = pos_tags.getAnnotBySpan(dep_start, dep_end)
                if tag is not None:
                    if tag["TAG"] in ("NNP", "NNPS"):
                        heads2qp[head].prp_mod += 1
                    elif tag["TAG"] in ("NN", "NNS"):
                        heads2qp[head].nom_mod += 1

        if dep["RELATION"] == "num":
            if dep_key == dep["GOV_SPAN"]:
                heads2qp[head].num_mod += 1

        if dep["RELATION"] == "poss":
            #the possessed
            if dep_key == dep["GOV_SPAN"]:
                heads2qp[head].poss_mod += 1
            elif dep_key == dep["DEP_SPAN"]:
                #the possessor
                heads2qp[head].is_poss += 1

        #NOTE prep_ can appear more than once for a single noun, which throws
        #off percentages
        if dep["RELATION"] in ("prep_of"):
            if dep_key == dep["GOV_SPAN"]:
                heads2qp[head].prep_mod += 1

        if dep["RELATION"] in ("rcmod"):
            if dep_key == dep["GOV_SPAN"]:
                heads2qp[head].rc_mod += 1

    #NOTE: modification levels should be low
    if (text == "the " + head) or \
            (text == "that " + head) or \
            (text == "this " + head) or \
            (text == "those " + head) or \
            (text == "these " + head):
        heads2qp[head].bare_definite += 1

    if (text.startswith("the ")):
        heads2qp[head].definite += 1
    if (text.startswith("a ")):
        heads2qp[head].indefinite += 1
示例#2
0
    def __str__(self):
        s = "{0} : [{1}]".format(self.text, ", ".join(self.gold_buddies))
        return s


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: %s <gold-document> <unannotated-documents-list>" %
              (sys.argv[0]))
        sys.exit(1)

    #read in gold chains
    gold_chains = reconcile.getGoldChains(sys.argv[1])
    gold_chain_text = defaultdict(list)

    pos_tags = reconcile.getPOS(sys.argv[1])
    tokens = reconcile.getTokens(sys.argv[1])

    for key in list(gold_chains.keys()):
        for mention in gold_chains[key]:
            if mention.pprint() not in [
                    x.pprint() for x in gold_chain_text[key]
            ]:
                #np_pos = [x.getATTR("TAG") for x in \
                #        pos_tags.getOverlapping(mention)]
                #np_tok = [x.getText() for x in \
                #        tokens.getOverlapping(mention)]
                #mention.setProp("TAGS", np_pos)
                #mention.setProp("TOKENS", np_tok)
                gold_chain_text[key].append(mention)
            #print mention.pprint()
    if len(sys.argv) < 2:
        print("Usage: %s <filelist>" % (sys.argv[0]))
        sys.exit(1)

    #let's look at some noun phrases for each domain.
    files = []
    with open(sys.argv[1], 'r') as fileList:
        files.extend(
            [x for x in fileList.readlines() if not x.startswith("#")])

    for f in files:
        f = f.strip()
        print("Working on file: {0}".format(f))
        allNPs = {}
        nps = reconcile.getNPs(f)
        pos = reconcile.getPOS(f)
        stanford_nes = reconcile.getNEs(f)
        sundance_nes = reconcile.getSundanceNEs(f)

        for np in nps:
            key = "{0},{1}".format(np.getStart(), np.getEnd())
            text = utils.textClean(np.getText().replace("\n", " ")).lower()
            tokens = text.split()
            if len(tokens) == 1:
                continue

            if key not in list(allNPs.keys()):
                allNPs[key] = NP(np.getText().replace("\n", " "))
                allNPs[key].start = np.getStart()
                allNPs[key].end = np.getEnd()
示例#4
0
def process_syntax(f, np, head, text, head2qp, stanford_deps):
    #TODO check for cases where dep_key isn't found might be an error in
    #finding the head bytespan
    dep_key = "{0},{1}".format(np["HEAD_START"], np["HEAD_END"])
    pos_tags = reconcile.getPOS(f)

    already_subj_this_time = False
    already_dobj_this_time = False
    already_iobj_this_time = False
    already_bd_this_time = False
    already_appos_this_time = False
    already_appos_this_time2 = True
    already_modded_once = False
    already_poss_once = False
    already_of_once = False
    for dep in stanford_deps:
        if (dep["RELATION"] == "nsubj" or dep["RELATION"] == "nsubjpass") \
                and dep_key == dep["DEP_SPAN"] and not already_subj_this_time:
            #we have a subj
            head2qp[head].subj += 1
            already_subj_this_time = True

        if dep["RELATION"] == "dobj" and dep_key == dep[
                "DEP_SPAN"] and not already_dobj_this_time:
            #direct obj
            head2qp[head].dobj += 1
            already_dobj_this_time = True

        if dep["RELATION"] == "iobj" and dep_key == dep[
                "DEP_SPAN"] and not already_iobj_this_time:
            #indirect object
            head2qp[head].iobj += 1
            already_iobj_this_time = True

        #apposition
        if dep["RELATION"] == "appos":
            if dep_key == dep["DEP_SPAN"]:
                head2qp[head].appos_dep += 1
                already_appos_this_time = True
            elif dep_key == dep["GOV_SPAN"]:
                head2qp[head].appos_gov += 1
                already_appos_this_time2 = True

        if dep["RELATION"] == "agent":
            if dep_key == dep["DEP_SPAN"]:
                head2qp[head].agent += 1
                head2qp[head].agent_verbs.append(dep["GOV"])

        if dep["RELATION"] == "amod":
            if dep_key == dep["GOV_SPAN"]:
                head2qp[head].adj_mod += 1
                #already_modded_once = True

        if dep["RELATION"] == "advmod":
            if dep_key == dep["GOV_SPAN"]:
                head2qp[head].adv_mod += 1
                #already_modded_once = True

        if dep["RELATION"] == "nn":
            if dep_key == dep["GOV_SPAN"]:
                head2qp[head].nn_mod += 1
                already_modded_once = True

                dep_start = int(dep["DEP_SPAN"].split(",")[0])
                dep_end = int(dep["DEP_SPAN"].split(",")[1])

                tag = pos_tags.getAnnotBySpan(dep_start, dep_end)
                if tag is not None:
                    if tag["TAG"] in ("NNP", "NNPS"):
                        head2qp[head].prp_mod += 1
                    elif tag["TAG"] in ("NN", "NNS"):
                        head2qp[head].nom_mod += 1

        if dep["RELATION"] == "num":
            if dep_key == dep["GOV_SPAN"]:
                head2qp[head].num_mod += 1
                already_modded_once = True

        if dep["RELATION"] == "poss":
            #the possessed
            if dep_key == dep["GOV_SPAN"] and not already_poss_once:
                head2qp[head].poss_mod += 1
                already_poss_once = True
                already_modded_once = True
            elif dep_key == dep["DEP_SPAN"]:
                #the possessor
                head2qp[head].is_poss += 1

        #NOTE prep_ can appear more than once for a single noun, which throws
        #off percentages
        if dep["RELATION"] in ("prep_of") and not already_of_once:
            if dep_key == dep["GOV_SPAN"]:
                head2qp[head].prep_mod += 1
                already_of_once = True

        if dep["RELATION"] in ("rcmod"):
            if dep_key == dep["GOV_SPAN"]:
                head2qp[head].rc_mod += 1

    if already_modded_once:
        head2qp[head].one_premod += 1

    #NOTE: modification levels should be low
    if (text == "the " + head) or \
            (text == "that " + head) or \
            (text == "this " + head) or \
            (text == "those " + head) or \
            (text == "these " + head):
        if not already_bd_this_time:
            head2qp[head].bare_definite += 1
            already_bd_this_time = True

    if (text.startswith("the ")):
        head2qp[head].definite += 1
    if (text.startswith("a ") or text.startswith("an ")):
        head2qp[head].indefinite += 1
        if already_appos_this_time or already_appos_this_time2:
            head2qp[head].ind_no_app += 1
示例#5
0
def process(f, head2qp, annotated_file):
    stanford_deps = reconcile.getStanfordDep(f)
    pos = reconcile.getPOS(f)
    if annotated_file:
        nps = reconcile.getNPs(f)
        for np in nps:
            head = None
            text = None
            if PRONOUNS:
                if qp_utils.isPronoun(np):
                    head = np.getText().lower()
                    text = np.getText()
                else:
                    continue
            else:
                if qp_utils.isNominal(np, pos):
                    text = utils.textClean(np.getText())
                    np_tags = pos.getSubset(np.getStart(), np.getEnd())
                    head = utils.textClean(
                        qp_utils.getHead2(text.lower(), np_tags))
                else:
                    continue

            #bookkeeping
            if head not in list(head2qp.keys()):
                head2qp[head] = QuasiPronoun(head)
                head2qp[head].updateCount(True)
                head2qp[head].updateDocs(f, True)
            else:
                head2qp[head].updateDocs(f, True)
                head2qp[head].updateCount(True)

            if USE_GOLD:
                gold_chains = reconcile.getGoldChains(f)
                process_gold(f, np, head, text, head2qp, gold_chains)
            process_syntax(f, np, head, text, head2qp, stanford_deps)
    else:
        stanford_nps = reconcile.getStanfordNPs(f)
        for np in stanford_nps:
            if PRONOUNS:
                if np["is_pronoun"]:
                    head = np.getText().lower()
                    text = np.getText()
                else:
                    continue
            else:
                #skip some problems with the parser or numbers
                if np["HEAD"].startswith("$") or np["HEAD"].endswith(
                        "%") or np["HEAD"] == ".":
                    continue

                if np["is_nominal"]:
                    text = utils.textClean(np.getText())
                    head = np["HEAD"].lower()
                else:
                    continue

            #bookkeeping
            if head not in list(head2qp.keys()):
                head2qp[head] = QuasiPronoun(head)
                head2qp[head].updateDocs(f, False)
                head2qp[head].updateCount(False)
            else:
                head2qp[head].updateDocs(f, False)
                head2qp[head].updateCount(False)
            process_syntax(f, np, head, text, head2qp, stanford_deps)
示例#6
0
    fileList = open(sys.argv[1], 'r')
    anaphoric_nominals = annotation_set.AnnotationSet("anaphoric_noms")
    existential_nominals = annotation_set.AnnotationSet("exist_noms")

    for f in fileList:
        f = f.strip()
        if f.startswith("#"):
            continue

        print("Working on %s" % f)

        gold_annots = reconcile.parseGoldAnnots(f)
        gold_chains = reconcile.getGoldChains(f)

        response_nps = reconcile.getNPs_annots(f)
        pos = reconcile.getPOS(f, True)
        reconcile.addSundanceProps(f, response_nps)
        utils.match_nps(gold_annots, response_nps)

        for g in gold_annots:
            if g.getATTR("GOLD_TYPE") != "NOM":
                continue

            for r in response_nps:
                if g.getATTR("MATCHED") == r.getID():
                    g.addProps(r.getProps())

            if g.getATTR("GRAMMAR") == "SUBJECT" or g.getATTR(
                    "SUN_ROLE") == "SUBJ":
                g.setProp("S_VERB", reconcile.getSubjVerb(g, pos))
def gold_annotations(f):
    """process the file with gold annotations"""

    global virtual_pronouns, total_counts, virtual_pronoun_heads, \
    nominal_base_antecedent, distance_from_antecedent

    doc = Document(f)
    gold_chains = reconcile.getGoldChains(f)

    #adding in Sundance nes.
    nes = reconcile.getNEs(f, True)
    add_reconcile_semantic_class(gold_chains, nes)

    #adding in Reconcile pos too.
    pos = reconcile.getPOS(f, True)

    #getting the docs nps
    reconcile_nps = reconcile.getNPs_annots(f)

    #getting sundance nps
    sundance_nps = reconcile.getSundanceNPs(f)
    add_sundance_nps(gold_chains, sundance_nps)

    original_text_heads = {}  # just getting the heads
    original_text = defaultdict(list)  # for getting total doc counts later.
    nominal2chains = defaultdict(
        list)  # the chains that a given nominal appears.

    for chain in list(gold_chains.keys()):
        base_antecedent = True
        prev_annot = None
        antecedents = 0
        for mention in gold_chains[chain]:

            #if the first antecedent in a chain, do not list it as anaphoric.
            if base_antecedent:
                if mention.getATTR("is_nominal") and not \
                mention.getATTR("GOLD_SINGLETON"):
                    text = mention.getText()
                    text_lower = mention.getATTR("TEXT_CLEAN").lower()
                    docs_appeared[text_lower].append(f)

                    nominal_base_antecedent[text_lower] = \
                    nominal_base_antecedent.get(text_lower, 0) + 1

                    original_text[text_lower].append(text)

                    #take note that this chain contained this nominal
                    nominal2chains[text_lower].append(chain)

                    #take note of the gold semantic class
                    gold_semantic_class[text_lower].append(
                        mention.getATTR("GOLD_SEMANTIC"))

                    #reconcile's semantic class
                    reconcile_semantic_class[text_lower].append(
                        mention.getATTR("NE_CLASS"))

                    #sundance's semantic class
                    sun_semantic_class[text_lower].append(
                        mention.getATTR("SUN_SEMANTIC"))

                    number_gold_antecedents[text_lower].append(antecedents)

                    #get verb stats
                    if mention.getATTR("ROLE") == "SUBJ":
                        verb = reconcile.getSubjVerb(mention, pos)
                        if verb != None:
                            subj_verbs[text_lower].append(verb.lower())
                    elif mention.getATTR("ROLE") == "DOBJ":
                        verb = reconcile.getObjVerb(mention, pos)
                        if verb != None:
                            obj_verbs[text_lower].append(verb.lower())

                base_antecedent = False
                prev_annot = mention
                antecedents += 1
                continue

            if mention.getATTR("is_nominal"):
                text = mention.getText()
                text_lower = mention.getATTR("TEXT_CLEAN").lower()
                head_text = mention.getATTR("HEAD_TEXT")

                original_text[text_lower].append(text)
                virtual_pronouns[text_lower] = \
                virtual_pronouns.get(text_lower, 0) + 1

                virtual_pronoun_heads[head_text.lower()] = \
                virtual_pronoun_heads.get(head_text.lower(), 0) + 1

                #the semantic class Reconcile puts this in.
                reconcile_semantic_class[text_lower].append(
                    mention.getATTR("NE_CLASS"))

                #register this doc as containing this np.
                docs_appeared[text_lower].append(f)

                #take note that this chain contained this nominal
                nominal2chains[text_lower].append(chain)

                #take note of the gold semantic class
                gold_semantic_class[text_lower].append(
                    mention.getATTR("GOLD_SEMANTIC"))

                #the number of possible correct antecedents for this anaphor
                number_gold_antecedents[text_lower].append(antecedents)

                #sundance's semantic class
                sun_semantic_class[text_lower].append(
                    mention.getATTR("SUN_SEMANTIC"))

                # subject verb statistics
                if mention.getATTR("ROLE") == "SUBJ":
                    verb = reconcile.getSubjVerb(mention, pos)
                    subj_verbs[text_lower].append(verb.lower())
                elif mention.getATTR("ROLE") == "DOBJ":
                    verb = reconcile.getObjVerb(mention, pos)
                    obj_verbs[text_lower].append(verb.lower())

                #get the sentence distance from these two mentions.
                mention_sent = reconcile.getAnnotSentence(f, mention)
                prev_sent = reconcile.getAnnotSentence(f, prev_annot)

                if mention_sent > -1 and prev_sent > -1:
                    distance_from_antecedent[text_lower].append(mention_sent - \
                            prev_sent)

                #get the TextTiling segment distance for the two mentions
                mention_seg = doc.getAnnotTile(mention)
                prev_seg = doc.getAnnotTile(prev_annot)
                if mention_seg > -1 and prev_seg > -1:
                    focus_distance[text_lower].append(mention_seg - \
                            prev_seg)

                #getting the distribution of closest antecedent types for a
                #given nominal
                if prev_annot.getATTR("is_nominal"):
                    nominals2type[text_lower]["nominal"] = \
                    nominals2type[text_lower].get("nominal",0) + 1
                elif prev_annot.getATTR("is_pronoun"):
                    nominals2type[text_lower]["pronoun"] = \
                    nominals2type[text_lower].get("pronoun",0) + 1
                else:
                    nominals2type[text_lower]["proper"] = \
                    nominals2type[text_lower].get("proper",0) + 1
            prev_annot = mention
            antecedents += 1

    #for key in nominal2chains.keys():
    #    print "%d : %s (doc: %s)" % (len(list(set(nominal2chains[key]))), key,
    #            doc)

    #update the total counts.
    for key in list(original_text.keys()):
        for text in list(set(original_text[key])):
            total_counts[key] = total_counts.get(key,
                                                 0) + doc.getWordCounts(text)

    #the head counts
    for key in list(virtual_pronoun_heads.keys()):
        total_counts_heads[key] = total_counts_heads.get(key, 0) + \
        doc.getWordCounts(key)