Пример #1
0
def processACE(f, np, heads2qp):
    ace_annots = reconcile.parseGoldAnnots(f)
    stanford_deps = reconcile.getStanfordDep(f)
    gold_chains = reconcile.getGoldChains(f)
    ace_np = ace_annots.getAnnotBySpan(np.getStart(), np.getEnd())

    if ace_np["is_nominal"]:
        head = utils.textClean(ace_np["HEAD"].strip().lower())
        text = utils.textClean(np.getText())

        #bookkeeping
        if head not in list(heads2qp.keys()):
            heads2qp[head] = QuasiPronoun(head)
        else:
            heads2qp[head].updateDocs(f)
            heads2qp[head].updateCount()

        if ace_np["GOLD_SINGLETON"]:
            heads2qp[head].singelton += 1
        else:
            #does it start the chain?
            for gc in list(gold_chains.keys()):
                if gold_chains[gc][0] == np:
                    heads2qp[head].starts_chain += 1
                    break

        process(f, np, head, text, heads2qp, stanford_deps)
Пример #2
0
    def OnFileOpen(self, e):
        """ File|Open event - Open dialog box. """
        dlg = wx.FileDialog(self, "Open", self.dirName, self.fileName,
                           "Text Files (*.txt)|*.txt|All Files|*.*", wx.OPEN)
        if (dlg.ShowModal() == wx.ID_OK):
            self.fileName = dlg.GetFilename()
            self.dirName = dlg.GetDirectory()
            f = file(os.path.join(self.dirName, self.fileName), 'r')
            self.fullText = ''.join(f.readlines())
            self.text_box_left.SetValue(self.fullText)
            f.close()
        dlg.Destroy()

        self.gold_chains = reconcile.getGoldChains(self.dirName)
Пример #3
0
def processACE(f, head2qp):
    global USE_GOLD
    ace_annots = reconcile.parseGoldAnnots(f)
    nps = reconcile.getNPs(f)
    stanford_deps = reconcile.getStanfordDep(f)
    gold_chains = reconcile.getGoldChains(f)
    for np in nps:
        ace_np = ace_annots.getAnnotBySpan(np.getStart(), np.getEnd())
        head = None
        text = None
        if PRONOUNS:
            if qp_utils.isPronoun(np):
                head = ace_np["HEAD"].lower()
                text = np.getText()
            else:
                continue
        else:
            if ace_np["is_nominal"]:
                head = utils.textClean(ace_np["HEAD"].strip().lower())
                text = utils.textClean(np.getText())
            else:
                continue

        #bookkeeping
        if head not in list(head2qp.keys()):
            head2qp[head] = QuasiPronoun(head)
        else:
            head2qp[head].updateDocs(f)
            head2qp[head].updateCount()

        if ace_np["GOLD_SINGLETON"]:
            head2qp[head].singleton += 1
            if (text.startswith("a ") or text.startswith("an ")):
                head2qp[head].faux_ba += 1
        else:
            #does it start the chain?
            if USE_GOLD:
                process_gold(f, np, head, text, head2qp, gold_chains)
        process_syntax(f, np, head, text, head2qp, stanford_deps)
Пример #4
0
        self.gold_buddies = []
        self.buddies = []

    def __str__(self):
        s = "{0} : [{1}]".format(self.text, ", ".join(self.gold_buddies))
        return s


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: %s <gold-document> <unannotated-documents-list>" %
              (sys.argv[0]))
        sys.exit(1)

    #read in gold chains
    gold_chains = reconcile.getGoldChains(sys.argv[1])
    gold_chain_text = defaultdict(list)

    pos_tags = reconcile.getPOS(sys.argv[1])
    tokens = reconcile.getTokens(sys.argv[1])

    for key in list(gold_chains.keys()):
        for mention in gold_chains[key]:
            if mention.pprint() not in [
                    x.pprint() for x in gold_chain_text[key]
            ]:
                #np_pos = [x.getATTR("TAG") for x in \
                #        pos_tags.getOverlapping(mention)]
                #np_tok = [x.getText() for x in \
                #        tokens.getOverlapping(mention)]
                #mention.setProp("TAGS", np_pos)
Пример #5
0
    i = 0
    prog = ProgressBar(len(files))
    correct_qps = {}
    incorrect_qps = {}
    for f in files:
        if f.startswith("#"):
            continue
        f = f.strip()
        prog.update_time(i)
        sys.stderr.write("\r%s" % (str(prog)))
        sys.stderr.flush()
        i += 1

        nps = reconcile.getNPs(f)
        gold_chains = reconcile.getGoldChains(f)
        try:
            all_pairs = reconcile.getFauxPairs(f, PREDICTIONS)
        except:
            continue

        response_pairs = []
        for pair in all_pairs:
            if pair[0] is None or pair[1] is None:
                continue
            response_pairs.append(pair)

        labeled_annots = reconcile.labelCorrectPairs(gold_chains,
                                                     response_pairs)
        for pair in labeled_annots:
            if ACE:
Пример #6
0
                      dest="duncan",
                      type="string",
                      default="")
    parser.add_option("-v",
                      help="Verbose. Be it.",
                      action="store_true",
                      dest="verbose",
                      default=False)
    (options, args) = parser.parse_args()

    if len(sys.argv) < 3:
        parser.print_help()
        sys.exit(1)

    if options.directory is not None:
        gold_chains = reconcile.getGoldChains(options.directory)
        duncan_pairs = duncan.getDuncanPairs(options.directory)
        accuracy = score.accuracy(gold_chains, duncan_pairs)
        print("A: %d/%d = %0.2f" % (accuracy[0], accuracy[1], accuracy[2]))
    elif options.filelist is not None:
        filelist = open(options.filelist, 'r')
        total = [0, 0]
        h_stats_correct = {}
        h_stats_total = {}
        for f in filelist:
            f = f.strip()
            if f.startswith("#"):
                continue
            gold_chains = reconcile.getGoldChains(f)
            duncan_pairs = duncan.getDuncanPairs(f)
            accuracy = score.accuracy(gold_chains, duncan_pairs)
Пример #7
0
                antecedent.getATTR("semantic"), antecedent.getATTR("gender"),
                antecedent.getATTR("number"))
            attrs2 = "sem=%s, gen=%s, num=%s" % (anaphor.getATTR("semantic"),
                                                 anaphor.getATTR("gender"),
                                                 anaphor.getATTR("number"))
            if byte not in tmp:
                tmp.append(byte)
                print("%s [%s] <- %s [%s] (H:%s)" %
                      (antecedent.ppprint(), attrs1, anaphor.ppprint(), attrs2,
                       h))

        if options.vverbose:
            print("===================")

    if options.evaluate:
        GoldChains = reconcile.getGoldChains(options.directory)
        s = score.accuracy(GoldChains, all_pairs)
        print("Document Score:")
        print("  Accuracy: %0.2f with %d Correct, %d Incorrect" %
              (s[0], s[1], s[2]))

    if options.stats:
        GoldChains = reconcile.getGoldChains(options.directory)
        s = score.accuracy(GoldChains, all_pairs)
        print("%d %d %d" % (s[1], s[2], s[3]))

    if options.write:
        heurs = defaultdict(list)
        UniquePairs = []
        for i in range(0, len(all_pairs)):
            curr = all_pairs[i]
Пример #8
0
#
import sys
import pydot

from pyconcile import reconcile

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: %s <response-file>" % (sys.argv[0]))
        sys.exit(1)

    dataDir = sys.argv[1][:sys.argv[1].find("/")]
    responseFile = sys.argv[1][sys.argv[1].find("/"):]
    clusterer="SingleLink"
    sentences = reconcile.getSentences(dataDir)
    gold_chains = reconcile.getGoldChains(dataDir)

    #get reconcile's edges
    response_chains = reconcile.getResponseChains(dataDir,
            responseFile+"/"+clusterer)

    response_pairs = reconcile.getResponsePairs(dataDir, responseFile, 0.5)
    response_pairs = reconcile.labelCorrectPairs(gold_chains, response_pairs)

    #pydot graph
    graph = pydot.Dot("reconcile_clusters", graph_type='digraph')

    #add in all the NP
    #NOTE: as long as we are working with gold mentions, the response and gold
    #will match. otherwise, will need to switch over to gold nps to see proper
    #'misses'
Пример #9
0
def process(f, head2qp, annotated_file):
    stanford_deps = reconcile.getStanfordDep(f)
    pos = reconcile.getPOS(f)
    if annotated_file:
        nps = reconcile.getNPs(f)
        for np in nps:
            head = None
            text = None
            if PRONOUNS:
                if qp_utils.isPronoun(np):
                    head = np.getText().lower()
                    text = np.getText()
                else:
                    continue
            else:
                if qp_utils.isNominal(np, pos):
                    text = utils.textClean(np.getText())
                    np_tags = pos.getSubset(np.getStart(), np.getEnd())
                    head = utils.textClean(
                        qp_utils.getHead2(text.lower(), np_tags))
                else:
                    continue

            #bookkeeping
            if head not in list(head2qp.keys()):
                head2qp[head] = QuasiPronoun(head)
                head2qp[head].updateCount(True)
                head2qp[head].updateDocs(f, True)
            else:
                head2qp[head].updateDocs(f, True)
                head2qp[head].updateCount(True)

            if USE_GOLD:
                gold_chains = reconcile.getGoldChains(f)
                process_gold(f, np, head, text, head2qp, gold_chains)
            process_syntax(f, np, head, text, head2qp, stanford_deps)
    else:
        stanford_nps = reconcile.getStanfordNPs(f)
        for np in stanford_nps:
            if PRONOUNS:
                if np["is_pronoun"]:
                    head = np.getText().lower()
                    text = np.getText()
                else:
                    continue
            else:
                #skip some problems with the parser or numbers
                if np["HEAD"].startswith("$") or np["HEAD"].endswith(
                        "%") or np["HEAD"] == ".":
                    continue

                if np["is_nominal"]:
                    text = utils.textClean(np.getText())
                    head = np["HEAD"].lower()
                else:
                    continue

            #bookkeeping
            if head not in list(head2qp.keys()):
                head2qp[head] = QuasiPronoun(head)
                head2qp[head].updateDocs(f, False)
                head2qp[head].updateCount(False)
            else:
                head2qp[head].updateDocs(f, False)
                head2qp[head].updateCount(False)
            process_syntax(f, np, head, text, head2qp, stanford_deps)
Пример #10
0
def gold_annotations(f):
    """process the file with gold annotations"""

    global virtual_pronouns, total_counts, virtual_pronoun_heads, \
    nominal_base_antecedent, distance_from_antecedent

    doc = Document(f)
    gold_chains = reconcile.getGoldChains(f)

    #adding in Sundance nes.
    nes = reconcile.getNEs(f, True)
    add_reconcile_semantic_class(gold_chains, nes)

    #adding in Reconcile pos too.
    pos = reconcile.getPOS(f, True)

    #getting the docs nps
    reconcile_nps = reconcile.getNPs_annots(f)

    #getting sundance nps
    sundance_nps = reconcile.getSundanceNPs(f)
    add_sundance_nps(gold_chains, sundance_nps)

    original_text_heads = {}  # just getting the heads
    original_text = defaultdict(list)  # for getting total doc counts later.
    nominal2chains = defaultdict(
        list)  # the chains that a given nominal appears.

    for chain in list(gold_chains.keys()):
        base_antecedent = True
        prev_annot = None
        antecedents = 0
        for mention in gold_chains[chain]:

            #if the first antecedent in a chain, do not list it as anaphoric.
            if base_antecedent:
                if mention.getATTR("is_nominal") and not \
                mention.getATTR("GOLD_SINGLETON"):
                    text = mention.getText()
                    text_lower = mention.getATTR("TEXT_CLEAN").lower()
                    docs_appeared[text_lower].append(f)

                    nominal_base_antecedent[text_lower] = \
                    nominal_base_antecedent.get(text_lower, 0) + 1

                    original_text[text_lower].append(text)

                    #take note that this chain contained this nominal
                    nominal2chains[text_lower].append(chain)

                    #take note of the gold semantic class
                    gold_semantic_class[text_lower].append(
                        mention.getATTR("GOLD_SEMANTIC"))

                    #reconcile's semantic class
                    reconcile_semantic_class[text_lower].append(
                        mention.getATTR("NE_CLASS"))

                    #sundance's semantic class
                    sun_semantic_class[text_lower].append(
                        mention.getATTR("SUN_SEMANTIC"))

                    number_gold_antecedents[text_lower].append(antecedents)

                    #get verb stats
                    if mention.getATTR("ROLE") == "SUBJ":
                        verb = reconcile.getSubjVerb(mention, pos)
                        if verb != None:
                            subj_verbs[text_lower].append(verb.lower())
                    elif mention.getATTR("ROLE") == "DOBJ":
                        verb = reconcile.getObjVerb(mention, pos)
                        if verb != None:
                            obj_verbs[text_lower].append(verb.lower())

                base_antecedent = False
                prev_annot = mention
                antecedents += 1
                continue

            if mention.getATTR("is_nominal"):
                text = mention.getText()
                text_lower = mention.getATTR("TEXT_CLEAN").lower()
                head_text = mention.getATTR("HEAD_TEXT")

                original_text[text_lower].append(text)
                virtual_pronouns[text_lower] = \
                virtual_pronouns.get(text_lower, 0) + 1

                virtual_pronoun_heads[head_text.lower()] = \
                virtual_pronoun_heads.get(head_text.lower(), 0) + 1

                #the semantic class Reconcile puts this in.
                reconcile_semantic_class[text_lower].append(
                    mention.getATTR("NE_CLASS"))

                #register this doc as containing this np.
                docs_appeared[text_lower].append(f)

                #take note that this chain contained this nominal
                nominal2chains[text_lower].append(chain)

                #take note of the gold semantic class
                gold_semantic_class[text_lower].append(
                    mention.getATTR("GOLD_SEMANTIC"))

                #the number of possible correct antecedents for this anaphor
                number_gold_antecedents[text_lower].append(antecedents)

                #sundance's semantic class
                sun_semantic_class[text_lower].append(
                    mention.getATTR("SUN_SEMANTIC"))

                # subject verb statistics
                if mention.getATTR("ROLE") == "SUBJ":
                    verb = reconcile.getSubjVerb(mention, pos)
                    subj_verbs[text_lower].append(verb.lower())
                elif mention.getATTR("ROLE") == "DOBJ":
                    verb = reconcile.getObjVerb(mention, pos)
                    obj_verbs[text_lower].append(verb.lower())

                #get the sentence distance from these two mentions.
                mention_sent = reconcile.getAnnotSentence(f, mention)
                prev_sent = reconcile.getAnnotSentence(f, prev_annot)

                if mention_sent > -1 and prev_sent > -1:
                    distance_from_antecedent[text_lower].append(mention_sent - \
                            prev_sent)

                #get the TextTiling segment distance for the two mentions
                mention_seg = doc.getAnnotTile(mention)
                prev_seg = doc.getAnnotTile(prev_annot)
                if mention_seg > -1 and prev_seg > -1:
                    focus_distance[text_lower].append(mention_seg - \
                            prev_seg)

                #getting the distribution of closest antecedent types for a
                #given nominal
                if prev_annot.getATTR("is_nominal"):
                    nominals2type[text_lower]["nominal"] = \
                    nominals2type[text_lower].get("nominal",0) + 1
                elif prev_annot.getATTR("is_pronoun"):
                    nominals2type[text_lower]["pronoun"] = \
                    nominals2type[text_lower].get("pronoun",0) + 1
                else:
                    nominals2type[text_lower]["proper"] = \
                    nominals2type[text_lower].get("proper",0) + 1
            prev_annot = mention
            antecedents += 1

    #for key in nominal2chains.keys():
    #    print "%d : %s (doc: %s)" % (len(list(set(nominal2chains[key]))), key,
    #            doc)

    #update the total counts.
    for key in list(original_text.keys()):
        for text in list(set(original_text[key])):
            total_counts[key] = total_counts.get(key,
                                                 0) + doc.getWordCounts(text)

    #the head counts
    for key in list(virtual_pronoun_heads.keys()):
        total_counts_heads[key] = total_counts_heads.get(key, 0) + \
        doc.getWordCounts(key)