Python textClean 예제들, pyconcile.utils.textClean Python 예제들

예제 #1

0

파일 보기

파일: qp-explorer.py 프로젝트: nathan-gilbert/pyconcile

def processACE(f, np, heads2qp):
    ace_annots = reconcile.parseGoldAnnots(f)
    stanford_deps = reconcile.getStanfordDep(f)
    gold_chains = reconcile.getGoldChains(f)
    ace_np = ace_annots.getAnnotBySpan(np.getStart(), np.getEnd())

    if ace_np["is_nominal"]:
        head = utils.textClean(ace_np["HEAD"].strip().lower())
        text = utils.textClean(np.getText())

        #bookkeeping
        if head not in list(heads2qp.keys()):
            heads2qp[head] = QuasiPronoun(head)
        else:
            heads2qp[head].updateDocs(f)
            heads2qp[head].updateCount()

        if ace_np["GOLD_SINGLETON"]:
            heads2qp[head].singelton += 1
        else:
            #does it start the chain?
            for gc in list(gold_chains.keys()):
                if gold_chains[gc][0] == np:
                    heads2qp[head].starts_chain += 1
                    break

        process(f, np, head, text, heads2qp, stanford_deps)

예제 #2

0

파일 보기

def isProper(annot):
    global PROPER_NOUNS
    if len(PROPER_NOUNS) < 1:
        read_in_propers()

    text = utils.textClean(annot.getText()).lower().strip()
    if text in PROPER_NOUNS:
        return True

    if annot["PROPER_NAME"] == "true":
        return True
    if annot["PROPER_NOUN"] == "true":
        return True

    if text.startswith("mr."): return True
    if text.startswith("ms."): return True
    if text.startswith("mrs."): return True
    if text.endswith("corp."): return True
    if text.endswith("co."): return True
    if text.endswith("ltd."): return True
    if text.endswith("inc."): return True
    if text.endswith("ag"): return True
    if text.endswith("plc"): return True

    return False

예제 #3

0

파일 보기

def getHeadSpan(annot, head):
    head = head.replace("(", "").replace(")", "")
    match = re.compile(r'\b({0})\b'.format(head), flags=re.IGNORECASE).search(
        utils.textClean(annot.getText()))
    if match:
        return (match.start(1) + annot.getStart(),
                match.end(1) + annot.getStart())
    return None

예제 #4

0

파일 보기

파일: qp_utils.py 프로젝트: nathan-gilbert/pyconcile

def isPronoun(annot):
    text = utils.textClean(annot.getText()).lower().strip()
    if text in data.ALL_PRONOUNS:
        return True

    if text in ("here", "there", "then", "those"):
        return True
    return False

예제 #5

0

파일 보기

def getHeadIndex(annot, head):
    annot_text = utils.textClean(annot.getText()).lower().strip()
    tokens = annot_text.split()
    i = 0
    for tok in tokens:
        if tok == head:
            return i
        i += 1
    return -1

예제 #6

0

파일 보기

파일: noun_phrase_explorer.py 프로젝트: nathan-gilbert/pyconcile

def getHeadSpan(annot, head):
    #NOTE: texts with parenths have problems
    if annot.getText().find("(") > -1 or annot.getText().find(")") > -1:
        return None
    match = re.compile(r'\b({0})\b'.format(head), flags=re.IGNORECASE).search(
        utils.textClean(annot.getText()))
    if match:
        return (match.start(1), match.end(1) - 1)
    return None

예제 #7

0

파일 보기

 def addDefinite(self, orig):
     orig_text = utils.textClean(orig.getText())
     if orig_text.startswith("the "):
         self.definite["definite"] = self.definite.get("definite", 0) + 1
     elif orig_text.startswith("a ") or orig_text.startswith("an "):
         self.definite["indefinite"] = self.definite.get("indefinite",
                                                         0) + 1
     else:
         if orig_text == self.head:
             self.definite["bare"] = self.definite.get("bare", 0) + 1

예제 #8

0

파일 보기

def processACE(f, head2qp):
    global USE_GOLD
    ace_annots = reconcile.parseGoldAnnots(f)
    nps = reconcile.getNPs(f)
    stanford_deps = reconcile.getStanfordDep(f)
    gold_chains = reconcile.getGoldChains(f)
    for np in nps:
        ace_np = ace_annots.getAnnotBySpan(np.getStart(), np.getEnd())
        head = None
        text = None
        if PRONOUNS:
            if qp_utils.isPronoun(np):
                head = ace_np["HEAD"].lower()
                text = np.getText()
            else:
                continue
        else:
            if ace_np["is_nominal"]:
                head = utils.textClean(ace_np["HEAD"].strip().lower())
                text = utils.textClean(np.getText())
            else:
                continue

        #bookkeeping
        if head not in list(head2qp.keys()):
            head2qp[head] = QuasiPronoun(head)
        else:
            head2qp[head].updateDocs(f)
            head2qp[head].updateCount()

        if ace_np["GOLD_SINGLETON"]:
            head2qp[head].singleton += 1
            if (text.startswith("a ") or text.startswith("an ")):
                head2qp[head].faux_ba += 1
        else:
            #does it start the chain?
            if USE_GOLD:
                process_gold(f, np, head, text, head2qp, gold_chains)
        process_syntax(f, np, head, text, head2qp, stanford_deps)

예제 #9

0

파일 보기

파일: faux_pronoun_extractor.py 프로젝트: nathan-gilbert/pyconcile

def collectACEFPs(ace_annots, this_files_common_nouns):
    for gold_np in ace_annots:
        if gold_np is not None:
            #if not gold_np["GOLD_SINGLETON"] and np["GRAMMAR"] == "SUBJECT":
            if not gold_np["GOLD_SINGLETON"] and gold_np["is_nominal"]:
                gold_text = utils.textClean(gold_np.getText()).lower().strip()
                gold_head = gold_np["HEAD"].lower().strip()
                #definites + demonstratives
                if gold_text in ("the " + gold_head, "that " + gold_head,
                                 "this " + gold_head, "these " + gold_head,
                                 "those " + gold_head):
                    this_files_common_nouns.append(gold_np)
        else:
            print("couldn't find {0} in the gold".format(np))

예제 #10

0

파일 보기

파일: qp_utils.py 프로젝트: nathan-gilbert/pyconcile

def isProper(annot, pos):
    global PROPER_NOUNS
    if len(PROPER_NOUNS) < 1:
        read_in_propers()

    global COMMON_NOUNS
    if len(COMMON_NOUNS) < 1:
        read_in_commons()

    text = utils.textClean(annot.getText()).lower()
    if text in PROPER_NOUNS:
        return True

    if text in COMMON_NOUNS:
        return False
    if isPronoun(annot):
        return False
    if annot["DATE"] != "NONE":
        return False
    if text.find("http://") > -1 or text.find("www.") > -1:
        return False

    if annot["PROPER_NAME"] == "true" or annot["PROPER_NOUN"] == "true":
        return True

    tags = pos.getSubset(annot.getStart(), annot.getEnd())
    head = getHead2(text, tags)
    if isNumber(head):
        return False
    if head.endswith("%") or head.startswith("$"):
        return False
    if head in ("million", "billion", "cents", "dollars"):
        return False

    head = head.replace("\"","")
    head_span = getHeadSpan(annot, head)
    if head_span is None:
        head_span = getHeadSpan(annot, head.replace("]",""))
    pos_tag = pos.getAnnotBySpan(head_span[0], head_span[1])
    if pos_tag is None:
        return False
    if pos_tag["TAG"].startswith("NNP"):
        return True

    return False

예제 #11

0

파일 보기

def isNominal(annot):
    """
    return True if the annotation is a nominal
    false otherwise
    """
    global COMMON_NOUNS
    if len(COMMON_NOUNS) < 1:
        read_in_commons()

    text = utils.textClean(annot.getText()).lower()
    if text in COMMON_NOUNS:
        return True

    if text.endswith("%"):
        return False

    if not isProper(annot) and text not in data.ALL_PRONOUNS and annot[
            "DATE"] == "NONE":
        return True

    return False

예제 #12

0

파일 보기

파일: qp-accuracy.py 프로젝트: nathan-gilbert/pyconcile

    total_scores = {"vps_guessed": 0, "vps_correct": 0}

    RESPONSE_TYPE = ""
    for f in files:
        f = f.strip()
        print("Working on file: {0}".format(f))
        gold_chains = reconcile.getGoldChains(f)
        pos = reconcile.getPOS(f)
        pairs = []
        if "-hobbs" in sys.argv:
            RESPONSE_TYPE = "Hobbs"
            #read in the hobbs annotations
            hobbs_pairs = reconcile.getProResPairs(f, "hobbs")
            for pair in hobbs_pairs:
                tags = pos.getSubset(pair[1].getStart(), pair[1].getEnd())
                text = utils.textClean(pair[1].getText()).lower()
                ana_head = qp_utils.getHead2(text, tags)
                print("{0:40} => {1}".format(text, ana_head))
                if ana_head in QPs:
                    pairs.append(pair)

                #if pair[1].getText().lower() not in data.ALL_PRONOUNS:
                #    pairs.append(pair)

        elif "-rec" in sys.argv:
            #TODO if we choose this route then there needs to be some mods
            #since each vp can be resolved multiple times.
            # 1. only count the closest antecedent?
            # 2. don't count string matches?
            # 3. look at what is in the "pro_antes" property (that gives us the
            # Cogniac decision.

예제 #13

0

파일 보기

파일: noun_phrase_explorer.py 프로젝트: nathan-gilbert/pyconcile

    with open(sys.argv[1], 'r') as fileList:
        files.extend(
            [x for x in fileList.readlines() if not x.startswith("#")])

    for f in files:
        f = f.strip()
        print("Working on file: {0}".format(f))
        allNPs = {}
        nps = reconcile.getNPs(f)
        pos = reconcile.getPOS(f)
        stanford_nes = reconcile.getNEs(f)
        sundance_nes = reconcile.getSundanceNEs(f)

        for np in nps:
            key = "{0},{1}".format(np.getStart(), np.getEnd())
            text = utils.textClean(np.getText().replace("\n", " ")).lower()
            tokens = text.split()
            if len(tokens) == 1:
                continue

            if key not in list(allNPs.keys()):
                allNPs[key] = NP(np.getText().replace("\n", " "))
                allNPs[key].start = np.getStart()
                allNPs[key].end = np.getEnd()

            head = getHead(text)
            head_span = getHeadSpan(np, head)

            if head_span is not None:
                allNPs[key].addHead(head_span)
                #check to see if the head is contain in a proper name

예제 #14

0

파일 보기

파일: faux_pronoun_extractor.py 프로젝트: nathan-gilbert/pyconcile

    #set up for all commons
    #ACE_HEADS.append(line)

    for f in files:
        f = f.strip()
        print("Working on file: {0}".format(f))
        this_files_common_nouns = []
        if ACE:
            tokens = reconcile.getTokens(f)
            pos = reconcile.getPOS(f)
            ace_annots = reconcile.parseGoldAnnots(f)
            this_files_common_nouns_orig = []
            collectACEFPs(ace_annots, this_files_common_nouns_orig)

            #remove post modded commons
            for fp in this_files_common_nouns_orig:
                if not checkForModification(fp, tokens, pos):
                    this_files_common_nouns.append(fp)
        else:
            gold_nps = reconcile.getNPs(f)
            collectFPs(gold_nps, this_files_common_nouns)

        #output common nouns to file
        i = 0
        with open(f + "/annotations/faux_pronouns", 'w') as outFile:
            for annot in this_files_common_nouns:
                outFile.write("{0}\t{1},{2}\t{3}\t\n".format(
                    i, annot.getStart(), annot.getEnd(),
                    utils.textClean(annot.getText().lower())))
                i += 1

예제 #15

0

파일 보기

파일: pronoun_profiler.py 프로젝트: nathan-gilbert/pyconcile

def add_stats(pronoun_class, doc, anaphor, text):
    if text in list(pronoun_class.keys()):
        pronoun_class[text].updateCount()
    else:
        pronoun_class[text] = Pronoun(text)

    #find the closest antecedent
    antecedent = doc.closest_antecedent(anaphor)
    if antecedent is not None:
        #print anaphor.ppprint(),
        #print antecedent.ppprint()

        #uniqueness -- what is the rate at which a pronoun is coreferent
        #with "new" words? I once called this generality -- captured with
        #antecedent
        pronoun_class[text].addAntecedent(
            utils.textClean(antecedent.getText()).lower())

        #string matches -- how often does this pronoun resolve to
        #instances of itself?
        ant_text = utils.textClean(antecedent.getText()).lower()
        if ant_text == text  \
            or (ant_text in ("he", "him") and text in ("he", "him")) \
            or (ant_text in ("they", "them") and text in ("they", "them")):
            pronoun_class[text].string_matches += 1

        #find the distance of the closest antecedent
        # 1. in word
        wd = doc.word_distance(antecedent, anaphor)
        pronoun_class[text].word_distance(antecedent, wd)

        # 2. in sentences
        sd = doc.sentence_distance(antecedent, anaphor)
        pronoun_class[text].sent_distance(antecedent, sd)

        #ant_pdtb = doc.getContainedPDTB(antecedent)
        #ana_pdtb = doc.getContainedPDTB(anaphor)
        # 3. pdtb parse distance ? what discourse parse values are useful?
        #for pdtb1 in ant_pdtb:
        #    for pdtb2 in ana_pdtb:
        #        if pdtb1 == pdtb2:
        #            #    a. if the anaphor and antecedent are in the same argument of a
        #            #    discourse relation?
        #            pronoun_class[text].pdtb["SAME_ARG"] = pronoun_class[text].pdtb["SAME_ARG"] + 1
        #
        #        if (pdtb1.getATTR("TYPE") == pdtb2.getATTR("TYPE")) and (pdtb1.getATTR("SID") == pdtb2.getATTR("SID")):
        #            #    b. if the anaphor and antecedent are in different arguments of the
        #            #    same discourse relation
        #            pronoun_class[text].pdtb["DIFF_ARG"] = pronoun_class[text].pdtb["DIFF_ARG"] + 1
        #else:
        ##    c. if the anaphor and antecedent are not in the same discourse
        ##    relation at all
        #    pronoun_class[text].pdtb["NONE"] = pronoun_class[text].pdtb["NONE"] + 1

        #how often is this pronoun coreferent with a nominal?
        if specificity_utils.isNominal(antecedent):
            pronoun_class[text].nominal_antecedent += 1

        #how often is this pronoun coreferent with a proper name?
        antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(),
                                               antecedent.getEnd())
        if antecedent_np.getATTR("contains_pn") is not None:
            if antecedent_np.getATTR("contains_pn") == antecedent.getText():
                pronoun_class[text].proper_antecedent += 1
        elif specificity_utils.isProper(antecedent_np):
            pronoun_class[text].proper_antecedent += 1

        #how often are antecedents of this pronoun in the subj or dobj
        #position of a verb?
        if antecedent_np["GRAMMAR"] == "SUBJECT":
            pronoun_class[text].subj += 1
        elif antecedent_np["GRAMMAR"] == "OBJECT":
            pronoun_class[text].dobj += 1
    else:
        pronoun_class[text].starts_chain()

예제 #16

0

파일 보기

    total_scores = {"vps_guessed" : 0,
                    "vps_correct" : 0
                    }

    RESPONSE_TYPE = ""
    for f in files:
        f=f.strip()
        print("Working on file: {0}".format(f))
        gold_chains = reconcile.getGoldChains(f)
        pairs = []
        if "-hobbs" in sys.argv:
            RESPONSE_TYPE = "Hobbs"
            #read in the hobbs annotations
            hobbs_pairs = reconcile.getProResPairs(f, "hobbs")
            for pair in hobbs_pairs:
                ana_head = specificity_utils.getHead(utils.textClean(pair[1].getText())).lower()
                if ana_head in VPs:
                    pairs.append(pair)
        elif "-rec" in sys.argv:
            #TODO if we choose this route then there needs to be some mods
            #since each vp can be resolved multiple times.
            # 1. only count the closest antecedent?
            # 2. don't count string matches?
            # 3. look at what is in the "pro_antes" property (that gives us the
            # Cogniac decision.
            # 4. take the average accuracy for each noun.
            RESPONSE_TYPE = "Reconcile"
            #predictions = "features.goldnps/predictions.DecisionTree.muc6_DecisionTree_goldnps"
            predictions = "features.goldnps-vps/predictions.DecisionTree.muc6_DecisionTree_goldnps-vps"
            pairs = reconcile.getResponsePairs(f, predictions)
            for pair in reconcile_pairs:

예제 #17

0

파일 보기

def add_stats(text, head, anaphor, doc, nouns, head2text):

    #catches a problem with the following report
    if head == 'the':
        head = text.split()[-1]

    if head.endswith("%"): return  #skip percents
    if head[-1].isdigit(): return  #skip numbers
    if utils.isConj(head): return  #just skip these guys too
    if head == "himself": return  #NOTE for some reason, the filter doesn't
    #catch this, must be happening after head
    #noun is created.
    if head == "themselves": return
    if head == "head": return
    if head == "where": return
    if head == "there": return
    if head == "here": return

    anaphor_np = doc.nps.getAnnotBySpan(anaphor.getStart(), anaphor.getEnd())

    #update the head2text dict
    if text not in head2text[head]:
        head2text[head].append(text)
    #make sure the head nouns are reasonable
    #print "{0} => {1}".format(text, head)

    #then look for thangs
    if text not in list(nouns.keys()):
        nouns[text] = VirtualPronoun(text)
        nouns[text].updateDocs(doc.getName())
    else:
        nouns[text].updateCount()
        nouns[text].updateDocs(doc.getName())

    if anaphor_np["GRAMMAR"] == "SUBJECT":
        nouns[text].subj += 1
    elif anaphor_np["GRAMMAR"] == "OBJECT":
        nouns[text].dobj += 1

    #begin modifier code
    definite = "the {0}".format(head)
    indefinite1 = "a {0}".format(head)
    indefinite2 = "an {0}".format(head)

    #pos = reconcile.getPOS(doc.getName())
    #head_index = specificity_utils.getHeadIndex(anaphor_np, head)
    #np_pos = pos.getSubset(anaphor.getStart(), anaphor.getEnd())
    #np_words = text.split()
    if text.startswith(definite):
        nouns[text].bare_definite += 1
    #elif text.startswith(indefinite1) or text.startswith(indefinite2):
    #nouns[text].indefinite += 1
    #else:
    ##NOTE: just checking to see if there is some kind of modification now
    #if len(np_pos) == len(np_words):
    ##sys.stderr.write("Mismatch tag and word length: {0} => {1}\n".format(np_pos.getList(), np_words))
    #for i in range(0, head_index):
    #if np_pos[i]["TAG"] == "DT":
    #continue
    #elif np_pos[i]["TAG"] == "JJ":
    ##print "Adjective: {0}".format(np_words[i])
    #nouns[text].adjective_modifiers.append(np_words[i])
    #elif np_pos[i]["TAG"].startswith("N"):
    ##print "Noun: {0} {1}".format(np_words[i], np_pos[i]["TAG"])
    #if np_pos[i]["TAG"].startswith("NNP"):
    #nouns[text].proper_modifiers.append(np_words[i])
    #else:
    #nouns[text].common_modifiers.append(np_words[i])
    #else:
    ##print "?: {0}".format(np_words[i])
    #nouns[text].other_modifiers.append(np_words[i])

    #if text.startswith("the "):
    #get parts of speech for the np:
    #else:
    ##not definite, but still modified
    #if len(np_pos) == len(np_words):
    ##sys.stderr.write("Mismatch tag and word length: {0} => {1}\n".format(np_pos.getList(), np_words))
    #continue

    #for i in range(0, head_index):
    #if np_pos[i]["TAG"] == "DT":
    #continue
    #elif np_pos[i]["TAG"] == "JJ":
    ##print "Adjective: {0}".format(np_words[i])
    #nouns[text].adjective_modifiers.append(np_words[i])
    #elif np_pos[i]["TAG"].startswith("N"):
    ##print "Noun: {0} {1}".format(np_words[i], np_pos[i]["TAG"])
    #if np_pos[i]["TAG"].startswith("NNP"):
    #nouns[text].proper_modifiers.append(np_words[i])
    #else:
    #nouns[text].common_modifiers.append(np_words[i])
    #else:
    ##print "?: {0}".format(np_words[i])
    #nouns[text].other_modifiers.append(np_words[i])

    #capture post modifiers
    #if text.find(head + " of ") > -1:
    #of_start = text.find(head + " of ")
    #of_object = text[len(head) + of_start + 3:]
    #nouns[text].of_attachments.append(of_object.strip())

    #if text.find(head + " on ") > -1:
    #of_start = text.find(head + " on ")
    #of_object = text[len(head) + of_start + 3:]
    #nouns[text].on_attachments.append(of_object.strip())

    #if text.find(head + " that ") > -1:
    #that_start = text.find(head + " that ")
    #that_clause = text[len(head) + that_start+5:]
    #nouns[text].that_attachments.append(that_clause.strip())

    #if text.find(head + " with ") > -1:
    #that_start = text.find(head + " with ")
    #that_clause = text[len(head) + that_start+5:]
    #nouns[text].with_attachments.append(that_clause.strip())

    #if text.find(head + " by ") > -1:
    #by_start = text.find(head + " by ")
    #by_object = text[len(head) + by_start+3:]
    #nouns[text].by_attachments.append(by_object.strip())

    #if text.find(head + " which ") > -1:
    #which_start = text.find(head + " which ")
    #which_clause = text[len(head) + which_start+6:]
    #nouns[text].which_attachments.append(which_clause.strip())

    #if len(np_pos) >= head_index+2 and len(np_words) >= head_index+2:
    #if np_pos[head_index+1]["TAG"] == "VBD":
    #nouns[text].verbed.append(np_words[head_index+1])

    #if np_pos[head_index+1]["TAG"] == "VBG":
    #nouns[text].verbing.append(np_words[head_index+1])
    #end modifier code

    #find which chain the anaphor is from and add the chain statistics
    anaphor_chain = None
    for chain in list(doc.gold_chains.keys()):
        for mention in doc.gold_chains[chain]:
            if anaphor == mention:
                anaphor_chain = chain
                break

    chain_name = "{0}:{1}".format(doc.getName(), anaphor_chain)
    if chain_name not in nouns[text].chains:
        nouns[text].chains.append(chain_name)

    if anaphor_chain is not None:
        chain_length = len(doc.gold_chains[anaphor_chain])
        nouns[text].chain_size[doc.getName()] = chain_length

        #coverage
        #chain_start = doc.gold_chains[chain][0].getStart()
        #chain_end   = doc.gold_chains[chain][-1].getEnd()
        #chain_size  = chain_end - chain_start
        #chain_coverage = float(chain_size) / len(doc.text)

        # number of sentences touched / number of sentences
        covered_sentences = 0
        for sent in doc.sentences:
            for mention in doc.gold_chains[anaphor_chain]:
                if sent.contains(mention):
                    covered_sentences += 1
                    break

        chain_coverage = float(covered_sentences) / len(doc.sentences)
        nouns[text].chain_coverage[doc.getName()] = chain_coverage

        for chain in list(doc.gold_chains.keys()):
            if chain == anaphor_chain:
                continue
            if len(doc.gold_chains[chain]) > chain_length:
                break
        else:
            nouns[text].largest_chain += 1

        common_only = True
        for mention in doc.gold_chains[anaphor_chain]:
            if mention == anaphor:
                continue
            mention_head = getHead(utils.textClean(mention.getText()))
            if mention_head not in nouns[text].all_entities:
                nouns[text].all_entities.append(mention_head)

            #does this chain contain proper names?
            mention_np = doc.nps.getAnnotBySpan(mention.getStart(),
                                                mention.getEnd())
            if specificity_utils.isProper(mention_np):
                common_only = False

        if chain_name not in list(nouns[text].nom_chain_only.keys()):
            nouns[text].nom_chain_only[chain_name] = common_only
    else:
        sys.stderr.write("Anaphor chain not found?\n")

    antecedent = doc.closest_antecedent(anaphor)
    if antecedent is not None:
        #record stats
        sd = doc.sentence_distance(antecedent, anaphor)
        nouns[text].sentence_distance(sd)
        nouns[text].most_recent_antecedents.append(
            antecedent.getText().lower())

        antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(),
                                               antecedent.getEnd())
        if antecedent_np["GRAMMAR"] == "SUBJECT":
            nouns[text].subj_ante += 1
        elif antecedent_np["GRAMMAR"] == "OBJECT":
            nouns[text].dobj_ante += 1

        if antecedent.getText().lower() == anaphor.getText().lower():
            nouns[text].string_matches += 1

        if specificity_utils.isProper(antecedent_np):
            nouns[text].prp_ante += 1
        elif specificity_utils.isNominal(antecedent_np):
            nouns[text].nom_ante += 1
        elif specificity_utils.isPronoun(antecedent_np):
            nouns[text].pro_ante += 1
    else:
        #this guy starts the chain
        nouns[text].starts_chain += 1

예제 #18

0

파일 보기

    #sys.stdout.flush()
    #sys.stdout.write("\r")
    #prog = ProgressBar(len(files))
    i = 0
    for f in files:
        #prog.update_time(i)
        #sys.stdout.write("\r%s" % (str(prog)))
        #sys.stdout.flush()
        i += 1
        #read in the nps
        nps = reconcile.getNPs(f)
        sentences = reconcile.getSentences(f)

        #see which nps correspond to these heads
        for np in nps:
            np_text = utils.textClean(np.getText())
            np_head = specificity_utils.getHead(np_text)

            if np_head in heads:
                #print "{0:35} => {1}".format(np_text, np_head)
                head2nouns[np_head].addDoc(f)
                head2nouns[np_head].addText(np_text)
                head2nouns[np_head].count += 1
                head2nouns[np_head].addDefinite(np)

                if np["GRAMMAR"] == "SUBJECT":
                    head2nouns[np_head].subj += 1
                elif np["GRAMMAR"] == "OBJECT":
                    head2nouns[np_head].dobj += 1

                np_sentence = getSentence(np, sentences)

예제 #19

0

파일 보기

파일: pronoun_profiler.py 프로젝트: nathan-gilbert/pyconcile

 def word_distance(self, ant, wd):
     ant_text = utils.textClean(ant.getText().lower()).strip()
     self.word_distances[ant_text].append(wd)

예제 #20

0

파일 보기

파일: qp-extractor.py 프로젝트: nathan-gilbert/pyconcile

        sys.stderr.flush()
        j += 1

        this_files_faux_pronouns = []
        tokens = reconcile.getTokens(f)
        pos = reconcile.getPOS(f)
        #read in all possible quasi pronouns
        if ACE:
            ace_annots = reconcile.parseGoldAnnots(f)
            faux_pronouns = collectACEFPs(ace_annots)
        else:
            nps = reconcile.getNPs(f)
            faux_pronouns = collectFPs(nps, pos)

        for fp in faux_pronouns:
            text = utils.textClean(fp.getText()).lower().strip()
            if ACE:
                head = fp["HEAD"].lower().strip()
            else:
                np_tags = pos.getSubset(fp.getStart(), fp.getEnd())
                head = qp_utils.getHead2(text, np_tags).lower().strip()

            #NOTE select what types of modification to allow on QPs here.
            #select only qps and bare definites
            #if (head in qps) and bareDefinite(text, head):
            #    this_files_faux_pronouns.append(fp)

            #all QPs regardless of modification
            if head in qps:
                this_files_faux_pronouns.append(fp)

예제 #21

0

파일 보기

파일: noun_list_creator.py 프로젝트: nathan-gilbert/coreference-resolution-annotator

        sys.exit(0)

    files = []
    with open(sys.argv[1], 'r') as inFile:
        files.extend([x for x in inFile.readlines() if not x.startswith("#")])

    heads = []
    for f in files:
        if f.startswith("#"): continue
        f = f.strip()

        #NOTE will need to get all NPs eventually, not just gold
        nps = reconcile.getNPs(f)
        for np in nps:
            if specificity_utils.isNominal(np) or \
                    specificity_utils.isPronoun(np) or \
                    np["DATE"] != "NONE":
                head = specificity_utils.getHead(
                    utils.textClean(np.getText()).lower())

                if head.find(" and ") > -1:
                    continue

                if head not in heads:
                    heads.append(head)

    with open("nouns.txt", 'w') as outFile:
        outFile.write("#" + sys.argv[1] + "\n")
        for head in heads:
            outFile.write("{0}\n".format(head))

예제 #22

0

파일 보기

파일: wordnet_explorer.py 프로젝트: nathan-gilbert/pyconcile

# Creation Date : 08-27-2013
# Last Modified : Wed 28 Aug 2013 03:08:42 PM MDT
# Created By : Nathan Gilbert
#
import sys
import en

from pyconcile import reconcile
from pyconcile import utils
import specificity_utils

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: %s <dir>" % (sys.argv[0]))
        sys.exit(1)

    #read in the nps
    nps = reconcile.getNPs(sys.argv[1])

    for np in nps:
        if specificity_utils.isProper(np) or specificity_utils.isPronoun(np):
            continue
        head = specificity_utils.getHead(utils.textClean(np.getText()))
        print("{0} => {1}".format(np.pprint(), head))
        print(en.noun.senses(head))
        print(en.noun.hypernyms(head, sense=0))
        print("=" * 30)

    #read in named entities and/or read the
    #fire up wordnet

예제 #23

0

파일 보기

파일: vp_accuracy.py 프로젝트: nathan-gilbert/pyconcile

    head2correct = {}  #heads that have a correct antecedent.
    head2wrong = {}  #heads that have an incorrect antecedent. appears in a
    #chain with nothing else it is coreferent with
    head2none = {}  #heads that were not given an antecedent. --focus on
    #this one first
    head2counts = {}

    outputfile = "/features.goldnps/predictions.StanfordSieve.stanfordsieve/SingleLink"
    for f in files:
        #gather all the chains that were generated by the system.
        gold_chains = reconcile.getGoldChains(f)
        response_chains = reconcile.getResponseChains(f, outputfile)
        nps = reconcile.getNPs(f)

        for np in nps:
            head = specificity_utils.getHead(utils.textClean(
                np.getText())).lower()
            if head in heads:
                #this is the number of times that a NP appeared in a doc
                head2counts[head] = head2counts.get(head, 0) + 1
            #print "{0} : {1}".format(np.pprint(), head)

        #for chain in response_chains:
        #    if len(response_chains[chain]) > 1:
        #        for mention in response_chains[chain]:
        #            print mention.pprint()
        #        print

        #find all the gold vps that were not assigned any cluster.
        for chain in list(response_chains.keys()):
            if len(response_chains[chain]) == 1:
                mention = response_chains[chain][0]

예제 #24

0

파일 보기

파일: pronoun_profiler.py 프로젝트: nathan-gilbert/pyconcile

 def sent_distance(self, ant, sd):
     ant_text = utils.textClean(ant.getText().lower()).strip()
     self.sent_distances[ant_text].append(sd)

예제 #25

0

파일 보기

    for f in files:
        if f.startswith("#"): continue

        #prog.update_time(i)
        #sys.stdout.write("\r%s" % (str(prog)))
        #sys.stdout.flush()

        i += 1
        f = f.strip()
        doc = Document(f)
        gold_nps = reconcile.getNPs(f)
        gold_chains = reconcile.getGoldChains(f)
        doc.addGoldChains(gold_chains)

        for np in gold_nps:
            text = utils.textClean(np.getText().lower()).strip()
            if TRUE_PRONOUNS:
                if text in TRUE:
                    add_stats(text, np, doc, nouns, head2text)
            else:
                if specificity_utils.isNominal(np):
                    #head = getHead(text)
                    #if head.endswith("%"): continue #skip percents
                    #if head[-1].isdigit(): continue #skip numbers
                    #if utils.isConj(head): continue #just skip these guys too
                    add_stats(text, np, doc, nouns, head2text)

    #sys.stdout.write("\r \r\n")
    #sorted_nouns = sorted(nouns, key=operator.attrgetter('count'), reverse=True)
    #sorted_nouns = sorted(nouns.values(), key=operator.attrgetter('count'), reverse=True)
    #print sorted_nouns

예제 #26

0

파일 보기

파일: list_and_rank.py 프로젝트: nathan-gilbert/pyconcile

            #in case there were no predictions in a file
            try:
                all_pairs = reconcile.getFauxPairs(f, predictions)
            except:
                continue

            response_pairs = []
            for pair in all_pairs:
                if pair[0] is None or pair[1] is None:
                    continue
                response_pairs.append(pair)

        for pair in response_pairs:
            ana_head = specificity_utils.getHead(
                utils.textClean(pair[1].getText())).lower().strip()
            #skip real pronouns
            if ana_head not in data.ALL_PRONOUNS:
                pairs.append(pair)
                if ana_head not in list(heads2nouns.keys()):
                    heads2nouns[ana_head] = Noun(ana_head)
                else:
                    heads2nouns[ana_head].updateCount()

        labeled_annots = reconcile.labelCorrectPairs(gold_chains, pairs)
        for pair in labeled_annots:
            total_scores["vps_guessed"] += 1
            ana_head = specificity_utils.getHead(
                utils.textClean(pair[1].getText())).lower().strip()
            if pair[2]:
                #correct pair

예제 #27

0

파일 보기

파일: nearest-sem-baseline.py 프로젝트: nathan-gilbert/pyconcile

                    break

        outdir = "{0}/{1}/{2}".format(f, features_dir, predictions_dir)
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        #output in some format easy for counting the accuracy.
        with open(outdir + "/faux.predictions", 'w') as outfile:
            outfile.write("#" + f + "\n")
            for res in resolutions:
                antecedent = res[0]
                anaphor = res[1]
                sem = res[2]
                outfile.write("{0},{1}\t{2},{3}\t{4} <- {5}\tSEM:{6}\n".format(
                    antecedent.getStart(), antecedent.getEnd(),
                    anaphor.getStart(), anaphor.getEnd(),
                    utils.textClean(antecedent.getText()),
                    utils.textClean(anaphor.getText()), sem))

        #output in some format easy for counting the accuracy.
        with open(outdir + "/pronoun.predictions", 'w') as outfile:
            outfile.write("#" + f + "\n")
            for res in pronoun_resolutions:
                antecedent = res[0]
                anaphor = res[1]
                outfile.write("{0},{1}\t{2},{3}\t{4} <- {5}\n".format(
                    antecedent.getStart(), antecedent.getEnd(),
                    anaphor.getStart(), anaphor.getEnd(),
                    utils.textClean(antecedent.getText()),
                    utils.textClean(anaphor.getText())))

예제 #28

0

파일 보기

def process(f, head2qp, annotated_file):
    stanford_deps = reconcile.getStanfordDep(f)
    pos = reconcile.getPOS(f)
    if annotated_file:
        nps = reconcile.getNPs(f)
        for np in nps:
            head = None
            text = None
            if PRONOUNS:
                if qp_utils.isPronoun(np):
                    head = np.getText().lower()
                    text = np.getText()
                else:
                    continue
            else:
                if qp_utils.isNominal(np, pos):
                    text = utils.textClean(np.getText())
                    np_tags = pos.getSubset(np.getStart(), np.getEnd())
                    head = utils.textClean(
                        qp_utils.getHead2(text.lower(), np_tags))
                else:
                    continue

            #bookkeeping
            if head not in list(head2qp.keys()):
                head2qp[head] = QuasiPronoun(head)
                head2qp[head].updateCount(True)
                head2qp[head].updateDocs(f, True)
            else:
                head2qp[head].updateDocs(f, True)
                head2qp[head].updateCount(True)

            if USE_GOLD:
                gold_chains = reconcile.getGoldChains(f)
                process_gold(f, np, head, text, head2qp, gold_chains)
            process_syntax(f, np, head, text, head2qp, stanford_deps)
    else:
        stanford_nps = reconcile.getStanfordNPs(f)
        for np in stanford_nps:
            if PRONOUNS:
                if np["is_pronoun"]:
                    head = np.getText().lower()
                    text = np.getText()
                else:
                    continue
            else:
                #skip some problems with the parser or numbers
                if np["HEAD"].startswith("$") or np["HEAD"].endswith(
                        "%") or np["HEAD"] == ".":
                    continue

                if np["is_nominal"]:
                    text = utils.textClean(np.getText())
                    head = np["HEAD"].lower()
                else:
                    continue

            #bookkeeping
            if head not in list(head2qp.keys()):
                head2qp[head] = QuasiPronoun(head)
                head2qp[head].updateDocs(f, False)
                head2qp[head].updateCount(False)
            else:
                head2qp[head].updateDocs(f, False)
                head2qp[head].updateCount(False)
            process_syntax(f, np, head, text, head2qp, stanford_deps)

예제 #29

0

파일 보기

파일: most_common_nps.py 프로젝트: nathan-gilbert/coreference-resolution-annotator

    heads2nps = defaultdict(list)
    i = 0
    for f in files:
        prog.update_time(i)
        i += 1
        sys.stderr.write("\r%s" % (str(prog)))
        sys.stderr.flush()
        f = f.strip()
        nps = reconcile.getNPs(f)
        for np in nps:
            if specificity_utils.isNominal(np) or \
                    specificity_utils.isPronoun(np) or \
                    np["DATE"] != "NONE":
                head = specificity_utils.getHead(
                    utils.textClean(np.getText()).lower())
                if head.find(" and ") > -1:
                    continue
                if head in heads:
                    heads2nps[head].append(utils.textClean(np.getText()))
    sys.stderr.write("\r \r\n")

    for head in list(heads2nps.keys()):
        counts = {}
        definite = False
        for np in heads2nps[head]:
            if np == head:
                continue
            if np == "the " + head or np == "a " + head or np == "that " + head:
                definite = True
                continue

예제 #30

0

파일 보기

            ana_head = specificity_utils.getHead(pair[1].getText()).lower()
            if ana_head in FAUX_PRONOUNS:
                if ana_head not in list(tracked_nouns.keys()):
                    tracked_nouns[ana_head] = Noun(ana_head)
                tracked_pairs.append(pair)

        #label the correct or incorrect pairs
        labeled_faux_pairs = reconcile.labelCorrectPairs(gold_chains,
                tracked_pairs)

        for lpair in labeled_faux_pairs:
            ana_head = specificity_utils.getHead(lpair[1].getText()).lower()
            key = "{0}:{1}:{2}".format(f, lpair[1].getStart(),
                    lpair[1].getEnd())

            tracked_nouns[ana_head].instances[key] = utils.textClean(lpair[1].getText())
            tracked_nouns[ana_head].antecedents[key] = utils.textClean(lpair[0].getText())
            tracked_nouns[ana_head].labels[key] = lpair[2]

            #this is an incorrect antecedent
            if not lpair[2]:
                closest_true_antecedent = closest_antecedent(gold_chains,
                        lpair[1])

                #deals with sentence distance
                resp_ant_sent = getAnnotSentenceNum(sentences, lpair[1])
                true_ant_sent = getAnnotSentenceNum(sentences, closest_true_antecedent)
                ana_sent = getAnnotSentenceNum(sentences, lpair[1])

                if closest_true_antecedent is not None:
                    true_dist = ana_sent - true_ant_sent