Пример #1
0
def instantiation_factors(doc, schema):
    ""
    # extract caps from schema
    events, chains = schema["raw_schema"]

    schema_caps = []
    for slots, args in chains:
        slots = sorted(slots)
        for i, slot in enumerate(slots):
            for tlot in slots[i:]:
                schema_caps.append((slot, tlot))

    # extract caps from doc
    doc_caps = [cap for cap, t in doc["freq_joint"] if cap in schema_caps]

    CAP = "CAP"
    counts = ft.histo(ft.indexBy(CAP, [{CAP: x} for x in doc_caps]))

    return counts
Пример #2
0
            docs = ft.tag(
                docs, "tokenized_%s" % tag,
                lambda N: [toker.tokenize(n) if n else "*NONE*" for n in N],
                [tag])

        docs = ft.tag(docs, "entities", lambda P, L, O: P + L + O,
                      ["tokenized_%s" % tag for tag in metatags])
        for doc in docs:
            if not doc["entities"]:
                print "No entities for this document."
                continue

            doc = CNLP.prep_tokens(doc)
            entities = NNP_entities(doc)
            entities = [{"entity": e} for e in entities]
            entcounts = ft.histo(ft.indexBy("entity", entities))

            entcounts = sorted([(-c, e) for e, c in entcounts.items()])
            entcounts = entcounts[:n]  #int(avg_doc_entlen)]
            entities = [e for c, e in entcounts]

            print doc["doc-id"]
            pprint(entities)
            pprint(doc["entities"])

            eq_RE = entity_set_eq(entities, doc["entities"])
            eq_ER = entity_set_eq(doc["entities"], entities)

            FP = len(entities) - eq_RE
            FN = len(doc["entities"]) - eq_ER
            quasiTP = entity_intersect(entities, doc["entities"])
Пример #3
0
from pprint import pprint
from copy import deepcopy

# load_csv test

data = ft.load_csv(open("tests/mid_e_verb_morph.csv"))

# PIPE test
print("ft.summary(data): ")
pprint(ft.summary(data))
print("*"*50)
dex = ft.indexBy("suf", data, pipe=lambda s: s[0] if s else s)
pprint(dex)
print("^^^Indexed by first letter of suf^^^")
print("Counts")
pprint(ft.histo(dex))


# Singleton test
foo = ft.singletons("i", range(1,10))
foodex = ft.indexBy("i", foo)
print("Singletons")
pprint(foodex)

# Merge test
premerged_data = deepcopy(data)
merged_data = ft.merge(data, differs=["tns","prs","num","mud"])
print()
print("Merge result:")
pprint(merged_data)
print()
from reuters_reader import rcv1
import sys
import ft
from pprint import pprint

docs = []
doc_counts = 0
docs_no_topic = 0
for doc in rcv1.reader(sys.argv[1]):
    doc_counts += 1
    if "bip:topics:1.0" in doc:
        docs.append(doc)
    else:
        docs_no_topic += 1

print "Total docs:", doc_counts
print "Docs w/o topic:", docs_no_topic

print "Counting topics..."
pprint(ft.histo(ft.multidex(docs, "bip:topics:1.0")))
Пример #5
0
import os
import nltk.corpus
import ft
import matplotlib.pyplot as plt
import numpy as np

europarl_path = "../../nltk_data/corpora/europarl_raw/"
path, langs, crap = os.walk(europarl_path).next()

lang_word_counts = {}
rank_counts = {}
for lang in langs:
    reader = nltk.corpus.EuroparlCorpusReader(europarl_path+lang, ".*")
    lang_word_counts[lang] = ft.histo(
                                ft.indexBy("word", 
                                    ft.singletons("word",
                                        [w for w in reader.words()])))
    rank_counts[lang] = sorted(lang_word_counts[lang].items(), key = lambda x: -x[1])

for lang in rank_counts:
    plt.loglog(range(1, len(rank_counts[lang])+1), 
             [c for w,c in rank_counts[lang]], label = lang)
plt.legend()
plt.show()




Пример #6
0
def condense_c(data):
    "Turns data into a structure ready for approximate_p"
    data_table = [{"pair": d} for d in data] #creates a free table
    datadex = ft.indexBy("pair", data_table) #creates a dex
    return ft.histo(datadex) #converts the dex to counts
Пример #7
0
def coref_type(coref_bundle, sentences, pref_types = False):
    """
    Determines the argument filler type as per Chambers and Jurafsky 2009, with
    some variants.

    (0) If possible, it prefers spec_types. It looks for these, left of the 
        head nouns in each DP. If 
    (1) Next, it tries the named entity recognizer for unambiguous results.
    (2) Next, it uses the single most frequent non-pronominal lemma.
    (3) Next, it uses the pronouns to make a last ditch choice.
    (4) In a last ditch effort, it chooses the shortest head noun in the 
        coreference chain.
    (5) At this point, it concludes there's no valid type.

    Args:
        coref = a flattened coreference chain
        sentences = the CoreNLP output "sentences"
        pref_types = default False, 
                     if specified, a tuple
                     (target_words, whole_dps)
                     where
                        target_words = a list of preferred head nouns
                        whole_dps = the rest of each dp outside its coref
    """

    max_counts = lambda D: max_set(D.items(), lambda x: x[1])
    # pre-requirement
    coref,c_raw = coref_bundle
    try:
        cnlps = get_cnlp(coref, sentences)
    except:
        return TYPE_ERROR
    #print "\n\n"
    #pprint(coref) 
    
    #(0) Apply the pre-counted list of HNs. If one of the most common head 
    #       nouns appears left of the head, it is the type.
    if pref_types:
        #Get the proper nouns left of the head
        dp_wholes = coref_kinda_flat(c_raw)
        
        head_spaces = get_head_space(coref, sentences, dp_wholes)
        head_squished = [w for s,i,w in sum(head_spaces,[])]
        #head_squished = [w for w in head_squished if pref_types(w["Lemma"])]
        head_squished = [w for w in head_squished if pref_types(w)]
        
        lemmas = ft.histo(ft.indexBy("Lemma", head_squished))
        hell_yea = max_counts(lemmas)

        if len(hell_yea) == 1:
             return hell_yea[0][0].lower()

    #(1-X) preparation 
    # we gotta dig deeper for these

    #get corenlp output for each word, and get the parts out we want
    NEcounts = ft.histo(ft.indexBy("NamedEntityTag", cnlps))
    Lcounts_all = ft.histo(ft.indexBy("Lemma", cnlps))
    Lcounts = dict([l for l in Lcounts_all.items() if l[0] not in pn_lemmas])

    #get the max_set of Named Entity counts and Lemma counts
    NE_crap = {"O", "MISC"}
    NE_crapless = dict([l for l in NEcounts.items() if l[0] not in NE_crap])
    NE_max = max_counts(NE_crapless)
    L_max = max_counts(Lcounts)
    
    #head noun counting
    # (this is done as a weird side effect--a slightly cleaned up version is
    #   applied as the first step here.)
    temp = singlitate(L_max)
    if temp in HN_counter: HN_counter[temp] += 1
    else: HN_counter[temp] = 1

    #pprint(NE_max)
    #pprint(Lcounts)
    #print "\n"

    #Data extraction is finally done.


    #(1) If we have a solid NE instance, return that.
    NE_max = singlitate(NE_max)
    NE_max = NE_max if NE_max not in NE_crap else False
    if NE_max: return NE_max
    
    
    #(3) We're really hurtin now. It tries to build a type based on pn.
    L_max_pn = set([pn for pn, c in max_counts(Lcounts_all)])
    #L_max_pn = singlitate(L_max_pn)
    
    #THIS NEEDS SOME TWEAKING
    for pn_tag in pronoun_groups:
        if L_max_pn & pronoun_groups[pn_tag]: return pn_tag
    
    #EMERGENCY ATTTEMPT
    return "THINGY"

    #(2) Is there a single most frequent, non-pn lemma? Return that.
    L_max = singlitate(L_max)
    if L_max: return L_max.lower()
    
    #(4) Real desperate here.
    # take the shortest head, all lowercase:
    if L_max_pn: 
        worst = min(L_max_pn, key=lambda s: len(s)).lower()
        print "Selecting, possibly, the worst possible choice"
        print L_max_pn
        print worst
        print "\n"
        return worst


    #(5) Ut-oh
    print "WARNING: No type for this poor fellow:"
    pprint(coref)
    pprint(NEcounts)
    pprint(NE_max)
    pprint(Lcounts)
    pprint(L_max)
    print "oh, and"
    pprint(L_max_pn)
    print "\n"
    return "NO-VALID-TYPE"