def instantiation_factors(doc, schema): "" # extract caps from schema events, chains = schema["raw_schema"] schema_caps = [] for slots, args in chains: slots = sorted(slots) for i, slot in enumerate(slots): for tlot in slots[i:]: schema_caps.append((slot, tlot)) # extract caps from doc doc_caps = [cap for cap, t in doc["freq_joint"] if cap in schema_caps] CAP = "CAP" counts = ft.histo(ft.indexBy(CAP, [{CAP: x} for x in doc_caps])) return counts
docs = ft.tag( docs, "tokenized_%s" % tag, lambda N: [toker.tokenize(n) if n else "*NONE*" for n in N], [tag]) docs = ft.tag(docs, "entities", lambda P, L, O: P + L + O, ["tokenized_%s" % tag for tag in metatags]) for doc in docs: if not doc["entities"]: print "No entities for this document." continue doc = CNLP.prep_tokens(doc) entities = NNP_entities(doc) entities = [{"entity": e} for e in entities] entcounts = ft.histo(ft.indexBy("entity", entities)) entcounts = sorted([(-c, e) for e, c in entcounts.items()]) entcounts = entcounts[:n] #int(avg_doc_entlen)] entities = [e for c, e in entcounts] print doc["doc-id"] pprint(entities) pprint(doc["entities"]) eq_RE = entity_set_eq(entities, doc["entities"]) eq_ER = entity_set_eq(doc["entities"], entities) FP = len(entities) - eq_RE FN = len(doc["entities"]) - eq_ER quasiTP = entity_intersect(entities, doc["entities"])
from pprint import pprint from copy import deepcopy # load_csv test data = ft.load_csv(open("tests/mid_e_verb_morph.csv")) # PIPE test print("ft.summary(data): ") pprint(ft.summary(data)) print("*"*50) dex = ft.indexBy("suf", data, pipe=lambda s: s[0] if s else s) pprint(dex) print("^^^Indexed by first letter of suf^^^") print("Counts") pprint(ft.histo(dex)) # Singleton test foo = ft.singletons("i", range(1,10)) foodex = ft.indexBy("i", foo) print("Singletons") pprint(foodex) # Merge test premerged_data = deepcopy(data) merged_data = ft.merge(data, differs=["tns","prs","num","mud"]) print() print("Merge result:") pprint(merged_data) print()
from reuters_reader import rcv1 import sys import ft from pprint import pprint docs = [] doc_counts = 0 docs_no_topic = 0 for doc in rcv1.reader(sys.argv[1]): doc_counts += 1 if "bip:topics:1.0" in doc: docs.append(doc) else: docs_no_topic += 1 print "Total docs:", doc_counts print "Docs w/o topic:", docs_no_topic print "Counting topics..." pprint(ft.histo(ft.multidex(docs, "bip:topics:1.0")))
import os import nltk.corpus import ft import matplotlib.pyplot as plt import numpy as np europarl_path = "../../nltk_data/corpora/europarl_raw/" path, langs, crap = os.walk(europarl_path).next() lang_word_counts = {} rank_counts = {} for lang in langs: reader = nltk.corpus.EuroparlCorpusReader(europarl_path+lang, ".*") lang_word_counts[lang] = ft.histo( ft.indexBy("word", ft.singletons("word", [w for w in reader.words()]))) rank_counts[lang] = sorted(lang_word_counts[lang].items(), key = lambda x: -x[1]) for lang in rank_counts: plt.loglog(range(1, len(rank_counts[lang])+1), [c for w,c in rank_counts[lang]], label = lang) plt.legend() plt.show()
def condense_c(data): "Turns data into a structure ready for approximate_p" data_table = [{"pair": d} for d in data] #creates a free table datadex = ft.indexBy("pair", data_table) #creates a dex return ft.histo(datadex) #converts the dex to counts
def coref_type(coref_bundle, sentences, pref_types = False): """ Determines the argument filler type as per Chambers and Jurafsky 2009, with some variants. (0) If possible, it prefers spec_types. It looks for these, left of the head nouns in each DP. If (1) Next, it tries the named entity recognizer for unambiguous results. (2) Next, it uses the single most frequent non-pronominal lemma. (3) Next, it uses the pronouns to make a last ditch choice. (4) In a last ditch effort, it chooses the shortest head noun in the coreference chain. (5) At this point, it concludes there's no valid type. Args: coref = a flattened coreference chain sentences = the CoreNLP output "sentences" pref_types = default False, if specified, a tuple (target_words, whole_dps) where target_words = a list of preferred head nouns whole_dps = the rest of each dp outside its coref """ max_counts = lambda D: max_set(D.items(), lambda x: x[1]) # pre-requirement coref,c_raw = coref_bundle try: cnlps = get_cnlp(coref, sentences) except: return TYPE_ERROR #print "\n\n" #pprint(coref) #(0) Apply the pre-counted list of HNs. If one of the most common head # nouns appears left of the head, it is the type. if pref_types: #Get the proper nouns left of the head dp_wholes = coref_kinda_flat(c_raw) head_spaces = get_head_space(coref, sentences, dp_wholes) head_squished = [w for s,i,w in sum(head_spaces,[])] #head_squished = [w for w in head_squished if pref_types(w["Lemma"])] head_squished = [w for w in head_squished if pref_types(w)] lemmas = ft.histo(ft.indexBy("Lemma", head_squished)) hell_yea = max_counts(lemmas) if len(hell_yea) == 1: return hell_yea[0][0].lower() #(1-X) preparation # we gotta dig deeper for these #get corenlp output for each word, and get the parts out we want NEcounts = ft.histo(ft.indexBy("NamedEntityTag", cnlps)) Lcounts_all = ft.histo(ft.indexBy("Lemma", cnlps)) Lcounts = dict([l for l in Lcounts_all.items() if l[0] not in pn_lemmas]) #get the max_set of Named Entity counts and Lemma counts NE_crap = {"O", "MISC"} NE_crapless = dict([l for l in NEcounts.items() if l[0] not in NE_crap]) NE_max = max_counts(NE_crapless) L_max = max_counts(Lcounts) #head noun counting # (this is done as a weird side effect--a slightly cleaned up version is # applied as the first step here.) temp = singlitate(L_max) if temp in HN_counter: HN_counter[temp] += 1 else: HN_counter[temp] = 1 #pprint(NE_max) #pprint(Lcounts) #print "\n" #Data extraction is finally done. #(1) If we have a solid NE instance, return that. NE_max = singlitate(NE_max) NE_max = NE_max if NE_max not in NE_crap else False if NE_max: return NE_max #(3) We're really hurtin now. It tries to build a type based on pn. L_max_pn = set([pn for pn, c in max_counts(Lcounts_all)]) #L_max_pn = singlitate(L_max_pn) #THIS NEEDS SOME TWEAKING for pn_tag in pronoun_groups: if L_max_pn & pronoun_groups[pn_tag]: return pn_tag #EMERGENCY ATTTEMPT return "THINGY" #(2) Is there a single most frequent, non-pn lemma? Return that. L_max = singlitate(L_max) if L_max: return L_max.lower() #(4) Real desperate here. # take the shortest head, all lowercase: if L_max_pn: worst = min(L_max_pn, key=lambda s: len(s)).lower() print "Selecting, possibly, the worst possible choice" print L_max_pn print worst print "\n" return worst #(5) Ut-oh print "WARNING: No type for this poor fellow:" pprint(coref) pprint(NEcounts) pprint(NE_max) pprint(Lcounts) pprint(L_max) print "oh, and" pprint(L_max_pn) print "\n" return "NO-VALID-TYPE"