def rels2rdf(ns, verbose=False): """ Convert the reldicts derived from the IEER corpus in an RDF Graph. """ graph = ConjunctiveGraph() graph.bind('nltk',BASE) graph.bind('org', "http://nltk.org/terms/org#") graph.bind('loc', "http://nltk.org/terms/loc#") graph.bind('pred', "http://nltk.org/terms/pred#") graph.bind('class', "http://nltk.org/terms/class#") in_uri = sym2uri(ns, 'pred', 'in') loc_uri = sym2uri(ns, 'class', 'Location') org_uri = sym2uri(ns, 'class', 'Organization') graph.add((in_uri, RDFNS.type, RDFSNS.Property)) graph.add((loc_uri, RDFNS.type, RDFSNS.Class)) graph.add((org_uri, RDFNS.type, RDFSNS.Class)) graph.add((in_uri, RDFSNS.domain, org_uri)) graph.add((in_uri, RDFSNS.range, loc_uri)) from nltk.corpus import ieer IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') for item in ieer.fileids(): for doc in ieer.parsed_docs(item): for reldict in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): graph.add(make_rdf(ns, reldict, relsym='in')) for triple in make_rdfs(ns, reldict): graph.add(triple) return graph
def rels2rdf(ns, verbose=False): """ Convert the reldicts derived from the IEER corpus in an RDF Graph. """ graph = ConjunctiveGraph() graph.bind('nltk',BASE) graph.bind('org', "http://nltk.org/terms/org#") graph.bind('loc', "http://nltk.org/terms/loc#") graph.bind('pred', "http://nltk.org/terms/pred#") graph.bind('class', "http://nltk.org/terms/class#") in_uri = sym2uri(ns, 'pred', 'in') loc_uri = sym2uri(ns, 'class', 'Location') org_uri = sym2uri(ns, 'class', 'Organization') graph.add((in_uri, RDFNS.type, RDFSNS.Property)) graph.add((loc_uri, RDFNS.type, RDFSNS.Class)) graph.add((org_uri, RDFNS.type, RDFSNS.Class)) graph.add((in_uri, RDFSNS.domain, org_uri)) graph.add((in_uri, RDFSNS.range, loc_uri)) from nltk.corpus import ieer IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') for item in ieer.items: for doc in ieer.parsed_docs(item): for reldict in relextract('ORG', 'LOC', doc, pattern=IN): graph.add(make_rdf(ns, reldict, relsym='in')) for triple in make_rdfs(ns, reldict): graph.add(triple) return graph
def in_demo(trace=0, sql=True): """ Select pairs of organizations and locations whose mentions occur with an intervening occurrence of the preposition "in". If the sql parameter is set to True, then the entity pairs are loaded into an in-memory database, and subsequently pulled out using an SQL "SELECT" query. """ from nltk.corpus import ieer if sql: try: import sqlite3 connection = sqlite3.connect(":memory:") connection.text_factory = sqlite3.OptimizedUnicode cur = connection.cursor() cur.execute("""create table Locations (OrgName text, LocationName text, DocID text)""") except ImportError: import warnings warnings.warn("Cannot import sqlite; sql flag will be ignored.") IN = re.compile(r'.*\bin\b(?!\b.+ing)') print print "IEER: in(ORG, LOC) -- just the clauses:" print "=" * 45 for file in ieer.fileids(): for doc in ieer.parsed_docs(file): if trace: print doc.docno print "=" * 15 for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print show_clause(rel, relsym='IN') if sql: try: rtuple = (rel['subjtext'], rel['objtext'], doc.docno) cur.execute( """insert into Locations values (?, ?, ?)""", rtuple) connection.commit() except NameError: pass if sql: try: cur.execute("""select OrgName from Locations where LocationName = 'Atlanta'""") print print "Extract data from SQL table: ORGs in Atlanta" print "-" * 15 for row in cur: print row except NameError: pass
def rels2rdf(ns, verbose=False): """ Convert the reldicts derived from the IEER corpus in an RDF Graph. """ graph = ConjunctiveGraph() graph.bind("nltk", BASE) graph.bind("org", "http://nltk.org/terms/org#") graph.bind("loc", "http://nltk.org/terms/loc#") graph.bind("pred", "http://nltk.org/terms/pred#") graph.bind("class", "http://nltk.org/terms/class#") in_uri = sym2uri(ns, "pred", "in") loc_uri = sym2uri(ns, "class", "Location") org_uri = sym2uri(ns, "class", "Organization") graph.add((in_uri, RDFNS.type, RDFSNS.Property)) graph.add((loc_uri, RDFNS.type, RDFSNS.Class)) graph.add((org_uri, RDFNS.type, RDFSNS.Class)) graph.add((in_uri, RDFSNS.domain, org_uri)) graph.add((in_uri, RDFSNS.range, loc_uri)) from nltk.corpus import ieer IN = re.compile(r".*\bin\b(?!\b.+ing\b)") for item in ieer.fileids(): for doc in ieer.parsed_docs(item): for reldict in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN): graph.add(make_rdf(ns, reldict, relsym="in")) for triple in make_rdfs(ns, reldict): graph.add(triple) return graph
def parse_doc(filename, index): docs = ieer.parsed_docs(filename) dt = docs[index].text words = dt.leaves() tags = tree2conll_without_postags(dt) rr = nltk.sent_tokenize(' '.join(words)) # small fixes: if filename == 'NYT_19980315' and index == 11: rr[8] = rr[8] + rr[9] rr.remove(rr[9]) if filename == 'NYT_19980407': if index == 4: rr[19] = rr[19] + rr[20] rr.remove(rr[20]) if index == 13: rr[9] = rr[9] + rr[10] rr.remove(rr[10]) L = get_breaks(words, rr) L.append( len(tags)) # otherwise you miss the last sentence of the document. tags = [tags[L[i]:L[i + 1]] for i in range(len(L) - 1)] return tags
def in_demo(trace=0, sql=True): """ Select pairs of organizations and locations whose mentions occur with an intervening occurrence of the preposition "in". If the sql parameter is set to True, then the entity pairs are loaded into an in-memory database, and subsequently pulled out using an SQL "SELECT" query. """ from nltk.corpus import ieer if sql: try: import sqlite3 connection = sqlite3.connect(":memory:") connection.text_factory = sqlite3.OptimizedUnicode cur = connection.cursor() cur.execute("""create table Locations (OrgName text, LocationName text, DocID text)""") except ImportError: import warnings warnings.warn("Cannot import sqlite; sql flag will be ignored.") IN = re.compile(r'.*\bin\b(?!\b.+ing)') print() print("IEER: in(ORG, LOC) -- just the clauses:") print("=" * 45) for file in ieer.fileids(): for doc in ieer.parsed_docs(file): if trace: print(doc.docno) print("=" * 15) for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(clause(rel, relsym='IN')) if sql: try: rtuple = (rel['subjtext'], rel['objtext'], doc.docno) cur.execute("""insert into Locations values (?, ?, ?)""", rtuple) connection.commit() except NameError: pass if sql: try: cur.execute("""select OrgName from Locations where LocationName = 'Atlanta'""") print() print("Extract data from SQL table: ORGs in Atlanta") print("-" * 15) for row in cur: print(row) except NameError: pass
def ieer_headlines(): from nltk.corpus import ieer print("IEER: First 20 Headlines") print("=" * 45) trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)] for tree in trees[:20]: print() print("%s:\n%s" % tree)
def ieer_headlines(): from nltk.corpus import ieer from nltk.tree import Tree print "IEER: First 20 Headlines" print "=" * 45 trees = [doc.headline for file in ieer.fileids() for doc in ieer.parsed_docs(file)] for tree in trees[:20]: print print "%s:\n%s" % (doc.docno, tree)
def ieer_headlines(): from nltk.corpus import ieer from nltk.tree import Tree print("IEER: First 20 Headlines") print("=" * 45) trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)] for tree in trees[:20]: print() print("%s:\n%s" % tree)
def roles_demo(trace=0): from nltk.corpus import ieer roles = r""" (.*( # assorted roles analyst| chair(wo)?man| commissioner| counsel| director| economist| editor| executive| foreman| governor| head| lawyer| leader| librarian).*)| manager| partner| president| producer| professor| researcher| spokes(wo)?man| writer| ,\sof\sthe?\s* # "X, of (the) Y" """ ROLES = re.compile(roles, re.VERBOSE) print() print("IEER: has_role(PER, ORG) -- raw rtuples:") print("=" * 45) for file in ieer.fileids(): for doc in ieer.parsed_docs(file): lcon = rcon = False if trace: print(doc.docno) print("=" * 15) lcon = rcon = True for rel in extract_rels("PER", "ORG", doc, corpus="ieer", pattern=ROLES): print(rtuple(rel, lcon=lcon, rcon=rcon))
def in_demo(trace=0): from nltk.corpus import ieer IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') print print "in(ORG, LOC) -- just the clauses:" print "=" * 45 for file in ieer.files(): for doc in ieer.parsed_docs(file): if trace: print doc.docno print "=" * 15 for rel in relextract('ORG', 'LOC', doc, pattern=IN): print show_clause(rel, relsym='IN')
def write_conll(filename): """ For example: filename = 'APW_19980314' """ docs = ieer.parsed_docs(filename) numdocs = len(docs) conllfile = CONLL_WRITE + filename with open(conllfile, 'a+') as nfd: for index in range(numdocs): nfd.write('\n') nfd.write('-DOCSTART- -X- -X- O') nfd.write('\n') sentences = parse_doc(filename, index) for sent in sentences: nfd.write('\n') for iobword in sent: nfd.write(iobword[0] + '\t' + iobword[1] + '\n')
def roles_demo(trace=0): from nltk.corpus import ieer roles = """ (.*( # assorted roles analyst| chair(wo)?man| commissioner| counsel| director| economist| editor| executive| foreman| governor| head| lawyer| leader| librarian).*)| manager| partner| president| producer| professor| researcher| spokes(wo)?man| writer| ,\sof\sthe?\s* # "X, of (the) Y" """ ROLES = re.compile(roles, re.VERBOSE) print() print("IEER: has_role(PER, ORG) -- raw rtuples:") print("=" * 45) for file in ieer.fileids(): for doc in ieer.parsed_docs(file): lcon = rcon = False if trace: print(doc.docno) print("=" * 15) lcon = rcon = True for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): print(rtuple(rel, lcon=lcon, rcon=rcon))
def in_demo(trace=0, sql=True): from nltk.corpus import ieer if sql: import sqlite3 connection = sqlite3.connect(":memory:") connection.text_factory = sqlite3.OptimizedUnicode cur = connection.cursor() cur.execute("""create table Locations (OrgName text, LocationName text, DocID text)""") IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') print print "IEER: in(ORG, LOC) -- just the clauses:" print "=" * 45 for file in ieer.files(): for doc in ieer.parsed_docs(file): if trace: print doc.docno print "=" * 15 for rel in extract_rels('ORG', 'LOC', doc, pattern=IN): print show_clause(rel, relsym='IN') if sql: rtuple = (rel['subjtext'], rel['objtext'], doc.docno) cur.execute( """insert into Locations values (?, ?, ?)""", rtuple) connection.commit() if sql: cur.execute("""select OrgName from Locations where LocationName = 'Atlanta'""") print print "Extract data from SQL table: ORGs in Atlanta" print "-" * 15 for row in cur: print row
#semcor print "semcor" for line in semcor.sents()[0:100]: s = "" for word in line: s = s + " " + word print s + "\n" for word in line: meanings = net.synsets(word) if len(meanings) > 0: print meanings[0].definition() elif num == "3": docs = ieer.parsed_docs('APW_19980424') tree = docs[1].text from nltk.sem import relextract pairs = relextract.tree2semi_rel(tree) for s, tree in pairs[18:22]: print('("...%s", %s)' % (" ".join(s[-5:]), tree)) reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # The function relextract() allows us to filter the reldicts # according to the classes of the subject and object named entities. # In addition, we can specify that the filler text has to match a given regular expression, # as illustrated in the next example. Here, we are looking for pairs of entities in the IN
from nltk.corpus import ieer import nltk def tree_search(tree, filt): for node in tree: if filt(node): yield node elif isinstance(node, nltk.tree.Tree): for sub in tree_search(node, filt): yield sub def is_person(t): if not isinstance(t, nltk.tree.Tree): return False return t.label()=="PERSON" docs = ieer.parsed_docs('NYT_19980315') for doc in docs: for elem in tree_search(doc.text, is_person): print ' '.join(elem.leaves())
from nltk.corpus import conll2000, conll2002 print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2000.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2002.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # SEMCOR from nltk.corpus import semcor print(semcor.words()) print(semcor.chunks()) print(semcor.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(semcor.chunk_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE list(map(str, semcor.tagged_chunks(tag='both')[:3])) [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]] # IEER from nltk.corpus import ieer ieer.fileids() # doctest: +NORMALIZE_WHITESPACE docs = ieer.parsed_docs('APW_19980314') print(docs[0]) print(docs[0].docno) print(docs[0].doctype) print(docs[0].date_time) print(docs[0].headline) print(docs[0].text) # doctest: +ELLIPSIS
from nltk.corpus import ieer from nltk.sem import relextract import re docs = ieer.parsed_docs('NYT_19980315') tree = docs[1].text #print(tree) # Converts chunk (NER is already done) into list of two-member lists. Each contains a string followed by a tree (named entity) # Eg: "about first-level questions, said Ms." => string # (PERSON Cohn) => named entity, which is a tree. Root is PERSON, and child node is Cohn pairs = relextract.tree2semi_rel(tree) """ for s, tree in pairs[:3]: print('*'*20) print(' '.join(s)) print(tree) """ # Processes three of the above pairs at a time into a dictionary. Eg: (string1, tree1) (string2, tree2) (string3, tree3) # string1 is stored as left context. # tree1 is the subject, string2 is the filler, and tree2 is the object. # string3 is stored as right context. reldicts = relextract.semi_rel2reldict(pairs) for r in reldicts: print('=' * 20) print(r['subjtext']) # Print the subject text print(r['filler']) # Print the filler information print(r['objtext']) # Print the object text # Matches any number of characters followed by the word "in" as long as "in" is not followed by a word ending in "ing" IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
def ieer_chunked_sents(tag=nltk.tag.pos_tag): for doc in ieer.parsed_docs(): tagged = ieertree2conlltags(doc.text, tag) yield conlltags2tree(tagged)
if tag == "WP": desired_subjclass = "PERSON" if desired_subjclass != "PERSON": print "Sorry, I don't know how to answer that kind of question. " elif desired_objclass != "ORGANIZATION": print "Sorry, I don't know how to look for that answer. " else: evidence = [] print "Searching for your answer... " for fileid in ieer.fileids(): for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): if rel['objclass'] == "ORGANIZATION" and rel[ 'objtext'] == desired_objtext: if rel['subjclass'] == "PERSON": desired_subjtext = rel['subjtext'] evidence.append(rel) #print(relextract.rtuple(rel)) if desired_subjtext != "": print "\nI've got the answer! The person you're looking for is " + desired_subjtext + ". " print "\nHere's some supporting evidence:"
def main(): print "user input(1) or semcor(2)?" num = raw_input() if num == "1": #input print "enter word" word = raw_input() for meaning in (net.synsets(word)): #print "Sense: " + re.findall("'.*'", str(meaning))[0] print "Sense: " + str(meaning) print meaning.definition() + "\n" hypernyms = (meaning.hypernyms()) if len(hypernyms) > 0: print "\nHypernyms:" for meaning2 in hypernyms: print re.findall("'.*'", str(meaning2))[0] hyponyms = (meaning.hyponyms()) if len(hyponyms) > 0: print "\nHyponyms:" for meaning2 in hyponyms: print re.findall("'.*'", str(meaning2))[0] # print "\nHypernym Tree:" # print (gethypernymtree(meaning)) print "\n" # dog = wn.synset('dog.n.01') # hypo = lambda s: s.hyponyms() # hyper = lambda s: s.hypernyms() #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms() #True #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms() elif (num == "2"): #semcor print "semcor" for line in semcor.sents()[0:100]: s = "" for word in line: s = s + " " + word print s + "\n" for word in line: meanings = net.synsets(word) if len(meanings) > 0: print meanings[0].definition() elif num == "3": docs = ieer.parsed_docs('APW_19980424') tree = docs[1].text from nltk.sem import relextract pairs = relextract.tree2semi_rel(tree) for s, tree in pairs[18:22]: print('("...%s", %s)' % (" ".join(s[-5:]), tree)) reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # The function relextract() allows us to filter the reldicts # according to the classes of the subject and object named entities. # In addition, we can specify that the filler text has to match a given regular expression, # as illustrated in the next example. Here, we are looking for pairs of entities in the IN # relation, where IN has signature <ORG, LOC>. IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)') for fileid in ieer.fileids(): print fileid for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)" ROLES = re.compile(roles, re.VERBOSE) for fileid in ieer.fileids(): for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS
#!/usr/bin/python # -*- coding: utf-8 -*- # NAMED ENTITIES from nltk.corpus import ieer docs = ieer.parsed_docs('NYT_19980315') tree = docs[1].text print(tree) # doctest: +ELLIPSIS from nltk.corpus import conll2002 for doc in conll2002.chunked_sents('ned.train')[27]: print(doc) from nltk.sem import relextract pairs = relextract.tree2semi_rel(tree) for s, tree in pairs[18:22]: print('("...%s", %s)' % (" ".join(s[-5:]), tree)) reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # doctest: +ELLIPSIS for r in reldicts[18:20]: print('=' * 20) print(r['subjtext']) print(r['filler']) print(r['objtext']) import re IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')