def relationBetweenEntities(sentences): tokenized_sentences = [word_tokenize(sentence) for sentence in sentences] tagged_sentences = [ nltk.tag.pos_tag(sentence) for sentence in tokenized_sentences ] OF = re.compile(r'.*\bof\b.*') IN = re.compile(r'.*\bin\b(?!\b.+ing)') print('PERSON-ORGANISATION Relationships:') for i, sent in enumerate(tagged_sentences): sent = nltk.chunk.ne_chunk( sent) # ne_chunk method expects one tagged sentence rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=IN, window=10) for rel in rels: print(rtuple(rel)) print('PERSON-GPE Relationships:') for i, sent in enumerate(tagged_sentences): sent = nltk.chunk.ne_chunk( sent) # ne_chunk method expects one tagged sentence rels = extract_rels('PER', 'GPE', sent, corpus='ace', pattern=OF, window=10) for rel in rels: print(rtuple(rel))
def rels2rdf(ns, verbose=False): """ Convert the reldicts derived from the IEER corpus in an RDF Graph. """ graph = ConjunctiveGraph() graph.bind('nltk',BASE) graph.bind('org', "http://nltk.org/terms/org#") graph.bind('loc', "http://nltk.org/terms/loc#") graph.bind('pred', "http://nltk.org/terms/pred#") graph.bind('class', "http://nltk.org/terms/class#") in_uri = sym2uri(ns, 'pred', 'in') loc_uri = sym2uri(ns, 'class', 'Location') org_uri = sym2uri(ns, 'class', 'Organization') graph.add((in_uri, RDFNS.type, RDFSNS.Property)) graph.add((loc_uri, RDFNS.type, RDFSNS.Class)) graph.add((org_uri, RDFNS.type, RDFSNS.Class)) graph.add((in_uri, RDFSNS.domain, org_uri)) graph.add((in_uri, RDFSNS.range, loc_uri)) from nltk.corpus import ieer IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') for item in ieer.fileids(): for doc in ieer.parsed_docs(item): for reldict in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): graph.add(make_rdf(ns, reldict, relsym='in')) for triple in make_rdfs(ns, reldict): graph.add(triple) return graph
def findrelations(text): roles = """ (.*( computer scientist| led | adjunct professor).*)| co-founder| chairman| parents| ,\sof\sthe?\s* # "X, of (the) Y" """ ROLES = re.compile(roles, re.VERBOSE) sentences = nltk.sent_tokenize(text) tokenized_sentences = [ nltk.word_tokenize(sentence) for sentence in sentences ] tagged_sentences = [ nltk.pos_tag(sentence) for sentence in tokenized_sentences ] chunked_sentences = nltk.ne_chunk_sents(tagged_sentences) for doc in chunked_sentences: print(doc) for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ace', pattern=ROLES): #it is a tree, so you need to work on it to output what you want print(relextract.show_raw_rtuple(rel))
def rels2rdf(ns, verbose=False): """ Convert the reldicts derived from the IEER corpus in an RDF Graph. """ graph = ConjunctiveGraph() graph.bind("nltk", BASE) graph.bind("org", "http://nltk.org/terms/org#") graph.bind("loc", "http://nltk.org/terms/loc#") graph.bind("pred", "http://nltk.org/terms/pred#") graph.bind("class", "http://nltk.org/terms/class#") in_uri = sym2uri(ns, "pred", "in") loc_uri = sym2uri(ns, "class", "Location") org_uri = sym2uri(ns, "class", "Organization") graph.add((in_uri, RDFNS.type, RDFSNS.Property)) graph.add((loc_uri, RDFNS.type, RDFSNS.Class)) graph.add((org_uri, RDFNS.type, RDFSNS.Class)) graph.add((in_uri, RDFSNS.domain, org_uri)) graph.add((in_uri, RDFSNS.range, loc_uri)) from nltk.corpus import ieer IN = re.compile(r".*\bin\b(?!\b.+ing\b)") for item in ieer.fileids(): for doc in ieer.parsed_docs(item): for reldict in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN): graph.add(make_rdf(ns, reldict, relsym="in")) for triple in make_rdfs(ns, reldict): graph.add(triple) return graph
def _nltk_extract(self, subj, obj): """Use NLTK's built-in relationship extractor to get subj and obj named entity relationships and context.""" re_location = re.compile(".*") result = [] for sent in self._sents: extraction = relextract.extract_rels(subj, obj, sent, pattern=re_location) if extraction: result.append(extraction) return result
def _nltk_extract(self, subj, obj): """Use NLTK's built-in relationship extractor to get subj and obj named entity relationships and context.""" re_location = re.compile(".*") result = [] for sent in self._sents: extraction = relextract.extract_rels( subj, obj, sent, pattern=re_location, ) if extraction: result.append(extraction) return result
reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # The function relextract() allows us to filter the reldicts # according to the classes of the subject and object named entities. # In addition, we can specify that the filler text has to match a given regular expression, # as illustrated in the next example. Here, we are looking for pairs of entities in the IN # relation, where IN has signature <ORG, LOC>. IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)') for fileid in ieer.fileids(): print fileid for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)" ROLES = re.compile(roles, re.VERBOSE) for fileid in ieer.fileids(): for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS
exit() except UnicodeDecodeError: print "Retrieved book not in proper encoding. Exiting program.\n" exit() print "Book retrieved - URL: https://www.gutenberg.org/ebooks/" + str( book_num) reg_ex = re.compile(r'.*') s = book_data print "\nTokenizing book data" s = word_tokenize(s) text_with_tags = nltk.pos_tag(s) text_chunk = nltk.ne_chunk(text_with_tags) person_location_pairs = {} print "Searching Name - Location interections" for rel in relextract.extract_rels('PER', 'GPE', text_chunk, pattern=reg_ex): relation = nltk.sem.rtuple(rel) person_location = get_person_location(relation) if person_location in person_location_pairs: current_pair = person_location_pairs[person_location] person_location_pairs[person_location] = current_pair + 1 else: person_location_pairs[person_location] = 1 if len(person_location_pairs) == 0: print "No interaction found in this book." exit() print "\n----------------------------------------------------" print "Interaction frequencies (Descending order)" print "(Person - Location : Count)" print "----------------------------------------------------"
for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # doctest: +ELLIPSIS for r in reldicts[18:20]: print('=' * 20) print(r['subjtext']) print(r['filler']) print(r['objtext']) import re IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') for fileid in ieer.fileids(): for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS roles = """ (.*( analyst| chair(wo)?man| commissioner| counsel| director| economist| editor| executive| foreman|
def main(): print "user input(1) or semcor(2)?" num = raw_input() if num == "1": #input print "enter word" word = raw_input() for meaning in (net.synsets(word)): #print "Sense: " + re.findall("'.*'", str(meaning))[0] print "Sense: " + str(meaning) print meaning.definition() + "\n" hypernyms = (meaning.hypernyms()) if len(hypernyms) > 0: print "\nHypernyms:" for meaning2 in hypernyms: print re.findall("'.*'", str(meaning2))[0] hyponyms = (meaning.hyponyms()) if len(hyponyms) > 0: print "\nHyponyms:" for meaning2 in hyponyms: print re.findall("'.*'", str(meaning2))[0] # print "\nHypernym Tree:" # print (gethypernymtree(meaning)) print "\n" # dog = wn.synset('dog.n.01') # hypo = lambda s: s.hyponyms() # hyper = lambda s: s.hypernyms() #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms() #True #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms() elif (num == "2"): #semcor print "semcor" for line in semcor.sents()[0:100]: s = "" for word in line: s = s + " " + word print s + "\n" for word in line: meanings = net.synsets(word) if len(meanings) > 0: print meanings[0].definition() elif num == "3": docs = ieer.parsed_docs('APW_19980424') tree = docs[1].text from nltk.sem import relextract pairs = relextract.tree2semi_rel(tree) for s, tree in pairs[18:22]: print('("...%s", %s)' % (" ".join(s[-5:]), tree)) reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # The function relextract() allows us to filter the reldicts # according to the classes of the subject and object named entities. # In addition, we can specify that the filler text has to match a given regular expression, # as illustrated in the next example. Here, we are looking for pairs of entities in the IN # relation, where IN has signature <ORG, LOC>. IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)') for fileid in ieer.fileids(): print fileid for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)" ROLES = re.compile(roles, re.VERBOSE) for fileid in ieer.fileids(): for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS
# Converts chunk (NER is already done) into list of two-member lists. Each contains a string followed by a tree (named entity) # Eg: "about first-level questions, said Ms." => string # (PERSON Cohn) => named entity, which is a tree. Root is PERSON, and child node is Cohn pairs = relextract.tree2semi_rel(tree) """ for s, tree in pairs[:3]: print('*'*20) print(' '.join(s)) print(tree) """ # Processes three of the above pairs at a time into a dictionary. Eg: (string1, tree1) (string2, tree2) (string3, tree3) # string1 is stored as left context. # tree1 is the subject, string2 is the filler, and tree2 is the object. # string3 is stored as right context. reldicts = relextract.semi_rel2reldict(pairs) for r in reldicts: print('=' * 20) print(r['subjtext']) # Print the subject text print(r['filler']) # Print the filler information print(r['objtext']) # Print the object text # Matches any number of characters followed by the word "in" as long as "in" is not followed by a word ending in "ing" IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') # Finds relationships of the form entity1 IN entity2, where entity1 is ORGANIZATION and entity2 is LOCATION print('\nRelation of type ORGANIZATION in LOCATION: \n') for relation in relextract.extract_rels('ORG', 'LOC', docs[1], corpus='ieer', pattern=IN): print(relextract.rtuple(relation))
import re from nltk import ne_chunk, pos_tag, word_tokenize from nltk.sem.relextract import extract_rels, rtuple text = BookText(url1) BELONG = re.compile(r'.*\bin|from|belonged|lived\b.*') sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] for i,sent in enumerate(tagged_sentences): sent = ne_chunk(sent) rels = extract_rels('PER', 'GPE', sent, corpus = 'ace', pattern = BELONG, window = 10) for rel in rels: print(rtuple(rel)) [PER: 'elizabeth/NNP'] 'lived/VBN in/IN' [GPE: 'london/NNP'] [PER: 'jane/NNP'] 'lived/VBN near/IN' [GPE: 'neitherfield/NNP'] [PER: 'bingley/NNP'] 'is/VBZ from/IN' [GPE: 'scotland/NNP'] [PER: 'elizabeth/NNP'] 'belonged/VBD to/IN' [GPE: 'london/NNP'] [PER: 'jane/NNP'] 'was/VBD now/RB in/IN' [GPE: 'brighton/NNP'] RELATIONS = re.compile(r'.*\mother|father|sister|brother|aunt|uncle\b.*') for i,sent in enumerate(tagged_sentences): sent = ne_chunk(sent) rels = extract_rels('PER', 'PER', sent, corpus = 'ace', pattern = BELONG, window = 10) for rel in rels: