def relationBetweenEntities(sentences): tokenized_sentences = [word_tokenize(sentence) for sentence in sentences] tagged_sentences = [ nltk.tag.pos_tag(sentence) for sentence in tokenized_sentences ] OF = re.compile(r'.*\bof\b.*') IN = re.compile(r'.*\bin\b(?!\b.+ing)') print('PERSON-ORGANISATION Relationships:') for i, sent in enumerate(tagged_sentences): sent = nltk.chunk.ne_chunk( sent) # ne_chunk method expects one tagged sentence rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=IN, window=10) for rel in rels: print(rtuple(rel)) print('PERSON-GPE Relationships:') for i, sent in enumerate(tagged_sentences): sent = nltk.chunk.ne_chunk( sent) # ne_chunk method expects one tagged sentence rels = extract_rels('PER', 'GPE', sent, corpus='ace', pattern=OF, window=10) for rel in rels: print(rtuple(rel))
def extractRel(reldicts,subjclass,objclass,window,pattern): relfilter = lambda x: (x['subjclass'] == subjclass and len(x['filler'].split()) <= window and pattern.match(x['filler']) and x['objclass'] == objclass) for rel in list(filter(relfilter, reldicts)): print(relextract.rtuple(rel))
reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # The function relextract() allows us to filter the reldicts # according to the classes of the subject and object named entities. # In addition, we can specify that the filler text has to match a given regular expression, # as illustrated in the next example. Here, we are looking for pairs of entities in the IN # relation, where IN has signature <ORG, LOC>. IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)') for fileid in ieer.fileids(): print fileid for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)" ROLES = re.compile(roles, re.VERBOSE) for fileid in ieer.fileids(): for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS
def main(): print "user input(1) or semcor(2)?" num = raw_input() if num == "1": #input print "enter word" word = raw_input() for meaning in (net.synsets(word)): #print "Sense: " + re.findall("'.*'", str(meaning))[0] print "Sense: " + str(meaning) print meaning.definition() + "\n" hypernyms = (meaning.hypernyms()) if len(hypernyms) > 0: print "\nHypernyms:" for meaning2 in hypernyms: print re.findall("'.*'", str(meaning2))[0] hyponyms = (meaning.hyponyms()) if len(hyponyms) > 0: print "\nHyponyms:" for meaning2 in hyponyms: print re.findall("'.*'", str(meaning2))[0] # print "\nHypernym Tree:" # print (gethypernymtree(meaning)) print "\n" # dog = wn.synset('dog.n.01') # hypo = lambda s: s.hyponyms() # hyper = lambda s: s.hypernyms() #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms() #True #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms() elif (num == "2"): #semcor print "semcor" for line in semcor.sents()[0:100]: s = "" for word in line: s = s + " " + word print s + "\n" for word in line: meanings = net.synsets(word) if len(meanings) > 0: print meanings[0].definition() elif num == "3": docs = ieer.parsed_docs('APW_19980424') tree = docs[1].text from nltk.sem import relextract pairs = relextract.tree2semi_rel(tree) for s, tree in pairs[18:22]: print('("...%s", %s)' % (" ".join(s[-5:]), tree)) reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # The function relextract() allows us to filter the reldicts # according to the classes of the subject and object named entities. # In addition, we can specify that the filler text has to match a given regular expression, # as illustrated in the next example. Here, we are looking for pairs of entities in the IN # relation, where IN has signature <ORG, LOC>. IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)') for fileid in ieer.fileids(): print fileid for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)" ROLES = re.compile(roles, re.VERBOSE) for fileid in ieer.fileids(): for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS
# Eg: "about first-level questions, said Ms." => string # (PERSON Cohn) => named entity, which is a tree. Root is PERSON, and child node is Cohn pairs = relextract.tree2semi_rel(tree) """ for s, tree in pairs[:3]: print('*'*20) print(' '.join(s)) print(tree) """ # Processes three of the above pairs at a time into a dictionary. Eg: (string1, tree1) (string2, tree2) (string3, tree3) # string1 is stored as left context. # tree1 is the subject, string2 is the filler, and tree2 is the object. # string3 is stored as right context. reldicts = relextract.semi_rel2reldict(pairs) for r in reldicts: print('=' * 20) print(r['subjtext']) # Print the subject text print(r['filler']) # Print the filler information print(r['objtext']) # Print the object text # Matches any number of characters followed by the word "in" as long as "in" is not followed by a word ending in "ing" IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') # Finds relationships of the form entity1 IN entity2, where entity1 is ORGANIZATION and entity2 is LOCATION print('\nRelation of type ORGANIZATION in LOCATION: \n') for relation in relextract.extract_rels('ORG', 'LOC', docs[1], corpus='ieer', pattern=IN): print(relextract.rtuple(relation))
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences. chunkedSents=list() for sent in nltk.sent_tokenize(text): chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))] docTree=nltk.Tree('DOCUMENT',chunkedSents) pairs=relextract.tree2semi_rel(docTree) for sent,tree in pairs: print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print reldicts = relextract.semi_rel2reldict(pairs) for r in reldicts: print '='*30 print(r['subjclass'],':', r['subjtext']) print (r['filler']) print (r['objclass'],':', r['objtext']) # Match pattern in filler roles = """ (.*( analyst| chair(wo)?man| commissioner| counsel| director| economist| editor| executive| foreman| governor| head| lawyer| leader| librarian).*)| manager| partner| president| producer| professor| researcher| spokes(wo)?man| writer| ,\sof\sthe?\s* """# "X, of (the) Y" ROLES = re.compile(roles, re.VERBOSE) IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') pattern=ROLES subjclass='PERSON'#'ORGANIZATION' objclass='ORGANIZATION'#'GPE' window=10 relfilter = lambda x: (x['subjclass'] == subjclass and len(x['filler'].split()) <= window and pattern.match(x['filler']) and x['objclass'] == objclass) for rel in list(filter(relfilter, reldicts)): print(relextract.rtuple(rel)) def improve(reldicts): for dicts in reldicts: print len(nltk.sent_tokenize(dicts['filler'])) improve(reldicts) #print pairs[0] #print pairs[1] #print pairs[2] #for sent,tree in pairs[0]: # print sent,tree #print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree)) #tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] ))) #l=[chunk for chunk in tree] #print "%s"%docTree """
from nltk import ne_chunk, pos_tag, word_tokenize from nltk.sem.relextract import extract_rels, rtuple text = BookText(url1) BELONG = re.compile(r'.*\bin|from|belonged|lived\b.*') sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] for i,sent in enumerate(tagged_sentences): sent = ne_chunk(sent) rels = extract_rels('PER', 'GPE', sent, corpus = 'ace', pattern = BELONG, window = 10) for rel in rels: print(rtuple(rel)) [PER: 'elizabeth/NNP'] 'lived/VBN in/IN' [GPE: 'london/NNP'] [PER: 'jane/NNP'] 'lived/VBN near/IN' [GPE: 'neitherfield/NNP'] [PER: 'bingley/NNP'] 'is/VBZ from/IN' [GPE: 'scotland/NNP'] [PER: 'elizabeth/NNP'] 'belonged/VBD to/IN' [GPE: 'london/NNP'] [PER: 'jane/NNP'] 'was/VBD now/RB in/IN' [GPE: 'brighton/NNP'] RELATIONS = re.compile(r'.*\mother|father|sister|brother|aunt|uncle\b.*') for i,sent in enumerate(tagged_sentences): sent = ne_chunk(sent) rels = extract_rels('PER', 'PER', sent, corpus = 'ace', pattern = BELONG, window = 10) for rel in rels: print(rtuple(rel))