def relationBetweenEntities(sentences):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [
        nltk.tag.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    OF = re.compile(r'.*\bof\b.*')
    IN = re.compile(r'.*\bin\b(?!\b.+ing)')
    print('PERSON-ORGANISATION Relationships:')
    for i, sent in enumerate(tagged_sentences):
        sent = nltk.chunk.ne_chunk(
            sent)  # ne_chunk method expects one tagged sentence
        rels = extract_rels('PER',
                            'ORG',
                            sent,
                            corpus='ace',
                            pattern=IN,
                            window=10)
        for rel in rels:
            print(rtuple(rel))

    print('PERSON-GPE Relationships:')
    for i, sent in enumerate(tagged_sentences):
        sent = nltk.chunk.ne_chunk(
            sent)  # ne_chunk method expects one tagged sentence
        rels = extract_rels('PER',
                            'GPE',
                            sent,
                            corpus='ace',
                            pattern=OF,
                            window=10)
        for rel in rels:
            print(rtuple(rel))
Пример #2
0
def extractRel(reldicts,subjclass,objclass,window,pattern):

	relfilter = lambda x: (x['subjclass'] == subjclass and
        	                 len(x['filler'].split()) <= window and
    	                  pattern.match(x['filler']) and
                           x['objclass'] == objclass)
	for rel in list(filter(relfilter, reldicts)):
		print(relextract.rtuple(rel))
Пример #3
0
    reldicts = relextract.semi_rel2reldict(pairs)
    for k, v in sorted(reldicts[0].items()):
        print(k, '=>', v)

#	The function relextract() allows us to filter the reldicts
#	according to the classes of the subject and object named entities.
#	In addition, we can specify that the filler text has to match a given regular expression,
#	 as illustrated in the next example. Here, we are looking for pairs of entities in the IN
#	relation, where IN has signature <ORG, LOC>.
    IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)')
    for fileid in ieer.fileids():
        print fileid
        for doc in ieer.parsed_docs(fileid):
            for rel in relextract.extract_rels('ORG',
                                               'LOC',
                                               doc,
                                               corpus='ieer',
                                               pattern=IN):
                print(relextract.rtuple(rel))  # doctest: +ELLIPSIS

    roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)"

    ROLES = re.compile(roles, re.VERBOSE)
    for fileid in ieer.fileids():
        for doc in ieer.parsed_docs(fileid):
            for rel in relextract.extract_rels('PER',
                                               'ORG',
                                               doc,
                                               corpus='ieer',
                                               pattern=ROLES):
                print(relextract.rtuple(rel))  # doctest: +ELLIPSIS
Пример #4
0
def main():
    print "user input(1) or semcor(2)?"

    num = raw_input()

    if num == "1":
        #input
        print "enter word"
        word = raw_input()
        for meaning in (net.synsets(word)):
            #print "Sense: " + re.findall("'.*'", str(meaning))[0]
            print "Sense: " + str(meaning)
            print meaning.definition() + "\n"
            hypernyms = (meaning.hypernyms())
            if len(hypernyms) > 0:
                print "\nHypernyms:"
                for meaning2 in hypernyms:
                    print re.findall("'.*'", str(meaning2))[0]

            hyponyms = (meaning.hyponyms())
            if len(hyponyms) > 0:
                print "\nHyponyms:"
                for meaning2 in hyponyms:
                    print re.findall("'.*'", str(meaning2))[0]

    #		print "\nHypernym Tree:"
    #		print (gethypernymtree(meaning))
            print "\n"

    #		dog = wn.synset('dog.n.01')
    #		hypo = lambda s: s.hyponyms()
    #	 	hyper = lambda s: s.hypernyms()
    #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms()
    #True
    #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms()

    elif (num == "2"):
        #semcor
        print "semcor"

        for line in semcor.sents()[0:100]:
            s = ""
            for word in line:
                s = s + " " + word
            print s + "\n"

            for word in line:
                meanings = net.synsets(word)
                if len(meanings) > 0:
                    print meanings[0].definition()
    elif num == "3":

        docs = ieer.parsed_docs('APW_19980424')
        tree = docs[1].text

        from nltk.sem import relextract
        pairs = relextract.tree2semi_rel(tree)
        for s, tree in pairs[18:22]:
            print('("...%s", %s)' % (" ".join(s[-5:]), tree))

        reldicts = relextract.semi_rel2reldict(pairs)
        for k, v in sorted(reldicts[0].items()):
            print(k, '=>', v)

    #	The function relextract() allows us to filter the reldicts
    #	according to the classes of the subject and object named entities.
    #	In addition, we can specify that the filler text has to match a given regular expression,
    #	 as illustrated in the next example. Here, we are looking for pairs of entities in the IN
    #	relation, where IN has signature <ORG, LOC>.
        IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)')
        for fileid in ieer.fileids():
            print fileid
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('ORG',
                                                   'LOC',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=IN):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS

        roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)"

        ROLES = re.compile(roles, re.VERBOSE)
        for fileid in ieer.fileids():
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('PER',
                                                   'ORG',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=ROLES):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS
Пример #5
0
# Eg: "about first-level questions, said Ms."   => string
#     (PERSON Cohn)                             => named entity, which is a tree. Root is PERSON, and child node is Cohn
pairs = relextract.tree2semi_rel(tree)
""" for s, tree in pairs[:3]:
    print('*'*20)
    print(' '.join(s))
    print(tree) """

# Processes three of the above pairs at a time into a dictionary. Eg: (string1, tree1)    (string2, tree2)    (string3, tree3)
# string1 is stored as left context.
# tree1 is the subject, string2 is the filler, and tree2 is the object.
# string3 is stored as right context.
reldicts = relextract.semi_rel2reldict(pairs)
for r in reldicts:
    print('=' * 20)
    print(r['subjtext'])  # Print the subject text
    print(r['filler'])  # Print the filler information
    print(r['objtext'])  # Print the object text

# Matches any number of characters followed by the word "in" as long as "in" is not followed by a word ending in "ing"
IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')

# Finds relationships of the form entity1 IN entity2, where entity1 is ORGANIZATION and entity2 is LOCATION
print('\nRelation of type ORGANIZATION in LOCATION: \n')
for relation in relextract.extract_rels('ORG',
                                        'LOC',
                                        docs[1],
                                        corpus='ieer',
                                        pattern=IN):
    print(relextract.rtuple(relation))
Пример #6
0
def createDoc(text):#To create a DOCUMENT by combining all the chunked sentences.
	chunkedSents=list()
	for sent in nltk.sent_tokenize(text):
		chunkedSents+=[chunk for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))]
	docTree=nltk.Tree('DOCUMENT',chunkedSents)
	pairs=relextract.tree2semi_rel(docTree)

	for sent,tree in pairs:
		print '("...%s", %s)' % (' '.join([word for word,tag in sent][-5:]),tree) # To print
	
	reldicts = relextract.semi_rel2reldict(pairs)

	for r in reldicts:
	        print '='*30
		print(r['subjclass'],':', r['subjtext'])
		print (r['filler'])
		print (r['objclass'],':', r['objtext'])

	# Match pattern in filler
	roles = """
			(.*(
			analyst|
			chair(wo)?man|
			commissioner|
			counsel|
			director|
			economist|
			editor|
			executive|
			foreman|
			governor|
			head|
			lawyer|
			leader|
			librarian).*)|
			manager|
			partner|
			president|
			producer|
			professor|
			researcher|
			spokes(wo)?man|
			writer|
			,\sof\sthe?\s* 
			"""# "X, of (the) Y"
	ROLES = re.compile(roles, re.VERBOSE)
	IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
	pattern=ROLES
	subjclass='PERSON'#'ORGANIZATION'
	objclass='ORGANIZATION'#'GPE'
	window=10
	relfilter = lambda x: (x['subjclass'] == subjclass and
                           len(x['filler'].split()) <= window and
                           pattern.match(x['filler']) and
                           x['objclass'] == objclass)

	for rel in list(filter(relfilter, reldicts)):
		print(relextract.rtuple(rel))
	
	def improve(reldicts):
		for dicts in reldicts:
			print len(nltk.sent_tokenize(dicts['filler']))
	improve(reldicts)

	#print pairs[0]
	#print pairs[1]
	#print pairs[2]
	#for sent,tree in pairs[0]:
	#	print sent,tree 
		#print('("...%s", %s)' % (" ".join(sent[0][-5:]),tree))
	#tree=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize( nltk.sent_tokenize(text)[24] )))
	#l=[chunk for chunk in tree]
	#print "%s"%docTree
	"""	
Пример #7
0
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.sem.relextract import extract_rels, rtuple

text = BookText(url1)

BELONG = re.compile(r'.*\bin|from|belonged|lived\b.*')

sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]

for i,sent in enumerate(tagged_sentences):
  sent = ne_chunk(sent)
  rels = extract_rels('PER', 'GPE', sent, corpus = 'ace', pattern = BELONG, window = 10)
  for rel in rels:
    print(rtuple(rel))

[PER: 'elizabeth/NNP'] 'lived/VBN in/IN' [GPE: 'london/NNP']
[PER: 'jane/NNP'] 'lived/VBN near/IN' [GPE: 'neitherfield/NNP']
[PER: 'bingley/NNP'] 'is/VBZ from/IN' [GPE: 'scotland/NNP']
[PER: 'elizabeth/NNP'] 'belonged/VBD to/IN' [GPE: 'london/NNP']
[PER: 'jane/NNP'] 'was/VBD now/RB in/IN' [GPE: 'brighton/NNP']

RELATIONS = re.compile(r'.*\mother|father|sister|brother|aunt|uncle\b.*')

for i,sent in enumerate(tagged_sentences):
  sent = ne_chunk(sent)
  rels = extract_rels('PER', 'PER', sent, corpus = 'ace', pattern = BELONG, window = 10)
  for rel in rels:
    print(rtuple(rel))