def relationBetweenEntities(sentences):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [
        nltk.tag.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    OF = re.compile(r'.*\bof\b.*')
    IN = re.compile(r'.*\bin\b(?!\b.+ing)')
    print('PERSON-ORGANISATION Relationships:')
    for i, sent in enumerate(tagged_sentences):
        sent = nltk.chunk.ne_chunk(
            sent)  # ne_chunk method expects one tagged sentence
        rels = extract_rels('PER',
                            'ORG',
                            sent,
                            corpus='ace',
                            pattern=IN,
                            window=10)
        for rel in rels:
            print(rtuple(rel))

    print('PERSON-GPE Relationships:')
    for i, sent in enumerate(tagged_sentences):
        sent = nltk.chunk.ne_chunk(
            sent)  # ne_chunk method expects one tagged sentence
        rels = extract_rels('PER',
                            'GPE',
                            sent,
                            corpus='ace',
                            pattern=OF,
                            window=10)
        for rel in rels:
            print(rtuple(rel))
예제 #2
0
def rels2rdf(ns, verbose=False):
    """
    Convert the reldicts derived from the IEER corpus in an RDF Graph.
    """
    graph = ConjunctiveGraph()
    graph.bind('nltk',BASE)
    graph.bind('org', "http://nltk.org/terms/org#")
    graph.bind('loc', "http://nltk.org/terms/loc#")
    graph.bind('pred', "http://nltk.org/terms/pred#")
    graph.bind('class', "http://nltk.org/terms/class#")
    in_uri = sym2uri(ns, 'pred', 'in')
    loc_uri = sym2uri(ns, 'class', 'Location')
    org_uri = sym2uri(ns, 'class', 'Organization')
    graph.add((in_uri, RDFNS.type, RDFSNS.Property))
    graph.add((loc_uri, RDFNS.type, RDFSNS.Class))
    graph.add((org_uri, RDFNS.type, RDFSNS.Class))
    graph.add((in_uri, RDFSNS.domain, org_uri))
    graph.add((in_uri, RDFSNS.range, loc_uri))
    from nltk.corpus import ieer
    IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
    for item in ieer.fileids():
        for doc in ieer.parsed_docs(item):
            for reldict in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
                graph.add(make_rdf(ns, reldict, relsym='in'))
                for triple in make_rdfs(ns, reldict):
                    graph.add(triple)
    return graph
예제 #3
0
def findrelations(text):
    roles = """
    (.*(                   
    computer scientist|
    led |
    adjunct professor).*)|
    co-founder|
    chairman|
    parents|
    ,\sof\sthe?\s*  # "X, of (the) Y"
    """
    ROLES = re.compile(roles, re.VERBOSE)

    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentences
    ]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences)

    for doc in chunked_sentences:
        print(doc)
        for rel in relextract.extract_rels('PER',
                                           'ORG',
                                           doc,
                                           corpus='ace',
                                           pattern=ROLES):
            #it is a tree, so you need to work on it to output what you want
            print(relextract.show_raw_rtuple(rel))
예제 #4
0
def rels2rdf(ns, verbose=False):
    """
    Convert the reldicts derived from the IEER corpus in an RDF Graph.
    """
    graph = ConjunctiveGraph()
    graph.bind("nltk", BASE)
    graph.bind("org", "http://nltk.org/terms/org#")
    graph.bind("loc", "http://nltk.org/terms/loc#")
    graph.bind("pred", "http://nltk.org/terms/pred#")
    graph.bind("class", "http://nltk.org/terms/class#")
    in_uri = sym2uri(ns, "pred", "in")
    loc_uri = sym2uri(ns, "class", "Location")
    org_uri = sym2uri(ns, "class", "Organization")
    graph.add((in_uri, RDFNS.type, RDFSNS.Property))
    graph.add((loc_uri, RDFNS.type, RDFSNS.Class))
    graph.add((org_uri, RDFNS.type, RDFSNS.Class))
    graph.add((in_uri, RDFSNS.domain, org_uri))
    graph.add((in_uri, RDFSNS.range, loc_uri))
    from nltk.corpus import ieer

    IN = re.compile(r".*\bin\b(?!\b.+ing\b)")
    for item in ieer.fileids():
        for doc in ieer.parsed_docs(item):
            for reldict in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN):
                graph.add(make_rdf(ns, reldict, relsym="in"))
                for triple in make_rdfs(ns, reldict):
                    graph.add(triple)
    return graph
예제 #5
0
    def _nltk_extract(self, subj, obj):
        """Use NLTK's built-in relationship extractor to get subj and obj
        named entity relationships and context."""
        re_location = re.compile(".*")
        result = []
        for sent in self._sents:
            extraction = relextract.extract_rels(subj, obj, sent, pattern=re_location)

            if extraction:
                result.append(extraction)

        return result
예제 #6
0
    def _nltk_extract(self, subj, obj):
        """Use NLTK's built-in relationship extractor to get subj and obj
        named entity relationships and context."""
        re_location = re.compile(".*")
        result = []
        for sent in self._sents:
            extraction = relextract.extract_rels(
                subj,
                obj,
                sent,
                pattern=re_location,
            )

            if extraction:
                result.append(extraction)

        return result
예제 #7
0
    reldicts = relextract.semi_rel2reldict(pairs)
    for k, v in sorted(reldicts[0].items()):
        print(k, '=>', v)

#	The function relextract() allows us to filter the reldicts
#	according to the classes of the subject and object named entities.
#	In addition, we can specify that the filler text has to match a given regular expression,
#	 as illustrated in the next example. Here, we are looking for pairs of entities in the IN
#	relation, where IN has signature <ORG, LOC>.
    IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)')
    for fileid in ieer.fileids():
        print fileid
        for doc in ieer.parsed_docs(fileid):
            for rel in relextract.extract_rels('ORG',
                                               'LOC',
                                               doc,
                                               corpus='ieer',
                                               pattern=IN):
                print(relextract.rtuple(rel))  # doctest: +ELLIPSIS

    roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)"

    ROLES = re.compile(roles, re.VERBOSE)
    for fileid in ieer.fileids():
        for doc in ieer.parsed_docs(fileid):
            for rel in relextract.extract_rels('PER',
                                               'ORG',
                                               doc,
                                               corpus='ieer',
                                               pattern=ROLES):
                print(relextract.rtuple(rel))  # doctest: +ELLIPSIS
예제 #8
0
         exit()
 except UnicodeDecodeError:
     print "Retrieved book not in proper encoding. Exiting program.\n"
     exit()
 print "Book retrieved - URL: https://www.gutenberg.org/ebooks/" + str(
     book_num)
 reg_ex = re.compile(r'.*')
 s = book_data
 print "\nTokenizing book data"
 s = word_tokenize(s)
 text_with_tags = nltk.pos_tag(s)
 text_chunk = nltk.ne_chunk(text_with_tags)
 person_location_pairs = {}
 print "Searching Name - Location interections"
 for rel in relextract.extract_rels('PER',
                                    'GPE',
                                    text_chunk,
                                    pattern=reg_ex):
     relation = nltk.sem.rtuple(rel)
     person_location = get_person_location(relation)
     if person_location in person_location_pairs:
         current_pair = person_location_pairs[person_location]
         person_location_pairs[person_location] = current_pair + 1
     else:
         person_location_pairs[person_location] = 1
 if len(person_location_pairs) == 0:
     print "No interaction found in this book."
     exit()
 print "\n----------------------------------------------------"
 print "Interaction frequencies (Descending order)"
 print "(Person - Location : Count)"
 print "----------------------------------------------------"
예제 #9
0
for k, v in sorted(reldicts[0].items()):
    print(k, '=>', v)  # doctest: +ELLIPSIS

for r in reldicts[18:20]:
    print('=' * 20)
    print(r['subjtext'])
    print(r['filler'])
    print(r['objtext'])

import re
IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
for fileid in ieer.fileids():
    for doc in ieer.parsed_docs(fileid):
        for rel in relextract.extract_rels('ORG',
                                           'LOC',
                                           doc,
                                           corpus='ieer',
                                           pattern=IN):
            print(relextract.rtuple(rel))  # doctest: +ELLIPSIS

roles = """
(.*(
analyst|
chair(wo)?man|
commissioner|
counsel|
director|
economist|
editor|
executive|
foreman|
예제 #10
0
def main():
    print "user input(1) or semcor(2)?"

    num = raw_input()

    if num == "1":
        #input
        print "enter word"
        word = raw_input()
        for meaning in (net.synsets(word)):
            #print "Sense: " + re.findall("'.*'", str(meaning))[0]
            print "Sense: " + str(meaning)
            print meaning.definition() + "\n"
            hypernyms = (meaning.hypernyms())
            if len(hypernyms) > 0:
                print "\nHypernyms:"
                for meaning2 in hypernyms:
                    print re.findall("'.*'", str(meaning2))[0]

            hyponyms = (meaning.hyponyms())
            if len(hyponyms) > 0:
                print "\nHyponyms:"
                for meaning2 in hyponyms:
                    print re.findall("'.*'", str(meaning2))[0]

    #		print "\nHypernym Tree:"
    #		print (gethypernymtree(meaning))
            print "\n"

    #		dog = wn.synset('dog.n.01')
    #		hypo = lambda s: s.hyponyms()
    #	 	hyper = lambda s: s.hypernyms()
    #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms()
    #True
    #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms()

    elif (num == "2"):
        #semcor
        print "semcor"

        for line in semcor.sents()[0:100]:
            s = ""
            for word in line:
                s = s + " " + word
            print s + "\n"

            for word in line:
                meanings = net.synsets(word)
                if len(meanings) > 0:
                    print meanings[0].definition()
    elif num == "3":

        docs = ieer.parsed_docs('APW_19980424')
        tree = docs[1].text

        from nltk.sem import relextract
        pairs = relextract.tree2semi_rel(tree)
        for s, tree in pairs[18:22]:
            print('("...%s", %s)' % (" ".join(s[-5:]), tree))

        reldicts = relextract.semi_rel2reldict(pairs)
        for k, v in sorted(reldicts[0].items()):
            print(k, '=>', v)

    #	The function relextract() allows us to filter the reldicts
    #	according to the classes of the subject and object named entities.
    #	In addition, we can specify that the filler text has to match a given regular expression,
    #	 as illustrated in the next example. Here, we are looking for pairs of entities in the IN
    #	relation, where IN has signature <ORG, LOC>.
        IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)')
        for fileid in ieer.fileids():
            print fileid
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('ORG',
                                                   'LOC',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=IN):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS

        roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)"

        ROLES = re.compile(roles, re.VERBOSE)
        for fileid in ieer.fileids():
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('PER',
                                                   'ORG',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=ROLES):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS
예제 #11
0
# Converts chunk (NER is already done) into list of two-member lists. Each contains a string followed by a tree (named entity)
# Eg: "about first-level questions, said Ms."   => string
#     (PERSON Cohn)                             => named entity, which is a tree. Root is PERSON, and child node is Cohn
pairs = relextract.tree2semi_rel(tree)
""" for s, tree in pairs[:3]:
    print('*'*20)
    print(' '.join(s))
    print(tree) """

# Processes three of the above pairs at a time into a dictionary. Eg: (string1, tree1)    (string2, tree2)    (string3, tree3)
# string1 is stored as left context.
# tree1 is the subject, string2 is the filler, and tree2 is the object.
# string3 is stored as right context.
reldicts = relextract.semi_rel2reldict(pairs)
for r in reldicts:
    print('=' * 20)
    print(r['subjtext'])  # Print the subject text
    print(r['filler'])  # Print the filler information
    print(r['objtext'])  # Print the object text

# Matches any number of characters followed by the word "in" as long as "in" is not followed by a word ending in "ing"
IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')

# Finds relationships of the form entity1 IN entity2, where entity1 is ORGANIZATION and entity2 is LOCATION
print('\nRelation of type ORGANIZATION in LOCATION: \n')
for relation in relextract.extract_rels('ORG',
                                        'LOC',
                                        docs[1],
                                        corpus='ieer',
                                        pattern=IN):
    print(relextract.rtuple(relation))
예제 #12
0
import re
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.sem.relextract import extract_rels, rtuple

text = BookText(url1)

BELONG = re.compile(r'.*\bin|from|belonged|lived\b.*')

sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]

for i,sent in enumerate(tagged_sentences):
  sent = ne_chunk(sent)
  rels = extract_rels('PER', 'GPE', sent, corpus = 'ace', pattern = BELONG, window = 10)
  for rel in rels:
    print(rtuple(rel))

[PER: 'elizabeth/NNP'] 'lived/VBN in/IN' [GPE: 'london/NNP']
[PER: 'jane/NNP'] 'lived/VBN near/IN' [GPE: 'neitherfield/NNP']
[PER: 'bingley/NNP'] 'is/VBZ from/IN' [GPE: 'scotland/NNP']
[PER: 'elizabeth/NNP'] 'belonged/VBD to/IN' [GPE: 'london/NNP']
[PER: 'jane/NNP'] 'was/VBD now/RB in/IN' [GPE: 'brighton/NNP']

RELATIONS = re.compile(r'.*\mother|father|sister|brother|aunt|uncle\b.*')

for i,sent in enumerate(tagged_sentences):
  sent = ne_chunk(sent)
  rels = extract_rels('PER', 'PER', sent, corpus = 'ace', pattern = BELONG, window = 10)
  for rel in rels: