Пример #1
0
def rels2rdf(ns, verbose=False):
    """
    Convert the reldicts derived from the IEER corpus in an RDF Graph.
    """
    graph = ConjunctiveGraph()
    graph.bind('nltk',BASE)
    graph.bind('org', "http://nltk.org/terms/org#")
    graph.bind('loc', "http://nltk.org/terms/loc#")
    graph.bind('pred', "http://nltk.org/terms/pred#")
    graph.bind('class', "http://nltk.org/terms/class#")
    in_uri = sym2uri(ns, 'pred', 'in')
    loc_uri = sym2uri(ns, 'class', 'Location')
    org_uri = sym2uri(ns, 'class', 'Organization')
    graph.add((in_uri, RDFNS.type, RDFSNS.Property))
    graph.add((loc_uri, RDFNS.type, RDFSNS.Class))
    graph.add((org_uri, RDFNS.type, RDFSNS.Class))
    graph.add((in_uri, RDFSNS.domain, org_uri))
    graph.add((in_uri, RDFSNS.range, loc_uri))
    from nltk.corpus import ieer
    IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
    for item in ieer.fileids():
        for doc in ieer.parsed_docs(item):
            for reldict in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
                graph.add(make_rdf(ns, reldict, relsym='in'))
                for triple in make_rdfs(ns, reldict):
                    graph.add(triple)
    return graph
Пример #2
0
def rels2rdf(ns, verbose=False):
    """
    Convert the reldicts derived from the IEER corpus in an RDF Graph.
    """
    graph = ConjunctiveGraph()
    graph.bind('nltk',BASE)
    graph.bind('org', "http://nltk.org/terms/org#")
    graph.bind('loc', "http://nltk.org/terms/loc#")
    graph.bind('pred', "http://nltk.org/terms/pred#")
    graph.bind('class', "http://nltk.org/terms/class#")
    in_uri = sym2uri(ns, 'pred', 'in')
    loc_uri = sym2uri(ns, 'class', 'Location')
    org_uri = sym2uri(ns, 'class', 'Organization')
    graph.add((in_uri, RDFNS.type, RDFSNS.Property))
    graph.add((loc_uri, RDFNS.type, RDFSNS.Class))
    graph.add((org_uri, RDFNS.type, RDFSNS.Class))
    graph.add((in_uri, RDFSNS.domain, org_uri))
    graph.add((in_uri, RDFSNS.range, loc_uri))
    from nltk.corpus import ieer
    IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
    for item in ieer.items:
        for doc in ieer.parsed_docs(item):
            for reldict in relextract('ORG', 'LOC', doc, pattern=IN):
                graph.add(make_rdf(ns, reldict, relsym='in'))
                for triple in make_rdfs(ns, reldict):
                    graph.add(triple)
    return graph
def in_demo(trace=0, sql=True):
    """
    Select pairs of organizations and locations whose mentions occur with an
    intervening occurrence of the preposition "in".

    If the sql parameter is set to True, then the entity pairs are loaded into
    an in-memory database, and subsequently pulled out using an SQL "SELECT"
    query.
    """
    from nltk.corpus import ieer
    if sql:
        try:
            import sqlite3
            connection = sqlite3.connect(":memory:")
            connection.text_factory = sqlite3.OptimizedUnicode
            cur = connection.cursor()
            cur.execute("""create table Locations
            (OrgName text, LocationName text, DocID text)""")
        except ImportError:
            import warnings
            warnings.warn("Cannot import sqlite; sql flag will be ignored.")

    IN = re.compile(r'.*\bin\b(?!\b.+ing)')

    print
    print "IEER: in(ORG, LOC) -- just the clauses:"
    print "=" * 45

    for file in ieer.fileids():
        for doc in ieer.parsed_docs(file):
            if trace:
                print doc.docno
                print "=" * 15
            for rel in extract_rels('ORG',
                                    'LOC',
                                    doc,
                                    corpus='ieer',
                                    pattern=IN):
                print show_clause(rel, relsym='IN')
                if sql:
                    try:
                        rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
                        cur.execute(
                            """insert into Locations 
                                    values (?, ?, ?)""", rtuple)
                        connection.commit()
                    except NameError:
                        pass

    if sql:
        try:
            cur.execute("""select OrgName from Locations
                        where LocationName = 'Atlanta'""")
            print
            print "Extract data from SQL table: ORGs in Atlanta"
            print "-" * 15
            for row in cur:
                print row
        except NameError:
            pass
Пример #4
0
def rels2rdf(ns, verbose=False):
    """
    Convert the reldicts derived from the IEER corpus in an RDF Graph.
    """
    graph = ConjunctiveGraph()
    graph.bind("nltk", BASE)
    graph.bind("org", "http://nltk.org/terms/org#")
    graph.bind("loc", "http://nltk.org/terms/loc#")
    graph.bind("pred", "http://nltk.org/terms/pred#")
    graph.bind("class", "http://nltk.org/terms/class#")
    in_uri = sym2uri(ns, "pred", "in")
    loc_uri = sym2uri(ns, "class", "Location")
    org_uri = sym2uri(ns, "class", "Organization")
    graph.add((in_uri, RDFNS.type, RDFSNS.Property))
    graph.add((loc_uri, RDFNS.type, RDFSNS.Class))
    graph.add((org_uri, RDFNS.type, RDFSNS.Class))
    graph.add((in_uri, RDFSNS.domain, org_uri))
    graph.add((in_uri, RDFSNS.range, loc_uri))
    from nltk.corpus import ieer

    IN = re.compile(r".*\bin\b(?!\b.+ing\b)")
    for item in ieer.fileids():
        for doc in ieer.parsed_docs(item):
            for reldict in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN):
                graph.add(make_rdf(ns, reldict, relsym="in"))
                for triple in make_rdfs(ns, reldict):
                    graph.add(triple)
    return graph
Пример #5
0
def parse_doc(filename, index):

    docs = ieer.parsed_docs(filename)
    dt = docs[index].text
    words = dt.leaves()
    tags = tree2conll_without_postags(dt)

    rr = nltk.sent_tokenize(' '.join(words))
    # small fixes:
    if filename == 'NYT_19980315' and index == 11:
        rr[8] = rr[8] + rr[9]
        rr.remove(rr[9])
    if filename == 'NYT_19980407':
        if index == 4:
            rr[19] = rr[19] + rr[20]
            rr.remove(rr[20])
        if index == 13:
            rr[9] = rr[9] + rr[10]
            rr.remove(rr[10])

    L = get_breaks(words, rr)
    L.append(
        len(tags))  # otherwise you miss the last sentence of the document.
    tags = [tags[L[i]:L[i + 1]] for i in range(len(L) - 1)]
    return tags
Пример #6
0
def in_demo(trace=0, sql=True):
    """
    Select pairs of organizations and locations whose mentions occur with an
    intervening occurrence of the preposition "in".

    If the sql parameter is set to True, then the entity pairs are loaded into
    an in-memory database, and subsequently pulled out using an SQL "SELECT"
    query.
    """
    from nltk.corpus import ieer

    if sql:
        try:
            import sqlite3

            connection = sqlite3.connect(":memory:")
            connection.text_factory = sqlite3.OptimizedUnicode
            cur = connection.cursor()
            cur.execute("""create table Locations
            (OrgName text, LocationName text, DocID text)""")
        except ImportError:
            import warnings

            warnings.warn("Cannot import sqlite; sql flag will be ignored.")

    IN = re.compile(r'.*\bin\b(?!\b.+ing)')

    print()
    print("IEER: in(ORG, LOC) -- just the clauses:")
    print("=" * 45)

    for file in ieer.fileids():
        for doc in ieer.parsed_docs(file):
            if trace:
                print(doc.docno)
                print("=" * 15)
            for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
                print(clause(rel, relsym='IN'))
                if sql:
                    try:
                        rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
                        cur.execute("""insert into Locations
                                    values (?, ?, ?)""", rtuple)
                        connection.commit()
                    except NameError:
                        pass

    if sql:
        try:
            cur.execute("""select OrgName from Locations
                        where LocationName = 'Atlanta'""")
            print()
            print("Extract data from SQL table: ORGs in Atlanta")
            print("-" * 15)
            for row in cur:
                print(row)
        except NameError:
            pass
Пример #7
0
def ieer_headlines():
    from nltk.corpus import ieer

    print("IEER: First 20 Headlines")
    print("=" * 45)

    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)
Пример #8
0
def ieer_headlines():
    
    from nltk.corpus import ieer
    from nltk.tree import Tree
    
    print "IEER: First 20 Headlines"
    print "=" * 45
    
    trees = [doc.headline for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print
        print "%s:\n%s" % (doc.docno, tree)
Пример #9
0
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree
    
    print("IEER: First 20 Headlines")
    print("=" * 45)  
    
    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)
Пример #10
0
def roles_demo(trace=0):
    from nltk.corpus import ieer

    roles = r"""
    (.*(                   # assorted roles
    analyst|
    chair(wo)?man|
    commissioner|
    counsel|
    director|
    economist|
    editor|
    executive|
    foreman|
    governor|
    head|
    lawyer|
    leader|
    librarian).*)|
    manager|
    partner|
    president|
    producer|
    professor|
    researcher|
    spokes(wo)?man|
    writer|
    ,\sof\sthe?\s*  # "X, of (the) Y"
    """
    ROLES = re.compile(roles, re.VERBOSE)

    print()
    print("IEER: has_role(PER, ORG) -- raw rtuples:")
    print("=" * 45)

    for file in ieer.fileids():
        for doc in ieer.parsed_docs(file):
            lcon = rcon = False
            if trace:
                print(doc.docno)
                print("=" * 15)
                lcon = rcon = True
            for rel in extract_rels("PER",
                                    "ORG",
                                    doc,
                                    corpus="ieer",
                                    pattern=ROLES):
                print(rtuple(rel, lcon=lcon, rcon=rcon))
Пример #11
0
def in_demo(trace=0):
 
    from nltk.corpus import ieer

    IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
    
    print
    print "in(ORG, LOC) -- just the clauses:"
    print "=" * 45

    for file in ieer.files():
        for doc in ieer.parsed_docs(file):
            if trace:
                print doc.docno
                print "=" * 15
            for rel in relextract('ORG', 'LOC', doc, pattern=IN):
                print show_clause(rel, relsym='IN')
Пример #12
0
def write_conll(filename):
    """ For example: filename = 'APW_19980314'

    """
    docs = ieer.parsed_docs(filename)
    numdocs = len(docs)

    conllfile = CONLL_WRITE + filename
    with open(conllfile, 'a+') as nfd:
        for index in range(numdocs):
            nfd.write('\n')
            nfd.write('-DOCSTART- -X- -X- O')
            nfd.write('\n')
            sentences = parse_doc(filename, index)
            for sent in sentences:
                nfd.write('\n')
                for iobword in sent:
                    nfd.write(iobword[0] + '\t' + iobword[1] + '\n')
Пример #13
0
def roles_demo(trace=0):
    from nltk.corpus import ieer

    roles = """
    (.*(                   # assorted roles
    analyst|
    chair(wo)?man|
    commissioner|
    counsel|
    director|
    economist|       
    editor|
    executive|
    foreman|
    governor|
    head|
    lawyer|
    leader|
    librarian).*)|
    manager|
    partner|
    president|
    producer|
    professor|
    researcher|
    spokes(wo)?man|
    writer|
    ,\sof\sthe?\s*  # "X, of (the) Y"
    """
    ROLES = re.compile(roles, re.VERBOSE)

    print()
    print("IEER: has_role(PER, ORG) -- raw rtuples:")
    print("=" * 45)

    for file in ieer.fileids():
        for doc in ieer.parsed_docs(file):
            lcon = rcon = False
            if trace:
                print(doc.docno)
                print("=" * 15)
                lcon = rcon = True
            for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
                print(rtuple(rel, lcon=lcon, rcon=rcon))
Пример #14
0
def in_demo(trace=0, sql=True):

    from nltk.corpus import ieer
    if sql:
        import sqlite3
        connection = sqlite3.connect(":memory:")
        connection.text_factory = sqlite3.OptimizedUnicode
        cur = connection.cursor()
        cur.execute("""create table Locations
        (OrgName text, LocationName text, DocID text)""")

    IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')

    print
    print "IEER: in(ORG, LOC) -- just the clauses:"
    print "=" * 45

    for file in ieer.files():
        for doc in ieer.parsed_docs(file):
            if trace:
                print doc.docno
                print "=" * 15
            for rel in extract_rels('ORG', 'LOC', doc, pattern=IN):
                print show_clause(rel, relsym='IN')
                if sql:
                    rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
                    cur.execute(
                        """insert into Locations 
                                values (?, ?, ?)""", rtuple)
                    connection.commit()

    if sql:
        cur.execute("""select OrgName from Locations
                    where LocationName = 'Atlanta'""")
        print
        print "Extract data from SQL table: ORGs in Atlanta"
        print "-" * 15
        for row in cur:
            print row
Пример #15
0
    #semcor
    print "semcor"

    for line in semcor.sents()[0:100]:
        s = ""
        for word in line:
            s = s + " " + word
        print s + "\n"

        for word in line:
            meanings = net.synsets(word)
            if len(meanings) > 0:
                print meanings[0].definition()
elif num == "3":

    docs = ieer.parsed_docs('APW_19980424')
    tree = docs[1].text

    from nltk.sem import relextract
    pairs = relextract.tree2semi_rel(tree)
    for s, tree in pairs[18:22]:
        print('("...%s", %s)' % (" ".join(s[-5:]), tree))

    reldicts = relextract.semi_rel2reldict(pairs)
    for k, v in sorted(reldicts[0].items()):
        print(k, '=>', v)

#	The function relextract() allows us to filter the reldicts
#	according to the classes of the subject and object named entities.
#	In addition, we can specify that the filler text has to match a given regular expression,
#	 as illustrated in the next example. Here, we are looking for pairs of entities in the IN
Пример #16
0
Файл: names.py Проект: Glank/rdp
from nltk.corpus import ieer
import nltk

def tree_search(tree, filt):
    for node in tree:
        if filt(node):
            yield node
        elif isinstance(node, nltk.tree.Tree):
            for sub in tree_search(node, filt):
                yield sub

def is_person(t):
    if not isinstance(t, nltk.tree.Tree):
        return False
    return t.label()=="PERSON"

docs = ieer.parsed_docs('NYT_19980315')
for doc in docs:
    for elem in tree_search(doc.text, is_person):
        print ' '.join(elem.leaves())
Пример #17
0
from nltk.corpus import conll2000, conll2002
print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2000.chunked_sents()[:2]:
    print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2002.chunked_sents()[:2]:
    print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE


# SEMCOR
    
from nltk.corpus import semcor
print(semcor.words())
print(semcor.chunks())
print(semcor.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(semcor.chunk_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
list(map(str, semcor.tagged_chunks(tag='both')[:3]))
[[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]]    

# IEER

from nltk.corpus import ieer
ieer.fileids() # doctest: +NORMALIZE_WHITESPACE
docs = ieer.parsed_docs('APW_19980314')
print(docs[0])
print(docs[0].docno)
print(docs[0].doctype)
print(docs[0].date_time)
print(docs[0].headline)
print(docs[0].text) # doctest: +ELLIPSIS
Пример #18
0
from nltk.corpus import ieer
from nltk.sem import relextract
import re

docs = ieer.parsed_docs('NYT_19980315')
tree = docs[1].text

#print(tree)

# Converts chunk (NER is already done) into list of two-member lists. Each contains a string followed by a tree (named entity)
# Eg: "about first-level questions, said Ms."   => string
#     (PERSON Cohn)                             => named entity, which is a tree. Root is PERSON, and child node is Cohn
pairs = relextract.tree2semi_rel(tree)
""" for s, tree in pairs[:3]:
    print('*'*20)
    print(' '.join(s))
    print(tree) """

# Processes three of the above pairs at a time into a dictionary. Eg: (string1, tree1)    (string2, tree2)    (string3, tree3)
# string1 is stored as left context.
# tree1 is the subject, string2 is the filler, and tree2 is the object.
# string3 is stored as right context.
reldicts = relextract.semi_rel2reldict(pairs)
for r in reldicts:
    print('=' * 20)
    print(r['subjtext'])  # Print the subject text
    print(r['filler'])  # Print the filler information
    print(r['objtext'])  # Print the object text

# Matches any number of characters followed by the word "in" as long as "in" is not followed by a word ending in "ing"
IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
Пример #19
0
def ieer_chunked_sents(tag=nltk.tag.pos_tag):
	for doc in ieer.parsed_docs():
		tagged = ieertree2conlltags(doc.text, tag)
		yield conlltags2tree(tagged)
Пример #20
0
        if tag == "WP":
            desired_subjclass = "PERSON"

if desired_subjclass != "PERSON":
    print "Sorry, I don't know how to answer that kind of question. "

elif desired_objclass != "ORGANIZATION":
    print "Sorry, I don't know how to look for that answer. "

else:
    evidence = []

    print "Searching for your answer... "

    for fileid in ieer.fileids():
        for doc in ieer.parsed_docs(fileid):
            for rel in relextract.extract_rels('PER',
                                               'ORG',
                                               doc,
                                               corpus='ieer',
                                               pattern=ROLES):
                if rel['objclass'] == "ORGANIZATION" and rel[
                        'objtext'] == desired_objtext:
                    if rel['subjclass'] == "PERSON":
                        desired_subjtext = rel['subjtext']
                        evidence.append(rel)
                #print(relextract.rtuple(rel))

    if desired_subjtext != "":
        print "\nI've got the answer! The person you're looking for is " + desired_subjtext + ". "
        print "\nHere's some supporting evidence:"
Пример #21
0
def main():
    print "user input(1) or semcor(2)?"

    num = raw_input()

    if num == "1":
        #input
        print "enter word"
        word = raw_input()
        for meaning in (net.synsets(word)):
            #print "Sense: " + re.findall("'.*'", str(meaning))[0]
            print "Sense: " + str(meaning)
            print meaning.definition() + "\n"
            hypernyms = (meaning.hypernyms())
            if len(hypernyms) > 0:
                print "\nHypernyms:"
                for meaning2 in hypernyms:
                    print re.findall("'.*'", str(meaning2))[0]

            hyponyms = (meaning.hyponyms())
            if len(hyponyms) > 0:
                print "\nHyponyms:"
                for meaning2 in hyponyms:
                    print re.findall("'.*'", str(meaning2))[0]

    #		print "\nHypernym Tree:"
    #		print (gethypernymtree(meaning))
            print "\n"

    #		dog = wn.synset('dog.n.01')
    #		hypo = lambda s: s.hyponyms()
    #	 	hyper = lambda s: s.hypernyms()
    #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms()
    #True
    #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms()

    elif (num == "2"):
        #semcor
        print "semcor"

        for line in semcor.sents()[0:100]:
            s = ""
            for word in line:
                s = s + " " + word
            print s + "\n"

            for word in line:
                meanings = net.synsets(word)
                if len(meanings) > 0:
                    print meanings[0].definition()
    elif num == "3":

        docs = ieer.parsed_docs('APW_19980424')
        tree = docs[1].text

        from nltk.sem import relextract
        pairs = relextract.tree2semi_rel(tree)
        for s, tree in pairs[18:22]:
            print('("...%s", %s)' % (" ".join(s[-5:]), tree))

        reldicts = relextract.semi_rel2reldict(pairs)
        for k, v in sorted(reldicts[0].items()):
            print(k, '=>', v)

    #	The function relextract() allows us to filter the reldicts
    #	according to the classes of the subject and object named entities.
    #	In addition, we can specify that the filler text has to match a given regular expression,
    #	 as illustrated in the next example. Here, we are looking for pairs of entities in the IN
    #	relation, where IN has signature <ORG, LOC>.
        IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)')
        for fileid in ieer.fileids():
            print fileid
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('ORG',
                                                   'LOC',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=IN):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS

        roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)"

        ROLES = re.compile(roles, re.VERBOSE)
        for fileid in ieer.fileids():
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('PER',
                                                   'ORG',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=ROLES):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS
Пример #22
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

# NAMED ENTITIES

from nltk.corpus import ieer
docs = ieer.parsed_docs('NYT_19980315')
tree = docs[1].text
print(tree)  # doctest: +ELLIPSIS

from nltk.corpus import conll2002
for doc in conll2002.chunked_sents('ned.train')[27]:
    print(doc)

from nltk.sem import relextract
pairs = relextract.tree2semi_rel(tree)
for s, tree in pairs[18:22]:
    print('("...%s", %s)' % (" ".join(s[-5:]), tree))

reldicts = relextract.semi_rel2reldict(pairs)
for k, v in sorted(reldicts[0].items()):
    print(k, '=>', v)  # doctest: +ELLIPSIS

for r in reldicts[18:20]:
    print('=' * 20)
    print(r['subjtext'])
    print(r['filler'])
    print(r['objtext'])

import re
IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')