예제 #1
0
def create_verbmet_objects():
    verbmet_objects = []
    pb_instances = propbank.instances()
    with open('VerbMetData - Master - VerbMetData Master.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                if row[2] == 'N':
                    sents[row[1]][int(row[3])] = row[4]
            line_count += 1
    for instance in pb_instances:
        for doc in sents:
            for sent in sents[doc]:
                if instance.fileid == doc and instance.sentnum == sent:
                    if instance.roleset[-1:].isnumeric():
                        # unpacking instance starts here -------------------
                        text = sents[doc][sent]
                        if annotations[text]:
                            verb_string = annotations[text][0]
                            label = annotations[text][1]
                            mismatch = annotations[text][2]
                            v_object = VerbMetClass(verb_string, text, instance, label, mismatch)
                            verbmet_objects.append(v_object)
    return verbmet_objects
예제 #2
0
 def __init__(self, filename="data.txt", mode=0, predicate=True, arguments=True):
     self.instances = propbank.instances()
     self.raw_sentence = []
     self.predicates = []
     self.arguments_list = []
     self.new_sentence = []
     self.x = 0
     self.filename = filename
     self.mode = mode
     self.predicate = predicate
     self.arguments = arguments
예제 #3
0
def create_sentences():
    pb_instances = propbank.instances()
    with open('VerbMetData - Master - VerbMetData Master.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                if row[2] == 'N':
                    sents[row[1]][int(row[3])] = row[4]
            line_count += 1
    for instance in pb_instances:
        for doc in sents:
            for sent in sents[doc]:
                if instance.fileid == doc and instance.sentnum == sent:
                    sentence_trees[sents[doc][sent]] = instance
def createCSV(inst_num):
	pb_instances = propbank.instances()
	inst = pb_instances[inst_num]
	t=inst.tree
	infl=inst.inflection
	# for (argloc, argid) in inst.arguments:
	# 	print('%-10s %s' % (argid, argloc.select(t).pprint(500)[:50]))
	
	try:
		# print inst.predicate
		index_pred=inst.predicate.wordnum
	except Exception, e:
		try:
			# print inst.predicate.pieces
			index_pred=inst.predicate.pieces[0].wordnum
		except Exception, e:
			# print inst.predicate.pieces[0].pieces
			index_pred=inst.predicate.pieces[0].pieces[0].wordnum
예제 #5
0
 def findexamples(self):
     self.allexamples=[]
     processed=0
     for instance in pb.instances():
         if self.testing>5:
             print instance.roleset
             print instance.arguments
         try:
             roleset=pb.roleset(instance.roleset)
             examples=roleset.findall('example')
             for e in examples:
                 #print ElementTree.tostring(e).decode('utf8').strip()
                 self.allexamples.append(e)
         except:
             pass
         processed+=1
         if self.testing > 2 and processed>10:
             break
     print "Number of examples: ",len(self.allexamples)
예제 #6
0
	def __init__(self):
		self.nlp = spacy.load('en')
		self.question_dict={}
		self.datapath = 'q_temps.tsv' 
		self.df = pd.read_csv(self.datapath, sep = '\t') # load question templates from tsv
		self.df.columns = ['Q', 'POS', 'TAG']
		self.aux_list = ['aux', 'auxpass', 'ROOT', 'advcl', 'ccomp', 'conj']
		self.verb_list = ['VBG', 'VBN', 'VBD', 'VBP', 'VBZ', 'VB']
		self.subj_list = [ 'nsubj', 'nsubjpass', 'expl','csubj','csubjpass']
		self.obj_list = ['obj', 'dobj', 'pobj']
		self.conj_list = ['CC', 'IN']
		self.indefinites = ['a', 'an']
		self.time_list = ['DATE', 'TIME','WRB']
		self.aux_dict = {'VBD': ['did', 'do', 'VBD', 'O', '', 'aux'],'VBP': ['do', 'do', 'VB', 'O', '', 'aux'], 'VBZ': ['does', 'do', 'VBZ', 'O', '', 'aux']}
		self.pro_dict = {'mine': 'yours', 'yours': 'mine', 'ours': 'yours', 'we': 'you', 'We': 'you', 'my': 'your', 'our': 'your', 'your': 'my', 'myself': 'yourself', 'ourself': 'yourself', 'yourself': 'myself', 'me': 'you', 'mine': 'yours', 'yours': 'mine', 'ours': 'yours', 'i': 'you'}
		self.pro_dict2 = {'she': 'her', 'he': 'him', 'they': 'them', 'her': 'her', 'him': 'him', 'them': 'them'}
		self.personal_nouns = ['PERSON', 'ORG']
		self.prep_list = ['at','for','to','on','by','with','in','from','into','about','behind','against','of','as']
		self.pb_inst = propbank.instances()
		self.verb_dict = {self.pb_inst[i].roleset[:-3]:i for i,item in enumerate(self.pb_inst)} # get verbs and their position in propbank list
# -*- coding: utf-8 -*-

import nltk
import pickle
#nltk.download('propbank')
#nltk.download('treebank')
from tqdm import tqdm
from nltk.corpus import treebank
from nltk.corpus import propbank

# Tiny example: PropBank
pb_instances = propbank.instances()
len(pb_instances)  # 112917
inst = pb_instances[1]
inst.fileid, inst.sentnum, inst.wordnum
print(propbank.instances()[1])
infl = inst.inflection
infl.form, infl.tense, infl.aspect, infl.person, infl.voice

# Tiny example: TreeBank
len(treebank.fileids())  # 199
len(treebank.parsed_sents())  # 3914
print(treebank.words('wsj_0001.mrg')[:])

# Compile all propbank metadata of verbs
pb_instances = propbank.instances()
index = [(inst.fileid, inst.sentnum, inst.wordnum, inst.inflection.tense)
         for inst in tqdm(pb_instances)]

ann = []
for fileid, sentnum, wordnum, tense in tqdm(index):
예제 #8
0
from nltk.corpus.reader.propbank import PropbankSplitTreePointer


def is_raising(pb_instance):
    for arg in pb_instance.arguments:
        if isinstance(arg[0], PropbankSplitTreePointer):
            return True
    return False


if __name__ == "__main__":
    from nltk.corpus import propbank
    pb_instances = propbank.instances()[:9353]
    inst = pb_instances[3]
    t = inst.tree
    pred = inst.predicate.select(t)
예제 #9
0
__author__ = 'juliewe'

from nltk.corpus import propbank
from xml.etree import ElementTree

n=103

if __name__=='__main__':
    pb_instances=propbank.instances()
    #print(pb_instances)
    instance=pb_instances[n]
    print instance.roleset,instance.predicate,instance.arguments

    send_01=propbank.roleset('send.01')
    for role in send_01.findall("roles/role"):
        print role.attrib['n'],role.attrib['descr']

    #print (ElementTree.tostring(send_01.find('example')).decode('utf8').strip())
    examples=send_01.findall('example')
    print len(examples)
    for e in examples:
        print ElementTree.tostring(e).decode('utf8').strip()
예제 #10
0
파일: propbank.py 프로젝트: WhiteAu/SRL-PSL
def get_instances(baseforms=None, instances_limit=None):
    instances = propbank.instances()[:instances_limit] if instances_limit else propbank.instances()
    instances = [i for i in instances if i.baseform in baseforms] if baseforms else instances
    return instances
import nltk
from nltk.corpus import propbank as pb
import re
import os
from collections import deque
from xml.etree import ElementTree
os.chdir(os.getcwd())
from cpyDatumTron import atum, datum, katum, Of, Intersect, Union
done = {}


def fileGenerator(rolesetString):
    file = open("propbank-examples.xml", "a")
    if (ElementTree.tostring(pb.roleset(rolesetString).find('example')) !=
            None):
        file.write(
            ElementTree.tostring(pb.roleset(rolesetString).find(
                'example')).decode('utf8').strip())
        file.write("\n")
        file.close()


for instance in pb.instances():
    if instance.roleset.split('.')[1] != 'XX':
        if not done.get(instance.roleset, False):
            done[instance.roleset] = True
            fileGenerator(instance.roleset)
예제 #12
0
def get_instances(baseforms=None, instances_limit=None):
    instances = propbank.instances(
    )[:instances_limit] if instances_limit else propbank.instances()
    instances = [i for i in instances
                 if i.baseform in baseforms] if baseforms else instances
    return instances
예제 #13
0
#Part One
#Imports
print("Importing NLTK, Treebank, and Propbank...")
import nltk
import os
import pickle
from nltk.corpus import treebank
from nltk.corpus import propbank

#Detailing the Propbank
print("Creating Propbank Objects...")
pb_instances = propbank.instances()
pb_verbs = propbank.verbs()
print()
print("Number of items defined in the propbank: ", len(pb_instances))
print("Number of verbs defined in the propbank: ", len(pb_verbs))
print()
#Argument Structure of 'agree' and 'fall'
#'agree'
agreeCount = 0
print("Argument Structure of 'agree': ")
for role in propbank.roleset('join.01').findall('roles/role'):
	print(role.attrib['n'],role.attrib['descr'])
print()
print("Treebank Sentences with 'agree': ")
for (n,i) in enumerate(pb_instances[:9353]):
    if i.roleset=='agree.01' and len(i.tree.leaves())<=30:
        print("Sentence ", n , ":",' '.join(i.tree.leaves())) #This prints ALL of the sentences
        print("Arguments of Sentence #", (agreeCount + 1))
        for(argloc, argid) in i.arguments:
            print(argid)
예제 #14
0
##sentences with their doc name, instance number, etc.


##Stats: 
#9000 "instances" (individual verbs annotated with their arguments)
#186 total documents represented among all the instances
#3469 total sentences


import nltk
import csv
from nltk.corpus import treebank,propbank
from collections import defaultdict

# Grabbing the first 9000 propbank instances (these 9000 instances map to 186 documents total)
instances = propbank.instances()[:9000]
#Extracting the set of documents represented by these 9000 instances
docs = set(list([instance.fileid for instance in instances]))
#A dictionary that maps WSJ documents (keys) to all propbank instances contained in each doc (values)
instancedict = defaultdict(list)
for doc in docs:
	instancedict[doc] = [instance for instance in instances if instance.fileid == doc]

#A dictionary of WSJ docs mapped to every sentence in that doc associated to a PB instance.
#Each sentence is then mapped to a list of its PB instances.
sentdict = defaultdict(lambda: defaultdict(list))
for doc in instancedict:
		sentlist = set([instance.sentnum for instance in instancedict[doc]])
		for num in sentlist:
			sentdict[doc][num] = [instance for instance in instancedict[doc] if instance.sentnum==num]