def create_verbmet_objects(): verbmet_objects = [] pb_instances = propbank.instances() with open('VerbMetData - Master - VerbMetData Master.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count != 0: if row[2] == 'N': sents[row[1]][int(row[3])] = row[4] line_count += 1 for instance in pb_instances: for doc in sents: for sent in sents[doc]: if instance.fileid == doc and instance.sentnum == sent: if instance.roleset[-1:].isnumeric(): # unpacking instance starts here ------------------- text = sents[doc][sent] if annotations[text]: verb_string = annotations[text][0] label = annotations[text][1] mismatch = annotations[text][2] v_object = VerbMetClass(verb_string, text, instance, label, mismatch) verbmet_objects.append(v_object) return verbmet_objects
def __init__(self, filename="data.txt", mode=0, predicate=True, arguments=True): self.instances = propbank.instances() self.raw_sentence = [] self.predicates = [] self.arguments_list = [] self.new_sentence = [] self.x = 0 self.filename = filename self.mode = mode self.predicate = predicate self.arguments = arguments
def create_sentences(): pb_instances = propbank.instances() with open('VerbMetData - Master - VerbMetData Master.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count != 0: if row[2] == 'N': sents[row[1]][int(row[3])] = row[4] line_count += 1 for instance in pb_instances: for doc in sents: for sent in sents[doc]: if instance.fileid == doc and instance.sentnum == sent: sentence_trees[sents[doc][sent]] = instance
def createCSV(inst_num): pb_instances = propbank.instances() inst = pb_instances[inst_num] t=inst.tree infl=inst.inflection # for (argloc, argid) in inst.arguments: # print('%-10s %s' % (argid, argloc.select(t).pprint(500)[:50])) try: # print inst.predicate index_pred=inst.predicate.wordnum except Exception, e: try: # print inst.predicate.pieces index_pred=inst.predicate.pieces[0].wordnum except Exception, e: # print inst.predicate.pieces[0].pieces index_pred=inst.predicate.pieces[0].pieces[0].wordnum
def findexamples(self): self.allexamples=[] processed=0 for instance in pb.instances(): if self.testing>5: print instance.roleset print instance.arguments try: roleset=pb.roleset(instance.roleset) examples=roleset.findall('example') for e in examples: #print ElementTree.tostring(e).decode('utf8').strip() self.allexamples.append(e) except: pass processed+=1 if self.testing > 2 and processed>10: break print "Number of examples: ",len(self.allexamples)
def __init__(self): self.nlp = spacy.load('en') self.question_dict={} self.datapath = 'q_temps.tsv' self.df = pd.read_csv(self.datapath, sep = '\t') # load question templates from tsv self.df.columns = ['Q', 'POS', 'TAG'] self.aux_list = ['aux', 'auxpass', 'ROOT', 'advcl', 'ccomp', 'conj'] self.verb_list = ['VBG', 'VBN', 'VBD', 'VBP', 'VBZ', 'VB'] self.subj_list = [ 'nsubj', 'nsubjpass', 'expl','csubj','csubjpass'] self.obj_list = ['obj', 'dobj', 'pobj'] self.conj_list = ['CC', 'IN'] self.indefinites = ['a', 'an'] self.time_list = ['DATE', 'TIME','WRB'] self.aux_dict = {'VBD': ['did', 'do', 'VBD', 'O', '', 'aux'],'VBP': ['do', 'do', 'VB', 'O', '', 'aux'], 'VBZ': ['does', 'do', 'VBZ', 'O', '', 'aux']} self.pro_dict = {'mine': 'yours', 'yours': 'mine', 'ours': 'yours', 'we': 'you', 'We': 'you', 'my': 'your', 'our': 'your', 'your': 'my', 'myself': 'yourself', 'ourself': 'yourself', 'yourself': 'myself', 'me': 'you', 'mine': 'yours', 'yours': 'mine', 'ours': 'yours', 'i': 'you'} self.pro_dict2 = {'she': 'her', 'he': 'him', 'they': 'them', 'her': 'her', 'him': 'him', 'them': 'them'} self.personal_nouns = ['PERSON', 'ORG'] self.prep_list = ['at','for','to','on','by','with','in','from','into','about','behind','against','of','as'] self.pb_inst = propbank.instances() self.verb_dict = {self.pb_inst[i].roleset[:-3]:i for i,item in enumerate(self.pb_inst)} # get verbs and their position in propbank list
# -*- coding: utf-8 -*- import nltk import pickle #nltk.download('propbank') #nltk.download('treebank') from tqdm import tqdm from nltk.corpus import treebank from nltk.corpus import propbank # Tiny example: PropBank pb_instances = propbank.instances() len(pb_instances) # 112917 inst = pb_instances[1] inst.fileid, inst.sentnum, inst.wordnum print(propbank.instances()[1]) infl = inst.inflection infl.form, infl.tense, infl.aspect, infl.person, infl.voice # Tiny example: TreeBank len(treebank.fileids()) # 199 len(treebank.parsed_sents()) # 3914 print(treebank.words('wsj_0001.mrg')[:]) # Compile all propbank metadata of verbs pb_instances = propbank.instances() index = [(inst.fileid, inst.sentnum, inst.wordnum, inst.inflection.tense) for inst in tqdm(pb_instances)] ann = [] for fileid, sentnum, wordnum, tense in tqdm(index):
from nltk.corpus.reader.propbank import PropbankSplitTreePointer def is_raising(pb_instance): for arg in pb_instance.arguments: if isinstance(arg[0], PropbankSplitTreePointer): return True return False if __name__ == "__main__": from nltk.corpus import propbank pb_instances = propbank.instances()[:9353] inst = pb_instances[3] t = inst.tree pred = inst.predicate.select(t)
__author__ = 'juliewe' from nltk.corpus import propbank from xml.etree import ElementTree n=103 if __name__=='__main__': pb_instances=propbank.instances() #print(pb_instances) instance=pb_instances[n] print instance.roleset,instance.predicate,instance.arguments send_01=propbank.roleset('send.01') for role in send_01.findall("roles/role"): print role.attrib['n'],role.attrib['descr'] #print (ElementTree.tostring(send_01.find('example')).decode('utf8').strip()) examples=send_01.findall('example') print len(examples) for e in examples: print ElementTree.tostring(e).decode('utf8').strip()
def get_instances(baseforms=None, instances_limit=None): instances = propbank.instances()[:instances_limit] if instances_limit else propbank.instances() instances = [i for i in instances if i.baseform in baseforms] if baseforms else instances return instances
import nltk from nltk.corpus import propbank as pb import re import os from collections import deque from xml.etree import ElementTree os.chdir(os.getcwd()) from cpyDatumTron import atum, datum, katum, Of, Intersect, Union done = {} def fileGenerator(rolesetString): file = open("propbank-examples.xml", "a") if (ElementTree.tostring(pb.roleset(rolesetString).find('example')) != None): file.write( ElementTree.tostring(pb.roleset(rolesetString).find( 'example')).decode('utf8').strip()) file.write("\n") file.close() for instance in pb.instances(): if instance.roleset.split('.')[1] != 'XX': if not done.get(instance.roleset, False): done[instance.roleset] = True fileGenerator(instance.roleset)
def get_instances(baseforms=None, instances_limit=None): instances = propbank.instances( )[:instances_limit] if instances_limit else propbank.instances() instances = [i for i in instances if i.baseform in baseforms] if baseforms else instances return instances
#Part One #Imports print("Importing NLTK, Treebank, and Propbank...") import nltk import os import pickle from nltk.corpus import treebank from nltk.corpus import propbank #Detailing the Propbank print("Creating Propbank Objects...") pb_instances = propbank.instances() pb_verbs = propbank.verbs() print() print("Number of items defined in the propbank: ", len(pb_instances)) print("Number of verbs defined in the propbank: ", len(pb_verbs)) print() #Argument Structure of 'agree' and 'fall' #'agree' agreeCount = 0 print("Argument Structure of 'agree': ") for role in propbank.roleset('join.01').findall('roles/role'): print(role.attrib['n'],role.attrib['descr']) print() print("Treebank Sentences with 'agree': ") for (n,i) in enumerate(pb_instances[:9353]): if i.roleset=='agree.01' and len(i.tree.leaves())<=30: print("Sentence ", n , ":",' '.join(i.tree.leaves())) #This prints ALL of the sentences print("Arguments of Sentence #", (agreeCount + 1)) for(argloc, argid) in i.arguments: print(argid)
##sentences with their doc name, instance number, etc. ##Stats: #9000 "instances" (individual verbs annotated with their arguments) #186 total documents represented among all the instances #3469 total sentences import nltk import csv from nltk.corpus import treebank,propbank from collections import defaultdict # Grabbing the first 9000 propbank instances (these 9000 instances map to 186 documents total) instances = propbank.instances()[:9000] #Extracting the set of documents represented by these 9000 instances docs = set(list([instance.fileid for instance in instances])) #A dictionary that maps WSJ documents (keys) to all propbank instances contained in each doc (values) instancedict = defaultdict(list) for doc in docs: instancedict[doc] = [instance for instance in instances if instance.fileid == doc] #A dictionary of WSJ docs mapped to every sentence in that doc associated to a PB instance. #Each sentence is then mapped to a list of its PB instances. sentdict = defaultdict(lambda: defaultdict(list)) for doc in instancedict: sentlist = set([instance.sentnum for instance in instancedict[doc]]) for num in sentlist: sentdict[doc][num] = [instance for instance in instancedict[doc] if instance.sentnum==num]