def read_corpus_inputs(args): """ Read and filter the part of the corpus we want features for """ reader = educe.stac.Reader(args.corpus) anno_files = reader.filter(reader.files(), mk_is_interesting(args, args.single)) corpus = reader.slurp(anno_files, verbose=True) if not args.ignore_cdus: strip_cdus(corpus, mode=args.strip_mode) postags = postag.read_tags(corpus, args.corpus) parses = corenlp.read_results(corpus, args.corpus) _fuse_corpus(corpus, postags) for lex in LEXICONS: lex.read(args.resources) pdtb_lex = read_pdtb_lexicon(args) inq_lex = {} # _read_inquirer_lexicon(args) verbnet_entries = [ VerbNetEntry(x, frozenset(vnet.lemmas(x))) for x in VERBNET_CLASSES ] return FeatureInput(corpus=corpus, postags=postags, parses=parses, lexicons=LEXICONS, pdtb_lex=pdtb_lex, verbnet_entries=verbnet_entries, inquirer_lex=inq_lex)
def in_verblist(lem): """Return true if the given lemma is found in the verbnet verb list""" verblist = verbnet.lemmas() if lem in verblist: return True else: return False
def read_corpus_inputs(args): """ Read and filter the part of the corpus we want features for """ reader = educe.stac.Reader(args.corpus) anno_files = reader.filter(reader.files(), mk_is_interesting(args, args.single)) corpus = reader.slurp(anno_files, verbose=True) if not args.ignore_cdus: strip_cdus(corpus) postags = postag.read_tags(corpus, args.corpus) parses = corenlp.read_results(corpus, args.corpus) _fuse_corpus(corpus, postags) for lex in LEXICONS: lex.read(args.resources) pdtb_lex = read_pdtb_lexicon(args) inq_lex = {} #_read_inquirer_lexicon(args) verbnet_entries = [VerbNetEntry(x, frozenset(vnet.lemmas(x))) for x in VERBNET_CLASSES] return FeatureInput(corpus=corpus, postags=postags, parses=parses, lexicons=LEXICONS, pdtb_lex=pdtb_lex, verbnet_entries=verbnet_entries, inquirer_lex=inq_lex)
def filterOut(self, L): L = list(set(L)) lemma = WordNetLemmatizer() #taking the verb lemas of the words in L L = [lemma.lemmatize(i, pos='v') for i in L] #extracting the verbs from the L which are in Wordnet's Verb and are a stopWord L = [i for i in L if i in verbnet.lemmas() and i not in stopWords] L = list(set(L)) return L
def test_corpus_verbnet_method_returns_correct_result(self): self.assertEqual( verbnet.classids('accept'), ['approve-77', 'characterize-29.2-1-1', 'obtain-13.5.2']) self.assertEqual(verbnet.longid('77'), 'approve-77') self.assertEqual(verbnet.lemmas()[0:10], [ 'December', 'FedEx', 'UPS', 'abandon', 'abase', 'abash', 'abate', 'abbreviate', 'abduct', 'abet' ])
def verbs(): """Return a list of verbs from verbnet that can be used to define a set of activities""" try_import('nltk') import nltk nltkdir = remkdir(os.path.join( os.environ['VIPY_CACHE'], 'nltk')) if 'VIPY_CACHE' in os.environ else tempfile.gettempdir() os.environ['NLTK_DATA'] = nltkdir print('[vipy.annotation.verbs]: Downloading verbnet to "%s"' % tempfile.gettempdir()) nltk.download('verbnet', tempfile.gettempdir()) from nltk.corpus import verbnet return verbnet.lemmas()
def filterOutVerbs(self, A): ''' return the words : 1 - which are in Wordnet's Verb 2 - which are not the stopwords ''' L = A[:] lemma = WordNetLemmatizer() L = [lemma.lemmatize(i, pos='v') for i in L] L = [i for i in L if i in verbnet.lemmas() and i not in stopWords] L = list(set(L)) return L
def get_all_infinitives(): import docassemble_pattern.en as pattern import nltk from nltk.corpus import wordnet as wn try: wn.all_synsets except LookupError: print("Downloading wordnet...") nltk.download("wordnet") from nltk.corpus import verbnet try: verbnet.lemmas except LookupError: print("Downloading verbnet...") nltk.download("verbnet") # from pattern - 8.5K all_pattern_verbs = [] """ Original pattern package have bugs in this verbs.infinitives class. For this reason, in the annotation pipeline, we didn't use their verb seed. Here we are installing pattern from the docassemble_pattern project, so bug is fixed. If you wish for consistency with original QANom dataset, leave the next line commented out. If you care only for better coverage, uncomment the following line: """ all_pattern_verbs = list(pattern.verbs.infinitives) # from wordnet - 8.7K verbs all_wn_verbs = sorted( set(l.name() for v_syn in wn.all_synsets(pos="v") for l in v_syn.lemmas() if "_" not in l.name())) # from verbnet - 3.6K verbs all_verbnet_verbs = list(verbnet.lemmas()) # together - 10.6K verbs infinitives = sorted( set(all_wn_verbs + all_pattern_verbs + all_verbnet_verbs)) return infinitives
def get_all(self): ''' This function return intent, action_on and attributs of the sentence ''' chunkGramForNounVerb = r'''NV: {<VB.?>+<.*>*<NN.?>+ | <NN.?>+<.*>*<VB.?>+} #chunking noun verbs }<[^(VB|NN)].*|VBZ|>{ #chinking all except POS starting with VB or NN and VBZ''' chunkGramForOnlyVerb = r'''VR: {<VB.?>} }<VBP|VBZ>+{''' tree = self.get_POS_tree(chunkGramForNounVerb) obj = [] series = [] L = [] for subtree in tree.subtrees(filter=lambda t: t.label() == 'NV'): if len(subtree.leaves()) > 1: series = subtree.leaves() obj = obj + [s for s in subtree.leaves()] #working below, if tree got some pairs as subtree if len(series) and (self.intent is False or self.action_upon is False): #tree.draw() verbs = [w for w, pos in series if re.match(r'VB.?', pos) is not None] nouns = [w for w, pos in series if re.match(r'NN.?', pos) is not None] if len(verbs): L = self.filterOutVerbs(verbs) #discarding the unwanted verbs L = [i for i in L if i not in discardedVerbs] if len(L): self.intent = L if len(nouns): lemma = WordNetLemmatizer() # considering those nouns which do not act as a verb L = [i for i in nouns if i not in stopWords and lemma.lemmatize(i, pos='v') not in verbnet.lemmas()] if len(L): self.action_upon = L #below works, if intent or action_upon did nt found yet if len(obj) and (self.intent is False or self.action_upon is False): #tree.draw() verbs = [w for w, pos in obj if re.match(r'VB.?', pos) is not None] nouns = [w for w, pos in obj if re.match(r'NN.?', pos) is not None] if len(verbs) and self.intent is False: L = self.filterOutVerbs(verbs) #again removing the unwanted verbs L = [i for i in L if i not in discardedVerbs] if len(L): self.intent = L if len(nouns) and self.intent is False: #filtering out nouns which are verbs L = self.filterOutVerbs(nouns) if len(L): #adding noun in discardedNouns which are verbs and present as intent for i in L: self.discardedNouns.append(i) self.intent = L if len(nouns) and self.action_upon is False: self.action_upon = nouns # if no intent is found yet then below if self.intent is False: print('3') tree = self.get_POS_tree(chunkGramForOnlyVerb) #tree.draw() verbs = [] verbs.clear() for subtree in tree.subtrees(filter=lambda t: t.label() == 'VR'): verbs = verbs + [s for s in subtree.leaves()] if len(verbs): L = [w for w, pos in verbs] L = self.filterOutVerbs(L) if len(L): self.intent = L if self.action_upon is not False: #again discarding nouns which are present as intent self.action_upon = [i for i in self.action_upon if i not in self.discardedNouns] L = self.get_attributes() if len(L): self.attributes = L if self.intent is False or len(self.intent) == 0: self.intent = ['No Intent Found'] if self.action_upon is False: self.action_upon = ['No Action Found'] if self.attributes is False: self.attributes = ['No Attribute(s) Found'] return { 'intent': ' , '.join(self.intent), 'action_on': ' , '.join(self.action_upon), 'attributes': ' , '.join(self.attributes) }
all_pattern_verbs = [] """ Original pattern package have bugs in this verbs.infinitives class. For this reason, in the annotation pipeline, we didn't use their verb seed. Here we are installing pattern from git, so bug is fixed. If you wish for consistency with original QANom dataset, leave the next line commented out. If you care only for better coverage, uncomment the following line: """ # all_pattern_verbs = list(pattern.verbs.infinitives) # from wordnet - 8.7K verbs all_wn_verbs = sorted( set(l.name() for v_syn in wn.all_synsets(pos="v") for l in v_syn.lemmas() if "_" not in l.name())) # from verbnet - 3.6K verbs all_verbnet_verbs = list(verbnet.lemmas()) # together - 10.6K verbs infinitives = sorted(set(all_wn_verbs + all_pattern_verbs + all_verbnet_verbs)) def as_gerund(verb): try: # pattern's lazy-loaders have a bug (first-time failure) v_prog = pattern.conjugate(verb, aspect=pattern.PROGRESSIVE) except: v_prog = pattern.conjugate(verb, aspect=pattern.PROGRESSIVE) return v_prog class SuffixBasedNominalizationCandidates: def __init__(self, verb_seed=infinitives):
generalThing = datum.thing verbnetRoot = generalThing.get("verbnet") wordnetRoot = generalThing.find("wordnet") class_ = verbnetRoot.get("class") verbclassID = verbnetRoot.get("verb class id") verbroot = verbnetRoot.get("verbroot") example = verbnetRoot.get("example") semantics = verbnetRoot.get("semantics") syntax = verbnetRoot.get("syntax") verbclass_ = verbnetRoot.get("verb class") description = verbnetRoot.get("description") semanticsArguments = verbnetRoot.get("semantics argument") syntaxArguments = verbnetRoot.get("syntax argument") syntaxFramesKatum = verbnetRoot.get("syntactic argument") semanticsFramesKatum = verbnetRoot.get("semantics predicate") predicateValue = verbnetRoot.get("predicate value") themroles = verbnetRoot.get("thematic role") roleType = verbnetRoot.get("role") listOfAllLemmas = vn.lemmas() uniqueClassIDs = [] for lemma in listOfAllLemmas: uniqueClassIDs.extend(vn.classids(lemma)) uniqueClassIDs = list(set(uniqueClassIDs)) processClassID(uniqueClassIDs) for v in vn.lemmas(): verbRootInstance = verbroot.get(v) for verbclass in vn.classids(v): verbRootInstance._is(classToKatumDict[verbclass], False) generalThing.save('wordnet-verbnet.datum')
if loops == 0: sent = sent.replace("loops", "") elif loops == 1: sent = sent.replace("loops", "loops-1") sent = sent.replace("object", "relation") return " ".join(sent.split()) grammar = { # should be pluralizable # should be able to say "<relation-singular> of the guy" "relation-singular": relations, "relation": ["relation-singular", "relation-plural"], # should be able to say "they <verb-plural> me" "verb-plural": vn.lemmas("admire-31.2") + vn.lemmas("amuse-31.1"), "verb": ["verb-singular", "verb-plural"], "time": time, "beginning": ["time the", "the"], "loops": ["", "relation of the loops"], "loops-1": ["relation of the loops"], } grammar["relation-plural"] = [ pluralize(relation) for relation in grammar["relation-singular"] ] grammar["verb-singular"] = [pluralize(verb) for verb in grammar["verb-plural"]] def generate(tpl): toks = []
maxCount = 0 avg = total / l # print bagOfWords["elizabeth"] # print avg allverbs = [] # Creating training set fr = open(extfile, 'r') for line in fr: token = line.strip("\n") extList[token] = avg words = verbnet.classids(token) for w in words: finalWord = w.decode("UTF-8", "ignore") allverbs += verbnet.lemmas(finalWord) for v in allverbs: extList[v] = avg / 2 # print len(extList) allverbs = [] fr = open(intfile, 'r') for line in fr: token = line.strip("\n") intList[token] = avg words = verbnet.classids(token) for w in words: finalWord = w.decode("UTF-8", "ignore") allverbs += verbnet.lemmas(finalWord)
#!/usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import verbnet verbnet.lemmas()[20:25] verbnet.classids()[:5] verbnet.classids('accept') verbnet.vnclass('remove-10.1') # doctest: +ELLIPSIS verbnet.vnclass('10.1') # doctest: +ELLIPSIS vn_31_2 = verbnet.vnclass('admire-31.2') for themrole in vn_31_2.findall('THEMROLES/THEMROLE'): print(themrole.attrib['type']) for selrestr in themrole.findall('SELRESTRS/SELRESTR'): print('[%(Value)s%(type)s]' % selrestr.attrib) print() print(verbnet.pprint('57'))
print(twitter_samples.tokenized('tweets.20150430-223406.json')) # rte # nltk.download('rte') print(rte.fileids()) # doctest: +ELLIPSIS rtepairs = rte.pairs(['rte2_test.xml', 'rte3_test.xml']) print(rtepairs) # doctest: +ELLIPSIS print(rtepairs[5]) print(rtepairs[5].text) # doctest: +NORMALIZE_WHITESPACE print(rtepairs[5].hyp) print(rtepairs[5].value) xmltree = rte.xml('rte3_dev.xml') print(xmltree) # doctest: +SKIP print(xmltree[7].findtext('t')) # doctest: +NORMALIZE_WHITESPACE # verbnet # nltk.download('verbnet') print(verbnet.lemmas()[20:25]) print(verbnet.classids()[:5]) print(verbnet.classids('accept')) print(verbnet.vnclass('remove-10.1')) # doctest: +ELLIPSIS print(verbnet.vnclass('10.1')) # doctest: +ELLIPSIS vn_31_2 = verbnet.vnclass('admire-31.2') for themrole in vn_31_2.findall('THEMROLES/THEMROLE'): print(themrole.attrib['type']) for selrestr in themrole.findall('SELRESTRS/SELRESTR'): print('[%(Value)s%(type)s]' % selrestr.attrib) print() print(verbnet.pprint('57')) # nps_chat # nltk.download('nps_chat') print(nltk.corpus.nps_chat.words()) print(nltk.corpus.nps_chat.tagged_words())