def initialize_lookup(self): if self.initialized: return try: fn.frames_by_lemma("dog") except LookupError: nltk_download(self.config, "framenet_v17") self.initialized = True
def get_lemma_frames(lemm): print(lemm) fs = fn.frames_by_lemma(lemm) for f in fs: lunits = [lunit.split('.')[0] for lunit in f['lexUnit']] if lemm in lunits: print(True) else: print(False) return fn.frames_by_lemma(lemm)
def get_frames_by_trans(self, trans): if nltk == True: lemma = trans frames = fn.frames_by_lemma(lemma) if len(frames) == 0: lemma = r'(?i)' + str(trans) frames = fn.frames_by_lemma(lemma) frames = [i.name for i in frames] return frames else: print( 'please install nltk FrameNet first. refer: http://www.nltk.org/howto/framenet.html' ) return False
def framenet_frames_all(self, tweet): """ count the total number of invoced frames in framenet for every token and every possible tag according to wiktionary, looks up frames and counts them >>> f=Featurizer() >>> f.init_wiktionary("wiktionary/en.tags.li") >>> f.framenet_frames_all("the little guy") >>> f.printFeatures() """ from nltk.corpus import framenet as fn if not self.wiktionary: print >> sys.stderr, "call init_wiktionary before using framenet" raise Error("init_wiktionary needed before calling this method") words = tweet if self.lowercase: words = words.lower() if self.remove_stopwords: words = self.DELIM.join([ w for w in words.split(self.DELIM) if w not in ENGLISH_STOP_WORDS ]) for token in words.split(self.DELIM): for tag in self.wiktionary[token]: token_tag = token + "." + tag[0].lower() #initial tag of UPOS print token_tag frames = fn.frames_by_lemma(r'(?i)\b{}\b'.format(token_tag)) for frame in frames: f = self.PREFIX_FRAMENET + frame['name'] self.d[f] = self.d.get(f, 0) + 1
def parGetFrame(lemma): frame = fn.frames_by_lemma("donkey") if frame != []: f = frame.pop() return 'asdf' else: return 'asdfasd'
def lookup_with_POS(self, candidate): word, word_pos = candidate if word_pos in self.pos_tag_mapping: word += "." + self.pos_tag_mapping[word_pos] frames = fn.frames_by_lemma(word) if not frames: return None return self.disambiguate(frames, candidate, override=defs.disam.first)
def find_frames(word): frames_list = [] fn_results = fn.frames_by_lemma(word) for item in fn_results: name = item.name #print(name) frames_list.append(name) return frames_list
def word_similarity(word): match_found = False fn_results = fn.frames_by_lemma(word) if fn_results: for item in fn_results: id = item.ID if id in (54, 61, 57, 690, 7): #if id in (43,523): match_found = True return match_found
def verb_to_frames(verb): syn_frame_dict = {} wnsynsets = wn.synsets(verb, wn.VERB) for syn in wnsynsets: lemmas = syn.lemma_names() frames = [] for lem in lemmas: # just collect frame ID fids = [frame.ID for frame in fn.frames_by_lemma(r'(?i)' + lem)] frames.extend(fids) syn_frame_dict[syn] = frames return syn_frame_dict
def invoke_frame(token: str): word = token.lower() lu_list = [(i.name, i.definition) for i in fn.lus()] lu_temp = set([i for i in lu_list if word == i[0].split('.')[0]]) frames = [] for lu, def_ in lu_temp: fr = fn.frames_by_lemma(r'(?i)' + lu) # print(len(fr), fr[0].ID) if len(frames) == 0: frames.append(fr[0]) else: if fr[0] not in frames: frames.append(fr[0]) return frames
def lookup_(self, candidate): # http://www.nltk.org/howto/framenet.html word = candidate # in framenet, pos-disambiguation is done via the lookup if self.disambiguation == defs.disam.pos: frames = self.lookup_with_POS(candidate) else: frames = fn.frames_by_lemma(word) if not frames: return frames = self.disambiguate(frames, candidate) if not frames: return None activations = {x.name: 1 for x in frames} if self.do_spread_activation: parent_activations = self.spread_activation(frames, self.spread_steps, 1) activations = {**activations, **parent_activations} return activations
def find_frames(lemma, pos_tag): simple_tag = None if SIMPLE_TAGS.has_key(pos_tag): simple_tag = SIMPLE_TAGS[pos_tag] if simple_tag: key = lemma+'.'+simple_tag else: key = lemma frames = [] if FRAME_CACHE.has_key(key): frames.append(FRAME_CACHE[key]) else: try: frames = fn.frames_by_lemma(key) if frames: for f in frames: FRAME_CACHE[key] = f except: pass return frames
def demo(): from pprint import pprint from nltk.corpus import framenet as fn # # It is not necessary to explicitly build the indexes by calling # buildindexes(). We do this here just for demo purposes. If the # indexes are not built explicitely, they will be built as needed. # print('Building the indexes...') fn.buildindexes() # # Get some statistics about the corpus # print('Number of Frames:', len(fn.frames())) print('Number of Lexical Units:', len(fn.lexical_units())) print('Number of annotated documents:', len(fn.documents())) print() # # Frames # print( 'getting frames whose name matches the (case insensitive) regex: "(?i)medical"' ) medframes = fn.frames(r'(?i)medical') print('Found {0} Frames whose name matches "(?i)medical":'.format( len(medframes))) print([(f.name, f.ID) for f in medframes]) # # store the first frame in the list of frames # tmp_id = medframes[0].ID m_frame = fn.frame(tmp_id) # reads all info for the frame # # get the frame relations # print( '\nNumber of frame relations for the "{0}" ({1}) frame:'.format( m_frame.name, m_frame.ID), len(m_frame.frameRelation)) for fr in m_frame.frameRelation: print(' ', fr.type + ":", fr.relatedFrame) # # get the names of the Frame Elements # print( '\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name), len(m_frame.FE)) print(' ', [x.name for x in m_frame.FE]) # # get the names of the "Core" Frame Elements # print('\nThe "core" Frame Elements in the "{0}" frame:'.format( m_frame.name)) print(' ', [x.name for x in m_frame.FE if x.coreType == "Core"]) # # get all of the Lexical Units that are incorporated in the # 'Ailment' FE of the 'Medical_conditions' frame (id=239) # print('\nAll Lexical Units that are incorporated in the "Ailment" FE:') m_frame = fn.frame(239) ailment_lus = [x for x in m_frame.lexUnit if x.incorporatedFE == 'Ailment'] print([x.name for x in ailment_lus]) # # get all of the Lexical Units for the frame # print('\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name), len(m_frame.lexUnit)) print(' ', [x.name for x in m_frame.lexUnit[:5]], '...') # # get basic info on the second LU in the frame # tmp_id = m_frame.lexUnit[1].ID # grab the id of the second LU luinfo = fn.lu_basic(tmp_id) # get basic info on the LU print('\nInformation on the LU: {0}'.format(luinfo.name)) pprint(luinfo) # # Get a list of all of the corpora used for fulltext annotation # print('\nNames of all of the corpora used for fulltext annotation:') allcorpora = set([x.corpname for x in fn.documents()]) pprint(list(allcorpora)) # # Get the names of the annotated documents in the first corpus # firstcorp = list(allcorpora)[0] firstcorp_docs = fn.documents(firstcorp) print('\nNames of the annotated documents in the "{0}" corpus:'.format( firstcorp)) pprint([x.filename for x in firstcorp_docs]) # # Search for frames containing LUs whose name attribute matches a # regexp pattern. # # Note: if you were going to be doing a lot of this type of # searching, you'd want to build an index that maps from # lemmas to frames because each time frames_by_lemma() is # called, it has to search through ALL of the frame XML files # in the db. print( '\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":' ) pprint(fn.frames_by_lemma(r'^run.v$'))
def frame_idname_list(lemma): frames = fn.frames_by_lemma(lemma) retq = [] for f in frames: retq.append((f["ID"], f["name"])) return tuple(retq)
try: sentences = str(st.tag_text(sentence)).replace( "</sentence>\n", "</sentence>|") # print sentences for sentence in sentences.split("|"): if sentence.strip() == '': continue # print sentence for tree in fromstring(sentence): lemma = tree.items()[2][1].lower() # print lemma # Lemmas to skip (use re.match) if lemma == "that" or lemma == "this": continue try: # Query FrameNet -- frame names frames = fn.frames_by_lemma(lemma) framenames = "" for frame in frames: # print frame.name, # print len(frames) # continue # Cutoff point if len(frames) > numframes: continue framenames = "".join( [framenames, "|", frame.name]) if framenames != "": print "".join([ field[0], "|", field[1], "|FRM_01|", lemma, framenames ])
def demo(): from pprint import pprint from nltk.corpus import framenet as fn # # It is not necessary to explicitly build the indexes by calling # buildindexes(). We do this here just for demo purposes. If the # indexes are not built explicitely, they will be built as needed. # print('Building the indexes...') fn.buildindexes() # # Get some statistics about the corpus # print('Number of Frames:', len(fn.frames())) print('Number of Lexical Units:', len(fn.lexical_units())) print('Number of annotated documents:', len(fn.documents())) print() # # Frames # print('getting frames whose name matches the (case insensitive) regex: "(?i)medical"') medframes = fn.frames(r'(?i)medical') print( 'Found {0} Frames whose name matches "(?i)medical":'.format(len(medframes))) print([(f.name, f.ID) for f in medframes]) # # store the first frame in the list of frames # tmp_id = medframes[0].ID m_frame = fn.frame(tmp_id) # reads all info for the frame # # get the frame relations # print( '\nNumber of frame relations for the "{0}" ({1}) frame:'.format(m_frame.name, m_frame.ID), len(m_frame.frameRelation)) for fr in m_frame.frameRelation: print(' ', fr.type + ":", fr.relatedFrame) # # get the names of the Frame Elements # print( '\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name), len(m_frame.FE)) print(' ', [x.name for x in m_frame.FE]) # # get the names of the "Core" Frame Elements # print( '\nThe "core" Frame Elements in the "{0}" frame:'.format(m_frame.name)) print(' ', [x.name for x in m_frame.FE if x.coreType == "Core"]) # # get all of the Lexical Units that are incorporated in the # 'Ailment' FE of the 'Medical_conditions' frame (id=239) # print('\nAll Lexical Units that are incorporated in the "Ailment" FE:') m_frame = fn.frame(239) ailment_lus = [x for x in m_frame.lexUnit if x.incorporatedFE == 'Ailment'] print([x.name for x in ailment_lus]) # # get all of the Lexical Units for the frame # print('\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name), len(m_frame.lexUnit)) print(' ', [x.name for x in m_frame.lexUnit[:5]], '...') # # get basic info on the second LU in the frame # tmp_id = m_frame.lexUnit[1].ID # grab the id of the second LU luinfo = fn.lu_basic(tmp_id) # get basic info on the LU print('\nInformation on the LU: {0}'.format(luinfo.name)) pprint(luinfo) # # Get a list of all of the corpora used for fulltext annotation # print('\nNames of all of the corpora used for fulltext annotation:') allcorpora = set([x.corpname for x in fn.documents()]) pprint(list(allcorpora)) # # Get the names of the annotated documents in the first corpus # firstcorp = list(allcorpora)[0] firstcorp_docs = fn.documents(firstcorp) print( '\nNames of the annotated documents in the "{0}" corpus:'.format(firstcorp)) pprint([x.filename for x in firstcorp_docs]) # # Search for frames containing LUs whose name attribute matches a # regexp pattern. # # Note: if you were going to be doing a lot of this type of # searching, you'd want to build an index that maps from # lemmas to frames because each time frames_by_lemma() is # called, it has to search through ALL of the frame XML files # in the db. print('\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":') pprint(fn.frames_by_lemma(r'^run.v$'))
def travel_verb_instances(data): travel = data["Travel verbs"].str.cat(sep=',') travel_verbs = travel.split(",") travel_verbs = convert_to_base_verb(travel_verbs) counter1 = Counter(travel_verbs) travel_verb_dict = dict(counter1) verbs = [] counts = [] is_verbs_in_vn = [] is_verbs_in_fn = [] is_verbs_in_both = [] is_verbs_not_in_both = [] fn_frames = [] is_synonyms_available = [] synonyms = [] is_any_syns_in_net = [] syns_in_vn = [] syns_in_fn = [] for verb in travel_verb_dict: verbs.append(verb) counts.append(travel_verb_dict[verb]) verb = verb.lower() fn_frame = [] fn_results = [] #print(fn_frame) syn_list = [] syn_in_vn = [] syn_not_in_vn = [] syn_in_fn = [] syn_not_in_fn = [] is_any_syn_in_net = 0 is_synonyms = 0 is_in_vn = verbs_in_verbnet(verb) is_in_fn = verbs_in_framenet(verb) # print("Verb is in verbnet %s",is_in_vn) # print("Verb is in framenet %s", is_in_fn) is_not_in_both = 1 if not is_in_vn and not is_in_fn else 0 is_in_both = 1 if is_in_vn and is_in_fn else 0 # print("Verb is in both %s", is_in_both) # print("Verb is not in both %s", is_not_in_both) if is_in_fn: fn_results = fn.frames_by_lemma(verb) if fn_results: for item in fn_results: name = item.name fn_frame.append(name) if is_not_in_both: syn_list = find_synonyms(verb) is_synonyms = 1 if len(syn_list) > 0 else 0 for syn in syn_list: is_syn_in_vn = verbs_in_verbnet(syn) is_syn_in_fn = verbs_in_framenet(syn) syn_in_vn.append( syn) if is_syn_in_vn else syn_not_in_vn.append(syn) syn_in_fn.append( syn) if is_syn_in_fn else syn_not_in_fn.append(syn) is_any_syn_in_net = is_any_syn_in_net + 1 if is_syn_in_vn or is_syn_in_fn else is_any_syn_in_net is_verbs_in_vn.append(is_in_vn) is_verbs_in_fn.append(is_in_fn) is_verbs_in_both.append(is_in_both) is_verbs_not_in_both.append(is_not_in_both) fn_frames.append((",".join(fn_frame))) is_synonyms_available.append(is_synonyms) synonyms.append((",").join(syn_list)) is_any_syns_in_net.append(is_any_syn_in_net) syns_in_vn.append((",".join(syn_in_vn))) syns_in_fn.append((",".join(syn_in_fn))) # print("==========Syn list============") # print(syn_list) # # print("========Syn in VN========") # print(syn_in_vn) # # print("========Syn in FN========") # print(syn_in_fn) # # print(is_any_syn_in_net) # # print(fn_frames) # print("================================================================================") # print(verbs) # print(counts) # print(is_verbs_in_vn) # print(is_verbs_in_fn) # print(is_verbs_in_both) # print(is_verbs_not_in_both) # print(fn_frames) # print(is_synonyms_available) # print(synonyms) # print(is_any_syns_in_net) # print(syns_in_vn) # print(syns_in_fn) #final_list = list(zip(verbs,counts,is_verbs_in_vn,is_verbs_in_fn,is_verbs_in_both,is_verbs_not_in_both,fn_frames,is_synonyms_available,synonyms,is_any_syns_in_net,syn_in_vn,syn_in_fn)) #print(data,final_list) df = pd.DataFrame(np.column_stack([ verbs, counts, is_verbs_in_vn, is_verbs_in_fn, is_verbs_in_both, is_verbs_not_in_both, fn_frames, is_synonyms_available, synonyms, is_any_syns_in_net, syns_in_vn, syns_in_fn ]), columns=[ 'Verb', 'Count', 'Is in VerbNet', 'Is in FrameNet', 'Is in both', 'is not in both', 'FN Frames', 'Is Synonyms', 'Synonyms', 'Is any syn in Net', 'Synonyms in VN', 'Synonyms in FN' ]) return df
def verbs_in_framenet(verb): fn_results = fn.frames_by_lemma(verb) print(fn_results) return 1 if fn_results else 0
#sense2freq = {} max_count = 0 max_synset = 0 for i, s in enumerate(synsets): freq = 0 for lemma in s.lemmas(): freq += lemma.count() if freq > max_count: max_count = freq max_synset = i #sense2freq[i] = freq synset_dict[word] = [max_synset] if first_fn_match and use_framenet: for word in synset_dict.keys(): if len(fn.frames_by_lemma(word)) > 0: #print word frame_dict[word] = [ fn.frames_by_lemma(word)[0].name ] #map(lambda x: x.name, fn.frames_by_lemma(word)[0]) ############################### wordcount_filename = 'WordNet-InfoContent-3.0/ic-brown-add1.dat' lines = [] with open(wordcount_filename) as f: lines = f.readlines() word_id_dict = {} for line in lines[1:]:
def nvConflict(path,content): all_words = [] model_2 = Word2Vec([content], size=100, window=5, min_count=1, workers=4) # read in every file from the full text annotation folder(or other annotation data folder) # read in as a tree structure using xml.etree.ElementTree for filename in os.listdir(path): if not filename.endswith('.xml'): continue fullname = os.path.join(path, filename) tree = ET.parse(fullname) tree = tree.getroot() t = tostring(tree) t = t.lower() tree = ET.fromstring(t) full_sentence = [] new_list = [] final_result_list = [] # read each sentence of each annotation file for sentence in tree: each_sentence = [] each_nvpair = [] each_sentence.append(filename) each_sentence.append("This following is a new sentence: ") each_sentence.append(sentence[0].text) result_list = [] for annot in sentence.iter(): # text, annotationSet # annot.attrib, e.g. <annotationSet cDate="10/26/2009 04:28:59 PDT Mon" luID="5511" luName="people.n" frameID="304" frameName="People" status="MANUAL" ID="6558815"> for x, y in annot.attrib.items(): # when the phrase type is noun; it's different kind of noun; # please see http://www.surdeanu.info/mihai/teaching/ista555-fall13/readings/PennTreebankConstituents.html for details if y == 'nn' or y == 'nns' or y == 'nnp' or y == 'nnps': print('--------------') print('This sentence has the following noun:') start = annot.attrib.get('start') start = int(start) end = annot.attrib.get('end') end = int(end) end = end + 1 clause_1 = sentence[0].text[start:end] print(clause_1) each_sentence.append(clause_1) each_sentence.append('This is the type of noun:') each_sentence.append(y) print(y) # slice_list.append(fn.annotations(clause_1)) fm1 = fn.frames_by_lemma(clause_1) print(fm1) each_sentence.append('This lemma evoked the following frame:') each_sentence.append(fm1) new_list.append(clause_1) each_nvpair.append(clause_1) all_words.append(clause_1) result_list.append(clause_1) # when the phrase type is verb; it's different kind of verb elif y == 'vvd' or y == 'vb' or y == 'vbd' or y == 'vbg' or y == 'vbn' or y == 'vbp' or y == 'vbz': print('--------------') print(sentence[0].text) print('This sentence has the following verb:') start = annot.attrib.get('start') start = int(start) print(start) end = annot.attrib.get('end') end = int(end) end = end + 1 print(sentence[0].text[start:end]) clause_2 = sentence[0].text[start:end] each_sentence.append(clause_2) each_sentence.append('This is the type of verb:') each_sentence.append(y) print(y) # slice_list.append(sentence[0].text[start:end]) each_sentence.append('This lemma evoked the following frame:') fm2 = fn.frames_by_lemma(clause_2) print(fm2) each_sentence.append(fm2) new_list.append(clause_2) each_nvpair.append(clause_2) all_words.append(clause_2) result_list.append(clause_2) full_sentence.append(each_sentence) full_sentence.append("\n") try: sim = [] avg = 0 for i in range(len(result_list)): for j in range(i + 1, len(result_list)): print(result_list[i]) print(result_list[j]) print(model_2.similarity(result_list[i], result_list[j])) sim.append(result_list[i]) sim.append(result_list[j]) sim.append(model_2.similarity(result_list[i], result_list[j])) avg += model_2.similarity(result_list[i], result_list[j]) avg = avg/(i*j) print(avg) result_list.append(sim) result_list.append(sentence[0].text) result_list.append(avg) final_result_list.append(result_list) final_result_list.append("\n") except: pass with open('/Users/mac/Desktop/final_testing_3', 'w') as file_handler: try: for item in final_result_list: file_handler.write("{}\n".format(item)) except: pass
# May erratically truncate extremely long sentence strings! try: sentences = str(st.tag_text(sentence)).replace("</sentence>\n","</sentence>|") # print sentences for sentence in sentences.split("|"): if sentence.strip() == '' : continue # print sentence for tree in fromstring(sentence): lemma = tree.items()[2][1].lower() # print lemma # Lemmas to skip (use re.match) if lemma == "that" or lemma == "this" : continue try: # Query FrameNet -- frame names frames = fn.frames_by_lemma(lemma) ; framenames = "" for frame in frames: # print frame.name, # print len(frames) # continue # Cutoff point if len(frames) > numframes : continue framenames = "".join([framenames,"|",frame.name]) if framenames != "": print "".join([field[0],"|",field[1],"|FRM_01|",lemma,framenames]) # Core Frame Elements for frame in frames: if len(frames) > numframes : continue ID = frame.ID ; framecores = "" cores = [(fename,fe.ID) for fename,fe in fn.frame(ID).FE.items() if fe.coreType=='Core'] for core in cores: framecores = "".join([framecores,"|",core[0]])
word1 = "melt" word2 = "oxidize" input = word1 vn_results = vn.classids(lemma=input) if not vn_results: print(input + ' not in verbnet.') else: print('verbnet:') for ele in vn_results: print(ele) print("") fn_results = fn.frames_by_lemma(input) if not fn_results: print(input + ' not in framenet.') else: print('framenet:') for ele in fn_results: print(ele) print("") pb_results = [] try: pb_results = pb.rolesets(input) except ValueError: print(input + ' not in propbank.')
def nvConflict(path): # read in every file from the full text annotation folder(or other annotation data folder) # read in as a tree structure using xml.etree.ElementTree for filename in os.listdir(path): if not filename.endswith('.xml'): continue fullname = os.path.join(path, filename) tree = ET.parse(fullname) tree = tree.getroot() t = tostring(tree) t = t.lower() tree = ET.fromstring(t) full_sentence = [] # slice_list = [] # read each sentence of each annotation file for sentence in tree: each_sentence = [] each_sentence.append(filename) each_sentence.append("This following is a new sentence: ") each_sentence.append(sentence[0].text) for annot in sentence.iter(): # text, annotationSet # annot.attrib, e.g. <annotationSet cDate="10/26/2009 04:28:59 PDT Mon" luID="5511" luName="people.n" frameID="304" frameName="People" status="MANUAL" ID="6558815"> for x, y in annot.attrib.items(): # when the phrase type is noun; it's different kind of noun; # please see http://www.surdeanu.info/mihai/teaching/ista555-fall13/readings/PennTreebankConstituents.html for details if y == 'nn' or y == 'nns' or y == 'nnp' or y == 'nnps': print('--------------') print('This sentence has the following noun:') start = annot.attrib.get('start') start = int(start) end = annot.attrib.get('end') end = int(end) end = end + 1 clause_1 = sentence[0].text[start:end] print(clause_1) # slice_list.append(clause_1) each_sentence.append(clause_1) each_sentence.append('This is the type of noun:') each_sentence.append(y) print(y) # slice_list.append(fn.annotations(clause_1)) fm1 = fn.frames_by_lemma(clause_1) print(fm1) each_sentence.append( 'This lemma evoked the following frame:') each_sentence.append(fm1) # when the phrase type is verb; it's different kind of verb elif y == 'vvd' or y == 'vb' or y == 'vbd' or y == 'vbg' or y == 'vbn' or y == 'vbp' or y == 'vbz': print('--------------') print(sentence[0].text) print('This sentence has the following verb:') start = annot.attrib.get('start') start = int(start) print(start) end = annot.attrib.get('end') end = int(end) end = end + 1 print(sentence[0].text[start:end]) clause_2 = sentence[0].text[start:end] each_sentence.append(clause_2) each_sentence.append('This is the type of verb:') each_sentence.append(y) print(y) # slice_list.append(sentence[0].text[start:end]) each_sentence.append( 'This lemma evoked the following frame:') fm2 = fn.frames_by_lemma(clause_2) print(fm2) each_sentence.append(fm2) full_sentence.append(each_sentence) full_sentence.append("\n") print(full_sentence) filename = open('/Users/mac/Desktop/find_nv_conflict/' + filename, "w") filename.write(str(full_sentence))
def getFrameLemma(self, lemma): frame = fn.frames_by_lemma(lemma + '.v') if len(frame) > 0: return frame[0] return None
def lookup(self, word): frames = fn.frames_by_lemma(word) return [f['name'] for f in frames]
f = fn.frames(r'(?i)perception') len(fn.frames()) f = fn.frame(66) f.ID f.definition set(f.lexUnit.keys()) [x.name for x in f.FE] f.frameRelations fn.frames_by_lemma(r'(?i)a little') fn.lu(256).name fn.lu(256).definition fn.lu(256).frame fn.lu(256).lexeme docs = fn.documents() len(docs) docs[0].keys() docs[0].filename
def process(text='', lang='en', coreferences=False, constituents=False, dependencies=False, expressions=False, **kwargs) -> OrderedDict: # build nlp-json j: OrderedDict = get_base() j['meta']['DC.language'] = lang d: OrderedDict = get_base_document(1) #j['documents'][d['id']] = d j['documents'].append(d) d['meta']['DC.source'] = 'NLTK {}'.format(nltk_version) j['meta']['DC.language'] = lang d['text'] = text # collect parsers lemmatizer = get_lemmatizer() stemmer = get_stemmer() # tokenization and pos words = [] for sent in segment(text): for token in sent: words.append(token.value) # create the token list t_id = 1 for word, xpos in pos_tag(words): wordnet_pos = get_wordnet_pos(xpos) lemma = lemmatizer(word, pos=wordnet_pos) # start the token t = {'id': t_id, 'text': word, 'stem': stemmer(word)} #d['tokenList'][t['id']] = t d['tokenList'].append(t) t_id += 1 # wordnet try: synsets = wordnet.synsets(lemma, pos=wordnet_pos) senses = {} for s in synsets: hyponyms = [ y for x in s.hyponyms() for y in x.lemma_names() ] hypernyms = [ y for x in s.hypernyms() for y in x.lemma_names() ] synonyms = s.lemma_names()[1:] examples = s.examples() sense = { 'wordnetId': s.name(), 'definition': s.definition() } if synonyms: sense['synonyms'] = synonyms if hypernyms: sense['hypernyms'] = hypernyms if hyponyms: sense['hyponyms'] = hyponyms if examples: sense['examples'] = examples antonyms = [] for l in s.lemmas(): if l.antonyms(): for a in l.antonyms(): antonyms.append(a.name()) if antonyms: sense['antonyms'] = antonyms senses[sense['wordnetId']] = sense if senses: t['synsets'] = senses except: pass # verbnet try: verbs = dict((class_id, { 'classId': class_id, 'frames': vn.frames(class_id) }) for class_id in vn.classids(word)) if verbs: t['verbFrames'] = verbs except: pass # framenet try: frame_net = {} frames = invoke_frame(word) if frames is not None: for fr in frames: lu_temp = [] for lu in fn.lus(r'(?i)' + word.lower()): fr_ = fn.frames_by_lemma(r'(?i)' + lu.name) if len(fr_): if fr_[0] == fr: lu_temp.append({ 'name': lu.name, 'definition': lu.definition, 'pos': lu.name.split('.')[1] }) frame_net[fr.ID] = { 'name': fr.name, 'frameId': fr.ID, 'definition': fr.definition, # 'relations':fr.frameRelations, 'lu': lu_temp } if frame_net: t['frames'] = frame_net except: pass return remove_empty_fields(j)
def getFrame(lex): frames = fn.frames_by_lemma(lex) for frame in frames: print frame.name +" " +str(frame.ID)
import nltk from pprint import pprint from nltk.tokenize import word_tokenize from nltk.corpus import framenet as fn i = 0 list = word_tokenize('add boiling water into the cup.') postagged = nltk.pos_tag(list) print('frames:') for x in list: print('(' + "'" + x + "'" + ',' + "'" + str(fn.frames_by_lemma(x)) + "'" + ')') print('postags:') for i in range(len(postagged)): print(postagged[i]) print(nltk.pos_tag(list)) print(postagged[1]) print(list)
sent = input("Enter the sentence: ") no_punct = "" for char in sent: if char not in punct: no_punct = no_punct + char list = word_tokenize(no_punct.lower()) i = 0 postagged = nltk.pos_tag(list) print('words:' + str(list)) for x in list: print('frame' + '(' + x + ',' + str(fn.frames_by_lemma(x)) + ')') for i in range(len(postagged)): print('nltk_pos' + str(postagged[i])) with open('morph.xml', 'rt') as f: tree = ElementTree.parse(f) with open('types.xml', 'rt') as f2: tree2 = ElementTree.parse(f2) for node in tree.iter('entry'): for x in list: name = node.attrib.get('word') pos = node.attrib.get('pos')
from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() lmtzr.lemmatize('humidity') from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() st.stem('luminous') lemma('humidity') frames = fn.frames_by_lemma(r'skin') for f in frames: print '%s - %s\n' % (f.name, f.definition) fn.lexical_units(r'') fn.frames_by_lemma(r'(?i)a little') for f in ('reflect', 'bank'): taxonomy.append(f, type='angle') for f in ('bank', 'financial-institution'):