def wnToBn(wn_labels, wordNet_to_babelNet, wordnetCompression): """ Retrieve the list of labels containing BabelNet synsets :param wn_labels: list of labels with wordnet synsets :param wordNet_to_babelNet: dictionary containing the mapping wn_synset => bn_synset :param wordnetCompression: boolean variable used to check whether is required to pass before from sensekey => wn_synsets and then to wn_synset => bn_synset or not. :return bn_label_synsets: list of labels with babelnet synsets """ bn_label_synsets = [] for label_sequence in wn_labels: temp = [] for label in label_sequence: if wordnetCompression: if 'wn:' in label: temp.append(wordNet_to_babelNet[label]) else: temp.append(label) else: if re.search(r'(%[1-9])', label): pos = wn.lemma_from_key(label).synset().pos() offset = wn.lemma_from_key(label).synset().offset() synset_id = "wn:" + str(offset).zfill( 8) + pos temp.append(wordNet_to_babelNet[synset_id]) else: temp.append(label) bn_label_synsets.append(temp) return bn_label_synsets
def build_vocab_synset_SemCor(): target_file = open( "../../WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.gold.key.txt", "r") # Create a vocab wrapper and add some special tokens. vocab = Vocabulary() # iterate through all definitions in the SemCor for line in target_file: # synset and literal definition from the WN key = line.replace('\n', '').split(' ')[-1] synset = wn.lemma_from_key(key).synset() # convert '.' to '__' for hashing synset = synset.name().replace('.', '__') vocab.add_word(synset) # add SemEval synsets semeval_file = open( "../../WSD_Evaluation_Framework/Evaluation_Datasets/semeval2007/semeval2007.gold.key.txt", "r") for line in semeval_file: key = line.replace('\n', '').split(' ')[-1] synset = wn.lemma_from_key(key).synset() synset = synset.name().replace('.', '__') vocab.add_word(synset) print("Total vocabulary size: {} {}".format(vocab.idx, len(vocab))) return vocab
def add_word(vocab, word, production_rule, keys = None, aliases = None): """ Get the synset for a word given its lemma key """ if word is None or len(word.strip()) == 0: raise ConfigException("word can't be blank") word = word.strip().lower() import nltk, re from nltk.corpus import wordnet as wn from word import BaseWord synsets = [] if keys != None: # Create a list of synsets for this word, if any lemmas were provided. if isinstance(keys, (list, tuple)): for k in keys: k = k.strip() s = wn.lemma_from_key(k).synset if s is None: raise ConfigException('lemma_from_key("%s") returned None' % (keys.strip())) elif s not in synsets: synsets.append(s) else: s = wn.lemma_from_key(keys.strip()).synset if s is None: raise ConfigException('lemma_from_key("%s") returned None' % (keys.strip())) elif s not in synsets: synsets.append(s) alias_list = [] try: # Make sure that the word itself is in its own alias list, # assuming it's a valid regular expression. re.compile(word) alias_list.append(word) except: pass # Create a list of aliases if applicable if aliases != None: if isinstance(aliases, (list, tuple)): for a in aliases: a = a.strip() if len(a) > 0 and a not in alias_list: alias_list.append(a) else: a = aliases.strip() if len(a) > 0 and a not in alias_list: alias_list.append(a) # Finally create a BaseWord object based on all of this stuff baseword = BaseWord(word, production_rule, synsets, aliases) vocab.append(baseword)
def patched_lemma_from_key(key, wordnet=wordnet): try: lemma = wordnet.lemma_from_key(key) except WordNetError as e: if key in patching_data: lemma = wordnet.lemma(patching_data[key]) elif '%3' in key: lemma = wordnet.lemma_from_key(key.replace('%3', '%5')) else: raise e return lemma
def parseXsl(self, fname): f = xlrd.open_workbook(fname) f_sheet = f.sheet_by_index(0) translator = {} for row in range(1, f_sheet.nrows): verb_obj = f_sheet.cell(row, 0) noun_obj = f_sheet.cell(row, 3) v = verb_obj.value n = noun_obj.value translator[wn.lemma_from_key(v).synset()] = wn.lemma_from_key( n).synset() return translator
def make_wn2_wn3(): key_map = {} id_map = {} curr_path = os.getcwd() + "/" _rel_path = curr_path.split('/data/')[0] print(_rel_path) _path = _rel_path + '/data/wn2.1_3.0' print("PTH", _path) mono_files = glob.glob(os.path.join(_path, '*.mono')) poly_files = glob.glob(os.path.join(_path, '*.poly')) for file_name in mono_files: with open(file_name) as fp: for each in fp.readlines(): wn2_sense_key , wn2_synset_offs ,\ wn3_sense_key , wn3_synset_offs = each.strip().split(' ') ref_syn = wn.lemma_from_key(wn3_sense_key).synset() wn2_word = ref_syn.name().split('.')[0] wn2_pos = ref_syn.pos() key_map[wn2_sense_key] = [wn3_sense_key] id_map[(wn2_word , wn2_pos ,1)] = (wn3_sense_key , \ wn3_synset_offs , 1) for file_name in poly_files: with open(file_name) as fp: for each in fp.readlines(): all_keys = each.strip().split(' ') wn2_sense_key , wn2_synset_offs , wn2_sense_number = \ all_keys[1].split(';') wn3_vals = [tuple(each.split(';')) \ for each in all_keys[2:]] key_vals = [] id_vals = [] for wn3_sense_key, wn3_synset_offs, wn3_sense_number in wn3_vals: if wn3_sense_key: id_vals.append((wn3_sense_key ,wn3_synset_offs ,\ wn3_sense_number)) key_vals.append(wn3_sense_key) if key_vals: ref_syn = wn.lemma_from_key(key_vals[0]).synset() wn2_word = ref_syn.name().split('.')[0] wn2_pos = ref_syn.pos() key_map[wn2_sense_key] = key_vals id_map[(wn2_word, wn2_pos, wn2_sense_number)] = id_vals with open(_rel_path + "/data/wn3_id_map", "wb") as file_handle: pickle.dump(id_map, file_handle) with open(_rel_path + "/data/wn3_key_map", "wb") as file_handle: pickle.dump(key_map, file_handle) return id_map, key_map
def lemmaToWNsynset(sentences, labels, isTrain): """ Transform every lemma associated to a sensekey in a wordnet synset. Then add the mapping lemma => wn_synset, wn_synset => sensekey in two dictionaries dictionary that will be saved as a .txt file. :param sentences: list of sentences :param labels: list of labels composed by lemmas and sensekeys :param isTrain: boolean variable used to distinguish between training and dev set operations :param wordnetCompression: boolean variable used to check whether the wordnet_synset => sensekeys mapping must be saved :return updated_labels = list of labels with lemmas and wn_synsets """ lemma_to_wn = {} #Dictionary containing the mapping lemma => wordnet synset updated_labels = [] #For every sentence of the dataset for i in range(len(sentences)): temp = [] current_label_sequence= labels[i] current_sentence = sentences[i] #For every token in the current sentence for j in range(len(current_sentence)): lemma = current_sentence[j] label = current_label_sequence[j] #Check if the label is a sensekey if re.search(r'(%[1-9])', label): #From the sensekey extract the synset pos = wn.lemma_from_key(label).synset().pos() offset = wn.lemma_from_key(label).synset().offset() wn_synset = "wn:" + str(offset).zfill( 8) + pos #Add pair (lemma, wordnet_synset) to the dictionary if not lemma in lemma_to_wn: lemma_to_wn[lemma] = [wn_synset] else: if not wn_synset in lemma_to_wn[lemma]: lemma_to_wn[lemma].append(wn_synset) lemma = wn_synset temp.append(lemma) updated_labels.append(temp) #If we worked on the training set, save the dictionary into two files if isTrain: if not os.path.exists("../../resource/Mapping_Files/lemma_to_wn.txt"): with open("../../resource/Mapping_Files/lemma_to_wn.txt", 'w') as file: for elem in lemma_to_wn: line = elem + " " + " ".join(lemma_to_wn[elem]) file.write(line + "\n") return updated_labels
def extractLabelIdentifier(sentences, ids, lemmas_mapping, vocab_identifier, wordnetCompression): """ Every lemma to disambiguate is associated to a label which is mapped to an integer. This mapping is contained in the vocab_identifier variable- Lemmas to disambiguate are first searched in the "lemmas_mapping" dictionary. If they are not present there the MSF (most frequent sense) method is applied. Thus their label is recovered from the WordNet interface. :param sentences: list of tokenized sentences :param ids: list of identifier for lemmas to predict :param lemmas_mapping: lemma => label mapping :param vocab_identifier: label => integer mapping :param wordnetCompression: boolean variable used to check if a model with the wordnet synset compression method has been used :return identifiers_list: list of list of integers and sensekey; each internal list correspond to all the sensekey identifier associated to a lemma """ identifiers_list = [] for i in range(len(sentences)): sentence = sentences[i] id_sequence = ids[i] sentence_ids = [] for j in range(len(sentence)): word = sentence[j] id = id_sequence[j] word_ids = [] #Check if the current word was met during training if not id == '0': if word in lemmas_mapping: #If it is, ectract the sensekeys associated to the lemma sensekeys = lemmas_mapping[word] #Then search for all the sensekeys their identifier for sensekey in sensekeys: word_ids.append(vocab_identifier[sensekey]) else: #Take the most frequent sense from wordnet mfs = str(wn.synsets(word)[0])[8:-2] #Retrieve the correspondent sensekey sensekey = wn.synset(mfs).lemmas()[0].key() if wordnetCompression: #Transform the senekey into a wordnet synset pos = wn.lemma_from_key(sensekey).synset().pos() offset = wn.lemma_from_key(sensekey).synset().offset() wn_synset = "wn:" + str(offset).zfill(8) + pos word_ids.append(wn_synset) else: word_ids.append(sensekey) if word_ids: sentence_ids.append(word_ids) identifiers_list.append(sentence_ids) return identifiers_list
def create_lookup_entry(lemma, dante_answer, answer_id): """ Returns True if it failed """ try: if lemma == "U": wn_definition = "NO DEFINITION FOUND - CHECK CONTEXT" else: wn_definition = wn.lemma_from_key(lemma).synset.definition write_string = str("For answer: " + answer_id + "\nAre these definitions of " + dante_answer["headword"] + " the same?\n" + wn_definition + "\n AND\n" + dante_answer["meaning"] + "\n") yes_or_no = None while yes_or_no != "y" and yes_or_no != "n": stdout.write("\r%s" % write_string) yes_or_no = raw_input("y or n?: ") stdout.flush() sleep(1) if yes_or_no == "y": DanteAPI.write_to_lookup(lemma, dante_answer["meaning"], True) elif yes_or_no == "n": DanteAPI.write_to_lookup(lemma, dante_answer["meaning"], False) else: raise ValueError("Wrong input squeezed through") stdout.write("\n") return True except WordNetError as err: synsets = wn.synsets(dante_answer["headword"]) print "head word " + dante_answer[ "headword"] + " and lemma " + lemma + " caused error: " + err.message print "Synsets: " + str(synsets) print "FIX IT BEFORE NEXT RUN" return False
def _disambiguation(): ev = [ 'noun.phenomenon', 'noun.act', 'noun.event', 'noun.attribute', 'adj.all', 'adv.all' ] true_splits = [] with cd(path.join('lib', 'ims_0.9.2.1')): pattern = '<x(.+?)</x>' with open('out_.txt', 'r', encoding='utf-8') as f, open("test_.txt", 'r', encoding='utf-8') as f1: for line, line1 in zip(f, f1): matches = re.finditer(pattern, line) lexnames = [] for m in matches: key = re.search('(?<=\s)([^ ]+?)(?=\|)', m[0]).group( 0) # for ' natural%3:00:03::|' try: lexname = wn.lemma_from_key(key).synset().lexname() lexnames.append(lexname) except WordNetError: print(key) print(lexnames) print(line1) if set(lexnames).intersection(set(ev)): true_splits.append(line1.strip()) print('Disambiguation: ', true_splits) return true_splits
def convert_goldkey2domain(input_path: str, output_path: str, bn2domain: Dict[str, str], is_bn: bool = True) -> None: """ This method is used to convert a goldkey map :param input_path: path of goldkey to convert :param output_path: path where it writes :param bn2domain: a map Babelnet 2 domain :param is_bn: if True, it converts in Babelnet format if False, it converts in coarse-grained format :return: None """ wn2bn = read_map(config.BABELNET2WORDNET_TR, reverse=True) with open(str(output_path), mode="w") as out_file: with open(str(input_path), mode="r") as in_file: for line in in_file: line = line.strip().split() syn = wn.lemma_from_key(line[1]).synset() syn = "wn:" + str(syn.offset()).zfill(8) + syn.pos() syn = _get_frequent_sense(syn, wn2bn, bn2domain, is_bn) out_file.write(line[0] + " " + syn + "\n")
def create_labels_words(path=path_dataset + "semcor+omsti.gold.key.txt"): ############################################################################### # This function, given the gold file, creates a dictionary of labels (babelnet id, Wndomain, Lexname) # for each ambiguous words # # Input: # path: path of the gold file # # Output: # dict_sensekey: dictionary of labels ############################################################################### sense_keys = [sensekey.split() for sensekey in utils.load_txt(path)] dict_sensekey = {} for list_info in sense_keys: # take the synset from the sense key synset = wn.lemma_from_key(list_info[1]).synset() # take the wordnet id from the sense key wn_id = "wn:" + str(synset.offset()).zfill(8) + synset.pos() bn_id = Wordnet_match[wn_id] try: dict_sensekey[list_info[0]] = [ bn_id, Wndomain_match[bn_id], lexname_match[bn_id] ] # add the factotum label to all the words which don't have a wndomain label except: dict_sensekey[list_info[0]] = [ bn_id, "factotum", lexname_match[bn_id] ] return dict_sensekey
def parse_corpus(path_name='Training_Corpora/SemCor/'): """This function parse the training data""" keys_path = glob.glob(os.path.join(path_name, '*gold.key.txt'))[0] sentences_path = glob.glob(os.path.join(path_name, '*data.xml'))[0] keys = dict() with open(keys_path, 'r') as f: for line in f: line = line.strip().split(' ') id_ = line[0] synset_keys = line[1:] synsets = [wn.lemma_from_key(k).synset() for k in synset_keys] keys[id_] = synsets with open(sentences_path, 'r') as f: tree = etree.parse(f) training = [] for sentence in tree.xpath('//sentence'): sent_id = sentence.attrib['id'] tags = [] words = [] for chunck in sentence[:]: type_ = chunck.tag if type_ == 'instance': tags.append(keys[chunck.attrib['id']]) if chunck.attrib['pos'] != '.' and chunck.attrib['lemma'] not in STOPWORD: words.append(chunck.attrib['lemma']) training.append((words, list(set(itertools.chain.from_iterable(tags))))) return training
def lemma_from_key(key): """ This function returns lemma object given the lemma key. This is similar to :func:`lemma` but it needs to supply the key of lemma instead of the name. .. note:: Support only English language (*eng*). :param str key: key of the lemma object :return: lemma object with the given key :rtype: :class:`Lemma` :Example: >>> from pythainlp.corpus.wordnet import lemma, lemma_from_key >>> >>> practice = lemma('practice.v.01.exercise') >>> practice.key() exercise%2:41:00:: >>> lemma_from_key(practice.key()) Lemma('practice.v.01.exercise') """ return wordnet.lemma_from_key(key)
def calc_fiwn_counts(): en2fi = {} for (fi_synset_key, fi_lemma_str, en_synset_key, en_lemma_str, rel, extra) in get_transl_iter(): if rel != "synonym": continue fi_lemma = get_lemma(fiwn, fi_synset_key, fi_lemma_str) assert fi_lemma is not None en_lemma = get_lemma(wordnet, en_synset_key, en_lemma_str) assert en_lemma is not None en2fi.setdefault(en_lemma.key(), []).append(fi_lemma.key()) divisors = set() counts = {} for en, fis in en2fi.items(): for fi in fis: counts.setdefault(fi, 0.0) try: en_lemma = wordnet.lemma_from_key(en) except WordNetError: # The following lemmas are not in the PWN sense index for some reason: # ['earth%1:17:02::', 'ddc%1:06:01::', 'kb%1:23:01::', 'sun%1:17:02::', # 'moon%1:17:03::', 'earth%1:15:01::', 'ddi%1:06:01::', 'kb%1:23:03::'] pass else: div = len(fis) divisors.add(div) counts[fi] += en_lemma.count() / div mult = reduce(lcm, divisors) for lemma, cnt in counts.items(): counts[lemma] = int((cnt * mult) + 0.5) return counts
def create_lookup_entry(lemma, dante_answer, answer_id): """ Returns True if it failed """ try: if lemma == "U": wn_definition = "NO DEFINITION FOUND - CHECK CONTEXT" else: wn_definition = wn.lemma_from_key(lemma).synset.definition write_string = str("For answer: " + answer_id + "\nAre these definitions of " + dante_answer["headword"] + " the same?\n" + wn_definition + "\n AND\n" + dante_answer["meaning"] + "\n") yes_or_no = None while yes_or_no != "y" and yes_or_no != "n": stdout.write("\r%s" % write_string) yes_or_no = raw_input("y or n?: ") stdout.flush() sleep(1) if yes_or_no == "y": DanteAPI.write_to_lookup(lemma, dante_answer["meaning"], True) elif yes_or_no == "n": DanteAPI.write_to_lookup(lemma, dante_answer["meaning"], False) else: raise ValueError("Wrong input squeezed through") stdout.write("\n") return True except WordNetError as err: synsets = wn.synsets(dante_answer["headword"]) print "head word " + dante_answer["headword"] + " and lemma " + lemma + " caused error: " + err.message print "Synsets: " + str(synsets) print "FIX IT BEFORE NEXT RUN" return False
def get_synset_from_sense_key(sense_key): try: lemma = wn.lemma_from_key(sense_key) except: # print("no synset found for: " + sense_key) return "NA" return str(lemma.synset())
def wsd_input_format(wsd_src_data, eval_result): ''' test_data[0] {'target_word': u'art#n', 'target_sense': None, 'id': 'senseval2.d000.s000.t000', 'context': ['the', '<target>', 'of', 'change_ringing', 'be', 'peculiar', 'to', 'the', 'english', ',', 'and', ',', 'like', 'most', 'english', 'peculiarity', ',', 'unintelligible', 'to', 'the', 'rest', 'of', 'the', 'world', '.'], 'poss': ['DET', 'NOUN', 'ADP', 'NOUN', 'VERB', 'ADJ', 'PRT', 'DET', 'NOUN', '.', 'CONJ', '.', 'ADP', 'ADJ', 'ADJ', 'NOUN', '.', 'ADJ', 'PRT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', '.']} ''' wsd_input = [] senses_input = [] for i in range(len(eval_result)): block = i / 32 src_word1, src_word2 = backward_step1_in[ 2 * block], backward_step1_in[2 * block + 1] tgt_sent = wsd_src_data[i].decode().encode( 'utf-8') + ' ' + eval_result[i] tgt_word = src_word1 synset = wn.lemma_from_key(tgt_word).synset() s = synset.name() target_word = '#'.join(s.split('.')[:2]) context = tgt_sent.split(' ') for j in range(len(context)): if context[j] == tgt_word: context[j] = '<target>' poss_list = ['.' for _ in range(len(context))] tmp_dict = { 'target_word': target_word, 'target_sense': None, 'id': None, 'context': context, 'poss': poss_list } wsd_input.append(tmp_dict) senses_input.append((src_word1, src_word2)) return wsd_input, senses_input
def parse_corpus(path_name='Training_Corpora/SemCor/'): """This function parse the training data""" keys_path = glob.glob(os.path.join(path_name, '*gold.key.txt'))[0] sentences_path = glob.glob(os.path.join(path_name, '*data.xml'))[0] keys = dict() with open(keys_path, 'r') as f: for line in f: line = line.strip().split(' ') id_ = line[0] synset_keys = line[1:] synsets = [wn.lemma_from_key(k).synset() for k in synset_keys] keys[id_] = synsets with open(sentences_path, 'r') as f: tree = etree.parse(f) training = [] for sentence in tree.xpath('//sentence'): sent_id = sentence.attrib['id'] tags = [] words = [] for chunck in sentence[:]: type_ = chunck.tag if type_ == 'instance': tags.append(keys[chunck.attrib['id']]) if chunck.attrib['pos'] != '.' and chunck.attrib[ 'lemma'] not in STOPWORD: words.append(chunck.attrib['lemma']) training.append( (words, list(set(itertools.chain.from_iterable(tags))))) return training
def wordnet_process(filename): ''' {'target_word': u'picture#n', 'target_sense': None, 'id': None, 'context': [u'<unk>', u'is', u'the', '<target>', u'of', u'the', u'<unk>', u'and', u'the', u'<unk>'], 'poss': ['.', '.', '.', '.', '.', '.', '.', '.', '.', '.']} ''' dict_list = [] with open(filename) as f1: for i, line in enumerate(f1): wordlist = line.strip().split() dict = {} target_id = -1 target = '' for id, word in enumerate(wordlist): if '%' in word: name = wn.lemma_from_key(word).synset().name().encode('utf-8').split('.') target_id = id # target = name[0]+'#'+name[1] target = word[:word.index('%')]+'#'+name[1] break dict['target_word'] = target dict['target_sense'] = None dict['id'] = None dict['context'] = [] for id, word in enumerate(wordlist): if id == target_id: dict['context'].append('<target>') else: dict['context'].append(word) dict['poss'] = ['.' for i in range(0, len(wordlist))] dict_list.append(dict) with open(filename+'.pickle', 'wb') as fp: pickle.dump(dict_list, fp)
def sk_to_ss(): """Update sensekey in tag file to synsetID (offset-pos)""" all_sk = set() print("Reading tag file ...") with open(SEMCOR_TAG, 'r') as semcor_tag: lines = [ x.split() for x in semcor_tag.readlines() ] for line in lines: sk = line[3] scloc = sk.find(';') if scloc > -1: sk = sk[:scloc] # only consider the first sensekey all_sk.add(sk) print(len(all_sk)) print("Loading WordNet ...") from nltk.corpus import wordnet as wn all_sk_notfound = set() with open(SS_SK_MAP, 'w') as mapfile: for sk in all_sk: try: if sk not in all_sk_notfound: ss = wn.lemma_from_key(sk).synset() sid = '%s-%s' % (ss.offset(), ss.pos()) mapfile.write('%s\t%s\n' % (sk, sid)) except nltk.corpus.reader.wordnet.WordNetError: all_sk_notfound.add(sk) except ValueError: print("Invalid sk: %s" % (sk,)) all_sk_notfound.add('[INVALID]\t' + sk) with open(SK_NOTFOUND, 'w') as notfoundfile: for sk in all_sk_notfound: notfoundfile.write(sk) notfoundfile.write('\n') print("Map file has been created")
def sc2ss(sensekey): '''Look up a synset given the information from SemCor''' ### Assuming it is the same WN version (e.g. 3.0) try: return wn.lemma_from_key(sensekey).synset() except: pass
def evaluate(instance, sentence, golds): preds = predict(instance, sentence) res = [] for p, g in zip(preds, golds): g = [wordnet.lemma_from_key(i).synset() for i in g[1]] res.append(p in g) return res
def lemmas(self): '''Return lemmas tied to the synsets''' if self.__BNLANG: return [s.getLemmas(self.__BNLANG) for s in self.synsets()] else: from nltk.corpus import wordnet as wn print(self.nearest) return [wn.lemma_from_key(k) for k, d in self.nearest]
def lemma_to_synset_key(keyin, keyout): for line in keyin: inst_id, lemma_ids = line.split(" ", 1) keyout.write(inst_id) for lemma_id in lemma_ids.split(): keyout.write( " " + wordnet.ss2of(wordnet.lemma_from_key(lemma_id).synset())) keyout.write("\n")
def get_tokens(self, in_file): root = ET.parse(in_file).getroot() for i, s in enumerate(root.findall('document/paragraph/sentence')): for t in s: synset_labels, lexname_labels = [], [] if 'wn30_key' in t.attrib: sensekey = t.attrib['wn30_key'] try: synset = wn.lemma_from_key(sensekey).synset() except Exception as e: sensekey = sensekey.replace( '%3', '%5') # a fix for unprocessable satellites synset = wn.lemma_from_key(sensekey).synset( ) # now, we should be able to find the modified sensekey in WN synset_labels.append(synset.name()) lexname_labels.append(synset.lexname()) yield synset_labels, lexname_labels, t.attrib['surface_form']
def create_dictionary(dataset_name, gold2dic, train=False): words = [] wordnet = [] # get an iterable context = etree.iterparse(dataset_name) sentence = [] sentenceNet = [] deletedSentences = 0 dictionary = {} for event, elem in iter(context): if elem.tag == "sentence": #if(int(elem.attrib['id'])%10 == 0) print('\t' + elem.attrib['id']) if len(sentence) < 1: deletedSentences += 1 else: words.append(' '.join(sentence)) wordnet.append(' '.join(sentenceNet)) sentence = [] sentenceNet = [] elif elem.tag == "wf" or elem.tag == "instance": lemma = elem.attrib["lemma"].lower() sentence.append(lemma) if elem.tag == "instance": dataset_id = elem.attrib["id"] synset = wn.lemma_from_key(gold2dic[dataset_id]).synset() synset_id = "wn:" + str( synset.offset()).zfill(8) + synset.pos() sentenceNet.append(synset_id) if lemma not in dictionary: dictionary[lemma] = [synset_id] elif synset_id not in dictionary[lemma]: dictionary[lemma].append(synset_id) else: sentenceNet.append(lemma) elem.clear() if train: save(dictionary, '../resources/' + 'synsetsdic') flag = 'train' else: flag = 'dev' save(words, '../resources/' + 'words_' + flag) save(wordnet, '../resources/' + 'wordnet_' + flag) print('\nSentences removed:', deletedSentences) return words, wordnet
def build_vocab_decoder_SemCor(threshold): # Create a vocab wrapper and add some special tokens. counter = Counter() target_file = open( "../../WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.gold.key.txt", "r") # iterate through all definitions in the SemCor for line in target_file: # synset and literal definition from the WN key = line.replace('\n', '').split(' ')[-1] synset = wn.lemma_from_key(key).synset() definition = synset.definition() def_tokens = nltk.tokenize.word_tokenize(definition) counter.update(def_tokens) # add SemEval synsets semeval_file = open( "../../WSD_Evaluation_Framework/Evaluation_Datasets/semeval2007/semeval2007.gold.key.txt", "r") for line in semeval_file: key = line.replace('\n', '').split(' ')[-1] synset = wn.lemma_from_key(key).synset() definition = synset.definition() def_tokens = nltk.tokenize.word_tokenize(definition) counter.update(def_tokens) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) print("Total vocabulary size: {}".format(vocab.idx)) return vocab
def return_synset_id(key): ''' Method used to return the wordnet synset_id from a sense_key of a specific word given in input. :param key the input sense_key :return the synset_id of the sense_key ''' synset = wn.lemma_from_key(key).synset() synset_id = "wn:" + str(synset.offset()).zfill(8) + synset.pos() return synset_id
def wn_id_from_sense_key(sense_key): """ Returns a WordNet ID built out of offset and POS from a given WordNet sense key. :param sense_key: WordNet sense key :return: WordNet ID corresponding to the given sense key """ synset = wn.lemma_from_key(sense_key).synset() return wn_id_from_synset(synset)
def semcor_to_offset(sensekey): """ Converts SemCor sensekey IDs to synset offset. >>> print semcor_to_offset('live%2:42:06::') 02614387-v """ synset = wn.lemma_from_key(sensekey).synset offset = '%08d-%s' % (synset.offset, synset.pos) return offset
def readAnswers(ifile="corpora/answers+misc/tasks/english-all-words/key"): answers = {} pattern = re.compile("(?P<id>d\S*)\s+(?P<sense>\S+:\S+)") for i in pattern.finditer(open(ifile).read()): try: answers[i.group("id")] = wn.lemma_from_key(i.group("sense")) except: pass return answers
def sk2lemma(self, sk, use_ws=False): try: lemma_name = wn.lemma_from_key(sk).name() except: lemma_name = sk.split('%')[0] if use_ws: lemma_name = lemma_name.replace('_', ' ') return lemma_name
def get_rel_lemmas(lemma_str): ret_lemmas = [] syn = wn.lemma_from_key(lemma_str).synset ret_lemmas = [i.name.lower() for i in syn.lemmas] hyns = set(get_all_hypernyms(syn,3)) for h in hyns: ret_lemmas += [i.name.lower() for i in h.lemmas] ret_lemmas += [i.lower() for i in lemmatize(nltk.word_tokenize(h.definition))] return ret_lemmas
def get_sense(word): try: synset = wn.lemma_from_key(word).synset().lemma_names() except Exception: return word for w in synset: if w != word: return w return word
def semcor_to_synset(sensekey): """ Look up a synset given the information from SemCor sensekey format. (Thanks for @FBond, see http://moin.delph-in.net/SemCor) >>> ss = semcor_to_offset('live%2:42:06::') >>> print '%08d-%s' % (ss.offset, ss.pos) >>> print ss, ss.definition 02614387-v Synset('live.v.02') lead a certain kind of life; live in a certain style """ return wn.lemma_from_key(sensekey).synset
def get_mfs(self, lemma, sensekeys = None): senseid = None try: if(not sensekeys): sensekeys = self.get_sense_keys(lemma) sense_freqs = {} for i in sensekeys: lemma_obj = wn.lemma_from_key(i) sense_freqs[i] = lemma_obj.count() senseid = max(sense_freqs.iteritems(), key=operator.itemgetter(1))[0] except(ValueError): print "Error: No senses for %s" % (lemma) return senseid
def semcor2synset(self, sense): return wn.lemma_from_key(sense).synset()
def lemma_from_key(key): return wordnet.lemma_from_key(key)
from nltk.corpus import wordnet as wn from nltk.corpus.reader.wordnet import WordNetError with open("EnglishAW.test.key") as answer_file: answers = answer_file.read().split('\n')[:-1] for answer in answers: answer = answer.split() word_id = answer[1] lemmas = answer[2:] for lemma in lemmas: try: if lemma != "U": synset = wn.lemma_from_key(lemma).synset except WordNetError: print("word id: {} lemma: {}".format(word_id, lemma))
def semcor_to_offset(sensekey): synset = wn.lemma_from_key(sensekey).synset offset = '%08d-%s' % (synset.offset, synset.pos) return offset
def slesk(self, wordforms): sensepreds = {} sw_hash = {i:1 for i in stopwords.words('english')} open_wfs = [i for i in wordforms if i.tag != 'punc' and not ('cmd' in i.attrib and i.attrib['cmd'] == "ignore")] # remove stop words and punctuation first # open_wfs = [i for i in wordforms if i.tag != 'punc'] # remove only punctuation for wf in wordforms: if('cmd' in wf.attrib and wf.attrib['cmd'] == 'done'): lemma = wf.attrib['lemma'] # may be need to automatically lemmatize here sensekeys = self.get_sense_keys(lemma) if(len(sensekeys) == 0): print "Error: No senses for %s" % lemma continue synsets = {k: wn.lemma_from_key(k).synset for k in sensekeys} idfs = text_utils.get_idfs() # get the window now window = 2 idx = [i for i,x in enumerate(open_wfs) if x == wf][0] lbound = idx-window if idx-window > 0 else 0 ubound = idx+window if idx+window < len(open_wfs) else len(open_wfs)-1 # all_context = set(text_utils.lemmatize(([i.text.lower() for i in open_wfs[lbound:(ubound+1)] if ('cmd' not in i.attrib or (i.attrib['cmd'] != "ignore" and i.attrib['id'] != wf.attrib['id']))]))) # this one keeps stopwords in window count all_context = set(text_utils.lemmatize(([i.text.lower() for i in open_wfs[lbound:(ubound+1)] if (i.attrib['id'] != wf.attrib['id'])]))) # this one keeps stopwords in window count jc_th = 0.1 context = [i for i in all_context if text_utils.compute_jc_sim(i,lemma) > jc_th] # lexical chain selection algorithm outstr = "-------------------" outstr += "\ncontext: "+str(context) # best = self.get_mfs(lemma) max = 0 cands = [] for k in synsets.keys(): synset = synsets[k] wntext = text_utils.lemmatize(nltk.word_tokenize(synset.definition)) for ex in synset.examples: wntext += text_utils.lemmatize(nltk.word_tokenize(ex)) # wntext += text_utils.lemmatize(text_utils.get_rel_lemmas(k)) # related lemmas from hypernyms etc. wntext = [i.lower() for i in wntext] lenlog = math.log(len(wntext)) normalizer = 1/ lenlog if lenlog > 0 else 0 outstr += "\n"+k+":"+str(wntext) wn_hash = {i:1 for i in wntext} matches = {} score = 0 for i in context: if(i in sw_hash): continue if i in wntext: score += 1 # if i in idfs: # score += idfs[i] # else: # score += 3 matches[i] = 1 #idfs[i] outstr += "\nScore: %s:%f" % (matches,score) # score = score * normalizer outstr += "\nNorm score: %s:%f" % (matches,score) if score > max: cands = [k] max = score elif score == max: cands.append(k) if(len(cands) > 1): best = self.get_mfs(lemma, cands) else: best = cands[0] mfs_id = self.get_mfs(lemma) true_id = lemma+"%"+wf.attrib['lexsn'] if mfs_id == true_id and best != mfs_id: print "stat:leskbad" print outstr print "MFS: %s, LESK: %s, CORRECT: %s" % (self.get_mfs(lemma), best, wf.attrib['lexsn']) elif mfs_id != true_id and best == true_id: print "stat:leskgood" print outstr print "MFS: %s, LESK: %s, CORRECT: %s" % (self.get_mfs(lemma), best, wf.attrib['lexsn']) elif max == 0: print "stat:nolesk" else: print "stat:lesksame" if(best): m = re.match("^"+lemma+"\%(\d+:\d+:\d+:(.*))", best) sensepreds[wf.attrib['id']] = m.group(1) else: sensepreds[wf.attrib['id']] = "gibberish" return sensepreds
def semcor_to_synset(sensekey): return wn.lemma_from_key(sensekey).synset
def getSSFromLemma(l): return str(wordnet.lemma_from_key(l).synset().name())