def load(path, model_class, suffix=''): with io.open(path + '.config', 'r', encoding='utf8') as f: config = json.load(f) word_voca = Vocabulary() word_voca.__dict__ = config['word_voca'] config['word_voca'] = word_voca entity_voca = Vocabulary() entity_voca.__dict__ = config['entity_voca'] config['entity_voca'] = entity_voca if 'snd_word_voca' in config: snd_word_voca = Vocabulary() snd_word_voca.__dict__ = config['snd_word_voca'] config['snd_word_voca'] = snd_word_voca model = model_class(config) model.load_state_dict(torch.load(path + '.state_dict' + suffix, map_location=torch.device('cpu'))) return model
def load_voca_embs(voca_path, embs_path): voca = Vocabulary.load(voca_path) embs = np.load(embs_path) # check if sizes are matched if embs.shape[0] == voca.size() - 1: print(embs.shape) unk_emb = np.mean(embs, axis=0, keepdims=True) embs = np.append(embs, unk_emb, axis=0) elif embs.shape[0] != voca.size(): print(embs.shape, voca.size()) raise Exception("embeddings and vocabulary have differnt number of items ") return voca, embs
def read_ent2id(ent_dic_path): print('load ent dic from', ent_dic_path) ent_dic = Vocabulary.load(ent_dic_path) ent2id = ent_dic.word2id return ent2id
import sys from nel.vocabulary import Vocabulary import nel.utils as utils import numpy as np if __name__ == "__main__": core_voca_path = sys.argv[1] word_embs_dir = sys.argv[2] print('load core voca from', core_voca_path) core_voca = Vocabulary.load(core_voca_path) print('load full voca and embs') full_voca, full_embs = utils.load_voca_embs( word_embs_dir + '/all_dict.word', word_embs_dir + '/all_word_embeddings.npy') print('select word ids') selected = [] for word in core_voca.id2word: word_id = full_voca.word2id.get(word, -1) if word_id >= 0: selected.append(word_id) print('save...') selected_embs = full_embs[selected, :] np.save(word_embs_dir + '/word_embeddings', selected_embs) with open(word_embs_dir + '/dict.word', 'w', encoding='utf8') as f: for i in selected: f.write(full_voca.id2word[i] + '\t1000\n')