def load_model(): # load latest model cp_list = [item for item in os.listdir('.') if item.endswith('.ep0')] if len(cp_list) == 0: print('no saved model, Initializing...') return Kor2Vec(embed_size=128) cp_list.sort() cp_path = cp_list[-1] print('Loading model {}...'.format(cp_path)) return Kor2Vec.load(cp_path)
def _process_text_data(path='data/processed/ratings_train.txt'): ''' embed text to sequence using kor2vec model (pretrained) text sequences will be saved in data/processed/text_squence.npy text label will be saved in data/processed/text_label.npy these files will be used to extract embedding model weight to pass to embedding layer of keras model :param path: path to text (and label) file :return: sequence and corressponding label ### this might take time ''' texts = [] labels = [] k2v = Kor2Vec.load('source/models/k2v.model') with open(path, 'r', encoding='utf-8') as f: for line in tqdm(f): line_ = line.replace('\n', '') text_id, text, text_label = line_.split('\t') labels.append(text_label) texts.append(text) text_seqs = k2v.embedding(texts, seq_len=1, numpy=True) label_seqs = np.array(labels, dtype=np.int) np.save('data/processed/text_label_.npy', label_seqs.astype(np.int), allow_pickle=False) np.save('data/processed/text_squence_.npy', text_seqs.astype(np.float32), allow_pickle=False) return text_seqs, label_seqs
def loader(): word_emb = Kor2Vec.load('kor2vec03010231.checkpoint.ep0') ld = LOADER(word_emb) for batch in ld.train: print(batch[0], batch[1]) break for batch in ld.train: print(batch[0], batch[1]) break
def kor2vec_main(): # Define Model and bind to NSML model = Kor2Vec() # training model model.train(corpus_path=args.corpus_path if not args.sampled else None, model_path=args.output_path, sample_path=args.corpus_path if args.sampled else None, window_size=args.window_size, negative_sample_count=args.window_size - 1, positive_sample_count=args.window_size - 1, sample_output_path=args.sample_output_path, batch_size=args.batch_size, epochs=args.epochs, pre_sequence=args.pre_seq)
def inference(input_path, model_name, output_path): # kor2vec model load word_emb = Kor2Vec.load('kor2vec03010231.checkpoint.ep0') loader = LOADER(word_emb) # Checkpoint model load checkpoint_model = GRU_ATT() model = GRU_ATT_WRAP(checkpoint_model) checkpoint = torch.load(model_name) checkpoint_model.load_state_dict(checkpoint["model_state_dict"]) epoch = checkpoint["epoch"] loss = checkpoint["loss"] print("Checkpoint => Epoch: " + str(epoch) + " Loss: " + str(round(loss, 6))) # data load with open(input_path, 'rt', encoding='utf8') as f: data = f.readlines() data = [item.strip().split('\t') for item in data] sample = np.array(data)[:, -1:] sampleT = [[loader.tensorize.embedding(item[0])] for item in sample] # into torch.tensor sampleT = [(loader.trim(item[0])) if item[0].size(0) > MAX_SEQ_LEN else (loader.zero_pad(item[0])) for item in sampleT] # if seq len is too long, trim it. otherwise, pad the sequence. test = DataLoader(sampleT, batch_size=BATCH_SIZE) # Test model.model.eval() pred = [] for batch in test: out = model.model(batch) pred.append(out.max(dim=1)[1]) relevant_cols = ['정의', '과정', '성질', '예', '흥미유발'] label_map = dict(zip(range(5), relevant_cols)) knowledge = [] index = 0 for i in range(len(pred)): for j in range(len(pred[i])): knowledge.append([]) knowledge[index].append(label_map[int(pred[i][j])]) index += 1 # Save knowledge_list = np.hstack((sample, knowledge)) write(output_path, knowledge_list)
def build_using_kor2vec(model_path): from kor2vec import Kor2Vec import torch.nn as nn # load kor2vec model lm = Kor2Vec.load(model_path) emb_lst = [] with open(FILE_NAME, 'rt', encoding='utf8') as f: for line in tqdm(f, total=NUM_WORD, desc='build using kor2vec'): emb_lst.append( lm.embedding(line.strip()).detach().squeeze(1).numpy()) # save results now = datetime.datetime.now() np.savetxt('word_emb_{:02d}{:02d}{:02d}{:02d}.txt'.format( now.month, now.day, now.hour, now.minute), np.array(emb_lst).squeeze(1), fmt="%.5f")
def train_kor2vec(input_path, output_path='models/k2v.model', embedding_size=300, batch_size=128): ''' :param input_path: text corpus :param output_path: file path to save the model :param embedding_size: size of embedding table :param batch_size: batch size :return: Nothing, just export model to file and store in folder ''' k2v = Kor2Vec(embed_size=embedding_size) k2v.train_(input_path, batch_size=batch_size) # takes some time k2v.save(path=output_path) # saving embedding print('===== trained kor2vec model =====') print('===== outputed as {} ====='.format(output_path)) return k2v
def main(): # kor2vec model load word_emb = Kor2Vec.load('kor2vec03010231.checkpoint.ep0') rnn_clf = GRU_ATT() model = GRU_ATT_WRAP(rnn_clf) loader = LOADER(word_emb) early_stopping = EarlyStopping(verbose=True, delta=DELTA) for epoch in range(NUM_EPOCH): train_epoch(model, loader.test_raw, loader.train, loader.val, early_stopping) acc, loss_ = test(model, loader.test_raw, loader.test, print_confusion_matrix=True, print_test_set=True) print('test - accuracy : {:.4f} | loss : {:.6f}'.format(acc, loss_))
# 5 epochs wiki, 10 epochs ours files = [ "./model.kor2vec.ep2", "./model.kor2vec.ep2.with_reviews", "./model.kor2vec.ep0", "./model.kor2vec.ep0.with_reviews", "./model.kor2vec.ep1", "./model.kor2vec.ep1.with_reviews", "./model.kor2vec.only_with_reviews", "./model.kor2vec.only_with_reviews_100" ] """ for file in files: kor2vec = Kor2Vec.load(file) score_sim = cal_sim_avg(kor2vec) score_dif = cal_diff_count_avg(kor2vec) print(f'{file} 의 점수: sim={score_sim}, dif={score_dif}') """ kor2vec = Kor2Vec.load("./model.kor2vec.ep0.with_reviews") get_vec(kor2vec, "first", "0withR") kor2vec = Kor2Vec.load("./model.kor2vec.ep0.with_reviews") get_vec(kor2vec, "average", "0withR") kor2vec = Kor2Vec.load("./model.kor2vec.ep1") get_vec(kor2vec, "first", "1") kor2vec = Kor2Vec.load("./model.kor2vec.ep1") get_vec(kor2vec, "average", "1") """ print('cos',cos_sim(kor2vec.embedding('맵다',numpy=True).squeeze(), \ kor2vec.embedding('느끼하다',numpy=True).squeeze())) a=kor2vec.embedding("웨이팅",numpy=True) st = time.time() b =kor2vec.embedding("매운", numpy=True) print(norm(a[0]))
def setUp(self): self.kor2vec = Kor2Vec()
new_words = processor.nouns(sentence) new_embedding_data.extend(new_words) return k2v if __name__ == '__main__': args = get_arguments() ### test korean to vec model k2v = None #### train embedding model input_path = 'data/processed/all_text.txt' train_embedding_model = args.train_embedding_model if train_embedding_model == False: try: k2v = Kor2Vec.load('source/models/k2v.model') except FileNotFoundError as e: raise ( 'Pretrained embedding model is not found, check if the file is exist, or train new one..' ) else: assert input_path is not None, 'To train embedding model, we need input corpus' train_kor2vec(input_path, output_path='source/models/k2v.model', embedding_size=512, batch_size=128) #### preprocessing document text data folder = [ 'data/kaist_text_corpus/utility/health/', 'data/kaist_text_corpus/literature/autobiography/',
def setKor2Vec(self, kor2vecFileName): self.kor2vec = Kor2Vec.load(kor2vecFileName)