예제 #1
0
def load_model():
    # load latest model
    cp_list = [item for item in os.listdir('.') if item.endswith('.ep0')]
    if len(cp_list) == 0:
        print('no saved model, Initializing...')
        return Kor2Vec(embed_size=128)
    cp_list.sort()
    cp_path = cp_list[-1]
    print('Loading model {}...'.format(cp_path))
    return Kor2Vec.load(cp_path)
def _process_text_data(path='data/processed/ratings_train.txt'):
    '''
	embed text to sequence using kor2vec model (pretrained)
	text sequences will be saved in data/processed/text_squence.npy
	text label will be saved in data/processed/text_label.npy
	these files will be used to extract embedding model weight to pass to embedding layer of keras model
	:param path: path to text (and label) file
	:return: sequence and corressponding label
	### this might take time
	'''
    texts = []
    labels = []
    k2v = Kor2Vec.load('source/models/k2v.model')
    with open(path, 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            line_ = line.replace('\n', '')
            text_id, text, text_label = line_.split('\t')
            labels.append(text_label)
            texts.append(text)
    text_seqs = k2v.embedding(texts, seq_len=1, numpy=True)
    label_seqs = np.array(labels, dtype=np.int)
    np.save('data/processed/text_label_.npy',
            label_seqs.astype(np.int),
            allow_pickle=False)
    np.save('data/processed/text_squence_.npy',
            text_seqs.astype(np.float32),
            allow_pickle=False)
    return text_seqs, label_seqs
def loader():
    word_emb = Kor2Vec.load('kor2vec03010231.checkpoint.ep0')
    ld = LOADER(word_emb)
    for batch in ld.train:
        print(batch[0], batch[1])
        break
    for batch in ld.train:
        print(batch[0], batch[1])
        break
예제 #4
0
def kor2vec_main():
    # Define Model and bind to NSML
    model = Kor2Vec()

    # training model
    model.train(corpus_path=args.corpus_path if not args.sampled else None,
                model_path=args.output_path,
                sample_path=args.corpus_path if args.sampled else None,
                window_size=args.window_size,
                negative_sample_count=args.window_size - 1,
                positive_sample_count=args.window_size - 1,
                sample_output_path=args.sample_output_path,
                batch_size=args.batch_size,
                epochs=args.epochs,
                pre_sequence=args.pre_seq)
예제 #5
0
def inference(input_path, model_name, output_path):
    # kor2vec model load
    word_emb = Kor2Vec.load('kor2vec03010231.checkpoint.ep0')
    loader = LOADER(word_emb)

    # Checkpoint model load
    checkpoint_model = GRU_ATT()
    model = GRU_ATT_WRAP(checkpoint_model)

    checkpoint = torch.load(model_name)
    checkpoint_model.load_state_dict(checkpoint["model_state_dict"])
    epoch = checkpoint["epoch"]
    loss = checkpoint["loss"]
    print("Checkpoint => Epoch: " + str(epoch) + "  Loss: " + str(round(loss, 6)))

    # data load
    with open(input_path, 'rt', encoding='utf8') as f:
        data = f.readlines()
    data = [item.strip().split('\t') for item in data]
    sample = np.array(data)[:, -1:]
    sampleT = [[loader.tensorize.embedding(item[0])] for item in sample]  # into torch.tensor
    sampleT = [(loader.trim(item[0])) if item[0].size(0) > MAX_SEQ_LEN else (loader.zero_pad(item[0])) for item in
               sampleT]  # if seq len is too long, trim it. otherwise, pad the sequence.
    test = DataLoader(sampleT, batch_size=BATCH_SIZE)

    # Test
    model.model.eval()
    pred = []
    for batch in test:
        out = model.model(batch)
        pred.append(out.max(dim=1)[1])

    relevant_cols = ['정의', '과정', '성질', '예', '흥미유발']
    label_map = dict(zip(range(5), relevant_cols))
    knowledge = []
    index = 0
    for i in range(len(pred)):
        for j in range(len(pred[i])):
            knowledge.append([])
            knowledge[index].append(label_map[int(pred[i][j])])
            index += 1

    # Save
    knowledge_list = np.hstack((sample, knowledge))
    write(output_path, knowledge_list)
예제 #6
0
def build_using_kor2vec(model_path):
    from kor2vec import Kor2Vec
    import torch.nn as nn

    # load kor2vec model
    lm = Kor2Vec.load(model_path)
    emb_lst = []
    with open(FILE_NAME, 'rt', encoding='utf8') as f:
        for line in tqdm(f, total=NUM_WORD, desc='build using kor2vec'):
            emb_lst.append(
                lm.embedding(line.strip()).detach().squeeze(1).numpy())

    # save results
    now = datetime.datetime.now()
    np.savetxt('word_emb_{:02d}{:02d}{:02d}{:02d}.txt'.format(
        now.month, now.day, now.hour, now.minute),
               np.array(emb_lst).squeeze(1),
               fmt="%.5f")
예제 #7
0
def train_kor2vec(input_path,
                  output_path='models/k2v.model',
                  embedding_size=300,
                  batch_size=128):
    '''
	:param input_path: text corpus
	:param output_path: file path to save the model
	:param embedding_size: size of embedding table
	:param batch_size: batch size
	:return: Nothing, just export model to file and store in folder
	'''

    k2v = Kor2Vec(embed_size=embedding_size)
    k2v.train_(input_path, batch_size=batch_size)  # takes some time
    k2v.save(path=output_path)  # saving embedding
    print('===== trained kor2vec model =====')
    print('===== outputed as {} ====='.format(output_path))
    return k2v
예제 #8
0
def main():

    # kor2vec model load
    word_emb = Kor2Vec.load('kor2vec03010231.checkpoint.ep0')
    rnn_clf = GRU_ATT()
    model = GRU_ATT_WRAP(rnn_clf)
    loader = LOADER(word_emb)

    early_stopping = EarlyStopping(verbose=True, delta=DELTA)

    for epoch in range(NUM_EPOCH):
        train_epoch(model, loader.test_raw, loader.train, loader.val,
                    early_stopping)

    acc, loss_ = test(model,
                      loader.test_raw,
                      loader.test,
                      print_confusion_matrix=True,
                      print_test_set=True)
    print('test - accuracy : {:.4f} | loss : {:.6f}'.format(acc, loss_))
예제 #9
0
# 5 epochs wiki, 10 epochs ours
files = [
    "./model.kor2vec.ep2", "./model.kor2vec.ep2.with_reviews",
    "./model.kor2vec.ep0", "./model.kor2vec.ep0.with_reviews",
    "./model.kor2vec.ep1", "./model.kor2vec.ep1.with_reviews",
    "./model.kor2vec.only_with_reviews",
    "./model.kor2vec.only_with_reviews_100"
]
"""
for file in files:
    kor2vec = Kor2Vec.load(file)
    score_sim = cal_sim_avg(kor2vec)
    score_dif = cal_diff_count_avg(kor2vec)
    print(f'{file} 의 점수: sim={score_sim}, dif={score_dif}')
"""
kor2vec = Kor2Vec.load("./model.kor2vec.ep0.with_reviews")
get_vec(kor2vec, "first", "0withR")
kor2vec = Kor2Vec.load("./model.kor2vec.ep0.with_reviews")
get_vec(kor2vec, "average", "0withR")
kor2vec = Kor2Vec.load("./model.kor2vec.ep1")
get_vec(kor2vec, "first", "1")
kor2vec = Kor2Vec.load("./model.kor2vec.ep1")
get_vec(kor2vec, "average", "1")
"""
    print('cos',cos_sim(kor2vec.embedding('맵다',numpy=True).squeeze(), \
                                       kor2vec.embedding('느끼하다',numpy=True).squeeze()))

a=kor2vec.embedding("웨이팅",numpy=True)
st = time.time()
b =kor2vec.embedding("매운", numpy=True)
print(norm(a[0]))
예제 #10
0
 def setUp(self):
     self.kor2vec = Kor2Vec()
예제 #11
0
            new_words = processor.nouns(sentence)
            new_embedding_data.extend(new_words)

    return k2v


if __name__ == '__main__':
    args = get_arguments()
    ### test korean to vec model
    k2v = None
    #### train embedding model
    input_path = 'data/processed/all_text.txt'
    train_embedding_model = args.train_embedding_model
    if train_embedding_model == False:
        try:
            k2v = Kor2Vec.load('source/models/k2v.model')
        except FileNotFoundError as e:
            raise (
                'Pretrained embedding model is not found, check if the file is exist, or train new one..'
            )
    else:
        assert input_path is not None, 'To train embedding model, we need input corpus'
        train_kor2vec(input_path,
                      output_path='source/models/k2v.model',
                      embedding_size=512,
                      batch_size=128)

    #### preprocessing document text data
    folder = [
        'data/kaist_text_corpus/utility/health/',
        'data/kaist_text_corpus/literature/autobiography/',
예제 #12
0
 def setKor2Vec(self, kor2vecFileName):
     self.kor2vec = Kor2Vec.load(kor2vecFileName)