-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_embedding_test.py
35 lines (29 loc) · 933 Bytes
/
word_embedding_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!usr/bin/env python
# -*- coding: utf-8 -*-
# set default coding euc-kr 2 utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
print ("load")
#load from kobill
from konlpy.corpus import kobill
#docs_ko =kobill.open('kobill/news.txt').read()
docs_ko = [kobill.open(i).read() for i in kobill.fileids()]
print ("tokenize")
#tokenize
from konlpy.tag import Twitter; t = Twitter()
print ("tokenize1")
pos = lambda d:['/'.join(p) for p in t.pos(d,stem=True,norm=True)]
print ('tokenize2')
texts_ko = [pos(doc) for doc in docs_ko]
#texts_ko = pos(docs_ko)
print ("train")
import time
now_time = time.time()
#train
from gensim.models import word2vec
wv_model_ko = word2vec.Word2Vec(texts_ko,workers=16,negative=10,window=7,size=300)
wv_model_ko.init_sims(replace=True)
wv_model_ko.save('ko_word2vec_e.model')
print ("training time "+str(time.time()-now_time)+"sec")
print (wv_model_ko.most_similar((pos('서울대학교'))))