예제 #1
0
    'project_subject_subcategories': str,
    'project_title': str,
    'project_essay_1': str,
    'project_essay_2': str,
    'project_essay_3': str,
    'project_essay_4': str,
    'project_resource_summary': str,
    'teacher_number_of_previously_posted_projects': int,
    'project_is_approved': np.uint8,
}
# Read data and store in DataFrame.
train_data = pd.read_csv(train_file_path,
                         sep=',',
                         dtype=dtype,
                         low_memory=True).sample(10000)
essay1 = train_data['project_essay_1']
ids = train_data['id']

ess1_list = []
for index, row in train_data.iterrows():
    ess1_list.append(
        LabeledSentence(row['project_essay_1'].split(" "), [row['id']]))
#size is the vector length, window means how many words are included in one paragraph
model = models.Doc2Vec(size=100, window=200, min_count=3, workers=1)
vocab = model.build_vocab(ess1_list)
model.train(ess1_list, epochs=10, total_words=100)
model.save("ess1_model.doc2vec")
# model_loaded = models.Doc2Vec.load('ess1_model.doc2vec')
# print "the first vector is: "
# print model.docvecs[0]
예제 #2
0
        self.labels = labels

    def __iter__(self):
        for i, words in enumerate(self.words_list):
            yield models.doc2vec.LabeledSentence(words,
                                                 ['%s' % self.labels[i]])


# ラベル付けを行う
morph_list, docs = set_folder_morph(corpus + folder)
sentences = LabeledListSentence(morph_list, docs)

# doc2vec の学習条件設定
# alpha: 学習率 / min_count: X回未満しか出てこない単語は無視
# size: ベクトルの次元数 / iter: 反復回数 / workers: 並列実行数
model = models.Doc2Vec(alpha=0.025, min_count=5, size=100, iter=20, workers=4)

# doc2vec の学習前準備(単語リスト構築)
model.build_vocab(sentences)

# Wikipedia から学習させた単語ベクトルを無理やり適用して利用することも出来ます
# model.intersect_word2vec_format('./data/wiki/wiki2vec.bin', binary=True)

print("training...")
# 学習実行
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)

if not os.path.isdir("./model"):
    os.mkdir("./model")
if not os.path.isdir("./model/doc2vec"):
    os.mkdir("./model/doc2vec")
예제 #3
0
pretrained_emb = "toy_data/sg.word2vec.300d"  #None if use without pretrained embeddings

#input corpus
train_corpus = "toy_data/train_docs.txt"

#output model
saved_path = "toy_data/model.bin"

#enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

#train doc2vec model
docs = g.doc2vec.TaggedLineDocument(train_corpus)
model = g.Doc2Vec(docs,
                  size=vector_size,
                  window=window_size,
                  min_count=min_count,
                  sample=sampling_threshold,
                  workers=worker_count,
                  hs=0,
                  dm=dm,
                  negative=negative_size,
                  dbow_words=1,
                  dm_concat=1,
                  pretrained_emb=pretrained_emb,
                  iter=train_epoch)

#save model
model.save(saved_path)
예제 #4
0
    data = pd.read_csv('./labeledTrainData.tsv', sep='\t')
    stop_words = [" "]
    sentences = []
    for index in data.index:
        id = data.loc[index]['id']
        sentiment = data.loc[index]['sentiment']
        review = data.loc[index]['review']
        #去掉HTML标签
        review = re.sub(r'<.*?>', '', review)
        review_list = [
            w.strip('.,?!\\"\'') for w in review.split(' ')
            if w not in stop_words
        ]
        sentences.append(models.doc2vec.TaggedDocument(review_list, [id]))
    #转换为向量
    doc2vec = models.Doc2Vec(sentences)

    #取30% 作为测试数据
    total = len(data.index)
    testIndexs = random.sample(range(total), int(total * 0.3))

    trainData = []
    trainLabel = []
    testData = []
    testLabel = []
    for index in range(total):
        id = data.loc[index]['id']
        if index in testIndexs:
            testData.append(doc2vec.docvecs[id])
            testLabel.append(data.loc[index]['sentiment'])
        else:
예제 #5
0
        for sent in self.lists:
            self.j =self.j +1
            yield gensim.models.doc2vec.LabeledSentence(words= sent, tags=["sent_"+ str(self.j)])

path = "/home/raksha/FIRE-2016/CHIS_testSet/final_fire_test_data_xls/skincancer.xlsx"    
book = xlrd.open_workbook(path)
first_sheet = book.sheet_by_index(0)
print first_sheet.nrows
for i in range(1,first_sheet.nrows):
     #print first_sheet.row_values(i)
     cell = first_sheet.cell(i,0)
     sent= cell.value.split()
     sentences.append(sent)

it = LabeledLineSentence(sentences)    #contains one file of all appended doc for that categoty 
doc2vecmodel = models.Doc2Vec(it,size = 200, window = 5, min_count = 0, dm = 0)

index2wordcollection = doc2vecmodel.index2word
env = lmdb.open('wikipedia-pubmed-and-PMC-w2v')
txn = env.begin(buffers=True)
wordvector=[]

for i in range(len(doc2vecmodel.syn0)):
    #pdb.set_trace()
    #if index2wordcollection[i].startswith("SENT_"):
    #    continue
    word = index2wordcollection[i]
   
    try:
    	word = index2wordcollection[i]
        text = word.encode('UTF-8') 
예제 #6
0
    for idx, (doc, name) in enumerate(zip(docs, corpus)):
        sys.stdout.write('\r前処理中 {}/{}'.format(idx, len(corpus)))
        yield doc_to_sentence(doc, name)

corpus = corpus_files()
sentences = corpus_to_sentences(corpus)

if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)

if isfile(PRE_TRAIN_MODEL_PATH):
    print('訓練済みモデルを使用します')
    model = models.Doc2Vec.load(PRE_TRAIN_MODEL_PATH)
else:
    model = models.Doc2Vec(dm=0, size=300, window=15, alpha=.025,
                           min_alpha=.025, min_count=1, sample=1e-6
                          )
    model.build_vocab(sentences)
    print('\n訓練開始')
    for epoch in range(20):
        print('Epoch: {}'.format(epoch + 1))
        model.train(sentences, total_examples=model.corpus_count, epochs=1)
        model.alpha -= (0.025 - 0.0001) / 19
        model.min_alpha = model.alpha
    model.save(PRE_TRAIN_MODEL_PATH)

predict_file = './text/livedoor-homme/livedoor-homme-5625149.txt'
print('類似度検索対象 : ' + predict_file)
predict_text = read_document(predict_file)
predict_results = model.docvecs.most_similar([model.infer_vector(split_into_words(predict_text))], topn=5)
for result in predict_results:
예제 #7
0
        # レポートの各行から、動詞・形容詞・名詞(数を除く)の情報のみを取り出す
        if len(chunks) > 3 and (
                chunks[3].startswith('動詞') or chunks[3].startswith('形容詞') or
            (chunks[3].startswith('名詞') and not chunks[3].startswith('名詞-数'))):
            words.append(chunks[0])
    sentences.append(TaggedDocument(words=words, tags=doc.tags))

# 学習モデルを生成
#   size: ベクトル化した際の次元数
#   alpha: 学習率
#   sample:単語を無視する際の頻度の閾値
#   min_count:学習に使う単語の最低出現回数
#   workers:学習時のスレッド数
model = models.Doc2Vec(vector_size=400,
                       alpha=0.0015,
                       sample=1e-4,
                       min_count=3,
                       workers=4)

# Doc2Vecに単語を登録
model.build_vocab(sentences)

# 学習評価に使用するサンプル数と閾値を設定
# ※Python3の四捨五入は「最近接偶数への丸め」となっていることに注意(1.5も2.5も2に丸められる)
sample_num = int(round(len(sentences) * SAMPLE_PROPORTION, 0))
passing_thres = int(round(sample_num * PASSING_PRECISION, 0))

print('sample_num:' + str(sample_num))
print('passing_thres:' + str(passing_thres))

endFlg = False
예제 #8
0
def train_doc_model(corpus, file):
    print("Training Doc2Vec model")
    model = models.Doc2Vec(corpus, size=100)
    model.save(file)
예제 #9
0
    'it', 'doesn', 'an', 'as', 'itself', 'at', 'have', 'in', 'any', 'if',
    'again', 'no', 'when', 'same', 'how', 'other', 'which', 'yo', 'shan',
    'needn', 'haven', 'after', 'most', 'such', 'why', 'a', 'off', 'i', 'm',
    'yours', 'so', 'y', 'the', 'having', 'once'
]
sentences = []
for i in range(len(your_list)):
    words = your_list[i][6].lower().split()
    words = [word for word in words if word not in stops]
    sentence = models.doc2vec.LabeledSentence(
        words=words, tags=["SENT_" + str(your_list[i][0])])
    sentences.append(sentence)

model = models.Doc2Vec(size=300,
                       window=20,
                       alpha=.025,
                       min_alpha=.025,
                       min_count=2,
                       workers=10)
model.build_vocab(sentences)

for epoch in range(12):
    model.train(sentences,
                total_examples=model.corpus_count,
                epochs=model.iter)
    model.alpha -= 0.0018  # decrease the learning rate`
    model.min_alpha = model.alpha  # fix the learning rate, no decay

print('DONE WITH TRAINING')
model.save('questiondupemodelA')
search_phrase = [
    'what', 'code', 'analysis', 'tools', 'do', 'you', 'use', 'on', 'your',
예제 #10
0
        docs.append(l[:-1].split(' '))

sentences = []

for i, title in enumerate(titles):
    sentences.append(models.doc2vec.LabeledSentence(docs[i], title))

#print(sentences)

#model = models.Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, alpha=.025, min_alpha=.025, min_count=0)
#model.build_vocab(sentences)
#for epoch in range(100):
#    model.train(sentences)
#    model.alpha -= 0.002  # decrease the learning rate`
#    model.min_alpha = model.alpha  # fix the learning rate, no decay

model = models.Doc2Vec(sentences,
                       dm=1,
                       dm_mean=1,
                       size=100,
                       window=2,
                       negative=5,
                       min_count=0)

model.save("my_model.doc2vec")
model_loaded = models.Doc2Vec.load('my_model.doc2vec')

#print(model.docvecs.most_similar(["SENT_1"]))
#print(model_loaded.docvecs.most_similar(["SENT_2"]))
print(model_loaded.docvecs.most_similar(["言語"]))
예제 #11
0
    
    item_index = np.arange(0,len(images_df_up_using)*4)
    item_index = item_index%len(images_df_up_using)

    input_x = images_df_using.to_numpy()

    total_input = drop_none(total_input)
    #len(0) 삭제
    use_input = [(line,i_idx) for line,i_idx in zip(total_input,item_index) if len(line)!=0] 

    input_d2v_x = [line for line,idx in use_input]
    item_index = [idx for line,idx in use_input]

    using_att_idx=drop_none(images_df_using.to_numpy())   
   
    model = models.Doc2Vec(alpha=.025, min_alpha=.025, min_count=1)
    sentences = LabeledLineSentenceByAttribute(input_d2v_x,item_index,suffle=True)

    model.build_vocab(sentences)
    model = models.Doc2Vec(
        documents=sentences, 
        min_count=1, size=50,
        window=1,
        iter=30,
        workers=10,
        #callbacks=[callback()]
        )  

    model.save("doc2vec_using_itemsplitidx_ustyle9")
    #ver2 : extend each item to att 
    model_loaded = models.Doc2Vec.load('doc2vec_using_itemsplitidx_ustyle9')
예제 #12
0
#_*_coding:utf-8_*_

from gensim import models,corpora
import jieba
import codecs
import logging
from langconv import *
#enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


zhwiki = '/Users/yangyang/Desktop/NLP/data/zhwiki-latest-pages-articles.xml.bz2'
wiki = corpora.WikiCorpus(zhwiki,lemmatize=False,dictionary={})

'''
gensim LabeledSentence:将文本(分词)、标签一起训练,得到文本向量
'''
class TaggedWikiDocument(object):
    def __init__(self, wiki):
        self.wiki = wiki
        self.wiki.metadata = True
    def __iter__(self):
        for content, (page_id, title) in self.wiki.get_texts():
            yield models.doc2vec.LabeledSentence(words=[w for c in content for w in jieba.cut(Converter('zh-hans').convert(c))], tags=[title])


documents = TaggedWikiDocument(wiki)
model = models.Doc2Vec(documents,dm=0,window=8,dbow_words=1,size=192,min_alpha=19,iter=5,workers=6)
model.save('./data/zhiwiki_news.doc2vec')
예제 #13
0
        # print word.encode('utf-8')
        if (word not in stopwords
                and flag[0] in [u'n', u'f', u'a', u'z']):  #去停用词和其他词性,比如非名词动词等
            result += word.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词
    return result


input = []
labels = []
uid = 1
for sentence in sentences:
    sentence = delNOTNeedWords(sentence, stopwords)
    input.append(jieba.lcut(sentence))
    labels.append('SENT_%s' % uid)
    uid = uid + 1

documents = LabeledSentence(words=input, labels=labels)

# bigram_transformer = models.Phrases(input)
model = models.Doc2Vec(documents,
                       size=feature_size,
                       window=content_window,
                       min_count=freq_min_count,
                       negative=negative,
                       iter=iter,
                       workers=multiprocessing.cpu_count())
# print model.index2word
model.save(save_filename)
f = model.most_similar([u'奥迪'])
for k in f:
    print k[0].encode('utf-8'), k[1]
예제 #14
0

LabeledSentence = gensim.models.doc2vec.LabeledSentence
doc2vec_dir ="Data/doc2vec/not_trump"

token_type = "zub_"
#sentences = []
#with open(doc2vec_dir+token_type+"doc2vec_train_corpus.txt",'r')as corpfile:
#    sentences=[sent.split() for sent in corpfile.readlines()]


with open(doc2vec_dir+token_type+"id_text_dic.json",'r')as corpfile:
    sent_dic = json.load(corpfile)
sentences = [LabeledSentence(v.split(),[str(k)]) for k,v in sent_dic.items()]
#sentences = models.doc2vec.TaggedLineDocument(doc2vec_dir+token_type+"doc2vec_train_corpus.txt")#yelp_data_small(words="sent_doc2vec", labels="label_doc2vec")
model_zub = models.Doc2Vec(sentences, size=dims, window=8, min_count=0, workers=4)
dims = str(dims)
model_zub.save(doc2vec_dir+token_type+"rumorEval_doc2vec"+dims+".model")
model_zub.init_sims(replace=True)
model_zub.save(doc2vec_dir+token_type+"rumorEval_doc2vec_set"+dims+".model")

    

dims =int(dims)
token_type = "twit_"
sentences = []
with open(doc2vec_dir+token_type+"id_text_dic.json",'r')as corpfile:
    sent_dic = json.load(corpfile)
sentences = [LabeledSentence(v.split(),[str(k)]) for k,v in sent_dic.items()]
#sentences = models.doc2vec.TaggedLineDocument(doc2vec_dir+token_type+"doc2vec_train_corpus.txt")#yelp_data_small(words="sent_doc2vec", labels="label_doc2vec")
model_twit = models.Doc2Vec(sentences, size=dims, window=8, min_count=0, workers=4)
        for train in trains:
            if train["html_url"].split("/")[-1] == comment["issue_url"].split(
                    "/")[-1]:
                train["body"] = f"{train['body']} {comment['body']}"

# add label
terms = [
    TaggedDocument(f"{train['title']} {train['body']}", [str(i)])
    for i, train in enumerate(trains)
]

# model train
model = models.Doc2Vec(terms,
                       dm=0,
                       vector_size=100,
                       window=2,
                       min_count=0,
                       workers=4,
                       epoch=20)
# model.save('doc2vec_model')
model = Doc2Vec.load("doc2vec_model")

# output results
results = model.docvecs.most_similar(len(trains) - 1)

suggestions = []
for result in results:
    index = int(result[0])
    suggestion = {}
    suggestion["html_url"] = trains[index]["html_url"]
    suggestion["title"] = trains[index]["title"]
예제 #16
0
            wakati_words.append(node.surface)
        elif hinshi in ["動詞", "形容詞"]:
            wakati_words.append(node.feature.split(",")[6])
        node = node.next
    return wakati_words


#作品リストをDoc2Vecが読めるTaggedDocument形式にし、配列に追加する --- (*5)
documents = []
#作品リストをループで回す
for auther, book in book_list():
    #作品の文字列を取得
    words = read_book(auther, book)
    #作品の文字列を分かち書きに
    wakati_words = split_words(words)
    #TaggedDocumentの作成 文書=分かち書きにした作品 タグ=作者:作品名
    document = TaggedDocument(wakati_words,
                              [auther["name"] + ":" + book["name"]])
    documents.append(document)

#TaggedDocumentの配列を使ってDoc2Vecの学習モデルを作成 --- (*6)
model = models.Doc2Vec(documents,
                       dm=0,
                       vector_size=300,
                       window=15,
                       min_count=1)

#Doc2Vecの学習モデルを保存
model.save('aozora.model')

print("モデル作成完了")
예제 #17
0
    def get_doc2vec_model(self, build_model):

        self.tagged_docs_pos_train, self.tagged_docs_neg_train = self.data_transformation.transform_sents(
            self.train_x, self.train_y, True)

        x = self.test_x if not self.is_real_test else self.test_x.values()
        self.tagged_docs_pos_test, self.tagged_docs_neg_test = self.data_transformation.transform_sents(
            x, self.test_y, False)

        if build_model or not os.path.exists(model_location +
                                             self.doc2vec_model_name):

            self.doc2vec_model = models.Doc2Vec(min_count=1,
                                                window=10,
                                                size=400,
                                                sample=1e-4,
                                                negative=5,
                                                workers=7)

            tagged_docs_train = self.tagged_docs_pos_train + self.tagged_docs_neg_train
            tagged_docs_test = self.tagged_docs_pos_test + self.tagged_docs_neg_test
            tagged_docs = tagged_docs_train + tagged_docs_test

            self.doc2vec_model.build_vocab(tagged_docs)

            shuffled = list(tagged_docs)
            random.shuffle(shuffled)
            self.doc2vec_model.train(
                shuffled,
                total_examples=self.doc2vec_model.corpus_count,
                epochs=self.iter)

            self.doc2vec_model.save(model_location + self.doc2vec_model_name)

        else:
            self.doc2vec_model = models.Doc2Vec.load(model_location +
                                                     self.doc2vec_model_name)

        train_arrays, train_labels = self.data_transformation.create_classifier_arrays(
            self.doc2vec_model, True, len(self.tagged_docs_pos_train),
            len(self.tagged_docs_neg_train))

        test_arrays, test_labels = self.data_transformation.create_classifier_arrays(
            self.doc2vec_model, False, len(self.tagged_docs_pos_test),
            len(self.tagged_docs_neg_test))

        clf = LogisticRegression(penalty='l2')
        clf = SVC()
        clf.fit(train_arrays, train_labels)

        # C_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        # gamma_range = [0.01, 0.02, 0.03, 0.04, 0.05, 0.10, 0.2, 0.3, 0.4, 0.5]
        # param_grid = dict(gamma=gamma_range, C=C_range)
        # cv = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
        # clf = RandomizedSearchCV(SVC(), param_distributions=param_grid, cv=cv, n_iter=5)
        # clf.fit(train_arrays, train_labels)
        #
        # print(clf.best_params_)
        # print(clf.best_estimator_)
        # print(clf.best_score_)

        logging.info("Finished training classifier.")

        # Approach when not training doc2vec on test reviews
        # tvecs = []
        #
        # for i in range(len(self.test_x)):
        #     tdt = TaggedDocument(self.remove_stopwords(self.test_x[i]), ["test_" + str(i)])
        #     tvecs.append(self.doc2vec_model.infer_vector(tdt.words, steps=200))
        #
        # logging.info("Created TaggedDocuments for Training data.")
        # print(classifier.score(test_arrays, test_labels))

        if self.is_real_test:
            file_ids = self.test_x.keys()
            pred = clf.predict(test_arrays)
            self.data_transformation.write_to_file(dict(zip(file_ids, pred)),
                                                   "doc2vec")
        else:
            v = Visualization(test_labels, clf.predict(test_arrays),
                              "doc2vec - Logistik Regression")
            v.generate()
예제 #18
0
def train_doc_model_manual(corpus, file):
    print("Training Dord2Vec model")
    model = models.Doc2Vec(dm=1, iter=5, alpha=0.1, min_alpha=0.025, size=100)
    model.build_vocab(corpus)
    model.train(corpus, total_examples=model.corpus_count, epochs=model.iter)
    model.save(file)
예제 #19
0
for r in result:
    all_text.append(r[1].strip().split(" "))
for o in original:
    all_text.append(o[1].strip().split(" "))

count = 0

doc = []

sentences = []
for i in range(len(all_text)):
    string = "DOC_" + str(i)
    sentence = models.doc2vec.LabeledSentence(all_text[i], labels=[string])
    sentences.append(sentence)

d2v = models.Doc2Vec(sentences, size=100, window=5, min_count=0, dm=1)

#Doc2Vec train
for j in range(5):
    d2v.train(sentences)
features = []

for ii, term in enumerate(sentences):
    feature = []
    string = "DOC_" + str(ii)
    for term in d2v[string]:
        feature.append(term)
    features.append(feature)

candidate_text = features[:1000]
original_text = features[1000:]
예제 #20
0
파일: doc2vec.py 프로젝트: matulma4/esc
from gensim import models
from gensim.models.doc2vec import TaggedLineDocument
from os import path
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Train doc2vec.")
    parser.add_argument("fname", help="filename", type=str)
    parser.add_argument("model", help="modelname", type=str)
    args = parser.parse_args()
    sentences = TaggedLineDocument(args.fname)
    outname = args.model + ".doc2vec"
    if path.isfile(outname):
        model = models.Doc2Vec.load(outname)
    else:
        model = models.Doc2Vec(size=100, window=5, min_count=5, workers=4)
        model.build_vocab(sentences)
        model.train(sentences)
        model.save(outname)
    print ""
#sentence3 = models.doc2vec.LabeledSentence(
#    words=[u'魚', u'泳ぐ', u'海'], tags=["SENT_3"])
#sentences = [sentence, sentence1, sentence2, sentence3]
#print sentences


class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        for uid, line in enumerate(open(filename)):
            yield LabeledSentence(words=line.split(), labels=['SENT_%s' % uid])


model = models.Doc2Vec(alpha=.025, min_alpha=.025, min_count=1)
model.build_vocab(sentences)

for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002  # decrease the learning rate`
    model.min_alpha = model.alpha  # fix the learning rate, no decay

model.save(my_model)
#model_loaded = models.Doc2Vec.load(my_model)

# ある文書に似ている文書を表示
#print ("SENT_0")
#print (model.docvecs.most_similar(["SENT_0"]) )
#print ("SENT_3")
#print (model.docvecs.most_similar(["SENT_3"]) )
예제 #22
0
def calc_similarity(folder, doc1, doc2):

    # フォルダが存在しない場合エラーを返し終了する
    if not os.path.isdir("%s/%s" % (folder, doc1)):
        print("Not exist " + doc1)
        quit()
    if not os.path.isdir("%s/%s" % (folder, doc2)):
        print("Not exist " + doc2)
        quit()

    # 必要変数を定義
    directory = os.getcwd() + "/"
    # model_path = "model/doc2vec/"

    # tourism内の入力されたフォルダを読み込む
    spot = os.listdir("%s/%s" % (folder, doc1))
    spot = [sp for sp in spot if not sp == ".DS_Store"]
    tourist_spot = os.listdir("%s/%s" % (folder, doc2))
    tourist_spot = [tsp for tsp in tourist_spot if not tsp == ".DS_Store"]

    # tmpフォルダを新たに作る(存在する場合は初期化)
    if os.path.isdir("%s/tmp" % folder):
        shutil.rmtree("%s/tmp" % folder)
        os.mkdir("%s/tmp" % folder)
    else:
        os.mkdir("%s/tmp" % folder)

    # 入力されたフォルダ内のフォルダ内のテキストファイルを読み込む
    spot_list = []
    tourist_spot_list = []

    for sp in spot:
        tmp = os.listdir("%s/%s/%s" % (folder, doc1, sp))
        tmp = [fn for fn in tmp if fn[-4:] == ".txt"]
        tmp = [fn for fn in tmp if not fn[0] == "."]
        spot_list.append(tmp)

    for tsp in tourist_spot:
        tmp = os.listdir("%s/%s/%s" % (folder, doc2, tsp))
        tmp = [fn for fn in tmp if fn[-4:] == ".txt"]
        tmp = [fn for fn in tmp if not fn[0] == "."]
        tourist_spot_list.append(tmp)

    print("copying...")

    # 入力されたフォルダ内のフォルダ内ののテキストファイルをコピーする
    # ファイルの中身を100字取り出しておく
    # file_list = []
    text_list = []
    for sp, sp_l in zip(spot, spot_list):
        end = len(sp_l)
        flag = 0
        for tsp, tsp_l in zip(tourist_spot, tourist_spot_list):
            os.mkdir("%s/tmp/%s_%s" % (folder, sp, tsp))
            for s in sp_l:
                shutil.copy("%s/%s/%s/%s" % (folder, doc1, sp, s),
                            "%s/tmp/%s_%s/%s" % (folder, sp, tsp, s))
                if flag in range(0, end):
                    f = open("%s/%s/%s/%s" % (folder, doc1, sp, s),
                             encoding="utf-8")
                    text = f.read()
                    text_list.append(text[:100])
                    f.close()
                    flag += 1
            for t in tsp_l:
                shutil.copy("%s/%s/%s/%s" % (folder, doc2, tsp, t),
                            "%s/tmp/%s_%s/%s" % (folder, sp, tsp, t))

    print("training...")

    morph_list, docs = [], []
    sentences = []
    model = []

    # ラベル付け, doc2vecモデルの定義を行う
    # alpha: 学習率 / min_count: X回未満しか出てこない単語は無視
    # size: ベクトルの次元数 / iter: 反復回数 / workers: 並列実行数
    # dm: 1の場合dmpvを使用, それ以外はDBoWを使用する
    # window: Doc2Vecで前後何単語まで入力とするか
    for sp in spot:
        for tsp in tourist_spot:
            ml, dc = set_folder_morph("%s/tmp/%s_%s" % (folder, sp, tsp))
            morph_list.append(ml)
            docs.append(dc)
            sentences.append(LabeledListSentence(ml, dc))
            model.append(
                models.Doc2Vec(alpha=0.025,
                               dm=1,
                               window=10,
                               min_count=0,
                               size=50,
                               iter=100,
                               workers=4))

    # doc2vecの学習前準備, 学習の実行
    index = 0
    for i in range(len(spot)):
        for j in range(len(tourist_spot)):
            model[index].build_vocab(sentences[index])
            model[index].train(sentences[index],
                               total_examples=model[index].corpus_count,
                               epochs=model[index].iter)
            index += 1

    if not os.path.isdir("./model"):
        os.mkdir("./model")
    if not os.path.isdir("./model/similarity"):
        os.mkdir("./model/similarity")

    # doc2vecモデルのセーブとロード
    index = 0
    for sp in spot:
        for tsp in tourist_spot:
            model[index].save("./model/similarity/%s_%s.model" % (sp, tsp))
            model[index] = models.Doc2Vec.load(
                "./model/similarity/%s_%s.model" % (sp, tsp))
            index += 1

    print("calculating...")

    # 計算結果を格納するためのゼロ行列を作成

    spot_len = 0
    file_name = []
    label = []

    index = 0
    for sp_l in spot_list:
        spot_len += len(sp_l)
        for fn in sp_l:
            file_name.append(fn)
            label.append(spot[index])
        index += 1

    DOC_SIM = np.zeros((spot_len, len(tourist_spot)))

    # 類似度の計算
    index = 0
    start = 0
    end = 0
    for i, sp in enumerate(spot):
        start = end
        end += len(spot_list[i])
        for j, tsp in enumerate(tourist_spot):
            os.chdir(directory + "%s/tmp/%s_%s" % (folder, sp, tsp))
            flag = start
            for f1 in spot_list[i]:
                for f2 in tourist_spot_list[j]:
                    DOC_SIM[flag, j] += model[index].docvecs.similarity(
                        d1=f1, d2=f2) / len(tourist_spot_list[j])
                flag += 1
            index += 1

    os.chdir(directory)

    if not os.path.isdir("./similarity"):
        os.mkdir("./similarity")

    # 計算結果を出力するための準備
    DOC_SIM_DF = pd.DataFrame(np.c_[file_name, text_list, DOC_SIM])
    DOC_SIM_DF.index = label
    DOC_SIM_DF.columns = ["file_name", "text"] + tourist_spot

    for tsp in tourist_spot:
        DOC_SIM_DF[[tsp]] = DOC_SIM_DF[[tsp]].astype(float)

    # 文書集合の類似度を出力
    # 1に近いほど似ている,0に近いほど似ていない
    DOC_SIM_DF.to_excel("./similarity/%s_%s.xlsx" % (doc1, doc2),
                        encoding="shift-jis")

    print("Done.")
예제 #23
0
 def similarity(self, label, items):
     if not self.model:
         self.model = models.Doc2Vec(self._gen_docs(self.docs),
                                     min_count=20,
                                     workers=4)
     return self.model.most_similar(label, topn=items)
예제 #24
0
        # レポートの各行から、動詞・形容詞・名詞(数を除く)の情報のみを取り出す
        if len(chunks) > 3 and (
                chunks[3].startswith('動詞') or chunks[3].startswith('形容詞') or
            (chunks[3].startswith('名詞') and not chunks[3].startswith('名詞-数'))):
            words.append(chunks[0])
    sentences.append(TaggedDocument(words=words, tags=doc.tags))

# 学習モデルを生成
#   size: ベクトル化した際の次元数
#   alpha: 学習率
#   sample:単語を無視する際の頻度の閾値
#   min_count:学習に使う単語の最低出現回数
#   workers:学習時のスレッド数
model = models.Doc2Vec(size=400,
                       alpha=0.0015,
                       sample=1e-4,
                       min_count=1,
                       workers=4)

# Doc2Vecに単語を登録
model.build_vocab(sentences)

# 学習評価に使用するサンプル数と閾値を設定
# ※Python3の四捨五入は「最近接偶数への丸め」となっていることに注意(1.5も2.5も2に丸められる)
sample_num = int(round(len(sentences) * SAMPLE_PROPORTION, 0))
passing_thres = int(round(sample_num * PASSING_PRECISION, 0))

for x in range(TRAIN_MAX):
    print(x)
    # 学習実施
    model.train(sentences,
예제 #25
0
def main():
    parser = argparse.ArgumentParser(description='Doc2vec driver.')
    parser.add_argument('mode', choices=['train', 'retrieve', 'infer'], help='Training, retrieve trained embeddings'
                                                                             'or inference mode')
    parser.add_argument('name', type=str, help='Model name')
    parser.add_argument('output_path', type=str, help='Output path')
    parser.add_argument('word_embeddings_path', type=str,
                        help='Pre-trained word embeddings path')
    parser.add_argument('tokenized_path', type=str,
                        help='Directory with tokenized plain text documents')
    args = parser.parse_args()


    if None in [args.mode, args.name, args.output_path]:
        exit('Arguments mode, name, output_path are required')
    if args.mode == 'train' and (args.word_embeddings_path is None or args.tokenized_path is None):
        exit('word_embeddings_path and tokenized_path arguments are required if mode is set to train')
    if args.mode == 'infer' and args.tokenized_path is None:
        exit('word_embeddings_path and tokenized_path arguments are required if mode is set to train')
    if args.mode == 'train':
        print("Entering training mode")
        pretrained_emb = args.word_embeddings_path
        tokenized_path = args.tokenized_path
        texts = []
        for filename in os.listdir(tokenized_path):
            with codecs.open(os.path.join(tokenized_path, filename), 'r', 'utf-8') as f:
                doc_tokens = []
                for line in f.readlines():
                    if len(line) > 0:
                        doc_tokens += line.split()
                texts.append(doc_tokens)
                #texts.append([line.split() for line in f.readlines()])

        print(len(texts))
        #doc2vec parameters
        vector_size = 300
        window_size = 15
        min_count = 1
        sampling_threshold = 1e-5
        negative_size = 5
        train_epoch = 5
        dm = 0 #0 = dbow; 1 = dmpv
        worker_count = 6 #number of parallel processes
        saved_path = os.path.join(args.output_path, 'models', args.name + '.bin')

        #enable logging
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        mkdir_p(os.path.join(args.output_path, 'models'))
        print("created models directory")
        docs = [g.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(texts)]
        print("Let's start training")
        model = g.Doc2Vec(docs, size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold,
                          workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1,
                          pretrained_emb=pretrained_emb, iter=train_epoch)
        print("Trained doc2vec")
        model.save(saved_path)
    elif args.mode == 'retrieve':
        saved_path = os.path.join(args.output_path, 'models', args.name + '.bin')
        model = g.Doc2Vec.load(saved_path)
        vectors = []
        for i in range(len(model.docvecs)):
            vectors.append(model.docvecs[i])
        vectors = np.array(vectors)
        np.save(os.path.join(args.output_path, args.name + '_vectors'), vectors)
            # print model.docvecs[i]
    else: # infer
        # inference hyper - parameters
        start_alpha = 0.01
        infer_epoch = 1000

        # load model
        m = g.Doc2Vec.load(os.path.join(args.output_path, 'models', args.name + '.bin'))
        tokenized_path = args.tokenized_path
        texts = []
        for filename in os.listdir(tokenized_path):
            with codecs.open(os.path.join(tokenized_path, filename), 'r', 'utf-8') as f:
                texts.append([line.split() for line in f.readlines()])
        test_docs = texts
        # infer test vectors
        for d in test_docs:
            print ' '.join([str(x) for x in m.infer_vector(d, alpha=start_alpha, steps=infer_epoch)]) + '\n'
예제 #26
0
파일: d2vg.py 프로젝트: matulma4/esc
          (len(doc_list), len(train_docs), len(test_docs)))

    name = sys.argv[1]
    dist_mem = int(sys.argv[2])
    hier_soft = int(sys.argv[3])
    neg = int(sys.argv[4])
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    fname = str(name) + ".doc2vec"
    if path.isfile(fname):
        model = models.Doc2Vec.load(fname)
    else:
        model = models.Doc2Vec(size=100,
                               window=5,
                               min_count=5,
                               workers=4,
                               negative=neg,
                               hs=hier_soft,
                               dm=dist_mem)
        model.save(fname)

    model.build_vocab(alldocs)
    model.train(alldocs)

    doc_id = 24  # np.random.randint(model.docvecs.count)  # pick random doc, re-run cell for more examples
    # model = np.random.choice(model)  # and a random model
    sims = model.docvecs.most_similar(
        doc_id, topn=model.docvecs.count)  # get *all* similar documents
    f = open(name + '.out', 'w')
    f.write(u'TARGET (%d): <%s>\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
    f.write(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
예제 #27
0
    originals.append(cols[0])
    sentences.append(
        models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(
            mecab.parse(cols[0]).strip(), min_len=1),
                                      tags=["SENT_" + str(j)]))
    j += 1

stop_words = []
if args.stop_words:
    for line in open(args.stop_words, "r", encoding="utf-8"):
        stop_words.append(line.strip())

vectorizer = TfidfVectorizer(token_pattern="(?u)\\b\\w+\\b",
                             stop_words=stop_words)

model = models.Doc2Vec(vector_size=400, windows=5, min_count=5, epochs=100)

model.build_vocab(sentences)
"""
print('\n訓練開始')
for epoch in range(51):
    print('Epoch: {}'.format(epoch + 1))
    model.train(sentences, epochs=model.epochs, total_examples=model.corpus_count)
    if epoch%5==0:
        model_str="jamQ_model400_doc2vec_"+str(epoch)
        model.save(model_str)
"""
model_str = "jamQ_model400_doc2vec_50"
model = models.Doc2Vec.load(model_str)

doc_vecs = []