예제 #1
0
def wordVector(doc):
    sentences = doc2vec.TaggedLineDocument(doc)
    model = doc2vec.Doc2Vec(size=100, window=300, min_count=5, workers=10)
    model.build_vocab(sentences)
    model.train(sentences)

    return model
예제 #2
0
def train_doc2vec(model_dir, train_file, train_params):
    import logging
    import multiprocessing

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    documents = doc2vec.TaggedLineDocument(train_file)

    size = train_params["size"]
    window = train_params["window"]
    min_count = train_params["min_count"]
    workers = multiprocessing.cpu_count() // 2
    epochs = train_params["epochs"]
    alpha = train_params["alpha"]
    min_alpha = train_params["min_alpha"]

    model = doc2vec.Doc2Vec(documents,
                            vector_size=size,
                            window=window,
                            min_count=min_count,
                            workers=workers,
                            epochs=epochs,
                            alpha=alpha,
                            min_alpha=min_alpha)
    if not os.path.exists(os.path.dirname(model_dir)):
        try:
            os.makedirs(os.path.dirname(model_dir))
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise
    model.save(model_dir)
def neighborhood_embedding(args):
    inputDir = args.preprocessed_input
    # outputFile = args.output
    iterations = args.iter
    dimensions = args.d
    window = args.windowSize
    dm = 1 if args.model == 'dm' else 0
    indexToName = generateWalkFile(inputDir, args.walkLength)
    # print(indexToName)
    sentences = doc.TaggedLineDocument(inputDir+'.walk')

    with open('log', 'a+') as f:
        results = []
        # for epochs in range(10, 110, 10):
            # print('epochs', epochs)
        model = doc.Doc2Vec(sentences, vector_size = dimensions, dm = dm, window = window )
        vectors = model.docvecs
        embeddings = [[] for _ in range(len(vectors))]
        for i in range(len(vectors)):
            embeddings[int(indexToName[i])] = vectors[i]
        
        from preprocess import evaluate
        res = evaluate(args.input, embeddings)
        results.append(str(res))
        print(res)

        f.write(inputDir + ',n,' + ','.join(results) + '\n')
예제 #4
0
def writeDoc2vecSimMatrix(outfile, allTweets, results, create):
    if create:
        outfile1 = os.path.dirname(outfile) + "/Doc2vecModelTokens.txt"
        pos_tweets = tagger.runtagger_parse(
            allTweets)  #tokenizer and POS-tagger
        tokens = makeDoc2vecFile(pos_tweets, outfile1, False)
        sentence = doc2vec.TaggedLineDocument(
            outfile1)  #Imports in doc2vec format
        model = doc2vec.Doc2Vec(sentence,
                                size=100,
                                window=300,
                                min_count=10,
                                workers=4)  #makes doc2vec model
        model_name = os.path.dirname(outfile) + "/Doc2vecModel.txt"
        model.save(model_name)
    else:
        model_name = os.path.dirname(outfile) + "/Doc2vecModel.txt"
        model = doc2vec.Doc2Vec.load(model_name)
    for i in range(0, len(allTweets)):
        x = []
        for result in results:
            k = allTweets.index(result)
            x.append(str(model.docvecs.similarity(i, k)))
        with open(outfile, "a+") as f:
            writeFile = csv.writer(f)
            writeFile.writerow(x)
예제 #5
0
def structural_embedding(input_dir,
                         iterations=20,
                         dimensions=128,
                         windowSize=2,
                         dm=1,
                         walkLength=64):
    # index_to_name = generate_walk_file(input_dir, walkLength, 0.5)
    walk_dir_path = input_dir.replace('sub_graphs', 'walks')
    walk_dir_path = (os.path.join(walk_dir_path, 'walk_file.walk'))

    sentences = doc.TaggedLineDocument(walk_dir_path)

    model = doc.Doc2Vec(vector_size=50,
                        epochs=40,
                        size=dimensions,
                        iter=iterations,
                        dm=dm,
                        window=windowSize,
                        min_count=1)
    model.build_vocab(sentences)
    model.train(sentences,
                total_examples=model.corpus_count,
                epochs=model.epochs)

    return list(model.docvecs.vectors_docs)
예제 #6
0
def main():
    logging.basicConfig(format='[%(levelname)s] : %(message)s',
                        level=logging.INFO)
    sentences = doc2vec.TaggedLineDocument('output/fb_article_seg.txt')
    model = doc2vec.Doc2Vec(sentences, size=100, window=3)
    #model.train(sentences)
    model.save('output/doc2vec.model')
 def make_model(self):
     documents = doc2vec.TaggedLineDocument(self.text_path)
     model = doc2vec.Doc2Vec(documents,
                             size=self.dimension,
                             dm=0,
                             min_count=1,
                             iter=10,
                             window=2)
     model.save(self.text_model_path)
def kmeans_doc2vec(file_name):
    sentences = doc2vec.TaggedLineDocument(file_name)
    model = doc2vec.Doc2Vec(sentences,  # 语料集
                            size=40,  # 是指特征向量的维度
                            window=3,  # 表示当前词与预测词在一个句子中的最大距离是多少
                            )
    model.save_word2vec_format("doc2vec_result.txt")
    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
    num_clusters = 10
    km = KMeans(n_clusters=10, init='k-means++', max_iter=300, n_init=1, verbose=False, random_state=0)
    result_doc2vec = list(km.fit_predict(model.docvecs))
    return result_doc2vec
예제 #9
0
def generate_doc2vec_model(target_game_name):
    print("Training Start")
    # カードテキスト読み込み
    card_text = doc2vec.TaggedLineDocument(target_game_name + ".txt")
    # 学習
    model = doc2vec.Doc2Vec(card_text, size=300, window=8, min_count=1,
                            workers=4, iter=400, dbow_words=1, negative=5)

    # モデルの保存
    model.save(target_game_name + ".model")
    print("Training Finish")
    return model
예제 #10
0
def gen_d2v_corpus(lines, savemodel, istran=False):

    total_examples = len(lines)
    with open("../data/ques2_result.txt", "wb") as fw:
        for line in lines:
            txt = " ".join(jieba.lcut(line)) + "\n"
            txt = txt.encode('utf-8')
            fw.write(txt)

    sents = doc2vec.TaggedLineDocument("../data/ques2_result.txt")

    model = None
    if os.path.exists(savemodel):
        print('loading model', savemodel, time.asctime())
        model = doc2vec.Doc2Vec.load(savemodel)
        print('loaded model', savemodel, time.asctime())
        if istran:
            count = 0
            while (True):
                count += 1
                epoches = 20
                model.train(sents,
                            total_examples=total_examples,
                            epochs=epoches)
                if count % 10:
                    model.save(savemodel + "." + str(count))
                    model.save(savemodel)
                print('trained ', count * epoches)
    else:
        print('train new model')
        model = doc2vec.Doc2Vec(sents,
                                size=300,
                                window=12,
                                min_count=2,
                                workers=4,
                                dm=0)

        print('train', time.asctime())
        model.train(sents, total_examples=total_examples, epochs=200)
        print('train', time.asctime())
        model.save(savemodel)

    save_path = '../data/query.doc2vec.txt'
    write_to_file(save_path, "".encode('utf-8'), mode='wb+')
    for i in range(100):
        vs = model.docvecs.most_similar(i)
        for v in vs[:10]:
            result_indx = v[0]
            distance = v[1]
            txt = '{} {} {} {} {} {}\n'.format(i, lines[i], "->", result_indx,
                                               lines[result_indx], distance)
            write_to_file(save_path, txt.encode('utf-8'))
        write_to_file(save_path, "\n".encode('utf-8'))
예제 #11
0
def neighborhood_embedding(input_dir,
                           iterations=20,
                           dimensions=128,
                           windowSize=2,
                           dm=1,
                           walkLength=64):
    index_to_name = generate_walk_file(input_dir, walkLength)
    walk_dir_path = input_dir.replace("\\sub_graphs\\", "\\walks\\")
    sentences = doc.TaggedLineDocument(walk_dir_path + '.walk')
    model = doc.Doc2Vec(sentences,
                        size=dimensions,
                        iter=iterations,
                        dm=dm,
                        window=windowSize)
    return list(model.docvecs.vectors_docs), index_to_name
예제 #12
0
def structural_embedding(args):
    inputDir = args.input
    outputFile = args.output
    iterations = args.iter
    dimensions = args.d
    window = args.windowSize
    dm = 1 if args.model == 'dm' else 0
    indexToName = generateWalkFile(inputDir, args.walkLength, args.p)
    sentences = doc.TaggedLineDocument(inputDir + '.walk')

    model = doc.Doc2Vec(sentences,
                        size=dimensions,
                        iter=iterations,
                        dm=dm,
                        window=window)

    saveVectors(list(model.docvecs), outputFile, indexToName)
def get_testset_update_trainning(para=True):
    doc_test_labels_preprocess = 'test_labels_level1_with_label_preprocess.txt'
    if para == True:
        with open(path_main + doc_test_labels, 'r', encoding='utf-8') as f:
            contents = f.readlines()
            with open(path_main + doc_test_labels_preprocess,
                      'w',
                      encoding='utf-8') as outf:
                for line in contents:
                    if len(line.strip()) < 5:
                        continue
                    else:
                        outf.write(line)
    else:
        pass

    sentences = d2v.TaggedLineDocument(path_main + doc_test_labels_preprocess)
    return sentences
예제 #14
0
def neighborhood_embedding(args):
    inputDir = args.input
    outputFile = args.output
    iterations = args.iter
    dimensions = args.d
    window = args.windowSize
    dm = 1 if args.model == 'dm' else 0
    indexToName = generateWalkFile(inputDir, args.walkLength)
    sentences = doc.TaggedLineDocument(inputDir + '.walk')

    model = doc.Doc2Vec(sentences,
                        vector_size=dimensions,
                        epochs=iterations,
                        dm=dm,
                        window=window)
    print(
        "Neighborhood output generated at the end of neighborhood.py -- example embedding of the first subgraph:"
    )
    print(model.docvecs[0])
예제 #15
0
def build_d2v_model(file_location, model_name, do_train="True"):

    documents = doc2vec.TaggedLineDocument(file_location)

    if (do_train == "True"):

        model = doc2vec.Doc2Vec(documents,
                                size=200,
                                window=5,
                                min_count=3,
                                workers=8,
                                iter=20)
        model.save(model_name)

    else:
        model = doc2vec.Doc2Vec.load(model_name)
        model.init_sims(replace=True)

    return model
    def run(self, conf_data):

        try:
            # init parms for doc2vec node
            self._init_node_parm(conf_data['node_id'])
            self.cls_pool = conf_data['cls_pool']

            # get prev node for load data
            data_node_name = self._get_backward_node_with_type(
                conf_data['node_id'], 'preprocess')
            train_data_set = self.cls_pool[data_node_name[0]]

            # load model for train
            update_flag = False
            model = doc2vec.Doc2Vec(size=self.vector_size,
                                    window=self.window_size)
            if (os.path.exists(''.join([self.md_store_path,
                                        '/model.bin'])) == True):
                model = doc2vec.Doc2Vec.load(''.join(
                    [self.md_store_path, '/model.bin']))
                update_flag = True

            # train vocab and model
            while (train_data_set.has_next()):
                train_data = doc2vec.TaggedLineDocument(
                    train_data_set.train_file_path())
                for x in range(0, self.iter_size):
                    if (update_flag == False):
                        model.build_vocab(train_data, update=False)
                        update_flag = True
                    else:
                        model.build_vocab(train_data, update=True)
                    model.train(train_data)
                train_data_set.next()

            os.makedirs(self.md_store_path, exist_ok=True)
            model.save(''.join([self.md_store_path, '/model.bin']))
            return len(model.raw_vocab)
        except Exception as e:
            logging.info("[Doc2Vector Train Process] : {0}".format(e))
            raise Exception(e)
예제 #17
0
    def graph_structural_embedding(self, graphs, **kwargs):
        dirName = 'data/output/sub2vec_output/'
        if not os.path.isdir(dirName):
            os.makedirs(dirName)
        file_name = os.path.join(dirName, 'random_walk_file.walk')
        indexToName = structural.generateWalkFile(graphs, file_name,
                                                  kwargs['walkLength'],
                                                  kwargs['alpha'],
                                                  kwargs['randomWalkCount'])
        sentences = doc.TaggedLineDocument(file_name)
        print('build model')
        model = doc.Doc2Vec(sentences,
                            vector_size=kwargs['dimensions'],
                            epochs=kwargs['iterations'],
                            dm=kwargs['dm'],
                            window=kwargs['window'])

        # outputfile = os.path.join(dirName, 'vectors.vec')
        # print('save vectores')
        # structural.saveVectors(model.docvecs, outputfile, indexToName)
        return model.docvecs
예제 #18
0
def structural_embedding(args):

    inputDir = args.input
    print(inputDir)
    outputFile = args.output
    iterations = args.iter
    dimensions = args.d
    window = args.windowSize
    dm = 1 if args.model == 'dm' else 0
    indexToName = generateWalkFile(inputDir, args.walkLength,
                                   args.p)  # just makes walks
    sentences = doc.TaggedLineDocument(inputDir + '.walk')

    model = doc.Doc2Vec(sentences,
                        vector_size=dimensions,
                        epochs=iterations,
                        dm=dm,
                        window=window)
    print("Total vects ",
          len(list(model.docvecs.vectors_docs)))  #model.docvecs
    saveVectors(list(model.docvecs.vectors_docs), outputFile, indexToName)
def train_doc2vec():
    conf = Config()
    sentences = doc2vec.TaggedLineDocument(conf.train_path)
    # build voca
    model = doc2vec.Doc2Vec(min_count=conf.word_min_count,
                            vector_size=conf.vector_size,
                            alpha=conf.learning_rate,
                            negative=conf.negative_size,
                            epochs=conf.train_epoch,
                            window_size=conf.window_size,
                            min_alpha=conf.learning_rate,
                            seed=1234,
                            workers=conf.worker_count)

    model.build_vocab(sentences)

    # Train document vectors
    model.train(sentences, epochs=model.iter,
                total_examples=model.corpus_count)

    # To save
    if not os.path.isdir(conf.model_path):
        os.mkdir(conf.model_path)
    model.save(conf.modelfile)
예제 #20
0
 def calculateEmbeddings(self):
     ''' calculates gensim doc2vec model and returns resulting embedding vectors'''
     if self.walks is None:
         print("Generate the walks first!")
         exit
     else:
         # save walks in a file
         walkFile = open(self.output + '.walk', 'w')
         for walk in self.walks:
             walkFile.write(arr2str(walk) + "\n")
         walkFile.close()
         sentences = doc.TaggedLineDocument(self.output + '.walk')
         if self.model == 'dm':
             self.model = 1
         else:
             self.model = 0
         model = doc.Doc2Vec(sentences,
                             vector_size=128,
                             epochs=100,
                             dm=self.model,
                             window=1)
         print("Total vects ",
               len(list(model.docvecs.vectors_docs)))  # model.docvecs
         return model.docvecs.vectors_docs
예제 #21
0
    test_docvec_file = "20news-test-7532.svm-doc2vec.txt"
    all_count = 18791
    train_count = 11285
    test_count = 7506
else:
    all_words_file = "reuters-all-8025.gibbslda-bow.txt"
    train_label_file = "reuters-train-5770.slda-label.txt"
    train_docvec_file = "reuters-train-5770.svm-doc2vec.txt"
    test_label_file = "reuters-test-2255.slda-label.txt"
    test_docvec_file = "reuters-test-2255.svm-doc2vec.txt"
    all_count = 8025
    train_count = 5770
    test_count = 2255

dim = 400
corpus = doc2vec.TaggedLineDocument(all_words_file)
model = doc2vec.Doc2Vec(corpus,size=dim, window=8, min_count=5, workers=4)
TRAIN_DOC2VEC = open(train_docvec_file, "w")
TRAIN_LABEL = open(train_label_file)

#pdb.set_trace()

for d in xrange(1, train_count + 1):
    doc_vec = model.docvecs[d]
    label_line = TRAIN_LABEL.readline().strip()
    label = int(label_line)

    TRAIN_DOC2VEC.write( "%d" %(label+1) )

    for k in xrange(dim):
        TRAIN_DOC2VEC.write( " %d:%.3f" %( k + 1, doc_vec[k] ) )
예제 #22
0
cores = multiprocessing.cpu_count()

vector_size = 300
window_size = 15
word_min_count = 2
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 1  #0 = dbow; 1 = dmpv
worker_count = cores  #number of parallel processes

inputfile = r"D:\user\Desktop\project\wiki_pos_tokenizer_without_taginfo.txt"
modelfile = r"D:\user\Desktop\project\wiki_pos_tokenizer_without_taginfo.doc2vec.model"
word2vec_file = modelfile + ".word2vec_format"

sentences = doc2vec.TaggedLineDocument(inputfile)

#build voca
model = doc2vec.Doc2Vec(min_count=word_min_count,
                        vector_size=vector_size,
                        alpha=0.025,
                        min_alpha=0.025,
                        seed=1234,
                        workers=worker_count)
model.build_vocab(sentences)

# Train document vectors
model.train(sentences, epochs=model.iter, total_examples=model.corpus_count)

# To save
예제 #23
0
파일: d2v.py 프로젝트: yitang/comment_toxic
#encoding=utf-8

import logging
import sys
import multiprocessing
import numpy as np

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models import doc2vec

from config import *

if __name__ == '__main__':

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


r = np.random.randint(100000,999999,size = (1,))
print (r[0])
sents = doc2vec.TaggedLineDocument("./all.txt")
print (sents)
model = doc2vec.Doc2Vec(sents, size = embedding_dims, window = 9, min_count=1, iter=45, hs=0, negative=11, seed=r[0],)
model.wv.save_word2vec_format("w2v.txt", binary=False)
model.save("d2v.model")

예제 #24
0
    # print(label_dict)
    # label_=(sorted(label_dict.items(),key=lambda x:x[1]))
    # print(label_)

    y = zeros((12901, len(label_names)))
    f9 = codecs.open(filename, 'r', encoding="utf-8")
    i = 0
    for li in f9.readlines():
        li = li.split()
        for j in li:
            y[i, label_dict[j]] = 1
        i += 1
    return y


sentences = doc2vec.TaggedLineDocument("result.txt")

model = doc2vec.Doc2Vec(sentences,
                        size=280,
                        window=5,
                        min_count=1,
                        workers=8,
                        iter=168)
# model.build_vocab(sentences)
model.train(sentences)

filename = "label_level01.txt"
corpus = model.docvecs
y = y_label(filename)

vector = []
예제 #25
0
    return np.concatenate((u, v, u - v))  #np.concatenate((u, v, u-v)) #u-v ???


def randvec(w, n=50, lower=-0.5, upper=0.5):
    """Returns a random vector of length `n`. `w` is ignored."""
    return np.array([random.uniform(lower, upper) for i in range(n)])


with open("parsed_data.txt", 'w') as c:
    count = 0
    for doc in vocab:
        c.write(doc + "\n")
        doc2vecContent[doc] = count
        count += 1

sentences = doc2vec.TaggedLineDocument("parsed_data.txt")
model = Doc2Vec(sentences, size=100, window=8, min_count=5, workers=4, iter=20)
model.save("model_name")


def get_vec_for_sentence(sentence):
    if sentence not in doc2vecContent:
        return "Error, sentence not found"
    return (model.docvecs[doc2vecContent[sentence]])


def build_dataset(dataset, vector_func, vector_combo_func=vec_concatenate):
    """
    Parameters
    ----------    
    dataset
예제 #26
0
import logging
import sys
import multiprocessing
import numpy as np

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models import doc2vec

embedding_dims = 128
if __name__ == '__main__':

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

r = np.random.randint(100000, 999999, size=(1, ))
print(r[0])
sents = doc2vec.TaggedLineDocument("./fc.dat")
print(sents)
model = doc2vec.Doc2Vec(sents,
                        size=embedding_dims,
                        window=9,
                        min_count=1,
                        iter=45,
                        hs=0,
                        negative=11,
                        seed=r[0])
model.wv.save_word2vec_format("w2v.txt", binary=False)
# model.save("d2v.model")
예제 #27
0
    df_file_records, nw_file_records = \
        read_file_info_records(train_ere_dir, train_entity_info_dir, train_relation_info_dir, train_event_info_dir,
                               train_em_args_dir)
    test_df_file_records = \
        read_file_info_records(test_df_ere_dir, test_df_entity_info_dir, test_df_relation_info_dir,
                               test_df_event_info_dir, test_df_em_args_dir, False)
    test_nw_file_records = \
        read_file_info_records(test_nw_ere_dir, test_nw_entity_info_dir, test_nw_relation_info_dir,
                               test_nw_event_info_dir, test_nw_em_args_dir, False)
    file_records = df_file_records + nw_file_records + test_df_file_records + test_nw_file_records
    contexts = get_contexts(file_records)

    # print 'Read external data...'
    imdb_texts, label = read_imdb_data(imdb_dir)

    print 'Write doctext...'
    texts = get_doc2vec_dataform(contexts + imdb_texts)
    # texts = get_doc2vec_dataform(contexts)
    write_doc2vec_input(texts, doctext_path)

    print 'Doc2vec...'
    docslist = doc2vec.TaggedLineDocument(doctext_path)
    model = Doc2Vec(docslist,
                    workers=multiprocessing.cpu_count(),
                    min_count=1,
                    size=200)
    model.save(docmodel_path)
    model = Doc2Vec.load(docmodel_path)
    doc2vec_model = model.docvecs
    print doc2vec_model[0]
예제 #28
0
    md = doc2vec.Doc2Vec(
        dm=0,  # PV-DBOW / default 1
        dbow_words=1,  # w2v simultaneous with DBOW d2v / default 0
        window=8,  # distance between the predicted word and context words
        vector_size=100,  # vector size
        alpha=0.025,  # learning-rate
        seed=1234,
        min_count=-1,  # ignore with freq lower
        min_alpha=0.025,  # min learning-rate
        workers=cores,  # multi cpu
        hs=1,  # hierarchical softmax / default 0
        negative=10,  # negative sampling / default 5
    )

    #파일의 텍스트값에서 벡터 추출, 단어간 유사도 측정
    sentences = doc2vec.TaggedLineDocument("news.json")

    md.build_vocab(sentences)
    print(str(md))

    start = time.time()

    #단어 학습을 통해 유사도 모델 개선
    md.train(sentences, epochs=md.iter, total_examples=md.corpus_count)
    #강화학습을 위한 반복문
    """for epoch in range(10):
        md.train(sentences, total_examples=md.corpus_count, epochs=md.iter)
        md.alpha -= 0.002 # decrease the learning rate
        md.min_alpha = md.alpha # fix the learning rate, no decay"""
    end = time.time()
    print("During Time: {}".format(end - start))
예제 #29
0
# coding=utf-8
from gensim.models import doc2vec
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
num = '1'
documents = doc2vec.TaggedLineDocument('document5.txt')
model = doc2vec.Doc2Vec(documents,
                        size=500,
                        window=1,
                        min_count=500,
                        workers=4)
model.save('./document' + num + '.bin')
예제 #30
0
    def structural_embedding(self, inputFile, outputFile):
        indexToName = self.generateWalkFile(inputFile, args.walkLength)
        sentences = doc.TaggedLineDocument(inputFile+'.walk')
        self.model = doc.Doc2Vec(sentences, size = dimensions, iter = iterations, window = window )

        saveVectors(list(self.docvecs), outputFile, indexToName)