Python TaggedLineDocument示例，gensim.models.doc2vec.TaggedLineDocument Python示例

示例#1

0

显示文件

文件： b_make_embeddings.py 项目： maxamas/Bachelor-Thesis

def build_doc2vec_model(dataset, vec_lenght, save_folder, name="Movie"):
    """
    build doc2vec model
    """
    #use helperfunction write_all_in_txt, which creates an txt file with required format for doc2vec model
    if name == "Movie":
        txt_file = make_doc2vec_inputfile(
            dataset=dataset, save_file="data/doc_2_vec/movie_d2v_input.txt")
    elif name == "Financial":
        txt_file = make_doc2vec_inputfile(
            dataset=dataset,
            save_file="data/doc_2_vec/financial_d2v_input.txt")

    doc = open(txt_file, "r", encoding="utf-8")
    documents = TaggedLineDocument(doc)

    model = gensim.models.Doc2Vec(documents,
                                  dm=0,
                                  dbow_words=0,
                                  size=vec_lenght,
                                  window=10,
                                  hs=0,
                                  negative=5,
                                  sample=1e-4,
                                  iter=20,
                                  min_count=10,
                                  workers=4,
                                  alpha=0.1)
    doc.close()
    model.save(fname_or_handle=save_folder)
    return model

示例#2

0

显示文件

文件： D2VEmbeddingHelper.py 项目： XLexxaX/AnyGraphMatcher

def stem(CONFIGURATION, sents):

    with open(CONFIGURATION.rundir + "w2v_training_material.csv",
              mode="w+",
              encoding="UTF-8") as f:
        for sent in sents:
            tmp = list()
            for expression in sent:
                if not 'http://' in expression:
                    expression_new = list()
                    for word in expression.split(' '):
                        expression_new.append(
                            ps.stem(re.sub('[^A-z0-9<>]', '', word.lower())))
                    expression = expression_new
                else:
                    expression = [re.sub('[\r\n]', '', expression)]
                if not " ".join(expression) == '' and len(
                        " ".join(expression)) > 1:
                    tmp = tmp + expression
            if len(tmp) > 1:
                for x in tmp:
                    f.write(str(x) + " ")
                f.write("\n")
        #f.write("<> <>\n")

    from gensim.test.utils import datapath
    from gensim.models.doc2vec import TaggedLineDocument
    for document in TaggedLineDocument(
            datapath(CONFIGURATION.rundir + "w2v_training_material.csv")):
        yield document

示例#3

0

显示文件

文件： spider.py 项目： Harmoush/Spider-master

    def crawl_page(thread_name, page_url):
        if page_url not in Spider.crawled:
            print(thread_name + ' now crawling ' + page_url)
            print('Queue ' + str(len(Spider.queue)) + ' | Crawled  ' +
                  str(len(Spider.crawled)))
            Spider.add_links_to_queue(Spider.gather_links(page_url))
            Spider.queue.remove(page_url)

            #Building the Doc2vec Model
            f = urlopen(page_url)
            html = f.read()
            open('temp.txt', 'w').close()
            f = open('temp.txt', 'w')
            f.write(html)
            f.close()
            html = TaggedLineDocument('temp.txt')
            model = Doc2Vec(html, size=100, window=8, min_count=5, workers=4)
            model.train(html, total_examples=100, epochs=5)
            #print model.docvecs[0]

            #saving data for building and testing the svm
            # if len(Spider.data_train)<50:
            #    Spider.data_train.add(model.docvecs[0])
            #else:
            #   Spider.data_test.add(model.docvecs[0])

            #set_to_file(Spider.data_train,'data_train.txt')
            #set_to_file(Spider.data_test,'data_test.txt')

            Spider.crawled.add(page_url)
            Spider.update_files()

示例#4

0

显示文件

文件： model_test_2.0(?.py 项目： 00100000/lyrics-kryptonite

def train(args):
    documents = TaggedLineDocument(LYRIC)
    return Doc2Vec(documents,
                   size=args.size,
                   window=args.window,
                   min_count=args.min_count,
                   workers=args.workers,
                   dm=args.dm)

示例#5

0

显示文件

def create_doc2vec_model_v2(train_d2v_data_file_path, vector_size, lexicon,
                            files_prefix, model_path):
    print('Training d2v model')
    docs = TaggedLineDocument(train_d2v_data_file_path)
    model = Doc2Vec(docs, size=vector_size, window=10, min_count=1, workers=8)
    model.save(model_path)

    return model

示例#6

0

显示文件

文件： train.py 项目： napoler/Synonymous-sentence

def pre_train():
    if os.path.exists('data/dataset/data.txt'):
        # documents = TaggedLineDocument('data/dataset/data.txt')
        print("data.txt 文件存在")
        pass
    else:
        data_txt()
    documents = TaggedLineDocument('data/dataset/data.txt')
    return documents

示例#7

0

显示文件

文件： similarity.py 项目： j9ac9k/word_similarity

def check_for_doc2vecmodel(doc2vec_fname, docs_fname, corpus, dictionary):
    try:
        doc2vec = models.Doc2Vec.load(doc2vec_fname)
    except IOError:
        print('Training Doc2Vec model, this may take a long time')
        documents = TaggedLineDocument(docs_fname)
        doc2vec = models.doc2vec.Doc2Vec(documents=documents, workers=4)
        doc2vec.save(doc2vec_fname)
    return doc2vec

示例#8

0

显示文件

文件： 4dm_dbow-checkpoint.py 项目： zjyzh/NLP_final

def dataset():
    df = open('data/blog_notopic.txt','r',encoding='utf-8')
    blogs=TaggedLineDocument(df)
    
    #计算x_train数据量
    '''count=-1
    for count, line in enumerate(open('data/blog_notopic.txt', 'r',encoding='utf-8')):
        pass
    count += 1'''
    return blogs

示例#9

0

显示文件

 def train(self, source_corpus_path, update=False):
     """
     Train an uninitialized model using corpus.
     Each line in the corpus should be words of a sentence separated by space.
     
     :param source_corpus_path: Path to corpus.
     :param update: Update vocab.
     :return: Nothing.
     """
     documents = TaggedLineDocument(source_corpus_path)
     self.model.build_vocab(documents, update=update)
     self.model.train(documents, total_examples=self.model.corpus_count, epochs=self.model.iter)

示例#10

0

显示文件

文件： D2VEmbeddingHelper.py 项目： XLexxaX/AnyGraphMatcher

def prepare_training_data(sentences, CONFIGURATION):

    #sentences = stem(CONFIGURATION, sentences)
    ctr = 0
    with open(CONFIGURATION.rundir + "w2v_training_material.csv",
              mode="w+",
              encoding="UTF-8") as f:
        for sent in sentences:
            tmp = list()
            for expression in sent:
                if not 'http://' in expression:
                    expression_new = list()
                    for word in expression.split(' '):
                        word = (ps.stem(re.sub('[^A-z0-9<>]', '',
                                               word.lower())))
                        #if len(word)>2:
                        #    words = #[word[i:i+3] for i in range(len(word)-3+1)]
                        #else:
                        words = [word]
                        expression_new = expression_new + words
                    expression = expression_new
                else:
                    expression = [re.sub('[\r\n]', '', expression)]
                #if not " ".join(expression) == '' and len(" ".join(expression))>1:
                tmp = tmp + expression
            if len(tmp) > 0:
                for x in tmp:
                    f.write(str(x) + " ")
                f.write("\n")
                ctr += 1
        #f.write("<> <>\n")

    from gensim.test.utils import datapath
    from gensim.models.doc2vec import TaggedLineDocument
    sentences = [
        document for document in TaggedLineDocument(
            datapath(CONFIGURATION.rundir + "w2v_training_material.csv"))
    ]

    #x = tuplize(sentences, CONFIGURATION)

    #x = eliminate_rare_and_frequent_terms(x)

    #documents = list()
    #for index, row in x.iterrows():
    #    documents.append([str(row[0])] + [str(row[1])])

    #documents = literalize(documents)

    documents = sentences

    return documents

示例#11

0

显示文件

文件： train_vector_model.py 项目： xumaoxuan/MyCluster

def trainDoc2Vector(sentence_count, vector_dimension):
    # train and save the model
    sentences = TaggedLineDocument('sources/splited_words.txt')
    model = Doc2Vec(sentences, size=vector_dimension, window=8, min_count=2, workers=multiprocessing.cpu_count())
    model.train(sentences, total_examples=sentence_count, epochs=model.iter)
    model.save('result/doc2vec.model')
    # save vectors
    out = open('result/doc2vec.vector', mode='w+', encoding='utf-8')
    for index in range(0, sentence_count, 1):
        docvec = model.docvecs[index]
        out.write(' '.join(str(f) for f in docvec) + "\n")

    out.close()

示例#12

0

显示文件

文件： UsingDoc2VecFeatures.py 项目： nishkalavallabhi/CLPsych2017Submission

def build_doc2vec_model():
    # Creating labeled sentences from training data
    sentences = TaggedLineDocument('bulk-total.txt')
    model = Doc2Vec(alpha=0.1,
                    size=30,
                    window=10,
                    min_count=5,
                    dm=0,
                    dbow_words=1,
                    iter=10)
    model.build_vocab(sentences)
    model.train(sentences, total_examples=81863, epochs=10)
    model.save('../models/clpsych-30dim-large.d2v')

示例#13

0

显示文件

文件： fact_check_processor_doc2vec.py 项目： vineetjohn/articlebias-doc2vec

    def process(self):

        log.info("Commencing execution")

        tagged_docs = TaggedLineDocument(self.labeled_articles_file_path)

        log.info("Training Doc2Vec model")
        doc2vec_model = doc2vec_helper.init_model(tagged_docs)
        doc2vec_model.save(self.doc2vec_model_file_path)
        log.info("Learnt vocab from training set and saved doc2vec model")

        x_train = list()
        with open(self.labeled_articles_file_path) as training_set:
            for line in training_set:
                x_train.append(doc2vec_model.infer_vector(line))

        y_train = [0] * self.samples_per_class_train
        y_train.extend([1] * self.samples_per_class_train)

        x_test = list()
        with open(self.articles_source_file_path) as test_set:
            for line in test_set:
                x_test.append(doc2vec_model.infer_vector(line))

        y_true = [1] * self.samples_per_class_test
        y_true.extend([0] * self.samples_per_class_test)

        ml_model_logreg = scikit_ml_helper.train_logistic_reg_classifier(x_train, y_train)
        scikit_ml_helper.persist_model_to_disk(ml_model_logreg, self.ml_model_file_path)
        y_pred = ml_model_logreg.predict(x_test)
        log.info("Logistic Regression")
        log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true)))
        log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true)))
        log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true)))

        ml_model_svm = scikit_ml_helper.train_svm_classifier(x_train, y_train)
        y_pred = ml_model_svm.predict(x_test)
        log.info("SVM")
        log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true)))
        log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true)))
        log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true)))

        ml_model_nb = scikit_ml_helper.train_gnb_classifier(x_train, y_train)
        y_pred = ml_model_nb.predict(x_test)
        log.info("Naive Bayes")
        log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true)))
        log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true)))
        log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true)))

        log.info("Completed execution")

示例#14

0

显示文件

文件： Word2Vec.py 项目： fei090620/nlp_tutorial

def calculate_and_save_word2vec_dict(words_list, files):
    dataset_file = os.path.join(word2vec_taget_dir, 'dataset.txt')
    FileProcessor(dataset_file).file_write(
        'utf8', u''.join([words + u'\n' for words in words_list][:-1]))
    doces = TaggedLineDocument(dataset_file)

    doc2Vec_model = doc2vec.Doc2Vec(doces, size=200, window=10, workers=4)
    doc2Vec_model.train(doces,
                        total_examples=doc2Vec_model.corpus_count,
                        epochs=200)
    doc2Vec_model.save(os.path.join(word2vec_taget_dir, 'doc2vec_model.txt'))
    FileProcessor(os.path.join(word2vec_taget_dir, 'tagged_map.txt'))\
        .file_write('utf8', u''.join([u'{0} {1} \n'.format(index, value.decode('utf8')) for index, value in enumerate(files)]))

    return doc2Vec_model

示例#15

0

显示文件

def train():
    tagged = TaggedLineDocument(filetgge)
    model = Word2Vec(alpha=0.025,
                     min_alpha=0.025,
                     size=50,
                     window=5,
                     min_count=5,
                     workers=8)
    model.build_vocab(tagged)
    for i in range(10):
        model.train(tagged)
        model.alpha -= 0.0002  # decrease the learning rate
        model.min_alpha = model.alpha  # fix the learning rate, no decay

    # model.save_word2vec_format(filetgge+'.model')
    model.save(filetgge + '.model')

示例#16

0

显示文件

def main():
    model_file = datahub.get_full_path("doc2vec.model")
    if not Path(model_file).exists():
        tokenized_file = datahub.get_full_path("articles1_token.txt")
        if not Path(tokenized_file).exists():
            print("data is not yet tokenized and saved. Doing now.")
            print("loading data")
            data_list = datahub.load_data(datahub.get_full_path("articles1.csv"))
            print("tokenizing data")
            tokenized_content_list = datahub.tokenize_content(data_list)
            print("saving tokenizing data in txt file")
            datahub.save_tagged_data(tokenized_content_list, tokenized_file)

        vector_size = 50
        epochs = 100
        print("training model")
        trained_model = v_model.train_model(TaggedLineDocument(tokenized_file), vector_size, epochs)
        print("saving model")
        v_model.save_model(trained_model, model_file)
        print("finish")
    else:
        print("{} already exists. Exiting.".format(model_file))

示例#17

0

显示文件

def get_tagged_sentences(filepath):
    sentences = TaggedLineDocument(filepath)
    return sentences

示例#18

0

显示文件

        chunksize = int(math.ceil(n / float(procs)))
        with gzip.open(base_output_path + 'docs_songs.txt.gz',
                       'w') as fout, gzip.open(
                           base_output_path + 'indices.txt.gz',
                           'w') as indices:
            for userid, doc in tq(pool.imap_unordered(get_songs,
                                                      files,
                                                      chunksize=100),
                                  total=n):
                fout.write(doc + '\n')
                indices.write(userid + '\n')

    with timed('Loading docs'):
        #documents = TaggedLineDocument(base_output_path+'docs_artist_blocks.txt.gz')
        #documents = [doc for doc in tq(TaggedLineDocument(base_output_path+'docs_songs.txt.gz'))]
        documents = TaggedLineDocument(base_output_path + 'docs_songs.txt.gz')

    with timed('Running model'):
        model = Doc2Vec(documents,
                        size=dim,
                        window=win,
                        min_count=min_count,
                        workers=procs)

    with timed('Saving results'):
        # from sklearn.preprocessing import Normalizer
        # nrm = Normalizer('l2')
        # normed = nrm.fit_transform(model.docvecs.doctag_syn0)
        # words_normed = nrm.fit_transform(model.syn0)

        # np.save(output_path+'/doc_features_normed-{}-{}-{}.npy'.format(dim,win,min_count),normed)

示例#19

0

显示文件

文件： data_checkins.py 项目： vec2link2018/vec2link

    user_dict[line.split()[0]] = ''
u_f.close()

for line in f_b_t.readlines():
    if utils.key_in_dic(line.replace('\n', '').split('\t')[0], user_dict):
        p_f.write(line)
    else:
        pass
f_b_t.close()
p_f.close()

#
from gensim.models.doc2vec import TaggedLineDocument, Doc2Vec
user_tranj_vec = '../data/user_tranj_vec.txt'

documents = TaggedLineDocument(doc_file)
model = Doc2Vec(documents,
                size=128,
                negative=10,
                window=8,
                hs=0,
                min_count=0,
                workers=15,
                iter=30)

user_id_list = []
u_f = open(user_file)
for line in u_f:
    user_id_list.append(line.split('\n')[0])
u_f.close()

示例#20

0

显示文件

import os.path
import sys
import multiprocessing

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedLineDocument

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # check and process input arguments
    if len(sys.argv) < 4:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    inp, outp1, outp2 = sys.argv[1:4]

    model = Doc2Vec(TaggedLineDocument(inp),
                    size=200,
                    window=5,
                    min_count=5,
                    workers=multiprocessing.cpu_count())

    # trim unneeded model memory = use(much) less RAM
    #model.init_sims(replace=True)
    model.save(outp1)  #save dov2vec
    model.save_word2vec_format(outp2, binary=False)  #save word2vec

示例#21

0

显示文件

# docLabels = ["input.txt"]
# data = []
# for doc in docLabels:
#     data.append(open(doc, 'r'))
#     print(data)
#
# it = LabeledLineSentence(data, [1])
# # print(it)= ["input.txt"]
# data = []
questions_path = "train_questionsq.txt"
answers_path = "train_answersq.txt"

questions = open(questions_path, 'r')
answers = open(answers_path, 'r')

doc = TaggedLineDocument("input.txt")

model = gensim.models.Doc2Vec(size=100,
                              window=10,
                              min_count=1,
                              workers=11,
                              alpha=0.025,
                              min_alpha=0.025)  # use fixed learning rate

model.build_vocab(doc)

model.iter = 300

model.train(doc, total_examples=model.corpus_count, epochs=model.iter)
#
# for epoch in range(10):

示例#22

0

显示文件

if not os.path.exists(base_output_path+'docs_artist_blocks.txt.gz'):
    with gzip.open(base_output_path+'docs_artist_blocks.txt.gz','w') as fout, gzip.open(base_output_path+'indices.txt.gz','w') as indices:
        files = sorted(glob.glob(scrobble_path+'*.txt'))
        for fi in tq(files):
            artists = [line.split('\t')[1] for line in open(fi)]
            last = None
            blocks = []
            for a in tq(artists):
                if a != last:
                    blocks.append(a)
                last = a
            doc = ' '.join(blocks)
            fout.write(doc+'\n')
            userid = fi[fi.rfind('\\')+1:-4]
            indices.write(userid+'\n')
documents = [doc for doc in tq(TaggedLineDocument(base_output_path+'docs_artist_blocks.txt.gz'))]



%time model = Doc2Vec(documents, size=dim, window=win, min_count=min_count,workers=workers)

dpath = 'P:/Projects/BigMusic/jared.data/d2v/artist_dict.pkl'
if not os.path.exists(dpath):
    artist_dict = {}
    for line in tq(open('P:/Projects/BigMusic/jared.rawdata/lastfm_itemlist.txt')):
        line = line.split('\t')
        if line[1]=='0':
            artist_dict[line[2]] = line[0]
    cPickle.dump(artist_dict,open(dpath,'wb'))
else:
    artist_dict = cPickle.load(open(dpath))

示例#23

0

显示文件

文件： vector.py 项目： hungita/kaggle-1

 def train(cls):
     model = Doc2Vec(documents=TaggedLineDocument(cls.corpus_path), vector_size=300, window=5, min_count=1, workers=4)
     model.save(config.model_path.format('d2v.model'))

示例#24

0

显示文件

import sys
from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument

file = sys.argv[1]
epochs = int(sys.argv[2])
words = sys.argv[3].split(' ')

steps = 50

docs = TaggedLineDocument(file)

model = Doc2Vec(docs, min_count=1, epochs=epochs)

docs_list = list(docs)

to_docstr = lambda x: ' '.join(docs_list[x].words)

print(f'--- similar : {to_docstr(0)} ---')

for i, p in model.docvecs.most_similar(0):
    print(f'{p}, {to_docstr(i)}')

print('')
print(f'--- similar : {words} ---')

x = model.infer_vector(words, steps=steps)

for tag, p in model.docvecs.most_similar([x]):
    print(f'{p}, {to_docstr(tag)}')

示例#25

0

显示文件

文件： myw2vCluster.py 项目： springhser/docCluster

def d2ctest():
    documents = TaggedLineDocument("new_text2.txt")
    model = Doc2Vec(documents, size=10, window=2, min_count=1, workers=1)
    print(model)
    model

示例#26

0

显示文件

from keras.models import Sequential, Model
from keras.optimizers import Adam

data_file = sys.argv[1]
dest_file_prefix = sys.argv[2]

epoch = int(sys.argv[3])
batch = int(sys.argv[4])

wv_size = 200
wv_epoch = 2000

num_unit = 512
input_size = (10, )

docs = TaggedLineDocument(data_file)

words = [d.words for d in docs]

wv_model = Word2Vec(words, wv_size, min_count=1, iter=wv_epoch)

input_size += (wv_model.vector_size, )

word_maxlen = np.max([len(w) for w in words])


def discriminator(input_shape):
    model = Sequential()

    model.add(GRU(num_unit, input_shape=input_shape))
    model.add(Dropout(0.3))

示例#27

0

显示文件

文件： doc2vec_test.py 项目： yasunobuigarashi/try_samples

model_file = sys.argv[1]
data_file = sys.argv[2]
gen_size = int(sys.argv[3])

questions = [q.split(' ') for q in sys.argv[4].split(';')]

steps = 50
base_word_prob = 0.7
prob_weight = 1.5
keyword_rate = 2
replace_targets = ['名詞', '形容詞']

model = Doc2Vec.load(model_file)

docs_list = list(TaggedLineDocument(data_file))

docs_list_str = [''.join(d.words) for d in docs_list]

is_replace_target = lambda t: np.any(
    [t.part_of_speech.startswith(trg) for trg in replace_targets])


def random_choice(cd):
    probs = np.exp(np.array([p for _, p in cd]) * prob_weight)
    probs /= probs.sum()

    return np.random.choice([d for d, _ in cd], p=probs)


adjust_prob = lambda c, q: (c[0], c[1] * keyword_rate) if c[0] in q else c

示例#28

0

显示文件

from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedLineDocument
from time import localtime, strftime

# document for training
doc_path = 'dota_picks'
corpus = TaggedLineDocument(doc_path)

# Doc2Vec parameters; self explanatory
vector_size = 50
window_size = 5
min_count = 0
sampling_threshold = 1e-4
negative_size = 5
train_epoch = 100
dm = 0  #0 = dbow; 1 = dmpv
worker_count = 8  #number of parallel processes

model = Doc2Vec(size=vector_size,
                window=window_size,
                min_count=min_count,
                sample=sampling_threshold,
                workers=worker_count,
                hs=0,
                dm=dm,
                negative=negative_size,
                dbow_words=1,
                dm_concat=1)
print("Building Vocab:", strftime("%a, %d %b %Y, %H:%M:%S", localtime()))
model.build_vocab(corpus)
print("Built Vocab:", strftime("%a, %d %b %Y, %H:%M:%S", localtime()))

示例#29

0

显示文件

#-*-coding:utf-8-*-
from __future__ import division
from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument
# import numpy.linalg
import numpy as np
import math
import scipy
from PIL import Image, ImageDraw
input_file = r"H:\network_diagnosis_data\test\GTPC_TUNNEL_PATH_BROKEN.3054.txt"
sentences = TaggedLineDocument(input_file)
dim = 1000
model = Doc2Vec(alpha=0.025, min_alpha=0.025, size=dim)  # default 300 ά
model.build_vocab(sentences)

for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
model.save(r'.\data\test_d2v')
# print model.infer_vector([u'people', u'like', u'words'])

total_num = model.docvecs.count
# print total_num
# print len( model.docvecs[0] )
para_vec = []
for i in xrange(total_num):
    if i == 0:
        para_vec = model.docvecs[i]
        continue
    para_vec = np.vstack((para_vec, model.docvecs[i]))
print para_vec

示例#30

0

显示文件

文件： doc2vec.py 项目： UVA-DSI/2018-Capstone-PLOS

import pandas as pd
import numpy as np
from time import time
import pickle

import gensim
from gensim import corpora, models, similarities
from gensim.models.doc2vec import TaggedDocument, TaggedLineDocument
from gensim.models import Doc2Vec
import gensim.models.doc2vec

print('loading docs...')
start_time = time()
documents = [
    doc for doc in TaggedLineDocument('volume2/processed_body_docs.txt')
]
print("--- %s seconds ---" % (time() - start_time))

#documents = []
#with open('/volume/processed_body_docs.txt') as f:
#    for line in f:
#        documents.append(TaggedLineDocument(line))

print('training doc2vec model...')
start_time = time()
model = Doc2Vec(documents,
                vector_size=200,
                window=5,
                min_count=5,
                workers=14,
                epochs=20)