Exemplo n.º 1
0
        print('Convert text documents to vectors by Doc2Vec')
        print(sys.argv[0] + " -h for help")
        sys.exit()
    elif opt in ("-d", "--data_dir"):
        data_dir = arg
    elif opt in ("-model_size", "--model_size"):
        model_size = int(arg)
    elif opt in ("-epoch", "--epoch"):
        nb_epochs = int(arg)
    elif opt in ("-lb", "--label_file"):
        label_file = arg
    elif opt in ("--out_file"):
        out_filename = arg

#load documents
documents = load.get_doc(data_dir, label_file)

print('Data Loading finished')

print(len(documents), type(documents))

# build the model
# model = gensim.models.Doc2Vec(documents, dm=1, alpha=0.025, size=model_size, min_alpha=0.025, min_count=0, workers=8)
#
# # start training
# for epoch in range(epochs):
#     if epoch % 5 == 0:
#         print ('Now training epoch %s'%epoch)
#     model.train(documents)
#     model.alpha -= 0.002  # decrease the learning rate
#     model.min_alpha = model.alpha  # fix the learning rate, no decay
Exemplo n.º 2
0
        print ('Convert text documents to vectors by Doc2Vec')
        print (sys.argv[0] + " -h for help")
        sys.exit ()
    elif opt in ("-d","--data_dir"):
        data_dir = arg
    elif opt in ("-model_size","--model_size"):
        model_size = int (arg)
    elif opt in ("-epoch","--epoch"):
        nb_epochs = int (arg)
    elif opt in ("-lb","--label_file"):
        label_file = arg
    elif opt in ("--out_file"):
        out_filename = arg
 
#load documents
documents = load.get_doc(data_dir, label_file)

print ('Data Loading finished')
 
print (len(documents),type(documents))
 
# build the model
model = gensim.models.Doc2Vec(documents, dm = 1, alpha=0.025, size= model_size, min_alpha=0.025, min_count=0, workers=8)
 
# start training
for epoch in range(nb_epochs):
    if epoch % 5 == 0:
        print ('Now training epoch %s'%epoch)
    model.train(documents)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay
Exemplo n.º 3
0
import gensim
import load

documents = load.get_doc('docs')
print('Data Loading finished')

print(len(documents), type(documents))

# build the model
model = gensim.models.Doc2Vec(documents,
                              dm=0,
                              alpha=0.025,
                              size=20,
                              min_alpha=0.025,
                              min_count=0)
modeldm = gensim.models.Doc2Vec(documents,
                                alpha=0.025,
                                size=20,
                                min_alpha=0.025,
                                min_count=0)

# start training
for epoch in range(200):
    if epoch % 20 == 0:
        print('Now training epoch %s' % epoch)
    model.train(documents)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

# shows the similar words
print(model.most_similar('mildew'))
Exemplo n.º 4
0
# doc2vectest.py
#import sys
#reload(sys)
#sys.setdefaultencoding('utf8')

#import codecs
import gensim
import load

documents = load.get_doc('/Users/lipingzhang/Desktop/program/doc2vec/word_vectors_game_of_thrones-LIVE/data')
#documents = get_doc('/Users/lipingzhang/Desktop/program/doc2vec/word_vectors_game_of_thrones-LIVE/data')
print('Data Loading finished')
print(len(documents), type(documents))

# build the model
model = gensim.models.Doc2Vec(documents, dm = 0, alpha = 0.025, size = 20, min_alpha = 0.025, min_count = 0)

# start training
for epoch in range(200):
    if epoch % 20 == 0:
        print('Now training epoch %s' & epoch)
    model.train(documents)
    # decrease the learning rate
    model.alpha -= 0.002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

# shows the similar words
print(model.most_similar('suppli'))

# shows the learnt embeeding
Exemplo n.º 5
0
import gensim
import load

documents = load.get_doc('books')
print('Data Loading finished')

print(len(documents), type(documents))

model = gensim.models.Doc2Vec(documents,
                              dm=0,
                              alpha=0.025,
                              size=20,
                              min_alpha=0.025,
                              min_count=0)

for epoch in range(200):
    if epoch % 20 == 0:
        print('Now training epoch %s' % epoch)
    token_count = sum([len(document) for document in documents])
    model.train(documents, total_examples=token_count, epochs=model.iter)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

print(model.most_similar('обман'))

print(model['обман'])

print(model.docvecs.most_similar(str('books/love.txt')))