def build_doc2vec_model(dataset, vec_lenght, save_folder, name="Movie"):
    """
    build doc2vec model
    """
    #use helperfunction write_all_in_txt, which creates an txt file with required format for doc2vec model
    if name == "Movie":
        txt_file = make_doc2vec_inputfile(
            dataset=dataset, save_file="data/doc_2_vec/movie_d2v_input.txt")
    elif name == "Financial":
        txt_file = make_doc2vec_inputfile(
            dataset=dataset,
            save_file="data/doc_2_vec/financial_d2v_input.txt")

    doc = open(txt_file, "r", encoding="utf-8")
    documents = TaggedLineDocument(doc)

    model = gensim.models.Doc2Vec(documents,
                                  dm=0,
                                  dbow_words=0,
                                  size=vec_lenght,
                                  window=10,
                                  hs=0,
                                  negative=5,
                                  sample=1e-4,
                                  iter=20,
                                  min_count=10,
                                  workers=4,
                                  alpha=0.1)
    doc.close()
    model.save(fname_or_handle=save_folder)
    return model
def stem(CONFIGURATION, sents):

    with open(CONFIGURATION.rundir + "w2v_training_material.csv",
              mode="w+",
              encoding="UTF-8") as f:
        for sent in sents:
            tmp = list()
            for expression in sent:
                if not 'http://' in expression:
                    expression_new = list()
                    for word in expression.split(' '):
                        expression_new.append(
                            ps.stem(re.sub('[^A-z0-9<>]', '', word.lower())))
                    expression = expression_new
                else:
                    expression = [re.sub('[\r\n]', '', expression)]
                if not " ".join(expression) == '' and len(
                        " ".join(expression)) > 1:
                    tmp = tmp + expression
            if len(tmp) > 1:
                for x in tmp:
                    f.write(str(x) + " ")
                f.write("\n")
        #f.write("<> <>\n")

    from gensim.test.utils import datapath
    from gensim.models.doc2vec import TaggedLineDocument
    for document in TaggedLineDocument(
            datapath(CONFIGURATION.rundir + "w2v_training_material.csv")):
        yield document
示例#3
0
    def crawl_page(thread_name, page_url):
        if page_url not in Spider.crawled:
            print(thread_name + ' now crawling ' + page_url)
            print('Queue ' + str(len(Spider.queue)) + ' | Crawled  ' +
                  str(len(Spider.crawled)))
            Spider.add_links_to_queue(Spider.gather_links(page_url))
            Spider.queue.remove(page_url)

            #Building the Doc2vec Model
            f = urlopen(page_url)
            html = f.read()
            open('temp.txt', 'w').close()
            f = open('temp.txt', 'w')
            f.write(html)
            f.close()
            html = TaggedLineDocument('temp.txt')
            model = Doc2Vec(html, size=100, window=8, min_count=5, workers=4)
            model.train(html, total_examples=100, epochs=5)
            #print model.docvecs[0]

            #saving data for building and testing the svm
            # if len(Spider.data_train)<50:
            #    Spider.data_train.add(model.docvecs[0])
            #else:
            #   Spider.data_test.add(model.docvecs[0])

            #set_to_file(Spider.data_train,'data_train.txt')
            #set_to_file(Spider.data_test,'data_test.txt')

            Spider.crawled.add(page_url)
            Spider.update_files()
def train(args):
    documents = TaggedLineDocument(LYRIC)
    return Doc2Vec(documents,
                   size=args.size,
                   window=args.window,
                   min_count=args.min_count,
                   workers=args.workers,
                   dm=args.dm)
示例#5
0
def create_doc2vec_model_v2(train_d2v_data_file_path, vector_size, lexicon,
                            files_prefix, model_path):
    print('Training d2v model')
    docs = TaggedLineDocument(train_d2v_data_file_path)
    model = Doc2Vec(docs, size=vector_size, window=10, min_count=1, workers=8)
    model.save(model_path)

    return model
示例#6
0
def pre_train():
    if os.path.exists('data/dataset/data.txt'):
        # documents = TaggedLineDocument('data/dataset/data.txt')
        print("data.txt 文件存在")
        pass
    else:
        data_txt()
    documents = TaggedLineDocument('data/dataset/data.txt')
    return documents
示例#7
0
def check_for_doc2vecmodel(doc2vec_fname, docs_fname, corpus, dictionary):
    try:
        doc2vec = models.Doc2Vec.load(doc2vec_fname)
    except IOError:
        print('Training Doc2Vec model, this may take a long time')
        documents = TaggedLineDocument(docs_fname)
        doc2vec = models.doc2vec.Doc2Vec(documents=documents, workers=4)
        doc2vec.save(doc2vec_fname)
    return doc2vec
示例#8
0
def dataset():
    df = open('data/blog_notopic.txt','r',encoding='utf-8')
    blogs=TaggedLineDocument(df)
    
    #计算x_train数据量
    '''count=-1
    for count, line in enumerate(open('data/blog_notopic.txt', 'r',encoding='utf-8')):
        pass
    count += 1'''
    return blogs
示例#9
0
 def train(self, source_corpus_path, update=False):
     """
     Train an uninitialized model using corpus.
     Each line in the corpus should be words of a sentence separated by space.
     
     :param source_corpus_path: Path to corpus.
     :param update: Update vocab.
     :return: Nothing.
     """
     documents = TaggedLineDocument(source_corpus_path)
     self.model.build_vocab(documents, update=update)
     self.model.train(documents, total_examples=self.model.corpus_count, epochs=self.model.iter)
def prepare_training_data(sentences, CONFIGURATION):

    #sentences = stem(CONFIGURATION, sentences)
    ctr = 0
    with open(CONFIGURATION.rundir + "w2v_training_material.csv",
              mode="w+",
              encoding="UTF-8") as f:
        for sent in sentences:
            tmp = list()
            for expression in sent:
                if not 'http://' in expression:
                    expression_new = list()
                    for word in expression.split(' '):
                        word = (ps.stem(re.sub('[^A-z0-9<>]', '',
                                               word.lower())))
                        #if len(word)>2:
                        #    words = #[word[i:i+3] for i in range(len(word)-3+1)]
                        #else:
                        words = [word]
                        expression_new = expression_new + words
                    expression = expression_new
                else:
                    expression = [re.sub('[\r\n]', '', expression)]
                #if not " ".join(expression) == '' and len(" ".join(expression))>1:
                tmp = tmp + expression
            if len(tmp) > 0:
                for x in tmp:
                    f.write(str(x) + " ")
                f.write("\n")
                ctr += 1
        #f.write("<> <>\n")

    from gensim.test.utils import datapath
    from gensim.models.doc2vec import TaggedLineDocument
    sentences = [
        document for document in TaggedLineDocument(
            datapath(CONFIGURATION.rundir + "w2v_training_material.csv"))
    ]

    #x = tuplize(sentences, CONFIGURATION)

    #x = eliminate_rare_and_frequent_terms(x)

    #documents = list()
    #for index, row in x.iterrows():
    #    documents.append([str(row[0])] + [str(row[1])])

    #documents = literalize(documents)

    documents = sentences

    return documents
示例#11
0
def trainDoc2Vector(sentence_count, vector_dimension):
    # train and save the model
    sentences = TaggedLineDocument('sources/splited_words.txt')
    model = Doc2Vec(sentences, size=vector_dimension, window=8, min_count=2, workers=multiprocessing.cpu_count())
    model.train(sentences, total_examples=sentence_count, epochs=model.iter)
    model.save('result/doc2vec.model')
    # save vectors
    out = open('result/doc2vec.vector', mode='w+', encoding='utf-8')
    for index in range(0, sentence_count, 1):
        docvec = model.docvecs[index]
        out.write(' '.join(str(f) for f in docvec) + "\n")

    out.close()
def build_doc2vec_model():
    # Creating labeled sentences from training data
    sentences = TaggedLineDocument('bulk-total.txt')
    model = Doc2Vec(alpha=0.1,
                    size=30,
                    window=10,
                    min_count=5,
                    dm=0,
                    dbow_words=1,
                    iter=10)
    model.build_vocab(sentences)
    model.train(sentences, total_examples=81863, epochs=10)
    model.save('../models/clpsych-30dim-large.d2v')
    def process(self):

        log.info("Commencing execution")

        tagged_docs = TaggedLineDocument(self.labeled_articles_file_path)

        log.info("Training Doc2Vec model")
        doc2vec_model = doc2vec_helper.init_model(tagged_docs)
        doc2vec_model.save(self.doc2vec_model_file_path)
        log.info("Learnt vocab from training set and saved doc2vec model")

        x_train = list()
        with open(self.labeled_articles_file_path) as training_set:
            for line in training_set:
                x_train.append(doc2vec_model.infer_vector(line))

        y_train = [0] * self.samples_per_class_train
        y_train.extend([1] * self.samples_per_class_train)

        x_test = list()
        with open(self.articles_source_file_path) as test_set:
            for line in test_set:
                x_test.append(doc2vec_model.infer_vector(line))

        y_true = [1] * self.samples_per_class_test
        y_true.extend([0] * self.samples_per_class_test)

        ml_model_logreg = scikit_ml_helper.train_logistic_reg_classifier(x_train, y_train)
        scikit_ml_helper.persist_model_to_disk(ml_model_logreg, self.ml_model_file_path)
        y_pred = ml_model_logreg.predict(x_test)
        log.info("Logistic Regression")
        log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true)))
        log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true)))
        log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true)))

        ml_model_svm = scikit_ml_helper.train_svm_classifier(x_train, y_train)
        y_pred = ml_model_svm.predict(x_test)
        log.info("SVM")
        log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true)))
        log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true)))
        log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true)))

        ml_model_nb = scikit_ml_helper.train_gnb_classifier(x_train, y_train)
        y_pred = ml_model_nb.predict(x_test)
        log.info("Naive Bayes")
        log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true)))
        log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true)))
        log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true)))

        log.info("Completed execution")
示例#14
0
def calculate_and_save_word2vec_dict(words_list, files):
    dataset_file = os.path.join(word2vec_taget_dir, 'dataset.txt')
    FileProcessor(dataset_file).file_write(
        'utf8', u''.join([words + u'\n' for words in words_list][:-1]))
    doces = TaggedLineDocument(dataset_file)

    doc2Vec_model = doc2vec.Doc2Vec(doces, size=200, window=10, workers=4)
    doc2Vec_model.train(doces,
                        total_examples=doc2Vec_model.corpus_count,
                        epochs=200)
    doc2Vec_model.save(os.path.join(word2vec_taget_dir, 'doc2vec_model.txt'))
    FileProcessor(os.path.join(word2vec_taget_dir, 'tagged_map.txt'))\
        .file_write('utf8', u''.join([u'{0} {1} \n'.format(index, value.decode('utf8')) for index, value in enumerate(files)]))

    return doc2Vec_model
示例#15
0
def train():
    tagged = TaggedLineDocument(filetgge)
    model = Word2Vec(alpha=0.025,
                     min_alpha=0.025,
                     size=50,
                     window=5,
                     min_count=5,
                     workers=8)
    model.build_vocab(tagged)
    for i in range(10):
        model.train(tagged)
        model.alpha -= 0.0002  # decrease the learning rate
        model.min_alpha = model.alpha  # fix the learning rate, no decay

    # model.save_word2vec_format(filetgge+'.model')
    model.save(filetgge + '.model')
示例#16
0
def main():
    model_file = datahub.get_full_path("doc2vec.model")
    if not Path(model_file).exists():
        tokenized_file = datahub.get_full_path("articles1_token.txt")
        if not Path(tokenized_file).exists():
            print("data is not yet tokenized and saved. Doing now.")
            print("loading data")
            data_list = datahub.load_data(datahub.get_full_path("articles1.csv"))
            print("tokenizing data")
            tokenized_content_list = datahub.tokenize_content(data_list)
            print("saving tokenizing data in txt file")
            datahub.save_tagged_data(tokenized_content_list, tokenized_file)

        vector_size = 50
        epochs = 100
        print("training model")
        trained_model = v_model.train_model(TaggedLineDocument(tokenized_file), vector_size, epochs)
        print("saving model")
        v_model.save_model(trained_model, model_file)
        print("finish")
    else:
        print("{} already exists. Exiting.".format(model_file))
示例#17
0
def get_tagged_sentences(filepath):
    sentences = TaggedLineDocument(filepath)
    return sentences
示例#18
0
        chunksize = int(math.ceil(n / float(procs)))
        with gzip.open(base_output_path + 'docs_songs.txt.gz',
                       'w') as fout, gzip.open(
                           base_output_path + 'indices.txt.gz',
                           'w') as indices:
            for userid, doc in tq(pool.imap_unordered(get_songs,
                                                      files,
                                                      chunksize=100),
                                  total=n):
                fout.write(doc + '\n')
                indices.write(userid + '\n')

    with timed('Loading docs'):
        #documents = TaggedLineDocument(base_output_path+'docs_artist_blocks.txt.gz')
        #documents = [doc for doc in tq(TaggedLineDocument(base_output_path+'docs_songs.txt.gz'))]
        documents = TaggedLineDocument(base_output_path + 'docs_songs.txt.gz')

    with timed('Running model'):
        model = Doc2Vec(documents,
                        size=dim,
                        window=win,
                        min_count=min_count,
                        workers=procs)

    with timed('Saving results'):
        # from sklearn.preprocessing import Normalizer
        # nrm = Normalizer('l2')
        # normed = nrm.fit_transform(model.docvecs.doctag_syn0)
        # words_normed = nrm.fit_transform(model.syn0)

        # np.save(output_path+'/doc_features_normed-{}-{}-{}.npy'.format(dim,win,min_count),normed)
示例#19
0
    user_dict[line.split()[0]] = ''
u_f.close()

for line in f_b_t.readlines():
    if utils.key_in_dic(line.replace('\n', '').split('\t')[0], user_dict):
        p_f.write(line)
    else:
        pass
f_b_t.close()
p_f.close()

#
from gensim.models.doc2vec import TaggedLineDocument, Doc2Vec
user_tranj_vec = '../data/user_tranj_vec.txt'

documents = TaggedLineDocument(doc_file)
model = Doc2Vec(documents,
                size=128,
                negative=10,
                window=8,
                hs=0,
                min_count=0,
                workers=15,
                iter=30)

user_id_list = []
u_f = open(user_file)
for line in u_f:
    user_id_list.append(line.split('\n')[0])
u_f.close()
示例#20
0
import os.path
import sys
import multiprocessing

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedLineDocument

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # check and process input arguments
    if len(sys.argv) < 4:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    inp, outp1, outp2 = sys.argv[1:4]

    model = Doc2Vec(TaggedLineDocument(inp),
                    size=200,
                    window=5,
                    min_count=5,
                    workers=multiprocessing.cpu_count())

    # trim unneeded model memory = use(much) less RAM
    #model.init_sims(replace=True)
    model.save(outp1)  #save dov2vec
    model.save_word2vec_format(outp2, binary=False)  #save word2vec
示例#21
0
# docLabels = ["input.txt"]
# data = []
# for doc in docLabels:
#     data.append(open(doc, 'r'))
#     print(data)
#
# it = LabeledLineSentence(data, [1])
# # print(it)= ["input.txt"]
# data = []
questions_path = "train_questionsq.txt"
answers_path = "train_answersq.txt"

questions = open(questions_path, 'r')
answers = open(answers_path, 'r')

doc = TaggedLineDocument("input.txt")

model = gensim.models.Doc2Vec(size=100,
                              window=10,
                              min_count=1,
                              workers=11,
                              alpha=0.025,
                              min_alpha=0.025)  # use fixed learning rate

model.build_vocab(doc)

model.iter = 300

model.train(doc, total_examples=model.corpus_count, epochs=model.iter)
#
# for epoch in range(10):
示例#22
0
if not os.path.exists(base_output_path+'docs_artist_blocks.txt.gz'):
    with gzip.open(base_output_path+'docs_artist_blocks.txt.gz','w') as fout, gzip.open(base_output_path+'indices.txt.gz','w') as indices:
        files = sorted(glob.glob(scrobble_path+'*.txt'))
        for fi in tq(files):
            artists = [line.split('\t')[1] for line in open(fi)]
            last = None
            blocks = []
            for a in tq(artists):
                if a != last:
                    blocks.append(a)
                last = a
            doc = ' '.join(blocks)
            fout.write(doc+'\n')
            userid = fi[fi.rfind('\\')+1:-4]
            indices.write(userid+'\n')
documents = [doc for doc in tq(TaggedLineDocument(base_output_path+'docs_artist_blocks.txt.gz'))]



%time model = Doc2Vec(documents, size=dim, window=win, min_count=min_count,workers=workers)

dpath = 'P:/Projects/BigMusic/jared.data/d2v/artist_dict.pkl'
if not os.path.exists(dpath):
    artist_dict = {}
    for line in tq(open('P:/Projects/BigMusic/jared.rawdata/lastfm_itemlist.txt')):
        line = line.split('\t')
        if line[1]=='0':
            artist_dict[line[2]] = line[0]
    cPickle.dump(artist_dict,open(dpath,'wb'))
else:
    artist_dict = cPickle.load(open(dpath))
示例#23
0
 def train(cls):
     model = Doc2Vec(documents=TaggedLineDocument(cls.corpus_path), vector_size=300, window=5, min_count=1, workers=4)
     model.save(config.model_path.format('d2v.model'))
示例#24
0
import sys
from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument

file = sys.argv[1]
epochs = int(sys.argv[2])
words = sys.argv[3].split(' ')

steps = 50

docs = TaggedLineDocument(file)

model = Doc2Vec(docs, min_count=1, epochs=epochs)

docs_list = list(docs)

to_docstr = lambda x: ' '.join(docs_list[x].words)

print(f'--- similar : {to_docstr(0)} ---')

for i, p in model.docvecs.most_similar(0):
    print(f'{p}, {to_docstr(i)}')

print('')
print(f'--- similar : {words} ---')

x = model.infer_vector(words, steps=steps)

for tag, p in model.docvecs.most_similar([x]):
    print(f'{p}, {to_docstr(tag)}')
示例#25
0
def d2ctest():
    documents = TaggedLineDocument("new_text2.txt")
    model = Doc2Vec(documents, size=10, window=2, min_count=1, workers=1)
    print(model)
    model
示例#26
0
from keras.models import Sequential, Model
from keras.optimizers import Adam

data_file = sys.argv[1]
dest_file_prefix = sys.argv[2]

epoch = int(sys.argv[3])
batch = int(sys.argv[4])

wv_size = 200
wv_epoch = 2000

num_unit = 512
input_size = (10, )

docs = TaggedLineDocument(data_file)

words = [d.words for d in docs]

wv_model = Word2Vec(words, wv_size, min_count=1, iter=wv_epoch)

input_size += (wv_model.vector_size, )

word_maxlen = np.max([len(w) for w in words])


def discriminator(input_shape):
    model = Sequential()

    model.add(GRU(num_unit, input_shape=input_shape))
    model.add(Dropout(0.3))
model_file = sys.argv[1]
data_file = sys.argv[2]
gen_size = int(sys.argv[3])

questions = [q.split(' ') for q in sys.argv[4].split(';')]

steps = 50
base_word_prob = 0.7
prob_weight = 1.5
keyword_rate = 2
replace_targets = ['名詞', '形容詞']

model = Doc2Vec.load(model_file)

docs_list = list(TaggedLineDocument(data_file))

docs_list_str = [''.join(d.words) for d in docs_list]

is_replace_target = lambda t: np.any(
    [t.part_of_speech.startswith(trg) for trg in replace_targets])


def random_choice(cd):
    probs = np.exp(np.array([p for _, p in cd]) * prob_weight)
    probs /= probs.sum()

    return np.random.choice([d for d, _ in cd], p=probs)


adjust_prob = lambda c, q: (c[0], c[1] * keyword_rate) if c[0] in q else c
示例#28
0
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedLineDocument
from time import localtime, strftime

# document for training
doc_path = 'dota_picks'
corpus = TaggedLineDocument(doc_path)

# Doc2Vec parameters; self explanatory
vector_size = 50
window_size = 5
min_count = 0
sampling_threshold = 1e-4
negative_size = 5
train_epoch = 100
dm = 0  #0 = dbow; 1 = dmpv
worker_count = 8  #number of parallel processes

model = Doc2Vec(size=vector_size,
                window=window_size,
                min_count=min_count,
                sample=sampling_threshold,
                workers=worker_count,
                hs=0,
                dm=dm,
                negative=negative_size,
                dbow_words=1,
                dm_concat=1)
print("Building Vocab:", strftime("%a, %d %b %Y, %H:%M:%S", localtime()))
model.build_vocab(corpus)
print("Built Vocab:", strftime("%a, %d %b %Y, %H:%M:%S", localtime()))
示例#29
0
#-*-coding:utf-8-*-
from __future__ import division
from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument
# import numpy.linalg
import numpy as np
import math
import scipy
from PIL import Image, ImageDraw
input_file = r"H:\network_diagnosis_data\test\GTPC_TUNNEL_PATH_BROKEN.3054.txt"
sentences = TaggedLineDocument(input_file)
dim = 1000
model = Doc2Vec(alpha=0.025, min_alpha=0.025, size=dim)  # default 300 ά
model.build_vocab(sentences)

for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
model.save(r'.\data\test_d2v')
# print model.infer_vector([u'people', u'like', u'words'])

total_num = model.docvecs.count
# print total_num
# print len( model.docvecs[0] )
para_vec = []
for i in xrange(total_num):
    if i == 0:
        para_vec = model.docvecs[i]
        continue
    para_vec = np.vstack((para_vec, model.docvecs[i]))
print para_vec
示例#30
0
import pandas as pd
import numpy as np
from time import time
import pickle

import gensim
from gensim import corpora, models, similarities
from gensim.models.doc2vec import TaggedDocument, TaggedLineDocument
from gensim.models import Doc2Vec
import gensim.models.doc2vec

print('loading docs...')
start_time = time()
documents = [
    doc for doc in TaggedLineDocument('volume2/processed_body_docs.txt')
]
print("--- %s seconds ---" % (time() - start_time))

#documents = []
#with open('/volume/processed_body_docs.txt') as f:
#    for line in f:
#        documents.append(TaggedLineDocument(line))

print('training doc2vec model...')
start_time = time()
model = Doc2Vec(documents,
                vector_size=200,
                window=5,
                min_count=5,
                workers=14,
                epochs=20)