def test_dmc_neg(self): """Test DM/concatenate doc2vec training.""" model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=0, negative=10, alpha=0.05, min_count=2, iter=20) self.model_sanity(model)
def test_deterministic_hs(self): """Test doc2vec results identical with identical RNG seed.""" # hs model = doc2vec.Doc2Vec(DocsLeeCorpus(), seed=42, workers=1) model2 = doc2vec.Doc2Vec(DocsLeeCorpus(), seed=42, workers=1) self.models_equal(model, model2)
# if not os.path.exists(model_path): # sentencess = LabeledLineSentence # train_list = get_namelist(train_path) # sentences = d2v.TaggedLineDocument(path_main+doc_labeled) sentences = list(read_corpus(output)) sentences_test = list(read_corpus(output_test)) sentences.extend(sentences_test) print(sentences_test) # testset_tranning = get_testset_update_trainning(True) # sentences.extend(testset_tranning) # print(testset_tranning) model = d2v.Doc2Vec(min_count=1, alpha=0.02, min_alpha=0.015, window=100, size=150, sample=1e-4, negative=6, workers=8) model.build_vocab(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=120) #,start_alpha=0.02,end_alpha=0.015 """ documents (iterable of iterables) – The documents iterable can be simply a list of TaggedDocument elements, but for larger corpora, consider an iterable that streams the documents directly from disk/network. If you don’t supply documents, the model is left uninitialized – use if you plan to initialize it in some other way. """ # model = d2v.Doc2Vec(sentences,min_count=2,alpha=0.02, min_alpha=0.015, window=60, # size=180, sample=1e-4, negative=5,iter=5, workers=8) model.save(model_path)
'size': 100, 'dm_concat': 1, 'dm': 1, 'min_count': 2 } #100차원 공간에 임베딩, dm=1(pv-dm 적용), dm_concat=1(1-layer의 hidden node 구성시 word vector와 paragraph vector concatenate. 논문상에서 average보다 concatenate가 좋다고 얘기함), min_count=2(최소 2회 이상 출현 단어) #namedtuple 형으로 만들어줘야 doc2vec을 실행할 수 있다. economic_tagged_document = namedtuple('economic_tagged_document', ['words', 'tags']) #namedtuple 만드는 과정 economic_tagged_tr_document = [ economic_tagged_document(words, [tags]) for tags, words in enumerate(economic_stop) ] #doc2vec 객체 생성 및 하이퍼파라미터 설정 economic_doc_vectorizer = doc2vec.Doc2Vec(**config) #vocabulary 선정 economic_doc_vectorizer.build_vocab(economic_tagged_tr_document) #100번 epoch for epoch in range(100): print(epoch) #training document로 학습 economic_doc_vectorizer.train(economic_tagged_tr_document) #learning rate decay economic_doc_vectorizer.alpha -= 0.002 #최소 economic_doc_vectorizer.min_alpha = economic_doc_vectorizer.alpha oshumed_tagged_document = namedtuple('oshumed_tagged_document', ['words', 'tags']) oshumed_tagged_tr_document = [
def test_persistence(self): """Test storing/loading the entire model.""" tmpf = get_tmpfile('gensim_doc2vec.tst') model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1) model.save(tmpf) self.models_equal(model, doc2vec.Doc2Vec.load(tmpf))
num_features = 100 model_name = "sec_filings_model_{}.d2v".format(num_features) # prepare training data sentences = filings_iterator( tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'), N=10000, useDB=True, year_start=year_start, year_end=year_end) model = doc2vec.Doc2Vec( sentences, size=num_features, min_count=1, seed=5, window=20, sample=1e-3, #hashfxn = analyze.hash32 workers=4) #model.init_sims(replace=True) #model.save(model_name) # ============================================================== # retrieve features for each document and store them in a file # ============================================================== sentences = filings_iterator( tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'), N=26761, useDB=True,
def test_dbow_neg(self): """Test DBOW doc2vec training.""" model = doc2vec.Doc2Vec(list_corpus, dm=0, hs=0, negative=10, min_count=2, epochs=20) self.model_sanity(model)
for d in rawDoc.keys(): docs.append(analyzedDocument(rawDoc[d], [d])) alpha_val = 0.025 min_alpha_val = 1e-4 passes = 40 alpha_delta = (alpha_val - min_alpha_val) / (passes - 1) # model = doc2vec.Doc2Vec(vector_size=FLAGS.embedding_size, dm=1, # sample=1e-3, negative=5, # window=2, min_count=0, # workers=4, epochs=10) model = doc2vec.Doc2Vec(vector_size=FLAGS.embedding_size, window=FLAGS.window_size, min_count=2, workers=4, epochs=10) model.build_vocab(docs) # Building vocabulary for epoch in range(passes): random.shuffle(docs) model.alpha, model.min_alpha = alpha_val, alpha_val model.train(docs, total_examples=len(docs), epochs=model.iter) alpha_val -= alpha_delta #Save model model.save(os.path.join(ROOT_DIR, FLAGS.dataset, 'doc2vecFile'))
#!usr/bin/env python # -*- coding:utf-8 -*- from gensim.models import doc2vec import numpy as np sentences = doc2vec.TaggedLineDocument("result.txt") model = doc2vec.Doc2Vec(sentences, size=20, window=3, min_count=3, workers=4, iter=60) # model.build_vocab(sentences) model.train(sentences) print('#########', model.vector_size) corpus = model.docvecs np.save("d2v.corpus.arff", corpus)
questions = processed_questions del processed_questions ######################################### Create Doc2vec ######################################### from gensim.models import doc2vec #Hyper Parameters for doc2vec Model num_workers = multiprocessing.cpu_count() context_size = 5 num_features = 50 min_word_Count = 1 model = doc2vec.Doc2Vec(questions, size = num_features, window = context_size, min_count = min_word_Count, workers = num_workers) ############################# create ques Vector ################################### #questions_vec = np.array((len(questions), 50)) questions_vec = [] #storing all vectors of questions for i in range(len(questions)): vec = model.docvecs[i] questions_vec.append(vec) ############################# create labels ################################### question_type = ["who", "what", "when", "affirmation", "unknown"]
def Updates(): try: print("updating Doc2Vec") print(updating) a = stem.snowball.ArabicStemmer() stopwords_list = stopwords.words('arabic') df = pd.read_csv('textc-Copy1.csv', encoding='utf-8') df["contenu"].fillna("محتوى فارغ", inplace=True) df["article"].fillna("محتوى فارغ", inplace=True) y = df['ToF'] df = df.drop('ToF', axis=1) text = [] for i in range(df.shape[0]): x = nltk.tokenize.wordpunct_tokenize(df.contenu[i]) text1 = [a.stem(word) for word in x] text.append(text1) titre = [ a.stem(word) for word in df.article if word not in stopwords_list ] #doc2vec docs = [] analyzedDocument = namedtuple('AnalyzedDocument', 'words tags') for i, te in enumerate(text): tags = [i] docs.append(analyzedDocument(te, tags)) model = doc2vec.Doc2Vec(docs, vector_size=300, non_negative=True, window=8, min_count=1, workers=4, dm=1) from gensim.test.utils import get_tmpfile fname = get_tmpfile("doc2vec.model") model.save(fname) model = doc2vec.Doc2Vec.load(fname) print("updating fastext") class MyItera(object): def __iter__(self): for line in Corpus.article: filtered_sentence = [] for w in tokenize(line): if w not in stop_words: filtered_sentence.append(w) yield filtered_sentence class MyIter(object): def __iter__(self): for line in Corpus.contenu: filtered_sentence = [] for w in tokenize(line): if w not in stop_words: filtered_sentence.append(w) yield filtered_sentence model = FastText(size=150, window=3, min_count=1) model.build_vocab(sentences=MyIter()) total_examples = model.corpus_count model.train(sentences=MyIter(), total_examples=total_examples, epochs=5) except: Update()
def prepare_doc2vec(sentences, embedding_size:int, model_path:str): documents = [doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)] model = doc2vec.Doc2Vec(documents, vector_size=embedding_size, min_count=1, workers=4) model.save(model_path)
def apply_pipeline(train_from_scratch=True, avoid_inference=False, shuffle_corpus=True, include_genres=False, include_categories=True, include_app_ids=True, verbose=False): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) game_names, game_tags = load_game_names(include_genres, include_categories) steam_tokens = load_tokens() documents = list(read_corpus(steam_tokens, game_tags, include_app_ids)) if shuffle_corpus: # « Only if the training data has some existing clumping – like all the examples with certain words/topics are # stuck together at the top or bottom of the ordering – is native ordering likely to cause training problems. # And in that case, a single shuffle, before any training, should be enough to remove the clumping. » # Reference: https://stackoverflow.com/a/48080869 random.shuffle(documents) if train_from_scratch: print('Creating a new Doc2Vec model from scratch.') model = doc2vec.Doc2Vec(documents, vector_size=100, window=5, min_count=5, epochs=20, workers=multiprocessing.cpu_count()) # NB: Do not follow the piece of advice given in https://rare-technologies.com/doc2vec-tutorial/ # « I have obtained better results by iterating over the data several times and either: # 1. randomizing the order of input sentences, or # 2. manually controlling the learning rate over the course of several iterations. » # Indeed, in my experience, this leads to buggy results. Moreover, this approach is not recommended according to # https://stackoverflow.com/a/48080869 model.save(get_doc_model_file_name()) else: print('Loading previous Doc2Vec model.') model = doc2vec.Doc2Vec.load(get_doc_model_file_name()) # Test doc2vec if verbose: try: # Spelunky + (Slay the Spire) - (Dream Quest) check_analogy(model, pos=['239350', '646570'], neg=['557410']) except TypeError: pass try: # Half-Life + (Witcher 2) - (Witcher) check_analogy(model, pos=['70', '20920'], neg=['20900']) except TypeError: pass query_app_ids = ['620', '364470', '504230', '583950', '646570', '863550', '794600'] for query_app_id in query_app_ids: print('Query appID: {} ({})'.format(query_app_id, game_names[query_app_id])) compute_similarity_using_doc2vec_model(query_app_id, steam_tokens, model, avoid_inference=avoid_inference, num_items_displayed=10) # Check the relevance of the corresponding word2vec for query_word in ['anime', 'fun', 'violent']: compute_similarity_using_word2vec_model(query_word, steam_tokens, model) entity = get_doc_model_entity(model) tag_entity = set(tag for tag in entity if 'appID_' not in tag) print(tag_entity) query_tags = ['In-App Purchases', 'Free to Play', 'Violent', 'Early Access'] for query_tag in tag_entity.intersection(query_tags): for query_app_id in query_app_ids: try: sim = model.docvecs.similarity(get_tag_prefix() + query_app_id, query_tag) print('Similarity = {:.0%} for tag {} vs. appID {} ({})'.format(sim, query_tag, query_app_id, game_names[query_app_id])) except KeyError: pass num_items_displayed = 3 for query_tag in tag_entity: print('\nTag: {}'.format(query_tag)) similarity_scores_as_tuples = model.docvecs.most_similar(positive=query_tag, topn=num_items_displayed) similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples) print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed) # Top 100 query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True) num_neighbors = 10 only_print_banners = True use_cosine_similarity = True label_database = np.array(model.docvecs.vectors_docs) doc_tags = list(model.docvecs.doctags.keys()) init_indices = np.array(range(len(doc_tags))) bool_indices_to_remove = list(map(lambda x: not x.startswith(get_tag_prefix()), doc_tags)) indices_to_remove = init_indices[bool_indices_to_remove] label_database = np.delete(label_database, indices_to_remove, axis=0) app_ids = [int(doc_tag[len(get_tag_prefix()):]) for doc_tag in doc_tags if doc_tag.startswith(get_tag_prefix())] knn = prepare_knn_search(label_database, use_cosine_similarity=use_cosine_similarity) query_des = None for query_app_id in query_app_ids: if avoid_inference: inferred_vector = label_database[app_ids.index(query_app_id)] else: # From query appID to query feature vector query = steam_tokens[str(query_app_id)] # Caveat: « Subsequent calls to this function may infer different representations for the same document. » # Reference: https://radimrehurek.com/gensim/models/doc2vec.html#gensim.models.doc2vec.Doc2Vec.infer_vector inferred_vector = model.infer_vector(query) if query_des is None: query_des = inferred_vector else: query_des = np.vstack((query_des, inferred_vector)) # Matching of feature vectors matches = perform_knn_search_with_vectors_as_input(query_des, knn, num_neighbors) # From feature matches to appID matches matches_as_app_ids = transform_matches_to_app_ids(matches, app_ids) print_ranking(query_app_ids, matches_as_app_ids, num_elements_displayed=num_neighbors, only_print_banners=only_print_banners) return
#%% from gensim.models import doc2vec import pandas as pd #%% all_data = pd.read_csv('all_data.csv') #%% # 建立語料庫 documents = [] for i, sentence in enumerate(all_data['clean_data']): documents.append(doc2vec.TaggedDocument(sentence, [i])) #%% # 模型建立 model = doc2vec.Doc2Vec(documents, min_count=1, window=15, size=100, sample=1e-3, negative=5, workers=4) model.train(documents, total_examples=model.corpus_count, epochs=10) model.save('doc2vec.model') # print(len(model.docvecs))
def train_doc2vec(): doc2vec_model = doc2vec.Doc2Vec(size=100, window=8, min_count=5, workers=8, iter=30, alpha=1e-2, min_alpha=1e-2) doc2vec_model.build_vocab(iter_docs_queries()) doc2vec_model.train(iter_docs_queries(), total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.iter) doc2vec_model.save('doc2vec_weigths')
def main(): #Set up logging configurations logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = doc2vec.Doc2Vec(hashfxn=myhash) #Load the trained model model = doc2vec.Doc2Vec.load("../../classifier/Doc2VectforNLPTraining") word_vectors = model.syn0 num_clusters = int(word_vectors.shape[0] / 5) # print("number of clusters: {}".format(num_clusters)) print("Clustering...") startTime = time.time() cluster_index = cfun.kmeans(num_clusters, word_vectors) endTime = time.time() print("Time taken for clustering: {} minutes".format((endTime - startTime)/60)) clusterf = open("../../classifier/doc2vec/clusterIndex.pickle","wb") #Save clusters pickle.dump(cluster_index,clusterf) # create a word/index dictionary, mapping each vocabulary word to a cluster number # zip(): make an iterator that aggregates elements from each of the iterables index_word_map = dict(zip(model.index2word, cluster_index)) train = pd.read_csv("../../data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) test = pd.read_csv("../../data/testData.tsv", header=0, delimiter="\t", quoting=3) #Create feature vectors for training data trainingDataFV = np.zeros((train["review"].size, num_clusters), dtype=np.float) #Create feature vectors for test data testDataFV = np.zeros((test["review"].size, num_clusters), dtype=np.float) #Populate feature vectors after cleaing the data print("Processing training data...") counter = 0 cleaned_training_data = preProc.clean_data(train) for review in cleaned_training_data: trainingDataFV[counter] = cfun.create_bag_of_centroids(review,num_clusters,index_word_map) counter += 1 print("Processing test data...") counter = 0 cleaned_test_data = preProc.clean_data(test) for review in cleaned_test_data: testDataFV[counter] = cfun.create_bag_of_centroids(review,num_clusters,index_word_map) counter += 1 n_estimators = 100 result = cfun.rfClassifer(n_estimators, trainingDataFV, train["sentiment"],testDataFV) output = pd.DataFrame(data={"id": test["id"], "sentiment": result}) output.to_csv("Doc2Vec_Clustering.csv", index=False, quoting=3)
from cleaner import clean from gensim.models import doc2vec def load_docs(filepath, clean_text=True): ret = [] for f_name in os.listdir(filepath): if clean_text: ret.append( clean( open(filepath + "/" + f_name, 'rb').read().decode('UTF-8'))) continue ret.append(open(filepath + "/" + f_name, 'rb').read().decode('UTF-8')) return ret def format_loaded_input_docs(docs): ret = [] for id_, doc in enumerate(docs): ret.append(doc2vec.TaggedDocument(words=doc.split(), tags=[id_])) return ret if __name__ == '__main__': docs = load_docs("samples/") formatted_docs = format_loaded_input_docs(docs) model = doc2vec.Doc2Vec(formatted_docs, size=5, window=10, workers=2) for e in model.docvecs: print e
def structural_embedding(self, inputFile, outputFile): indexToName = self.generateWalkFile(inputFile, args.walkLength) sentences = doc.TaggedLineDocument(inputFile+'.walk') self.model = doc.Doc2Vec(sentences, size = dimensions, iter = iterations, window = window ) saveVectors(list(self.docvecs), outputFile, indexToName)
# Tokenize and lemmatize def preprocess(text): result = [] for token in gensim.utils.simple_preprocess(text): if token not in gensim.parsing.preprocessing.STOPWORDS and len( token) > 3: result.append(lemmatize_stemming(token)) return result def myhash(obj): return hash(obj) % (2**32) model = doc2vec.Doc2Vec(hashfxn=myhash) #Load the model we trained earlier model = doc2vec.Doc2Vec.load("../classifier/doc2vec/Doc2VecTaggedDocs") sentence1 = preprocess("Why do rockets look white?") sentence2 = preprocess("Why are rockets and boosters painted white?") inferred_embedding_2 = numpy.array( model.infer_vector(doc_words=sentence2, steps=30, alpha=0.025)) inferred_embedding_1 = numpy.array( model.infer_vector(doc_words=sentence1, steps=30, alpha=0.025)) print( numpy.dot(inferred_embedding_1, inferred_embedding_2) / (numpy.linalg.norm(inferred_embedding_1, ord=2) *
def preprocess(path,readMePath, test_id, train_id, with_args_num=True, with_pac_depth=True): data = pd.read_csv(path) columns = ['Method', 'Exception', 'ProjectId', 'ProjectName', 'MethodTop', 'MethodBottom', 'ClassTop', 'ClassBottom', 'PacTop', 'PacBottom', 'Doc', 'Throw', 'Catch'] data.reindex(columns=columns) data.drop_duplicates(subset=columns[:-2], keep=False, inplace=True) data = data[data.Exception.str.contains('java')] data.index = range(len(data)) exs = list(data['Exception'].drop_duplicates()) exs_to_id = {} for e in exs: exs_to_id[e] = len(exs_to_id) project_id_ = data['ProjectId'] # data.drop(columns='ProjectId', inplace=True) # columns.remove('ProjectId') if with_args_num: columns.insert(2, 'ArgsNum') data = data.reindex(columns=columns, fill_value=0) data['ArgsNum'] = [int(s.count('@')) for s in data['Method']] if with_pac_depth: columns.insert(2, 'PacDepth') data = data.reindex(columns=columns, fill_value=0) data['PacDepth'] = [int(s.count('.')) for s in data['Method']] columns.insert(2, 'ExceptionType') data = data.reindex(columns=columns, fill_value=0) data['ExceptionType'] = [exs_to_id[e] for e in data['Exception']] zxqcommentPath = "comments.csv" zxqcomments = pd.read_csv(zxqcommentPath, header=None, error_bad_lines=False, encoding="gbk") zxqmethod = zxqcomments[0] zxqdict = {zxqcomments[0][0]: zxqcomments[2][0]} for i in range(len(zxqmethod)): zxqdict[zxqcomments[0][i]] = zxqcomments[2][i] doc = [] zxqindex = 0 for s in data['Doc']: tmp = [] tmp.extend([p for p in s.split('|')]) zxqmethod = data['Method'][zxqindex] zxqmethod = zxqmethod.split('$')[1].split('@')[0] zxqcom = zxqdict.get(zxqmethod) zxqcom = str(zxqcom) if (zxqcom == 'nan'): zxqcom = "" tmp.extend([p for p in zxqcom.split()]) zxqindex = zxqindex + 1 doc.append(tmp) doc_size = 128 res = doc2vec_abstract(readMePath,size=doc_size) abstract = np.zeros(shape=(data.shape[0], doc_size)) j = 0 for i in project_id_: abstract[j] = res["project"] j += 1 d = get_refer('depend_all.csv', 'depend') refers = np.zeros(shape=(data.shape[0], d["project"].shape[0])) j = 0 for i in project_id_: refers[j] = d["project"] j += 1 # exceptions = np.array(data.iloc[:, 1], dtype=np.str) data.index = range(len(data)) train_id = data[data['ProjectId'].isin(train_id)].index.tolist() test_id = data[data['ProjectId'].isin(test_id)].index.tolist() data.drop(columns='ProjectId', inplace=True) data.drop(columns='ProjectName', inplace=True) columns.remove('ProjectId') columns.remove('ProjectName') features = np.array(data.iloc[:, 2:-3], dtype=np.float) labels = np.array([0 if int(x) == 1 else 1 for x in data['Throw']], dtype=np.int) _doc = doc2vec_generate(doc) vector_size = 128 d2v = doc2vec.Doc2Vec(documents=_doc, min_count=1, vector_size=vector_size) # from gensim.test.utils import get_tmpfile # fname = get_tmpfile("doc2vec_sentences_model") # d2v.save(fname) # d2v = Doc2Vec.load(fname) sentences = np.array([list(d2v.infer_vector(n)) for n in doc], dtype=np.float) # sentences = np.zeros(shape=(data.shape[0], vector_size)) # j = 0 # for i in project_id_: # sentences[j] = sen[j] # j += 1 # 划分train和test method = data['Method'] exception = data['Exception'] sentences_train, sentences_test = sentences[train_id], sentences[test_id] features_train, features_test = features[train_id], features[test_id] refers_train, refers_test = refers[train_id], refers[test_id] abstract_train, abstract_test = abstract[train_id], abstract[test_id] labels_train, labels_test = labels[train_id], labels[test_id] # return doc, features, labels, refers, abstract return method[test_id], exception[ test_id], sentences_train, sentences_test, features_train, features_test, refers_train, refers_test, \ abstract_train, abstract_test, labels_train, labels_test
def test_dbow_neg_fromfile(self): """Test DBOW doc2vec training.""" with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec(list_corpus, dm=0, hs=0, negative=10, min_count=2, epochs=20) self.model_sanity(model)
import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) # Set values for various parameters num_features = 500 # Word vector dimensionality min_word_count = 100 # Minimum word count num_workers = 4 # Number of threads to run in parallel context = 15 # Context window size downsampling = 1e-3 # Downsample setting for frequent words # Initialize and train the model (this will take some time) from gensim.models import doc2vec print "Training model..." model = doc2vec.Doc2Vec(workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling) #, alpha = 0.025, min_alpha = 0.025 model.build_vocab(sentences) for epoch in range(3): model.train(sentences) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay model.docvecs.most_similar("ANSWER") # If you don't plan to train the model any further, calling # init_sims will make the model much more memory-efficient. model.init_sims(replace=True) # It can be helpful to create a meaningful model name and # save the model for later use. You can load it later using Word2Vec.load()
for i,v in enumerate(reviews): label = '%s_%s'%(label_type,i) labelized.append(LabeledSentence(v, [label])) return labelized allXs = x_train + x_test x_train = labelizeReviews(x_train, 'Train') x_test = labelizeReviews(x_test, 'Test') allXs = labelizeReviews(allXs, 'All') for i in range(10): print x_train[i] print x_test[i] # Instantiate Doc2Vec model and build vocab model = doc2vec.Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7) model.build_vocab(allXs) # model = doc2vec.Doc2Vec.load('doc2vec_model') #Pass through the data set multiple times, shuffling the training reviews each time to improve accuracy for epoch in range(20): model.train(utils.shuffle(x_train)) model.save('Model_after_train') model = doc2vec.Doc2Vec.load('Model_after_train') #print model.docvecs['All_0'] # get training set vectors from our models def getVecs(model, corpus, size, vecs_type): vecs = np.zeros((len(corpus), size)) for i in range(0 , len(corpus)):
def grid_search_doc2vec(X_texto, X_materia, y_train): lr_acc = [] sizes = [500, 1000, 1500] # size of the vector counts = [1, 5] #min_count for words to be included in vocab dms = [ 0 ] # distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) epoch = 10 docs_token = [doc.split() for doc in X_texto] documents = [ doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(docs_token) ] for size in sizes: for count in counts: for dm in dms: # define model, vocab and train it model = doc2vec.Doc2Vec(vector_size=size, min_count=count, epochs=epoch, workers=cores - 1, dm=dm) model.build_vocab(documents) model.train(documents, total_examples=model.corpus_count, epochs=model.epochs) # get vector representation of docs X_vector = [] for doc in docs_token: vector = model.infer_vector(doc) X_vector.append(vector) X = pd.concat( [X_materia.reset_index(drop=True), pd.DataFrame(X_vector)], axis=1) #oversampling sm = SMOTE(sampling_strategy=0.4) X_train_sm, y_train_sm = sm.fit_resample(X, y_train) # classifier: CV score lrg = LogisticRegression(solver="lbfgs", class_weight='balanced') scores = cross_val_score(lrg, X=X_train_sm, y=y_train_sm, cv=5, n_jobs=-1, verbose=1, scoring='recall') lr_acc.append((scores.mean(), size, count, dm)) results = pd.DataFrame(lr_acc) parameters = { 'size': results.sort_values(0).iloc[-1, :][1], 'count': results.sort_values(0).iloc[-1, :][2], 'dm': results.sort_values(0).iloc[-1, :][3] } return results, parameters
print (len(w2v_train_doc2)) print ('now train word2vec') import time a=time.time() class sentences_generator(): def __init__(self, doc): self.doc = doc def __iter__(self): for i,line in enumerate(self.doc): sentence = TaggededDocument(line.split(), tags=[i]) yield sentence sents=sentences_generator(w2v_train_doc2) model = doc2vec.Doc2Vec(sents,dm=1,vector_size=300,window=13,min_count=1,hs=1,workers=12,epochs=30) b=time.time() model.save("../data/pv.txt") print (b-a) import operator def get_doc2vec_score(data,tf): DS=[] tag_num=0 hit_num=0 for row in tqdm(range(data.shape[0])): d2v_score={} doc_vec=model[train_idx[row]] doc_vec/=np.sqrt(sum(doc_vec**2)) for w in tf[row]: try:
from gensim.models import doc2vec from collections import namedtuple # Load data doc1 = ["This is a sentence", "This is another sentence"] # Transform data (you can add more data preprocessing steps) docs = [] analyzedDocument = namedtuple('AnalyzedDocument', 'words tags') for i, text in enumerate(doc1): words = text.lower().split() print words tags = ['test'] # tags = ['hello'+str(i)] print tags docs.append(analyzedDocument(words, tags)) # Train model (set min_count = 1, if you want the model to work with the provided example data set) model = doc2vec.Doc2Vec(docs, size=100, window=300, min_count=1, workers=4) # Get the vectors print model.docvecs['test'] # print model.docvecs['hello0'] print model.docvecs[0] print model.docvecs[1] # 这两种方法的结果是一样 的 # 标签必须 unique? ''' sentences=doc2vec.TaggedLineDocument(file_path) model = doc2vec.Doc2Vec(sentences,size = 100, window = 300, min_count = 10, workers=4) docvec = model.docvecs[99] '''
def test_persistence(self): """Test storing/loading the entire model.""" model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1) model.save(testfile()) self.models_equal(model, doc2vec.Doc2Vec.load(testfile()))
cores = multiprocessing.cpu_count() vector_size = 300 window_size= 15 word_min_count = 2 sampling_threashold = 1e-5 negative_size = 5 train_epoch = 100 dm = 1 worker_count = cores train_data = read_data('data/ratings_train.txt') train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]] TaggedDocument = namedtuple('TaggedDocument', 'words tags') tagged_train_docs = [TaggedDocument(d, [c]) for d,c in train_docs] doc_vectorizer = doc2vec.Doc2Vec(size=300, alpha=0.025, min_alpha=0.025, seed=1234) doc_vectorizer.build_vocal(tagged_train_docs) for epoch in range(10): doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter) doc_vectorizer.alpha -= 0.002 doc_vectorizer.min_alpha = doc_vectorizer.alpha doc_vectorizer.save('model/doc2vec.model') pprint(doc_vectorizer.most_similar('공포/Noun')) pprint(doc_vectorizer.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle'))
if len(tag) == 0: sys.exit(1) class MyTagDocument(object): def __init__(self, filename): self.filename = filename def __iter__(self): for line in open(self.filename): items = line.split('\t', 1) yield doc2vec.TaggedDocument(to_unicode(items[1]).split(), [items[0]]) documents=MyTagDocument('./data/articles.txt.' + tag) model=doc2vec.Doc2Vec(documents, size=100, window=8, min_count=5, workers=8) model.save('doc2vec') output = open('output_doc2vec.txt.' + tag, 'w') count = model.docvecs.count idx = 0 while idx < count: tag = model.docvecs.index_to_doctag(idx) idx = idx + 1 res=[] for f in model.docvecs[tag]: res.append(str(f)) output.write(tag + "\tdoc2vec\t" + " ".join(res) + "\n") output.close()
def test_dms_hs(self): """Test DM/sum doc2vec training.""" model = doc2vec.Doc2Vec(list_corpus, dm=1, dm_mean=0, size=24, window=4, hs=1, negative=0, alpha=0.05, min_count=2, iter=20) self.model_sanity(model)