def setUp(self): filename = datapath("alldata-id-10.txt") train_docs = read_sentiment_docs(filename) self.train_docs = train_docs self.source_doc_vec_file = datapath("small_tag_doc_5_iter50") self.target_doc_vec_file = datapath("large_tag_doc_10_iter50") self.source_doc_vec = Doc2Vec.load(self.source_doc_vec_file) self.target_doc_vec = Doc2Vec.load(self.target_doc_vec_file)
def do_command(args): # Load data data = load_data(args.input) #ids, documents = zip(*data) data = [(id, tokenize(doc)) for id, doc in data] ids = [id for id, _ in data] if not os.path.exists(args.modelfile): model = embed_documents(data) # Save model model.save(args.modelfile) else: model = Doc2Vec.load(args.modelfile) #map(model.infer_tokens, tokenized) print("Loaded model.") # Do k-nearest neighbors search. writer = csv.writer(args.output, delimiter='\t') writer.writerow(["id1", "id2", "score"]) count = int(args.count) if args.count > 0 else len(model.docvecs) vectors = np.array([model.docvecs[i] for i in range(count)]) del model # clear up memory for i, j, score in find_nearest_neighbors(vectors): id1, id2 = ids[i], ids[j] writer.writerow([id1, id2, score])
def __init__(self, sentences, name, dataset_name, epochs=1, dimension=50, modelfile=None): self.inner_model = None # parameters self.dataset = dataset_name self.sentences = sentences self.name = name self.epochs = epochs self.dimension = dimension # data file path models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models']) if modelfile is not None: filename = modelfile else: filename = "DOC2VEC_%s_%s_%s_%s" % (self.dataset, self.name, self.epochs, self.dimension) self.filepath = os.path.join(models_folder, filename) model_exists = os.path.isfile(self.filepath) # train initial model if model_exists: logging.info("found data file %s" % (self.filepath, )) self.inner_model = Doc2Vec.load(self.filepath) else: self.inner_model = Doc2Vec(sentences, size=self.dimension) print self.inner_model.vocab.keys() self.inner_model.save(fname=self.filepath)
def __init__(self, size=300, window=8, min_count=2, workers=8, path_to_model=None, stream_train=False): ''' Initializes the Doc2Vec_Wrapper class. Args: size (int): Specifies the size of the feature-vector. Defaults to 300 window (int): Specifies the size of the context window from which the feature vector is learned min_count (int): Specifices the minimum number of instances of each word that is saved in the model workers (int): number of parallel processes path_to_model (str): Specifies model on disk stream_train (bool): If true, update word vectors with new sentences. If false, just get doc vecs ''' self.stream_train=stream_train self.is_trained = False self.model = None ## if a path is passed, try to load from disk. Otherwise, retrain anyway if path_to_model: try: self.is_trained = True self.model = Doc2Vec.load(path_to_model) except: pass ## params for Doc2Vec self.size = size ## size of the vector self.window = window ## size of the context window self.min_count = min_count ## minimum count of vocab to store in binary tree self.workers = workers ## number of parallel processes == number of cores on the computer
def test_category(): from gensim.models.doc2vec import Doc2Vec from sematch.utility import FileIO from sematch.semantic.relatedness import ConceptRelatedness model_category = Doc2Vec.load(FileIO.filename('models/category/cat2vec')) cat2vec_rel = ConceptRelatedness(model_category) print(cat2vec_rel.word_similarity('happy','sad'))
def load_external(self, model_file_name): """ load a word2vec model from the file specified :param model_file_name: name of the model file :return: """ self.model = Doc2Vec.load(model_file_name)
def varify(): from gensim.models.doc2vec import Doc2Vec model = Doc2Vec.load('data/doc2vec.d2v') documents = pickle.load(open('data/fedcorpus.pick', 'r')) for i in xrange(3): inferred_docvec = model.infer_vector(documents[i].words) print documents[i].tags print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))
def main(): """ 1. Divide total dataset into several data bins by randomly extracting data entries with given ratio. 2. Run cross-validation for given numbers of iterations in either SMOTE or non-SMOTE mode. 3. Report and present statistical evaluations for each data bin. """ stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns = list(), list(), list() # ns for non-SMOTE stats_Fscores_ws, stats_recalls_ws, stats_precisions_ws = list(), list(), list() # ws for with SMOTE data_pos, data_neg = load_data("../data/") data_pos, data_neg = data_filter(data_pos), data_filter(data_neg) print "Loading Doc2Vec model ..." model_doc2vec = Doc2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) # load Doc2Vec model print "Doc2Vec model loading done!" models = {"SVC": sklearn.svm.SVC(), \ "Logit": sklearn.linear_model.LogisticRegression(), \ "DT": sklearn.tree.DecisionTreeClassifier(), \ "NBayes": sklearn.naive_bayes.GaussianNB(), \ "NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()} model_chosen = "NBayes" print "Classifier Type:", model_chosen for binIndex in range(NUM_OF_BINS): print "Experiment on DataSet#", str(binIndex) random.shuffle(data_pos) random.shuffle(data_neg) size_pos_bin, size_neg_bin = int(len(data_pos)*SAMPLE_SIZE_RATIO), int(len(data_neg)*SAMPLE_SIZE_RATIO) data_pos_bin, data_neg_bin = data_pos[:size_pos_bin], data_neg[:size_neg_bin] # dataset bin sFscores_iter_ns, sRecalls_iter_ns, sPrecisions_iter_ns = list(), list(), list() sFscores_iter_ws, sRecalls_iter_ws, sPrecisions_iter_ws = list(), list(), list() for iteration in range(NUM_OF_ITERATION): random.seed(iteration) random.shuffle(data_pos_bin) random.shuffle(data_neg_bin) data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos_bin, data_neg_bin, model_doc2vec) # convert to doc vectors print "non-SMOTE experiment" accuracys, precisions, recalls, Fscores = cross_validationS( \ data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD, smote_flag=False) # cross validation sFscores_iter_ns.extend(Fscores) sRecalls_iter_ns.extend(recalls) sPrecisions_iter_ns.extend(precisions) print "with SMOTE experiemnt" accuracys, precisions, recalls, Fscores = cross_validationS( \ data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD, smote_flag=True) # cross validation sFscores_iter_ws.extend(Fscores) sRecalls_iter_ws.extend(recalls) sPrecisions_iter_ws.extend(precisions) stats_Fscores_ns.append(sFscores_iter_ns) stats_precisions_ns.append(sPrecisions_iter_ns) stats_recalls_ns.append(sRecalls_iter_ns) stats_Fscores_ws.append(sFscores_iter_ws) stats_precisions_ws.append(sPrecisions_iter_ws) stats_recalls_ws.append(sRecalls_iter_ws) print "All Experiments Done!" save_stats(stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns, stats_Fscores_ws, stats_recalls_ws,\ stats_precisions_ws, model_name=model_chosen) print "Statistics ready!"
def get_model(model_num, model_names): if model_num < 10: model = Word2Vec.load(model_path + model_names) elif model_num < 99: model = Doc2Vec.load(model_path + model_names) else: model = Word2Vec.load_word2vec_format(model_path + model_names, binary=True) # C text format return model
def create_and_train_models_d2vec(tag, cores=6): """ Build vocabulary and train models :param tag: small or big :param cores: number of cores :return: the current models """ simple_models = get_models_d2vec(cores) model_files = get_models_filename_d2vec(tag) if all([os.path.exists(file) for file in model_files]): print('Models exist, loading...') for i, fname in enumerate(model_files): simple_models[i] = Doc2Vec.load(fname) models_by_name = OrderedDict((str(model), model) for model in simple_models) return models_by_name else: print('Building models...') voc_model = build_vocab_d2vec(tag, cores) # Share vocabulary between models for model in simple_models: model.reset_from(voc_model) models_by_name = OrderedDict((str(model), model) for model in simple_models) print('Training models...') print("START %s" % datetime.datetime.now()) best_error = defaultdict(lambda: 1.0) # to selectively-print only best errors achieved alpha, min_alpha, passes = (0.025, 0.001, 20) alpha_delta = (alpha - min_alpha) / passes file = x_train_str.format(tag) x_train = pd.read_hdf(file) train_list = x_train.tolist() for epoch in range(passes): shuffle(train_list) # shuffling gets best results for name, train_model in models_by_name.items(): # train duration = 'na' train_model.alpha, train_model.min_alpha = alpha, alpha with elapsed_timer() as elapsed: train_model.train(CorpusStream(train_list, 'train'), total_examples=train_model.corpus_count, epochs=train_model.iter) duration = '%.1f' % elapsed() print('completed pass %i at alpha %f' % (epoch + 1, alpha)) alpha -= alpha_delta print("END %s" % str(datetime.datetime.now())) for name, model in models_by_name.items(): name = name.replace('/', '').replace(',', '_') model.save('models/{0}_{1}.m'.format(name, tag)) return models_by_name
def get_WordVector_matrix(label): model = Doc2Vec.load('./WordVector_model.d2v') size = len(label) vectors = np.zeros((size,depth)) for i in range(size): try: doc_vector = model.docvecs[str(i)] vectors[i]=(doc_vector[0]) except KeyError: print str(i) + ' occurs KeyError' pass return map(list,vectors)
def build_model(x_train, x_test, iteration =5, save=True): if(save): big_list = x_train + x_test model = Doc2Vec(min_count=2, window=10, size=100, sample=1e-4, negative=5, workers=8) model.build_vocab(big_list) for i in range(iteration): model.train(big_list) print 'saving model to file.....' model.save('./sentim.d2v') else: print 'loading model from file.....' model = Doc2Vec.load('./sentim.d2v') return model
def test_models( FULL_SIM, models_files ): test_papers = pd.read_csv( TEST_FILEPATH ) # NOTE: Only need for testing with AII: keywords_docsrels = populate_iks_dict() authorities = initialize_authorities() for mod_f in models_files: print( 'Testing '+ mod_f ) model = Doc2Vec.load( mod_f ) print( 'Model loaded.' ) test_model( FULL_SIM, model, test_papers, keywords_docsrels, authorities )
def get_vec(vector_file, id_file, w_file): p2v = Doc2Vec.load(vector_file) fout = open(w_file, "w") index = 0 with open(id_file) as f: for line in f: index += 1 if index % 1000 == 0: logging("%d cases" % index) line = line.strip() vec = p2v.docvecs[line] line_w = line + "\t" + "\t".join([str(x) for x in vec]) + "\t" + "\n" fout.write(line_w) fout.close()
def datacluster(data): infered_vectors_list = [] print "load model..." model_dm = Doc2Vec.load(model_path) print "load train vectors..." for text, label in data: vector = model_dm.infer_vector(text) infered_vectors_list.append(vector) ''' print "Check the optimized parameter..." Nc = range(1, 50) pca_data = [PCA(n_components = i).fit(infered_vectors_list).transform(infered_vectors_list) for i in Nc] kmeans = cluster.KMeans(init='k-means++',n_clusters=20,max_iter=300) score = [kmeans.fit(pca_data[i]).score(pca_data[i]) for i in range(len(pca_data))] print score plt.plot(Nc,score) plt.xlabel('PCA components') plt.ylabel('Score') plt.title('Elbow Curve') plt.show() ''' print "PCA decomposition..." pca = PCA(n_components = 10).fit(infered_vectors_list) pca_data = pca.transform(infered_vectors_list) print "train K-Means model..." kmean_model = cluster.KMeans(init='k-means++',n_clusters=16,max_iter=300) kmean_model.fit(pca_data) #get the classified index result = kmean_model.fit_predict(pca_data) print "Predicting result:", result #save the cluster result joblib.dump(kmean_model, cluster_path) #load the cluster result # new_km = joblib.load(cluster_path) numSamples = len(pca_data) print numSamples centroids = kmean_model.labels_ #print centroids,type(centroids) #显示中心点 #print kmean_model.inertia_ #显示聚类效果 ''' marker = ['o', '.', ',', 'x', '*', 'd', 's', 'p'] color = ['r', 'g', 'b', 'c', 'm', 'k', 'y', 'w'] for i in xrange(numSamples): plt.scatter(pca_data[i][0], pca_data[i][1], \ marker=marker[centroids[i]], color=color[centroids[i]]) plt.show() ''' return centroids
def main(): #load data set training_reviews = load_dataset(TRAIN_FILE) testing_reviews = load_dataset(TEST_FILE) #load doc2vec model doc2vec_model = Doc2Vec.load(DOC2VEC_MODEL) cate_index = get_all_categories(training_reviews) cates = dict2list(cate_index) n_cates = len(cates) train_X = get_X(training_reviews, doc2vec_model) test_X = get_X(testing_reviews, doc2vec_model) train_labels = get_labels(training_reviews, cate_index) test_labels = get_labels(testing_reviews, cate_index) labelwise_acc = [] labelwise_output = [] for cate in range(n_cates): # train a bonary model train_Y = get_Y(train_labels, cate) prob = svm_problem(train_Y, train_X) param = svm_parameter("-s 0 -t 2 -b 1") m = svm_train(prob, param) # test test_Y = get_Y(test_labels, cate) p_label, p_acc, p_val = svm_predict(test_Y, test_X, m, '-b 1') labelwise_acc.append(p_acc) labelwise_output.append(p_label) # evaluation p, r, f = microF1(labelwise_output, test_labels) # output out_dir = "../data/use_doc2vec/" out_file = out_dir + "laptop.txt" labelwise_acc = [(cates[i], labelwise_acc[i][0]) for i in range(n_cates)] labelwise_acc = sorted(labelwise_acc, key=lambda x:x[1]) with open(out_file, 'w') as out: out.write("Precision:\t{}\nRecall:\t{}\nF1:\t{}\n".format(p, r, f)) print("{}\n{}\n{}".format(p, r, f)) for cate_i in range(n_cates): out.write("{}:\t{}\n".format(labelwise_acc[cate_i][0], labelwise_acc[cate_i][1]))
def get_distances_subset(n_closest, category_hash_with_doc_ids, csv_path): # example # category_hash_with_doc_ids = {"cat1":["us-1", "us-2"], "cat2": ["us-3"]} # loop over subjects and average docvecs belonging to subject. # place in dictionary model = Doc2Vec.load('../doc2vec_model') cpc_vectors = get_category_vectors_subset(model, category_hash_with_doc_ids) distance_mat = get_distance_mat(cpc_vectors) to_csv = [] for subj_id in list(category_hash_with_doc_ids.keys()): relateds = get_n_closest(distance_mat, subj_id, n=n_closest) for related_id, dist in relateds.iteritems(): weight = round(1./dist) #weight = round((1-dist) * 10) row = (subj_id, related_id, weight, subj_id, related_id) to_csv.append(row) edges = pd.DataFrame(to_csv, columns=['source', 'target', 'weight', 'source_name', 'target_name']) edges.to_csv(csv_path, index=False)
def test(): global english_punctuation, model_path new_model = Doc2Vec.load(model_path) # sentence = "reserve setup_data: [mem 0x000000008f889018-0x000000008f8bc057] usable" # sentence = "efi: mem14: type=2, attr=0xf, range=[0x000000008fa17000-0x000000008fb19000) (1MB)" # sentence = "pci 0000:07:08.2: [8086:208d] type 00 class 0x088000" # sentence = "i40e 0000:b0:00.2: irq 41 for MSI/MSI-X" sentence = "ata8: SATA link up 6.0 Gbps (SStatus 133 SControl 300)" #tokenize test_tokenized = [word.lower() for word in word_tokenize(sentence)] #remove stopwords english_stopwords = stopwords.words('english') test_stopwords = [word for word in test_tokenized if not word in english_stopwords] #remove punctuation test_punctuation = [word for word in test_stopwords if not word in english_punctuations] #stem words #st = PorterStemmer() #test_stemmed = [st.stem(word) for word in test_punctuation] test_text = test_punctuation print "===>Testing sentence:", test_text inferred_vector_dm = new_model.infer_vector(test_text) sims = new_model.docvecs.most_similar(positive=[inferred_vector_dm]) return sims
def __init__(self, model_name=None, corpus=None, stop_words=False, filename=None, **kwargs): """ model_name: name of the model which has been trained and saved corpus: dictionary with 'question' and 'answer', where corpus['question'] is a list of TaggedDocuments filename: name of file containing the questions dataset """ if corpus: self.corpus = corpus else: self.corpus = {} self.corpus['question'] = list(self.read_corpus(filename['question'], stop_words=stop_words)) self.corpus['answer'] = list(self.read_corpus(filename['answer'], stop_words=stop_words)) if model_name: self.model = Doc2Vec.load(model_name) else: size = kwargs.get('size', 50) min_count = kwargs.get('min_count', 5) alpha = kwargs.get('alpha', 0.025) min_alpha = kwargs.get('min_alpha', 0.025) iters = kwargs.get('iters', 10) self.train(size=size, min_count=min_count, alpha=alpha, min_alpha=min_alpha, iters=iters)
n closest subject_ids """ s = distance_mat.loc[subject_id] closest = s.sort_values()[1:1+n] return closest if __name__ == '__main__': parser = argparse.ArgumentParser(description="produce similarity matrix") parser.add_argument('dbname', help="Database name") parser.add_argument('path_to_model', help="Model to test") parser.add_argument('n_closest', help="How many closest subjects to look into") args = parser.parse_args() model = Doc2Vec.load(args.path_to_model) subject_hash = get_subject_hash(args.dbname) subject_ids = list(subject_hash.keys()) # loop over subjects and average docvecs belonging to subject. # place in dictionary subject_vectors = get_subject_vectors(subject_ids) distance_mat = get_distance_mat(subject_vectors) to_csv = [] for subj_id in subject_ids: relateds = get_n_closest(distance_mat, subj_id, n=int(args.n_closest)) for related_id, dist in relateds.iteritems(): weight = round(1./dist) #weight = round((1-dist) * 10)
def binarizator(x, coeff): if x > coeff: return 1 else: return 0 def intersection(lst1, lst2): return list(set(lst1) & set(lst2)) data_rout = r"./data/lingvo_test" models_rout = r"./models" # load models: d2v_model = Doc2Vec.load( os.path.join(models_rout, 'bss_doc2vec_model_20200611_draft')) print("d2v_model load Done") keras.losses.contrastive_loss = contrastive_loss lstm_model = load_model( os.path.join(models_rout, 'siamese_model_d2v_nn_2020_0612.h5')) print("lstm_model load Done") with open(os.path.join(models_rout, "tokenizator_model.pickle"), "br") as f: lingv_model = pickle.load(f) tk_appl = TokenizerApply(Loader(lingv_model)) tx1 = "сдавать ндс" tx2 = "сдавать ндфл" # tx1 = 'срок камеральной проверки по ндс заявленной к вычету'
import feature_extractor from gensim.models.doc2vec import Doc2Vec import parser import numpy as np from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error import pdb import pickle model = Doc2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary = True) print "MODEL LOADED" f = open('stopwords.txt') stoplist = set(line.split('\n')[0] for line in f) def filter_essay(essay): stop_removed = filter(lambda x: x not in stoplist, essay.split()) all_filtered = filter(lambda x: x in model.vocab, stop_removed) return all_filtered def filter_essays(essays): return [filter_essay(essay) for essay in essays] def calc_similarity(i1, i2): return model.n_similarity(i1, i2) def classify(k, instance, training_data, training_scores): similarity = np.array([calc_similarity(instance, x) for x in training_data])
def compute_vector(articles): model = Doc2Vec.load('./Model/Doc2Vec_Model') for article in articles: print(model.infer_vector(tag_article(article).words))
def get_vector(path, label): model = Doc2Vec.load(path) RES = [] for i in range(len(label)): RES.append(model.docvecs['g_'+str(i)]) return np.array(RES)
return docs doc = generate_docs1() print(accept[838:]) doclist = [] for i in range(doc_num): doclist.append(TaggedDocument(doc[i], ['상고 도로교통법_' + str(i)])) doc_vectorizer = Doc2Vec( dm=0, # PV-DBOW / default 1 dbow_words=1, # w2v simultaneous with DBOW d2v / default 0 vector_size=300, # vector size window=8, # distance between the predicted word and context words alpha=0.025, # learning-rate seed=1234, min_count=20, # ignore with freq lower min_alpha=0.025, # min learning-rate workers=4, # multi cpu hs=1, # hierarchical softmax / default 0 negative=10, # negative sampling / default 5 ) start = time.time() doc_vectorizer.build_vocab(doclist) for epoch in range(doc_vectorizer.iter): doc_vectorizer.train(doclist, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter) doc_vectorizer.alpha -= 0.002 # decrease the learning rate doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay
from gensim.models.doc2vec import Doc2Vec from gensim.models.doc2vec import LabeledSentence from regression import BaseBowRegressor import nltk reviews_texts, _, _, _, _ = BaseBowRegressor.get_reviews_data(range(1, 70)) sentences = [] print "Tokenizing sentences..." for i, review in enumerate(reviews_texts): tokens = nltk.word_tokenize(review) tokens = [token.lower() for token in tokens] sentences.append(LabeledSentence(words=tokens, labels=["REVIEW_" + str(i)])) print "Doc2Vec" model = Doc2Vec(sentences, size=100, window=8, min_count=5, workers=4)
import sanalytics.algorithms.utils as sau from time import time import numpy as np ## Read threshold arg = sys.argv[1].split("|") t = float(arg[0]) name = arg[1] fold = int(arg[1].split("_")[-2]) ## Import Data X_train = pd.read_parquet("datasets/rq3_data/sec1.0_train.parquet") X_val = X_train[X_train.fold==fold] ## Import D2V d2v = Doc2Vec.load("datasets/kfold_d2v/{}.model".format(name)) ## In pos set def pos_set(str): if "|" in str: return False if "sse" in str: return True if "set1" in str: return True if "set2" in str: return True ## Predict functions def predict(post, thresh, d2v): vec = d2v.infer_vector("{} {} {}".format(post.title, post.question, post.answers).split()) sims = d2v.docvecs.most_similar([vec], topn=1000) return min(len([i for i in sims if pos_set(i[0]) and i[1] > thresh]), 1) ## Columns
def load(self, model_dir): model_path = self._get_model_path(model_dir) self.model = Doc2Vec.load(model_path)
def FeaturiseData(featureType, trainDF, testDF, unsupervisedDF=None, pickleObject=False, reduceDims=500, verbose=False): # convert to test and train, # this is needed here so as not to put training data into # the featurisation functions, since in reality they would be built # using only training dat if featureType == 'tfidf': # now we need to convert it into features so the data can be # put into a machine learning model vectorizer = TfidfVectorizer(stop_words='english', analyzer='word', min_df=0.02, max_df=0.98, use_idf=False, norm=None) # featureDataTrainTemp = vectorizer.fit_transform( trainDF['stemmed_text']) featureDataTrain = featureDataTrainTemp.todense() #featureDataTrain = (featureDataTrain - dataMean)/dataSD labelsTrain = np.array(trainDF['labels']) words = vectorizer.get_feature_names() # remember ONLY TRANSFORM, don't fit!!! featureDataTestTemp = vectorizer.transform(testDF['stemmed_text']) featureDataTest = featureDataTestTemp.todense() #featureDataTest = (featureDataTest - dataMean)/dataSD labelsTest = np.array(testDF['labels']) if verbose: startTime = datetime.now() print("reducing dims...") # 100 dims chosen arbitrarily... #featureData, _, _ = scipy.sparse.linalg.svds(featureDataTrainTemp, k = 100) # svdObj = TruncatedSVD(n_components=reduceDims, n_iter=7, random_state=42) # # featureDataTrain = svdObj.fit_transform(featureDataTrainTemp) # # # #ONLY transform!!! do not fit # featureDataTest = svdObj.transform(featureDataTestTemp) # if verbose: tookThisLong = datetime.now() - startTime print("SVD took %s " % str(tookThisLong)) print("number of words = ", len(words)) elif featureType == "gensim": #gensimDF = pd.concat([trainDF, unsupervisedDF]) # not sure if order is important so shuffle anyway, can't hurt... #gensimDF = gensimDF.sample(frac = 1) # convert the stemmed words into a format gensim can deal with documentsGensim = [ TaggedDocument(doc, [i]) for i, doc in enumerate(unsupervisedDF['stop_words_removed_list']) ] # build doc2vec model - this could do with some experimentation... modelGensim = Doc2Vec(documentsGensim, vector_size=reduceDims, window=4, min_count=3, workers=6) # now use the model to infer vectors docVecList = [] labels = [] for index, row in trainDF.iterrows(): docVecList.append( modelGensim.infer_vector(row['stop_words_removed_list'])) labels.append(row['labels']) featureDataTrain = np.array(docVecList) labelsTrain = np.array(labels) docVecList = [] labels = [] for index, row in testDF.iterrows(): docVecList.append( modelGensim.infer_vector(row['stop_words_removed_list'])) labels.append(row['labels']) featureDataTest = np.array(docVecList) labelsTest = np.array(labels) #print("labelsTest.shape = ", labelsTest.shape) if pickleObject: # pickle data dirname = os.path.dirname(__file__) # pickle train Data fileNameFeatureDataTrain = '../Data_Featurised/train_data_%s.pkl' % featureType fileNameFullFeatureDataTrain = os.path.join(dirname, fileNameFeatureDataTrain) file = open(fileNameFullFeatureDataTrain, 'wb') pickle.dump(featureDataTrain, file) file.close() # pickle test Data fileNameFeatureDataTest = '../Data_Featurised/test_data_%s.pkl' % featureType fileNameFullFeatureDataTest = os.path.join(dirname, fileNameFeatureDataTest) file = open(fileNameFullFeatureDataTest, 'wb') pickle.dump(featureDataTest, file) file.close() # pickle train labels fileNameFeatureDataLabelTrain = '../Data_Featurised/train_label_data_%s.pkl' % featureType fileNameFullFeatureDataTrainLabel = os.path.join( dirname, fileNameFeatureDataLabelTrain) file = open(fileNameFullFeatureDataTrainLabel, 'wb') pickle.dump(labelsTrain, file) file.close() # pickle test Data fileNameFeatureDataLabelTest = '../Data_Featurised/test_label_data_%s.pkl' % featureType fileNameFullFeatureDataTestLabel = os.path.join( dirname, fileNameFeatureDataLabelTest) file = open(fileNameFullFeatureDataTestLabel, 'wb') pickle.dump(labelsTest, file) file.close() if featureType == 'tfidf': #pickle tfidf vectorizer and truncated SVD fileNameTfidfObj = '../Feature_Models/tfidf_vect.pkl' fileNameFullTfidfObj = os.path.join(dirname, fileNameTfidfObj) file = open(fileNameFullTfidfObj, 'wb') pickle.dump(vectorizer, file) file.close() # #pickle tfidf vectorizer and truncated SVD # fileNameSvdObj = '../Feature_Models/svd_obj.pkl' # fileNameFullSvdObj = os.path.join(dirname, fileNameSvdObj) # file = open(fileNameFullSvdObj, 'wb') # pickle.dump(svdObj, file) # file.close() elif featureType == 'gensim': fileNameGensimObj = '../Feature_Models/gensim_obj.pkl' fileNameFullGensimObj = os.path.join(dirname, fileNameGensimObj) modelGensim.save(fileNameFullGensimObj) return featureDataTrain, featureDataTest, labelsTrain, labelsTest
print(documents[7150]) # test # remove words that appear only once from collections import defaultdict frequency = defaultdict(int) for text in documents: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in documents] # train the model documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)] #print(documents) model = Doc2Vec(documents, vector_size=35, window=2, min_count=2, workers=5) model.train(documents, total_examples=model.corpus_count, epochs=150) test_text = [ 'Silicon', 'bValley', 'Girlb', 'Apple', 'iPad', 'GB', 'yes', 'good', 'enough', 'to', 'make', 'me', 'put', 'down', 'my', 'Kindle', 'Pampers', 'Cruisers', 'Diapers', 'For', 'A', 'Snug', 'No', 'Gap', 'Fit', 'Nikon', 'Coolpix', 'Digital', 'Camera', 'Not', 'Exactly', 'an', 'SLR', 'but', 'Gets', 'the', 'Job', 'Done', 'Pampers', 'Swaddlers', 'for', 'Newborns', 'Confusing', 'for', 'the', 'First', 'Time', 'Mom', 'TiVo', 'Humax', 'DVDRRW', 'Series', 'Digital', 'Recorder', 'How', 'Do', 'I', 'Love', 'Thee', 'MAC', 'Shadestick', 'for', 'Eyes', 'Easy', 'as', 'Pumpkin', 'Pie', 'Postrio', 'Restaurant', 'San', 'Francisco', 'Still', 'Glamorous', 'Sonic', 'Rio', 'Sport', 'S', 'MP', 'Player', 'Hours', 'of', 'skipfree', 'music', 'for', 'Runners', 'Peter', 'Jacksons', 'Return', 'of', 'the', 'King', 'Tears', 'and', 'Triumph', 'Propel', 'Fitness', 'Water', 'Atkins', 'Friendlier', 'replacement', 'for', 'Gatorade', 'MAC', 'Paints', 'More',
u'ỦƯỨỪỬỮỰỲỴỶỸÝ' regex = re.compile("[^{0}a-zA-Z0-9 ]".format(vn_accents)) document = regex.sub(" ", document) # remove duplicated spaces document = re.sub(' +', ' ', document) # remove leading and trailing spaces document = document.strip() # lowering document = document.lower() with open("vocabulary.json", "r") as fr: vocab = json.load(fr) vectorizer = TfidfVectorizer(vocabulary=vocab, max_features=10000) tfidfvec = vectorizer.fit_transform([document]) docvec_model = Doc2Vec.load("doc2vec.model") docvec = docvec_model.infer_vector(document.split()) tfidfvec = tfidfvec.toarray() tfidfvec.shape = (1, 10000) docvec.shape = (1, 60) prefix = "./pretrained/multiview/" list_files = glob.glob("{}*".format(prefix)) # model_path = max(list_files, key=os.path.getctime) fnames_acc = [] for file in list_files: fname_acc = re.findall(r"([\d\.]+)\.hdf5", file) fnames_acc.append(float(fname_acc[0])) fnames_acc.sort() max_acc = "{:.4f}".format(fnames_acc[-1])
F1 = 2 * Recall * Precision / (Recall + Precision) result = {'params': {'window': window, 'min_count': min_count, 'vector_size': vector_size, 'alpha': alpha, 'min_alpha': min_alpha, 'epochs': epochs}, 'score': {'Accuracy': Accuracy, 'Precision': Precision, 'Recall': Recall, 'F1': F1}} return result if __name__ == '__main__': warnings.filterwarnings('ignore', category=FutureWarning) dataset_dir = './static/processed/v3/' norm_train = list(read_train_dataset(dataset_dir + 'norm-train.jsonl')) anom_train = list(read_train_dataset(dataset_dir + 'anom-train.jsonl')) model = Doc2Vec(norm_train + anom_train, dm=1, window=2, min_count=1, vector_size=300, alpha=0.08, min_alpha=0.01, epochs=600, workers=6) norm_train_vecs = [model.docvecs['norm'+str(i)] for i in range(len(norm_train))] anom_train_vecs = [model.docvecs['anom'+str(i)] for i in range(len(anom_train))] x_train = norm_train_vecs + anom_train_vecs y_train = ['norm']*len(norm_train_vecs) + ['anom']*len(anom_train_vecs) clf = RandomForestClassifier(random_state=0, n_estimators=50, max_depth=23, max_features=100, n_jobs=6) clf.fit(x_train, y_train) # testing norm_test_vecs = [] norms = list(read_test_dataset(dataset_dir + 'norm-test.jsonl')) for norm in norms: norm_test_vecs.append(model.infer_vector(norm['words']))
def train_doc2vec(data_frame, patent_ids, classif_level, classif_type): root_location = fh.get_root_location("data/lstm_outcome/") doc2vec_model_save_location = fh.join_paths(root_location, "doc2vec_model/") preprocessed_location = fh.join_paths(root_location, "preprocessed_data/separated_datasets/") training_preprocessed_files_prefix = fh.join_paths(preprocessed_location, "training_docs_data_preprocessed/") validation_preprocessed_files_prefix = fh.join_paths(preprocessed_location, "validation_docs_data_preprocessed/") test_preprocessed_files_prefix = fh.join_paths(preprocessed_location, "test_docs_data_preprocessed/") vocab_path = fh.join_paths(doc2vec_model_save_location, "vocab_model") training_docs_iterator = create_tuple_array(data_frame, patent_ids, text_batch_size=10000) ##### tagged_data = training_docs_iterator cores = multiprocessing.cpu_count() model_dbow = Doc2Vec(dm=1, vector_size=200, window=2, negative=10, sample=1e-8, hs=0, min_count=50, alpha=0.25, min_alpha=0.05, dbow_words=0, seed=1234, concat=0, workers=cores) model_dbow.build_vocab([x for x in tqdm(tagged_data)]) for epoch in range(30): # model_dbow.train(utils_shuffle_rows([x for x in tqdm(tagged_data)]), total_examples=len(tagged_data), epochs=1) model_dbow.train(utils_shuffle_rows([x for x in tqdm(tagged_data)]), total_examples=len(tagged_data), epochs=1) model_dbow.alpha -= 0.002 model_dbow.min_alpha = model_dbow.alpha date = datetime.datetime.now().isoformat() model_dbow.save(fh.link_paths(vocab_path, 'doc2vec_vocab_30_epochs')) ##### params = wmh.get_parameters_lstm_doc2vec() GLOBAL_VARS.DOC2VEC_MODEL_NAME, placeholder_model_name, doc2vec_model = wmh.get_lstm_doc2vec(params, classif_level, classif_type) # yields a list of sentences id, text as a tuple or (id, tuple) # training_docs_iterator = lrh.BatchWrapper(training_preprocessed_files_prefix, text_batch_size=10000, level=classif_level, # level_type=classif_type) doc2vec_model.build_vocab(documents=training_docs_iterator, progress_per=params[13]) doc2vec_model.save(fh.link_paths(vocab_path, "doc2vec_vocab")) DOC2VEC_ALPHA_DECREASE = wmh.set_alpha_parameters_lstm_doc2vec(doc2vec_model) start_epoch = 1 # for epoch in range(1, params[11] + 1): # GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch) # doc2vec_folder_path = fh.join_paths(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME) # if fh.ensure_exists_path_location(fh.link_paths(doc2vec_folder_path, "doc2vec_model")): # start_epoch = epoch # if start_epoch > 1: # GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(start_epoch) # doc2vec_folder_path = fh.join_paths(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME) # # if a model of that epoch already exists, we load it and proceed to the next epoch # doc2vec_model = Doc2Vec.load(fh.link_paths(doc2vec_folder_path, "doc2vec_model")) # start_epoch += 1 ## The Actual Training for epoch in range(start_epoch, params[11] + 1): print("### epoch "+str(epoch)+" ###") # set new filename/path to include the epoch GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch) doc2vec_folder_path = fh.join_paths(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME) # train the doc2vec model # training_docs_iterator = lrh.BatchWrapper(training_preprocessed_files_prefix, text_batch_size=10000, level=classif_level, # level_type=classif_type) # yields a list of sentences id, text as a tuple or (id, tuple) doc2vec_model.train(documents=training_docs_iterator, total_examples=len(training_docs_iterator), report_delay=params[12], epochs=params[10]) doc2vec_model.alpha -= DOC2VEC_ALPHA_DECREASE # decrease the learning rate doc2vec_model.min_alpha = doc2vec_model.alpha # fix the learning rate, no decay doc2vec_model.save(fh.link_paths(doc2vec_folder_path, "doc2vec_model")) if epoch != params[11]: print("still training epochs missing: " + str(epoch)) sys.exit(1)
def __init__(self): self.model=Doc2Vec.load('./model.d2v') self.st = LancasterStemmer()
def get_doc2vec_model(model_path): return Doc2Vec.load(model_path)
from gensim.models.doc2vec import Doc2Vec, TaggedDocument doc = ["test 1"] tokenized_doc = ['ok'] tokenized_doc print(doc) #%% tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)] tagged_data model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs=100) model.save("test_doc2vec.model") model = Doc2Vec.load("test_doc2vec.model") model.wv.vocab #%% Soal no 4 import re import os unsup_sentences = [] for dirname in [
import json import time import random import string import numpy as np from gensim.models.doc2vec import Doc2Vec import anyconfig config = anyconfig.load(open("config.yaml", 'rb')) model = Doc2Vec.load('doc2vec/model') keys = list(config["label"]["id2value"].keys())[1:] dic_label = {} for i, key in enumerate(keys): dic_label[key] = i + 1 print(dic_label) def aug(shapes): add_shapes = [] rm_shapes = [] for shape in shapes: points = shape['points'] points = sorted(points) if points[0][1] > points[1][1] and points[2][1] > points[3][1]: points = [points[1], points[3], points[2], points[0]] elif points[0][1] > points[1][1] and points[2][1] < points[3][1]: points = [points[1], points[2], points[3], points[0]] elif points[0][1] < points[1][1] and points[2][1] > points[3][1]: points = [points[0], points[3], points[2], points[1]]
def main(): stats_Fscore, stats_recall, stats_precision = list(), list(), list() data_pos, data_neg = load_data("../data/") data_pos, data_neg = data_filter(data_pos), data_filter(data_neg) model = Doc2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) print "Model loading done!" for test_mode in range(2): if test_mode == 0: print "non-SMOTE" else: print "SMOTE" sFscores, sRecalls, sPrecisions = list(), list(), list() for iteration in range(NUM_OF_ITERATION): # start iteration random.seed(iteration) random.shuffle(data_pos) random.shuffle(data_neg) data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos, data_neg, model) # convert to Word Vectors print len(data_pos_vec), len(data_neg_vec) models = {"SVC": sklearn.svm.SVC(), \ "Logit": sklearn.linear_model.LogisticRegression(), \ "DT": sklearn.tree.DecisionTreeClassifier(), \ "NBayes": sklearn.naive_bayes.GaussianNB(), \ "NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()} model_chosen = "SVC" accuracys, precisions, recalls, Fscores = cross_validationS(\ data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD, smote_flag=test_mode) # cross validation sFscores.extend(Fscores) sRecalls.extend(recalls) sPrecisions.extend(precisions) stats_Fscore.append(sFscores) stats_recall.append(sRecalls) stats_precision.append(sPrecisions) plt.figure() colors = ["red", "blue"] modes = ["no-SMOTE", "SMOTE"] for i in range(len(stats_Fscore)): # plot statistical summary plt.plot(stats_Fscore[i], marker='o', color=colors[i], label=modes[i]+"_Fscore") #plt.plot(stats_precision[i], marker='+', color=colors[i], label=modes[i]+"_precision") #plt.plot(stats_recall[i], marker='*', color=colors[i], label=modes[i]+"_recall") plt.ylim([0, 1.0]) plt.legend(loc=4, borderaxespad=0.5) plt.ylabel("Scores") plt.xlabel("Data Sequence") plt.savefig("../results/"+model_chosen+"-ValidationStats.png") savefile_name = "../results/" + model_chosen + "-ValidationStats.txt" fp = open(savefile_name, 'w') print "******** Evaluation **********\n" fp.write("******** Evaluation **********\n") for test_mode in range(2): # print statistical evaluations stats_precision[test_mode].sort() stats_recall[test_mode].sort() stats_Fscore[test_mode].sort() p_median = stats_precision[test_mode][len(stats_precision)/2] r_median = stats_recall[test_mode][len(stats_recall)/2] f_median = stats_Fscore[test_mode][len(stats_Fscore)/2] iqr_p = stats_precision[test_mode][int(len(stats_precision)*0.75)] - stats_precision[test_mode][int(len(stats_precision)*0.25)] iqr_r = stats_recall[test_mode][int(len(stats_recall)*0.75)] - stats_recall[test_mode][int(len(stats_recall)*0.25)] iqr_f = stats_Fscore[test_mode][int(len(stats_Fscore)*0.75)] - stats_Fscore[test_mode][int(len(stats_Fscore)*0.25)] print modes[test_mode] fp.write(modes[test_mode]+'\n') print "\t p_median \t r_median \t f_median" fp.write("\t p_median \t r_median \t f_median \n") print "\t%.5f \t%.5f \t%.5f" % (p_median, r_median, f_median) fp.write("\t%.5f \t%.5f \t%.5f \n" % (p_median, r_median, f_median)) print "\t iqr_p \t iqr_r \t iqr_f" fp.write("\t iqr_p \t iqr_r \t iqr_f \n") print "\t%.5f \t%.5f \t%.5f" % (iqr_p, iqr_r, iqr_f) fp.write("\t%.5f \t%.5f \t%.5f \n" % (iqr_p, iqr_r, iqr_f)) print '\n'
def compute_similarities_text_word2vec(train=False, learn_idx=None): QUERY = "SELECT * FROM lyrics LIMIT 5000" cur.execute(QUERY) print("let's go!") i = 0 id_to_sentence = dict() for msd_track_id, mxm_track_id, word, count, is_test in cur.fetchall(): id_to_sentence[msd_track_id] = id_to_sentence.get( msd_track_id, "") + (word + " ") * count from nltk.corpus import stopwords # download('stopwords') stop_words = set(stopwords.words('english')) for k, v in id_to_sentence.items(): id_to_sentence[k] = ' '.join( [word for word in v.split() if word not in stop_words]) id_to_sentence[ k] = "Blank" if id_to_sentence[k] == "" else id_to_sentence[k] track_ids, sentences = zip(*id_to_sentence.items()) from gensim.models import Word2Vec from gensim.models.doc2vec import Doc2Vec, TaggedDocument from gensim.test.utils import common_texts, get_tmpfile ############### Word2Vec ####################### # path = get_tmpfile("word2vec.model") # model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4) # model = Word2Vec.load("word2vec.model") # model.save("word2vec.model") ############### Doc2Vec ######################## from nltk.tokenize import word_tokenize def learn_model(learn_idx): # from nltk import download # download("punkt") documents = [ TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(sentences) ] max_epochs = 10000 vec_size = 300 alpha = 0.040 model = Doc2Vec(size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=0) # model = Doc2Vec(vector_size=vec_size, dm=0) model.build_vocab(documents) for epoch in range(max_epochs): print("iteration {0}".format(epoch)) model.train(documents, total_examples=model.corpus_count, epochs=model.iter) # model.alpha -= 0.0001 # model.min_alpha = model.alpha model.save("doc2vec_bow{}.model".format(learn_idx)) if train: if learn_idx != None: learn_model(learn_idx) else: model = Doc2Vec.load("doc2vec_bow{}.model".format(learn_idx)) # test_doc = word_tokenize(sentences[0]) # v1 = model.infer_vector(test_doc) # print("V1_infer:",v1) def print_top_n(x, n): title, artist = msd_titles[track_ids[x]] print('Examining --{} by {}--'.format(title, artist)) print(' '.join(sorted(id_to_sentence[track_ids[x]].split()))) print("------") test_doc = word_tokenize(id_to_sentence[track_ids[x]]) v1 = model.infer_vector(test_doc) # print("V1_infer:",v1) similar_doc = model.docvecs.most_similar([v1], topn=n) print("len(similar_doc):", len(similar_doc)) i = 1 for idx, similarity_degree in similar_doc: idx = int(idx) title, artist = msd_titles[track_ids[idx]] print() print("{}. {} by {} with similarity degree of {}".format( i, title, artist, similarity_degree)) print(' '.join(sorted(id_to_sentence[track_ids[idx]].split()))) print() i += 1 print_top_n(0, 5)
return torch.tensor(result) #批量将文本转化维向量并返回 if __name__ == '__main__': datapath = 'C:/Users/13170/Desktop/dataone/' read = [] for count in range(1,24825): txtopen = open(datapath+str(count)+'.txt') txtpre = txtopen.read() txtpre = Preprocessing(txtpre) read.append(txtpre) txtopen.close() documents = [TaggedDocument(read[i],[i]) for i in range(len(read))] model_dbow = Doc2Vec(documents=documents,dm=0,vector_size=500,workers=cpu_count(),alpha=0.025,min_alpha=0.025) model_dm = Doc2Vec(documents=documents, dm=1, vector_size=500, workers=cpu_count(), alpha=0.025, min_alpha=0.025) for epoch in range(10): begin = time.time() model_dbow.train(documents,total_examples=model_dbow.corpus_count,epochs=10) model_dbow.alpha -= 0.002 model_dbow.min_alpha = model_dbow.alpha print("epoch:{e} running time: {n}".format(e=str(epoch), n=str(time.time() - begin))) for epoch in range(10): begin = time.time() model_dm.train(documents, total_examples=model_dm.corpus_count, epochs=10) model_dm.alpha -= 0.002 model_dm.min_alpha = model_dm.alpha print("epoch:{e} running time: {n}".format(e=str(epoch), n=str(time.time() - begin)))
def run_clustering_experiments(guide: Guide, nice_dir: str, vector_dir: str, overwrite: bool = False, vector_type=TFIDF_VECTORS, cluster_type=TOPIC_CLUSTER, max_samples=None, num_runs=20, vec_path: str = None): """ Run the semi-supervised clustering experiments. This consists of: * """ # A pseudorandom number generator is created, # then seeded, to ensure that the results are # replicable from run to run. r = random.Random(20) sorted_docs = sorted(guide.docs, key=lambda x: x.id) r.shuffle(sorted_docs) # Set the maximum number of samples to a default for # topics (20) or categories (50) if left unspecified if max_samples is None: max_samples = 20 if cluster_type == TOPIC_CLUSTER else 50 # Load the spacy language model if we plan to # use the GloVe vectors. spacy_model = None if vector_type == GLOVE_VECTORS: spacy_model = spacy.load( '/home2/rgeorgi/python3/lib/python3.4/site-packages/en_core_web_lg/en_core_web_lg-2.0.0/' ) w2v_model = None if vector_type == WORD2VEC_VECTORS: if os.path.splitext(vec_path)[1] in ['.bin', '.gz']: w2v_model = gensim.models.KeyedVectors.load_word2vec_format( vec_path, binary=True) else: w2v_model = Doc2Vec.load(vec_path) # w2v_model = gensim.models.Word2Vec.load(vec_path) # ------------------------------------------- # Outer loop # # To make sure that the clustering results are not a fluke of picking a given couple documents # to seed the clusters, a number of runs are performed in which the order of the document set # is varied, so that different example docs will be chosen # ------------------------------------------- for run_num in range(1, num_runs): ordered_docs = [] true_labels = [] # sorted_docs = sorted_docs[window_size:] + sorted_docs[:window_size] r.shuffle(sorted_docs) # Make sure that the order of documents # and their labels is static for evaluation. for doc in sorted_docs: ordered_docs.append(doc) if cluster_type == CATEGORY_CLUSTER: true_labels.append(doc.category.category_id) else: true_labels.append(doc.topic.id) if vector_type == TFIDF_VECTORS: matrix = tfidf_matrix(ordered_docs, nice_dir) elif vector_type == GLOVE_VECTORS: matrix = glove_matrix(ordered_docs, nice_dir, vector_dir, spacy_model, overwrite=overwrite) elif vector_type == WORD2VEC_VECTORS: matrix = gensim_matrix(ordered_docs, nice_dir, vector_dir, w2v_model, overwrite=overwrite) # ------------------------------------------- # Iterate over a different number of supervised # samples # ------------------------------------------- for supervised_samples in range(0, max_samples + 1): # ------------------------------------------- # Build the initial clusters # ------------------------------------------- inits = init_cluster_dict(matrix.shape[1]) samples_per_cluster = defaultdict(int) # Now, let's pick out some supervised # samples. for i, doc in enumerate(ordered_docs): topic_id = doc.topic.id category_id = doc.category.category_id label_key = category_id if cluster_type == CATEGORY_CLUSTER else topic_id if samples_per_cluster[label_key] <= supervised_samples: v = matrix[i].toarray()[0, :] if not isinstance( matrix[i], np.ndarray) else matrix[i] inits[label_key] += v samples_per_cluster[label_key] += 1 for key in inits: if samples_per_cluster[key] > 1: inits[key] /= samples_per_cluster[key] # ------------------------------------------- # Now, do the clustering. # ------------------------------------------- # If no samples are used, seed the clusters randomly. # otherwise, use the generated init vectors. if supervised_samples == 0: init = 'random' else: init = np.array([v for v in inits.values()]) # Set the number of clusters based on the number of clusters # defined in the guide num_clusters = len( g.categories) if cluster_type == CATEGORY_CLUSTER else len( g.topics) k = KMeans( n_clusters=num_clusters, random_state=5, init=init, n_init=1, ) k.fit(matrix) rand_index = adjusted_rand_score(true_labels, k.labels_) # Finally, print out a CSV row for each iteration. csv = '{},{},{}'.format(run_num, supervised_samples, rand_index) print(csv)
# In[92]: reviewTrain = [] for i in text: reviewTrain.append(preprocess(i)) del text # In[93]: res = [] for i in reviewTrain: res.append(i) documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(res)] model = Doc2Vec(documents) # In[94]: reviewVec = np.array([]) for i in res: reviewVec = np.append(reviewVec, model.infer_vector(i)) reviewVec = reviewVec.reshape(len(res), int(reviewVec.shape[0] / len(res))) # In[95]: lr = LogisticRegression(multi_class='multinomial', solver='newton-cg') # # X_train, X_test, y_train, y_test = train_test_split(reviewVec[:trainSize, :], trainLabel,
models['EXT'] = CustomUnpickler(open('BigFiveModels/CON_model.pkl', 'rb')).load() models['AGR'] = CustomUnpickler(open('BigFiveModels/AGR_model.pkl', 'rb')).load() models['NEU'] = CustomUnpickler(open('BigFiveModels/NEU_model.pkl', 'rb')).load() #Load the Clap Prediction Model with the 'simple' Lasso Regression model - CAUTION - It is not very accurate ! with open("./ClapPredictionModels/clap_prediction_model_lasso.pkl", 'rb') as file: clap_prediction_model_lasso = pickle.load(file) column_for_regression=["sentence_count","title_word_count","average_word_count_per_sentence", "text_word_count","vocab_count_excl_commonwords","imgs_per_1000words", "FS_GradeScore","vids_per_1000words","polarity","subjectivity"] #Load the pre-trained Doc2Vec Model trained on 200 sample Medium Data Science articles with 300 vec dimensions Doc2VecModel= Doc2Vec.load("./ClapPredictionModels/Doc2Vec.model") #Load the average document vector for the 37 out of the 200 reference articles that have > 5k Claps VH_Vec=load('./ClapPredictionModels/VH_Claps_Vector.npy') H_Vec=load('./ClapPredictionModels/H_Claps_Vector.npy') M_Vec=load('./ClapPredictionModels/M_Claps_Vector.npy') L_Vec=load('./ClapPredictionModels/L_Claps_Vector.npy') VL_Vec=load('./ClapPredictionModels/VL_Claps_Vector.npy') def get_html(url): user_agent_list = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
rw_df['Madein_city'][idx - kag_len], rw_df['Variety'][idx - kag_len], str(idx) ])) max_epochs = 50 vec_size = 100 # Previous setup - 25 alpha = 0.025 window_size = 5 num_workers = 4 minimun_count = 1 # Previous setup - 2 model = Doc2Vec( vector_size=vec_size, window=window_size, alpha=alpha, min_alpha=0.00025, min_count=minimun_count, dm=1, # PV-DM workers=num_workers, epochs=max_epochs) model.build_vocab(tagged_data) for epoch in range(max_epochs): start = timer() print('iteration {0}'.format(epoch)) model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs) # decrease the learning rate model.alpha -= 0.0002
import hug from gensim.models.doc2vec import Doc2Vec from gensim.utils import simple_preprocess import re model = Doc2Vec.load('models/wiki-latest') @hug.get('/topicos', examples='frase=Vingadores são um grupo de super-heróis') @hug.local() def topicos(frase: str): """Informa os topicos de uma frase qualquer""" tokens = simple_preprocess(frase) inferred_vector = model.infer_vector(tokens) similars = model.docvecs.most_similar([inferred_vector], topn=10) return { 'topicos': similars } @hug.get(examples="expressao=homem está para rei como mulher está para") def analogia(expressao: str): """Calcula uma analogia entre termos""" entry = '{0}'.format(expressao) math_symbol = "\+" analogy_symbol = "está para" # Case 1: user wants to do word math: word1 - word2 + word3 positive = []
from gensim.models.doc2vec import Doc2Vec import os from .heroku import * # noqa ALLOWED_HOSTS = ['0.0.0.0', '127.0.0.1:8000'] SECRET_KEY = 'f=fqwc&$zt_6rf8y45j1l7w!^e*%a_c)4sf+v*_uf%hwf5_*16' # MODEL_FILE is the full path of the neural net model to be used. # *Make sure the test file is not .gitignored*; it is needed for CI. # However, production-quality models are too big for GitHub, so they should be # .gitignored. # MODEL_FILE defaults to the test model used for CI; because it is checked into # the repo it should be present and is therefore a sensible default for local # development. If you want to have a production-like environment, and to use # a model that represents the entire database, get it separately; put it in # hamlet/model/hamlet.model; and add DJANGO_USE_LIVE_MODEL=True to your .env. modelpath = os.environ.get('DJANGO_MODEL_path', '') if modelpath: MODEL_FILE = os.path.join(PROJECT_DIR, modelpath) else: MODEL_FILE = os.path.join(PROJECT_DIR, 'testmodels', 'testmodel.model') NEURAL_NET = Doc2Vec.load(MODEL_FILE) # The string "PASSED" will pass any captcha. # Don't use this in production! # http://django-simple-captcha.readthedocs.io/en/latest/advanced.html#captcha-test-mode CAPTCHA_TEST_MODE = True
#!/usr/bin/python # -*- coding: UTF-8 -*- from gensim.models.doc2vec import Doc2Vec import pykeyvi docvecs_process_input_keyvi_index_file = "docvecs_urlid_url.kv" output_data_path = "/raid/ankit/doc2vec/out_s_p_1M" doc2vec_trained_model = 'pages_with_spaces.doc2vec' _alpha, _min_alpha, _passes = (0.020, 0.001, 20) print "Loading keyvi dictionaries ..." keyvi_dict=pykeyvi.Dictionary("{}/{}".format(output_data_path, docvecs_process_input_keyvi_index_file)) print "Finished Loading key-vi Dictionary." print "Loading Doc2Vec Model ... " model = Doc2Vec.load("{}/{}".format(output_data_path, doc2vec_trained_model)) print "Model Loaded Successfully!" def get_similar_urls(sample_query, nearest_num): tokens = sample_query.lower().split() dv = model.infer_vector(tokens, alpha=_alpha, min_alpha=_min_alpha, steps=_passes) # note: may want to use many more steps than default sims = model.docvecs.most_similar(positive=[dv], topn=nearest_num) for url_id, distance in sims: url = "" for m in keyvi_dict.Get(str(url_id)): url = m.GetValueAsString() print "{}\t{}\t{}".format(url_id, url, distance) def main(): print "\nSimilar URLS for Queries - Doc2Vec Retrieval Interface [All URL's]"
def similarity_scores(df, meth): from collections import defaultdict from gensim import corpora from gensim.models.doc2vec import Doc2Vec, TaggedDocument from nltk.tokenize import word_tokenize from nltk.corpus import stopwords textsList = df[['description', 'title']].values.T.tolist() textsList_flat = [item for sublist in textsList for item in sublist] documents = textsList_flat if meth == 'd2v': # Doc2Vec preprocessing tagged_data = [ TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(documents) ] #this is sufficient for a word-order--conserving model where we will retain punctuation #Doc2Vec #run the model max_epochs = 10 vec_size = 25 alpha = 0.03 model = Doc2Vec(size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=2, dm=1) model.build_vocab(tagged_data) for epoch in range(max_epochs): print('iteration {0}'.format(epoch)) model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter) # decrease the learning rate model.alpha -= 0.00025 # fix the learning rate, no decay model.min_alpha = model.alpha model.save("d2v_mixed7.model") print("Model Saved") # compute cosine similarity cossims = [] for i in range(len(df)): cossimil = model.docvecs.similarity(i, (len(df) + i)) cossims.append(cossimil) elif meth == 'lda' or meth == 'lsi': # BOW model preprocessing # Split the document into tokens from gensim.utils import simple_preprocess def sent_to_words(sentences): for sentence in sentences: yield (simple_preprocess(str(sentence), deacc=True)) texts = list(sent_to_words(documents)) # Remove common words and words that are only one character. stoplist = set('for a of the and to in'.split()) #stoplist = set(stopwords.words('english')) texts = [[ token for token in doc if (len(token) > 1) and (token not in stoplist) ] for doc in texts] # Lemmatize the documents. from nltk.stem.wordnet import WordNetLemmatizer lemmatizer = WordNetLemmatizer() texts = [[lemmatizer.lemmatize(token) for token in doc] for doc in texts] # Lemmatized reduction with spaCy package, to keep only certain word-classes (noun, adjective, verb, adverb) i.e. remove prepositions etc # function from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/ def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): texts_red = [] for sentence in texts: doc = nlp(" ".join(sentence)) texts_red.append([ token.lemma_ for token in doc if token.pos_ in allowed_postags ]) return texts_red import spacy nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) texts = lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # Filter carefully to remove rarest words (occurring in less than 15 documents), or common lemmas (more than 60% of the documents) dictionary = corpora.Dictionary(texts) dictionary.filter_extremes(no_below=15, no_above=0.6) # Construct the final corpus, bag-of-words representation of documents corpus = [dictionary.doc2bow(text) for text in texts] #Run the model if meth == 'lda': #LDA from gensim import models lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, alpha='auto', eta='auto', iterations=10, passes=2, num_topics=100, eval_every=None, decay=0.8, offset=1) corpus_lda = lda_model[corpus] # compute cosine similarity from gensim.matutils import cossim cossims = [] for i in range(len(df)): cossimil = cossim(corpus_lda[i], corpus_lda[len(df) + i]) cossims.append(cossimil) else: #LSI (with TFIDF) from gensim import models tfidf = models.TfidfModel( corpus, smartirs='npc', #probabilistic idf slope=0.2 ) #lower slope means longer documents are favoured more (usually an effective choice for TFIDF) corpus_tfidf = tfidf[corpus] lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300, power_iters=2) corpus_lsi = lsi_model[corpus_tfidf] # compute cosine similarity from gensim.matutils import cossim cossims = [] for i in range(len(df)): cossimil = cossim(corpus_lsi[i], corpus_lsi[len(df) + i]) cossims.append(cossimil) else: print("Please provide a valid method ('lda', 'lsi', 'd2v')") df_sims = df.assign(sim=cossims) return df_sims
def load(cls, model_file='synset2vec'): model = Doc2Vec.load(model_file) return cls(model)
def load_model(model_path): model = Doc2Vec.load(os.path.join( os.path.dirname(os.path.abspath('__file__')), model_path) ) return model
vec.append(float(el)) return vec # load w2v model and w2v vectors model_path = 'araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model' model_w2v = Word2Vec.load(model_path) broken_vectors = pd.read_csv('model_vec_w2v.csv').text_vec model_vectors_w2v = broken_vectors.apply(make_list) # load d2v model and d2v vectors broken_vectors_d2v = pd.read_csv('model_vectors_d2v.csv').text_vec model_vectors_d2v = broken_vectors_d2v.apply(make_list) model_d2v = Doc2Vec.load('doc2vec.model') # open texts texts = list(pd.read_csv('corpus.csv').text) # open inverted index inv_ind = json.load(open('invind.json')) @app.route('/', methods=['GET']) def index(): if request.args: query = request.args.get('query') search_method = request.args.get('search_method') search_result = search(query, search_method) return render_template('result.html',
# -*- coding: utf-8 -*- """ Created on Tue Apr 30 20:54:58 2019 @author: Athan """ from gensim.models.doc2vec import Doc2Vec from nltk.tokenize import word_tokenize model= Doc2Vec.load("d2v.model") #to find the vector of a document which is not in training data test_data = word_tokenize("I love chatbots".lower()) v1 = model.infer_vector(test_data) print("V1_infer", v1)
model_DM.train(training_doc) model_DBOW.train(training_doc) # Save the trained models: fout = 'DM.d2v' model_DM.save(most_recent + fout) model_DM.init_sims(replace=True) fout = 'DBOW.d2v' model_DBOW.init_sims(replace=True) model_DBOW.save(most_recent + fout) else: # Load Doc2Vec model from disk: fout = 'DM.d2v' model_DM = Doc2Vec.load(most_recent + fout) fout = 'DBOW.d2v' model_DBOW = Doc2Vec.load(most_recent + fout) # train the two different methods of the Doc2Vec algorithm: # NB DBOW is more similar to the recommended skip-gram of # Word2Vec by the original paper's authors. print('nonmatch', model_DM.doesnt_match("delay government flooding lightning".split())) print('nonmatch', model_DM.doesnt_match("euref voteout remain lightning".split())) print('euref sim by word', model_DM.similar_by_word('euref')) print('flood ', model_DM.similar_by_word('flood'))
'job': str, 'description': str, 'others': str }, error_bad_lines=False) job_description_df.describe() job = job_description_df['job'] jobno = job_description_df['jobno'] job_in_now = job.tolist() jobno_now = jobno.tolist() yy = job_in_now[9210] print(len(job_in_now)) print(yy) #載入training好的wiki_seg_job_0728_true_model.txt,接著由test_data_1放入需要測試的文字,可找到 model = Doc2Vec.load( '/Users/liouscott/Documents/scott/104_competition/model/wiki_seg_job_0728_true_model.txt' ) test_data_1 = '工讀生' test_cut_raw_1 = [] item2 = (pseg.cut(test_data_1)) for k in list(item2): test_cut_raw_1.append(k.word) inferred_vector = model.infer_vector(test_cut_raw_1) sims = model.docvecs.most_similar([inferred_vector], topn=20) sims_two = np.dot(model.docvecs[6682], model.docvecs[50021]) print(sims) #sims是一个tuples,(index_of_document, similarity) print(sims_two) print(content[9210]) print(len(description_in)) print(len(model.docvecs)) print(model.docvecs)
if deps_model_file != "": has_deps_embeddings = True logging.info("Loading dependency embeddings from %s" % deps_model_file) deps_model = Embeddings.load(deps_model_file+".npy", deps_model_file+".vocab") logging.info("Deps Model loaded!") #deps_vocabulary = deps_model._vocab #deps_embeddings = deps_model._vecs # Load Models here is_doc2vec_model = False # load word2vec word2vec_model if doc2vec_model_file != '': model = Doc2Vec.load(doc2vec_model_file) is_doc2vec_model = True else: if word2vec_load_bin: model = Word2Vec.load_word2vec_format(word2vec_model_file, binary=True) # use this for google vectors else: model = Word2Vec.load(word2vec_model_file) use_id_for_vector = use_id_for_vector and is_doc2vec_model word2vec_num_features = len(model.syn0[0]) logging.info("Embeddings feature vectors length:%s" % word2vec_num_features) logging.info("Model syn0 len=%d" % (len(model.syn0))) # define classes class_mapping = dict([(val, idx) for idx, val in enumerate(valid_senses)])
def nlp(): tagged_data = [] stemmer = SnowballStemmer("hungarian") hu = detect("nagyon szertém ha működnél köszi puszi") # assert gensim.models.doc2vec.FAST_VERSION > -1 conn = sqlite3.connect( r'C:\Users\Domos\Documents\andris disszertacio\url.db') curr = conn.cursor() curr.execute( """SELECT DISTINCT paragaph FROM 'psArticle_tb' where time like "%2014%";""" ) ps14 = "" for row in curr.fetchall(): urlrow = str(row) if urlrow == "": continue urlrow = urlrow.replace('\\xa0', ' ') urlrow = urlrow.replace('\xa0', ' ') urlrow = urlrow.replace('\\n', "") urlrow = urlrow.replace('\\r', "") urlrow = urlrow.replace('\\', "") urlrow = urlrow.replace('\\xadt', " ") urlrow = urlrow.replace('\\t', "") urlrow = urlrow.replace('(', "") urlrow = urlrow.replace(')', "") urlrow = urlrow.replace("'", "") urlrow = urlrow.replace(',', "") urlrow = urlrow.replace('[', "") urlrow = urlrow.replace(']', "") try: detect(urlrow) == hu except: continue else: if detect(urlrow) == hu: ps14 += urlrow print("ps14 done") curr.execute( """SELECT DISTINCT paragaph FROM 'psArticle_tb' where time like "%2015%";""" ) ps15 = "" for row in curr.fetchall(): urlrow = str(row) if urlrow == "": continue urlrow = urlrow.replace('\\xa0', ' ') urlrow = urlrow.replace('\xa0', ' ') urlrow = urlrow.replace('\\n', "") urlrow = urlrow.replace('\\r', "") urlrow = urlrow.replace('\\', "") urlrow = urlrow.replace('\\xadt', " ") urlrow = urlrow.replace('\\t', "") urlrow = urlrow.replace('(', "") urlrow = urlrow.replace(')', "") urlrow = urlrow.replace("'", "") urlrow = urlrow.replace(',', "") urlrow = urlrow.replace('[', "") urlrow = urlrow.replace(']', "") try: detect(urlrow) == hu except: continue else: if detect(urlrow) == hu: ps15 += urlrow print("ps15 done") curr.execute( """SELECT DISTINCT paragaph FROM 'psArticle_tb' where time like "%2016%";""" ) ps16 = "" for row in curr.fetchall(): urlrow = str(row) if urlrow == "": continue urlrow = urlrow.replace('\\xa0', ' ') urlrow = urlrow.replace('\xa0', ' ') urlrow = urlrow.replace('\\n', "") urlrow = urlrow.replace('\\r', "") urlrow = urlrow.replace('\\', "") urlrow = urlrow.replace('\\xadt', " ") urlrow = urlrow.replace('\\t', "") urlrow = urlrow.replace('(', "") urlrow = urlrow.replace(')', "") urlrow = urlrow.replace("'", "") urlrow = urlrow.replace(',', "") urlrow = urlrow.replace('[', "") urlrow = urlrow.replace(']', "") try: detect(urlrow) == hu except: continue else: if detect(urlrow) == hu: ps16 += urlrow print("ps16 done") curr.execute( """SELECT DISTINCT paragaph FROM 'psArticle_tb' where time like "%2017%";""" ) ps17 = "" for row in curr.fetchall(): urlrow = str(row) if urlrow == "": continue urlrow = urlrow.replace('\\xa0', ' ') urlrow = urlrow.replace('\xa0', ' ') urlrow = urlrow.replace('\\n', "") urlrow = urlrow.replace('\\r', "") urlrow = urlrow.replace('\\', "") urlrow = urlrow.replace('\\xadt', " ") urlrow = urlrow.replace('\\t', "") urlrow = urlrow.replace('(', "") urlrow = urlrow.replace(')', "") urlrow = urlrow.replace("'", "") urlrow = urlrow.replace(',', "") urlrow = urlrow.replace('[', "") urlrow = urlrow.replace(']', "") try: detect(urlrow) == hu except: continue else: if detect(urlrow) == hu: ps17 += urlrow print("ps17 done") curr.execute( """SELECT DISTINCT paragaph FROM 'psArticle_tb' where time like "%2018%";""" ) ps18 = "" for row in curr.fetchall(): urlrow = str(row) if urlrow == "": continue urlrow = urlrow.replace('\\xa0', ' ') urlrow = urlrow.replace('\xa0', ' ') urlrow = urlrow.replace('\\n', "") urlrow = urlrow.replace('\\r', "") urlrow = urlrow.replace('\\', "") urlrow = urlrow.replace('\\xadt', " ") urlrow = urlrow.replace('\\t', "") urlrow = urlrow.replace('(', "") urlrow = urlrow.replace(')', "") urlrow = urlrow.replace("'", "") urlrow = urlrow.replace(',', "") urlrow = urlrow.replace('[', "") urlrow = urlrow.replace(']', "") try: detect(urlrow) == hu except: continue else: if detect(urlrow) == hu: ps18 += urlrow print("ps18 done") a = [ps14, ps15, ps16, ps17, ps18] for j, _k in enumerate(a): words = [] w = word_tokenize(_k.lower()) for word in w: words.append(stemmer.stem(word)) tags = [str(j)] tagged_data += [TaggedDocument(words, tags)] print(tagged_data) max_epochs = 2 vec_size = 300 alpha = 0.025 model = Doc2Vec(size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1) model.build_vocab(tagged_data) for epoch in range(max_epochs): print('iteration {0}'.format(epoch)) model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter) model.alpha -= 0.0002 model.min_alpha = model.alpha model.save("ps.model") print("Model Saved") model = Doc2Vec.load("ps.model") #to find the vector of a document which is not in training data print(model.wv.most_similar("migráns")) print(model.docvecs.most_similar([1]))
from pyspark import SparkContext from pyspark.sql import SQLContext from gensim.models.doc2vec import Doc2Vec sc = SparkContext() sqlContext = SQLContext(sc) # this is a large object we cache it on each worker node gmod_broadcast = sc.broadcast( Doc2Vec.load("/root/doc2vec/doc2vec_model/hn") ) df = sqlContext.read.load("hdfs:///hndata/parquet_typed", format="parquet") ids = df.where("score IS NOT NULL") \ .where("type='story'") \ .where("title IS NOT NULL") \ .map(lambda row: row.id) def mergeVec(id): gmod = gmod_broadcast.value vec = gmod.docvecs["TITLE_%d" % id] return (id, vec) docvecs = ids.map(mergeVec) docvecs.saveAsPickleFile("hdfs:///hndata/docvecs_glove_pickle")
def load_doc2vec_model(): fname = get_tmpfile(os.path["DIR_PATH"] + "/data/doc2vec/v2/doc2vec_articles_181030") model = Doc2Vec.load(fname) return model
def mkExistingTrainedModel(path): return Doc2Vec.load(path)
tagged_data = [ TaggedDocument(words=word_tokenize(sentence.lower()), tags=[sentence.lower()]) for sentence in df['DESC'] ] df['TAGGED_DATA'] = tagged_data #tagged data contains TaggedDocuments with word having the tokens and the tag as the same as the cleaned sentence,so that we can identidy it later max_epochs = 100 vec_size = 100 alpha = 0.025 model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1) model.build_vocab(tagged_data) for epoch in range(max_epochs): model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter) # decrease the learning rate model.alpha -= 0.0002 # fix the learning rate, no decay model.min_alpha = model.alpha #to get a sentence vector,use the cleaned form of the sentence as an index into model.docvecs #to get vector for df['DESC'][0] use model.docvecs[df['DESC'][0]] or model[df['DESC'][0]]