def setUp(self):
        filename = datapath("alldata-id-10.txt")
        train_docs = read_sentiment_docs(filename)
        self.train_docs = train_docs
        self.source_doc_vec_file = datapath("small_tag_doc_5_iter50")
        self.target_doc_vec_file = datapath("large_tag_doc_10_iter50")

        self.source_doc_vec = Doc2Vec.load(self.source_doc_vec_file)
        self.target_doc_vec = Doc2Vec.load(self.target_doc_vec_file)
Пример #2
0
    def __init__(self, size=300, window=8, min_count=2, workers=8, path_to_model=None, stream_train=False):

        '''
        Initializes the Doc2Vec_Wrapper class. 

        Args:
            size (int): Specifies the size of the feature-vector. Defaults to 300
            window (int): Specifies the size of the context window from which the feature vector is learned
            min_count (int): Specifices the minimum number of instances of each word that is saved in the model
            workers (int): number of parallel processes
            path_to_model (str): Specifies model on disk 
            stream_train (bool): If true, update word vectors with new sentences. If false, just get doc vecs
        '''

        self.stream_train=stream_train

        self.is_trained = False
        self.model = None

        ## if a path is passed, try to load from disk. Otherwise, retrain anyway
        if path_to_model:
            try:
                self.is_trained = True
                self.model = Doc2Vec.load(path_to_model)
            except:
                pass

        ## params for Doc2Vec 
        self.size = size ## size of the vector
        self.window = window ## size of the context window
        self.min_count = min_count ## minimum count of vocab to store in binary tree
        self.workers = workers ## number of parallel processes == number of cores on the computer
 def load_external(self, model_file_name):
     """
     load a word2vec model from the file specified
     :param model_file_name: name of the model file
     :return:
     """
     self.model = Doc2Vec.load(model_file_name)
Пример #4
0
def test_category():
    from gensim.models.doc2vec import Doc2Vec
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import ConceptRelatedness
    model_category = Doc2Vec.load(FileIO.filename('models/category/cat2vec'))
    cat2vec_rel = ConceptRelatedness(model_category)
    print(cat2vec_rel.word_similarity('happy','sad'))
Пример #5
0
    def __init__(self, sentences, name, dataset_name, epochs=1, dimension=50, modelfile=None):
        self.inner_model = None

        # parameters
        self.dataset = dataset_name
        self.sentences = sentences
        self.name = name
        self.epochs = epochs
        self.dimension = dimension

        # data file path
        models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models'])
        if modelfile is not None:
            filename = modelfile
        else:
            filename = "DOC2VEC_%s_%s_%s_%s" % (self.dataset, self.name, self.epochs, self.dimension)
        self.filepath = os.path.join(models_folder, filename)
        model_exists = os.path.isfile(self.filepath)

        # train initial model
        if model_exists:
            logging.info("found data file %s" % (self.filepath, ))
            self.inner_model = Doc2Vec.load(self.filepath)
        else:
            self.inner_model = Doc2Vec(sentences, size=self.dimension)
            print self.inner_model.vocab.keys()
            self.inner_model.save(fname=self.filepath)
Пример #6
0
def do_command(args):
    # Load data
    data = load_data(args.input)
    #ids, documents = zip(*data)
    data = [(id, tokenize(doc)) for id, doc in data]
    ids = [id for id, _ in data]

    if not os.path.exists(args.modelfile):
        model = embed_documents(data)
        # Save model
        model.save(args.modelfile)
    else:
        model = Doc2Vec.load(args.modelfile)
        #map(model.infer_tokens, tokenized)
    print("Loaded model.")
    # Do k-nearest neighbors search.

    writer = csv.writer(args.output, delimiter='\t')
    writer.writerow(["id1", "id2", "score"])
    count = int(args.count) if args.count > 0 else len(model.docvecs)
    vectors = np.array([model.docvecs[i] for i in range(count)])
    del model # clear up memory

    for i, j, score in find_nearest_neighbors(vectors):
        id1, id2 = ids[i], ids[j]
        writer.writerow([id1, id2, score])
Пример #7
0
def varify():
    from gensim.models.doc2vec import Doc2Vec
    model = Doc2Vec.load('data/doc2vec.d2v')
    documents = pickle.load(open('data/fedcorpus.pick', 'r'))
    for i in xrange(3):
        inferred_docvec = model.infer_vector(documents[i].words)
        print documents[i].tags
        print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))
def get_model(model_num, model_names):
    
    
    if model_num < 10:
        model = Word2Vec.load(model_path + model_names)
    elif model_num < 99:
        model = Doc2Vec.load(model_path + model_names)
    else:
        model = Word2Vec.load_word2vec_format(model_path + model_names, binary=True)  # C text format
    return model
Пример #9
0
def create_and_train_models_d2vec(tag, cores=6):
    """
    Build vocabulary and train models
    :param tag: small or big 
    :param cores: number of cores
    :return: the current models
    """
    simple_models = get_models_d2vec(cores)
    model_files = get_models_filename_d2vec(tag)
    if all([os.path.exists(file) for file in model_files]):
        print('Models exist, loading...')
        for i, fname in enumerate(model_files):
            simple_models[i] = Doc2Vec.load(fname)
        models_by_name = OrderedDict((str(model), model) for model in simple_models)
        return models_by_name
    else:
        print('Building models...')
        voc_model = build_vocab_d2vec(tag, cores)
        # Share vocabulary between models
        for model in simple_models:
            model.reset_from(voc_model)

        models_by_name = OrderedDict((str(model), model) for model in simple_models)
        print('Training models...')
        print("START %s" % datetime.datetime.now())
        best_error = defaultdict(lambda: 1.0)  # to selectively-print only best errors achieved

        alpha, min_alpha, passes = (0.025, 0.001, 20)
        alpha_delta = (alpha - min_alpha) / passes
        file = x_train_str.format(tag)
        x_train = pd.read_hdf(file)
        train_list = x_train.tolist()

        for epoch in range(passes):
            shuffle(train_list)  # shuffling gets best results

            for name, train_model in models_by_name.items():
                # train
                duration = 'na'
                train_model.alpha, train_model.min_alpha = alpha, alpha
                with elapsed_timer() as elapsed:
                    train_model.train(CorpusStream(train_list, 'train'), total_examples=train_model.corpus_count,
                                      epochs=train_model.iter)
                    duration = '%.1f' % elapsed()

            print('completed pass %i at alpha %f' % (epoch + 1, alpha))
            alpha -= alpha_delta

        print("END %s" % str(datetime.datetime.now()))
        for name, model in models_by_name.items():
            name = name.replace('/', '').replace(',', '_')
            model.save('models/{0}_{1}.m'.format(name, tag))

    return models_by_name
Пример #10
0
def get_WordVector_matrix(label):
    model = Doc2Vec.load('./WordVector_model.d2v')
    size = len(label)
    vectors = np.zeros((size,depth))
    for i in range(size):
        try:
            doc_vector = model.docvecs[str(i)]
            vectors[i]=(doc_vector[0])
        except KeyError:
            print str(i) + ' occurs KeyError'
            pass
    return map(list,vectors)
Пример #11
0
def test_models( FULL_SIM, models_files ):
    test_papers = pd.read_csv( TEST_FILEPATH )

    # NOTE: Only need for testing with AII:
    keywords_docsrels = populate_iks_dict()
    authorities = initialize_authorities()

    for mod_f in models_files:
        print( 'Testing '+ mod_f )
        model = Doc2Vec.load( mod_f )
        print( 'Model loaded.' )

        test_model( FULL_SIM, model, test_papers, keywords_docsrels, authorities )
Пример #12
0
def build_model(x_train, x_test, iteration =5, save=True):
    if(save):
        big_list = x_train + x_test
        model = Doc2Vec(min_count=2, window=10, size=100, sample=1e-4, negative=5, workers=8)
        model.build_vocab(big_list)
	for i in range(iteration):
            model.train(big_list)
	print 'saving model to file.....'  
        model.save('./sentim.d2v')
    else:
	print 'loading model from file.....'
	model = Doc2Vec.load('./sentim.d2v')
    return model
Пример #13
0
def get_vec(vector_file, id_file, w_file):
    p2v = Doc2Vec.load(vector_file)
    fout = open(w_file, "w")
    index = 0
    with open(id_file) as f:
        for line in f:
            index += 1
            if index % 1000 == 0:
                logging("%d cases" % index)
            line = line.strip()
            vec = p2v.docvecs[line]
            line_w = line + "\t" + "\t".join([str(x) for x in vec]) + "\t" + "\n"
            fout.write(line_w)
    fout.close()
Пример #14
0
def datacluster(data):
	infered_vectors_list = []
	print "load model..."
	model_dm = Doc2Vec.load(model_path)
	print "load train vectors..."
	for text, label in data:
		vector = model_dm.infer_vector(text)
		infered_vectors_list.append(vector)
	'''
	print "Check the optimized parameter..."
	Nc = range(1, 50)
	pca_data = [PCA(n_components = i).fit(infered_vectors_list).transform(infered_vectors_list) for i in Nc]
	kmeans = cluster.KMeans(init='k-means++',n_clusters=20,max_iter=300)
	score = [kmeans.fit(pca_data[i]).score(pca_data[i]) for i in range(len(pca_data))]
	print score
	plt.plot(Nc,score)
	plt.xlabel('PCA components')
	plt.ylabel('Score')
	plt.title('Elbow Curve')
	plt.show()
	'''

	print "PCA decomposition..."
	pca = PCA(n_components = 10).fit(infered_vectors_list)
	pca_data = pca.transform(infered_vectors_list)
	print "train K-Means model..."
	kmean_model = cluster.KMeans(init='k-means++',n_clusters=16,max_iter=300)
	kmean_model.fit(pca_data)
	#get the classified index
	result = kmean_model.fit_predict(pca_data)
	print "Predicting result:", result
	#save the cluster result
	joblib.dump(kmean_model, cluster_path)
	#load the cluster result
#	new_km = joblib.load(cluster_path)
	numSamples = len(pca_data) 
	print numSamples
	centroids = kmean_model.labels_
	
	#print centroids,type(centroids) #显示中心点
	#print kmean_model.inertia_  #显示聚类效果
	'''	
	marker = ['o', '.', ',', 'x', '*', 'd', 's', 'p']
	color = ['r', 'g', 'b', 'c', 'm', 'k', 'y', 'w']
	for i in xrange(numSamples):
		plt.scatter(pca_data[i][0], pca_data[i][1], \
				marker=marker[centroids[i]], color=color[centroids[i]])
	plt.show()
	'''
	return centroids
Пример #15
0
def main():
    #load data set
    training_reviews = load_dataset(TRAIN_FILE)
    testing_reviews = load_dataset(TEST_FILE)

    #load doc2vec model
    doc2vec_model = Doc2Vec.load(DOC2VEC_MODEL)

    cate_index = get_all_categories(training_reviews)
    cates = dict2list(cate_index)
    n_cates = len(cates)

    train_X = get_X(training_reviews, doc2vec_model)
    test_X = get_X(testing_reviews, doc2vec_model)

    train_labels = get_labels(training_reviews, cate_index)
    test_labels = get_labels(testing_reviews, cate_index)

    labelwise_acc = []
    labelwise_output = []

    for cate in range(n_cates):
        # train a bonary model
        train_Y = get_Y(train_labels, cate)
        prob = svm_problem(train_Y, train_X)
        param = svm_parameter("-s 0 -t 2 -b 1")
        m = svm_train(prob, param)

        # test
        test_Y = get_Y(test_labels, cate)
        p_label, p_acc, p_val = svm_predict(test_Y, test_X, m, '-b 1')

        labelwise_acc.append(p_acc)
        labelwise_output.append(p_label)

    # evaluation
    p, r, f = microF1(labelwise_output, test_labels)

    # output
    out_dir = "../data/use_doc2vec/"
    out_file = out_dir + "laptop.txt"
    labelwise_acc = [(cates[i], labelwise_acc[i][0]) for i in range(n_cates)]
    labelwise_acc = sorted(labelwise_acc, key=lambda x:x[1])
    with open(out_file, 'w') as out:
        out.write("Precision:\t{}\nRecall:\t{}\nF1:\t{}\n".format(p, r, f))
        print("{}\n{}\n{}".format(p, r, f))
        for cate_i in range(n_cates):
            out.write("{}:\t{}\n".format(labelwise_acc[cate_i][0], labelwise_acc[cate_i][1]))
def get_distances_subset(n_closest, category_hash_with_doc_ids, csv_path):
    # example
    # category_hash_with_doc_ids = {"cat1":["us-1", "us-2"], "cat2": ["us-3"]}
    # loop over subjects and average docvecs belonging to subject.
    # place in dictionary
    model = Doc2Vec.load('../doc2vec_model')
    cpc_vectors  = get_category_vectors_subset(model, category_hash_with_doc_ids)
    distance_mat = get_distance_mat(cpc_vectors)

    to_csv = []
    for subj_id in list(category_hash_with_doc_ids.keys()):
        relateds = get_n_closest(distance_mat, subj_id, n=n_closest)
        for related_id, dist in relateds.iteritems():
            weight = round(1./dist)
            #weight = round((1-dist) * 10)
            row = (subj_id, related_id, weight, subj_id, related_id)
            to_csv.append(row)

    edges = pd.DataFrame(to_csv, columns=['source', 'target', 'weight', 'source_name', 'target_name'])
    edges.to_csv(csv_path, index=False)
Пример #17
0
def test():
	global english_punctuation, model_path
	new_model = Doc2Vec.load(model_path)
#	sentence = "reserve setup_data: [mem 0x000000008f889018-0x000000008f8bc057] usable"
#	sentence = "efi: mem14: type=2, attr=0xf, range=[0x000000008fa17000-0x000000008fb19000) (1MB)"
#	sentence = "pci 0000:07:08.2: [8086:208d] type 00 class 0x088000"
#	sentence = "i40e 0000:b0:00.2: irq 41 for MSI/MSI-X"
	sentence = "ata8: SATA link up 6.0 Gbps (SStatus 133 SControl 300)"
	#tokenize
	test_tokenized = [word.lower() for word in word_tokenize(sentence)]
	#remove stopwords
	english_stopwords = stopwords.words('english')
	test_stopwords = [word for word in test_tokenized if not word in english_stopwords]
	#remove punctuation
	test_punctuation = [word for word in test_stopwords if not word in english_punctuations]
	#stem words
	#st = PorterStemmer()   
	#test_stemmed = [st.stem(word) for word in test_punctuation]
	test_text = test_punctuation
	print "===>Testing sentence:", test_text
	inferred_vector_dm = new_model.infer_vector(test_text)
	sims = new_model.docvecs.most_similar(positive=[inferred_vector_dm])
	return sims
Пример #18
0
	def __init__(self, model_name=None, corpus=None, stop_words=False, filename=None, **kwargs):
		"""
		model_name: name of the model which has been trained and saved
		corpus: dictionary with 'question' and 'answer', where corpus['question'] is a list of TaggedDocuments
		filename: name of file containing the questions dataset
		"""
		if corpus:
			self.corpus = corpus
		else:
			self.corpus = {}
			self.corpus['question'] = list(self.read_corpus(filename['question'], stop_words=stop_words))
			self.corpus['answer'] = list(self.read_corpus(filename['answer'], stop_words=stop_words))

		if model_name:
			self.model = Doc2Vec.load(model_name)

		else:
			size = kwargs.get('size', 50)
			min_count  = kwargs.get('min_count', 5)
			alpha = kwargs.get('alpha', 0.025)
			min_alpha = kwargs.get('min_alpha', 0.025)
			iters = kwargs.get('iters', 10)

			self.train(size=size, min_count=min_count, alpha=alpha, min_alpha=min_alpha, iters=iters)
Пример #19
0
from flask import render_template
from flask import redirect
from flask import send_from_directory
from flaskexample import app
from flask import request
#some functions defined under doc2vec.py file
from flaskexample import doc2vec
import gensim
import os.path
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

mypath = os.path.abspath(os.path.dirname("data_clean.csv"))
path = os.path.join(mypath, "flaskexample/doc2vec_model")
model = Doc2Vec.load(path)


@app.route('/')
@app.route('/index')
def index():
    return render_template("index.html")


@app.route('/search_story')
def search_story():
    return render_template("search_story.html")


@app.route('/channel')
def channel():
    return redirect("https://www.youtube.com/channel/UCWENB1OaGA9402PKzEVl0ow")
Пример #20
0
filename = "IR_training_dump.txt"


class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        for line in open(filename, "r"):
            string = line.strip().split('\t')
            yield LabeledSentence(words=string[1].split(), tags=string[0])


#it = LabeledLineSentence(filename)

fname = 'my_model2HUGE.doc2vec'
model = Doc2Vec.load(fname)  # you can continue training with the loaded model!
cores = multiprocessing.cpu_count()

#model =Doc2Vec(dm=1, dm_mean=1, size=200, window=8, min_count=19, iter =10, workers=cores)
#model.build_vocab(it)

#model.train(it, total_examples=model.corpus_count, epochs=model.iter)#

#model.save('my_model2HUGE.doc2vec')

answer = model.docvecs.most_similar(positive=["775DE74B"], topn=30)
print(len(answer))
for i in answer:
    print(i[0])
def _ds2v_vector(requests, model_dir):
    model = Doc2Vec.load(model_dir)
    vectors = list(map(lambda x: model.infer_vector(x), requests))
    return vectors
Пример #22
0
def sen2VecAvg(algo=5):
    print("Using Avg of sentence vectors")
    model = Doc2Vec.load('my_model_sens.doc2vec')
    postrfiles = glob.glob("../asgn2data/aclImdb/train/pos/*.txt")
    negtrfiles = glob.glob("../asgn2data/aclImdb/train/neg/*.txt")
    postsfiles = glob.glob("../asgn2data/aclImdb/test/pos/*.txt")
    negtsfiles = glob.glob("../asgn2data/aclImdb/test/neg/*.txt")

    x = np.zeros((25000, 100))
    xt = np.zeros((25000, 100))
    y = np.zeros(25000)
    yt = np.zeros(25000)

    i = 0
    for f in postrfiles:
        with open(f, 'r') as curfile:
            data = curfile.read().decode("utf-8")
            data = sent_normalize_text(data)
            sens = nltk.sent_tokenize(data)
            for j in range(len(sens)):
                x[i] += model[f + 'SENT_{}'.format(j)]
            x[i] = x[i] / len(sens)
            y[i] = 10
            i += 1

    for f in negtrfiles:
        with open(f, 'r') as curfile:
            data = curfile.read().decode("utf-8")
            data = sent_normalize_text(data)
            sens = nltk.sent_tokenize(data)
            for j in range(len(sens)):
                x[i] += model[f + 'SENT_{}'.format(j)]
            x[i] = x[i] / len(sens)
            y[i] = 0
            i += 1

    i = 0
    for f in postsfiles:
        with open(f, 'r') as curfile:
            data = curfile.read().decode("utf-8")
            data = sent_normalize_text(data)
            sens = nltk.sent_tokenize(data)
            for j in range(len(sens)):
                xt[i] += model[f + 'SENT_{}'.format(j)]
            xt[i] = xt[i] / len(sens)
            yt[i] = 10
            i += 1

    for f in negtsfiles:
        with open(f, 'r') as curfile:
            data = curfile.read().decode("utf-8")
            data = sent_normalize_text(data)
            sens = nltk.sent_tokenize(data)
            for j in range(len(sens)):
                xt[i] += model[f + 'SENT_{}'.format(j)]
            xt[i] = xt[i] / len(sens)
            yt[i] = 0
            i += 1

    combined = list(zip(x, y))
    random.shuffle(combined)
    x[:], y[:] = zip(*combined)

    return x, xt, y, yt
Пример #23
0
def loadModel(filename="vec.model"):
    return Doc2Vec.load(filename)
Пример #24
0
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(token_list)]

cores = multiprocessing.cpu_count()

model = Doc2Vec(dm=0,
                dbow_words=1,
                size=300,
                window=10,
                min_count=2,
                iter=10000,
                workers=cores)

fname = get_tmpfile("my_doc2vec_model")

try:
    model = Doc2Vec.load(fname)
except:
    model.build_vocab(documents)

    print("inicio do treino")
    model.train(documents,
                total_examples=model.corpus_count,
                epochs=model.iter)
    print("fim do treino")
    model.delete_temporary_training_data(keep_doctags_vectors=True,
                                         keep_inference=True)
    model.save(fname)

phrase = "Ciro"
tokens = nltk.word_tokenize(phrase)
Пример #25
0
rects1 = plt.barh(index, compareMeanDf['Best Group'], bar_width, alpha=opacity, color='b', label='Best Group')
rects2 = plt.barh(index + bar_width, compareMeanDf['Worst Group'], bar_width, alpha=opacity, color='g', label='Worst Group')

plt.ylabel('Course Materials', fontsize=20)
plt.xlabel('Average number of activities', fontsize=20)
plt.title('')
plt.yticks(index + bar_width, compareMeanDf.Material, fontsize=18)
plt.xticks(fontsize=20)
plt.legend(fontsize=25)

#--------------------------------------------------------------
#======== Code analysis -------------------------
#------------------------------------------
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
model= Doc2Vec.load(basePath + "ca116_2vecSize50.model")
a = model.docvecs[0]
a = np.array(list(a) + list(a))
taskList = dataUpload['task'].unique()
transitionDataMatrixWeeks[10].index[1]

def similarityBetweenTwoStudent(studentId1, studentId2, doc2vecModel, taskList):   
    vectorStudent1 = []
    vectorStudent2 = []
    for t in taskList:
        key1 = studentId1+'*'+t
        key2 = studentId2+'*'+t
        if (key1 in doc2vecModel.docvecs.index2entity) and (key2 in doc2vecModel.docvecs.index2entity):
            vectorStudent1 = vectorStudent1 + list(doc2vecModel.docvecs[key1])
            vectorStudent2 = vectorStudent2 + list(doc2vecModel.docvecs[key2])
    if len(vectorStudent1) and len(vectorStudent2) > 0:
Пример #26
0
from gensim.utils import tokenize
from gensim import utils


class MyIter(object):
    path = ""

    def __init__(self, fp):
        self.path = fp

    def __iter__(self):
        # path = datapath(self.path)
        with utils.open(self.path, 'r', encoding='utf-8') as fin:
            for line in fin:
                yield list(tokenize(line))


dataset_path = r"data\dataset_lower_clean_stem_sentence.csv"
model_path = r"model\doc2vec100.bin"
corpus = MyIter(dataset_path)
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]
d2v_model = Doc2Vec(vector_size=100, window=2, min_count=1, workers=4)
d2v_model.build_vocab(documents)
d2v_model.train(documents,
                total_words=d2v_model.corpus_count,
                epochs=d2v_model.epochs)
d2v_model.save(model_path)

model_path = r"model\doc2vec100.bin"
d2v_model = Doc2Vec.load(model_path)
print(d2v_model.wv.most_similar(["naskah", "dinas"]))
Пример #27
0
def readModel(datapath, modelName):
    global model
    print('Loading model from', datapath + '/model/' + modelName)
    model = Doc2Vec.load(datapath + '/model/' + modelName)
Пример #28
0
import hug
from gensim.models.doc2vec import Doc2Vec
from gensim.utils import simple_preprocess
import re

model = Doc2Vec.load('models/wiki-latest')


@hug.get('/topicos', examples='frase=Vingadores são um grupo de super-heróis')
@hug.local()
def topicos(frase: str):
    """Informa os topicos de uma frase qualquer"""
    tokens = simple_preprocess(frase)
    inferred_vector = model.infer_vector(tokens)
    similars = model.docvecs.most_similar([inferred_vector], topn=10)

    return {
        'topicos': similars
    }


@hug.get(examples="expressao=homem está para rei como mulher está para")
def analogia(expressao: str):
    """Calcula uma analogia entre termos"""

    entry = '{0}'.format(expressao)
    math_symbol = "\+"
    analogy_symbol = "está para"

    # Case 1: user wants to do word math: word1 - word2 + word3
    positive = []
Пример #29
0
    words_list = sentence.split(' ')
    array = np.array([w2vModel[word] for word in words_list if word in w2vModel])
    df_SentenceVec = pd.Series(array.mean(axis=0))

    return df_SentenceVec

    

def train_D2V(d2vCorpus, embedSize=200, epoch_num=1):
    model_dm = Doc2Vec(d2vCorpus, min_count=1, window=3, size=embedSize, sample=1e-3, negative=5, workers=4)
    model_dm.train(d2vCorpus, total_examples=model_dm.corpus_count, epochs=epoch_num)
    model_dm.save("doc2vec.model")

    return model_dm
    
model_dm = Doc2Vec.load("doc2vec.model")


###全局和局部变量 | 引用和复制###########################################
#注意:python只有在“函数”,“类”里才会区分全局变量和局部变量,在if,for里面不会区分

##全局和局部变量
b = [1,2,3]

def func():   #copy全局变量b来使用,但不会改变外边的b
    a = b + [4,5,6]
    return a  #输出:a=[1,2,3,4,5,6], b=[1,2,3]
    
def func(b):   #只是参数传递,不会改变外面的b
    b = [0]
    a = b + [4,5,6]
Пример #30
0
    def train(self, pairs, labels, verbose=False, cache=None):
        """Train word2vec embeddings."""

        self.doc2vec = Doc2Vec.load(self.model_cache)
        super().train(pairs, labels, verbose, cache)
Пример #31
0
    def load(self, cache):
        """Load trained model."""

        self.doc2vec = Doc2Vec.load(self.model_cache)
        super().load(cache)
 def get_model(cls):
     """Get the model object for this instance."""
     modelfile = glob.glob('/opt/ml/model/*.pkl')[0]
     return Doc2Vec.load(modelfile)#default model name of export.pkl
Пример #33
0
df = df[df.Solution.notnull()]
df = df[df.Neutrality.notnull()]
df = df[df.Localization.notnull()]

df_x = df.loc[:, ['Comments']]

headers.remove('Comments')
headers = ["Mitigation"]

df_y = df.loc[:, headers]
df_y.head()
df_y[df_y != 0] = 1
df_y = df_y.round(0).astype(int)
df_y['new'] = 1 - df_y
#load model
model = Doc2Vec.load(os.path.join("trained", "comments2vec.d2v"))

comments = []
for index, row in df.iterrows():
    line = row["Comments"]
    line = re.sub("[^a-zA-Z?!]", " ", line)
    words = [
        w.lower().decode('utf-8') for w in line.strip().split() if len(w) >= 3
    ]
    comments.append(words)
x_train = []
for comment in comments:
    feature_vec = model.infer_vector(comment)
    #feature_vec = np.append(feature_vec,len(comment))
    x_train.append(feature_vec)
import os
import random
import logging
import pandas as pd
import numpy as np
import pickle as pk

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk import tokenize
from zipfile import ZipFile

#setting the working directory
os.chdir('/home/jcai/geometry_of_law/')
#loading the model
model = Doc2Vec.load(
    '/home/jcai/geometry_of_law/doc2vec_v50k_d200_shuffled_opinion/ALL_opinion.d2v'
)

#calculate and export similarity to "regulation", "Privacy", "Labor"
list_of_issues = [
    "criminal-appeals", "civil-rights", "first-admendment", "due-process",
    "privacy", "labor", "regulation"
]

list_of_names = [
    "criminal appeals", "civil rights", "first admendment", "due process",
    "privacy", "labor", "regulation"
]

issue_dict = dict(zip(list_of_issues, list_of_names))
Пример #35
0
def pre_train_embedding(embed_type, pre_embed_model, train, test):
    # x열은 토크나이징 된 단어들 목록
    # y열은 타겟 라벨

    # counter vectorize, tf-idf용 corpus // 빈칸으로 띄어쓰기
    train_corpus1 = []
    # word2vec, doc2vec용 corpus # ,로 띄어쓰기와 리스트 형식
    train_corpus2 = []

    for words in train['x']:
        sentence1 = ""
        sentence2 = []

        for word in words.split(","):
            sentence1 += word + " "
            sentence2.append(word)

        sentence1 = sentence1[:len(sentence1) - 1]

        train_corpus1.append(sentence1)
        train_corpus2.append(sentence2)

    # counter vectorize, tf-idf용 corpus // 빈칸으로 띄어쓰기
    test_corpus1 = []
    # word2vec, doc2vec용 corpus # ,로 띄어쓰기와 리스트 형식
    test_corpus2 = []

    for words in test['x']:
        sentence1 = ""
        sentence2 = []

        for word in words.split(","):
            sentence1 += word + " "
            sentence2.append(word)

        sentence1 = sentence1[:len(sentence1) - 1]

        test_corpus1.append(sentence1)
        test_corpus2.append(sentence2)

    if embed_type == "CounterVector":

        start = time.time()

        count_vectorizer = load(
            open(
                "C:/Users/battl/PycharmProjects/cse_project/project list/Machine Learning Classification Model Visualization Web Service/embedding_model/"
                + pre_embed_model, "rb"))

        count_train_vectors = count_vectorizer.transform(train_corpus1)
        count_test_vectors = count_vectorizer.transform(test_corpus1)

        sparse_count_train_x = csr_matrix(count_train_vectors)
        sparse_count_test_x = csr_matrix(count_test_vectors)

        end = time.time()

        print('pre-train CounterVectorizer embedding time: {}'.format(end -
                                                                      start))

        return sparse_count_train_x, sparse_count_test_x, train[
            'y'].values, test['y'].values

    elif embed_type == "TF-IDF":

        start = time.time()

        tfidf_vectorizer = load(
            open(
                "C:/Users/battl/PycharmProjects/cse_project/project list/Machine Learning Classification Model Visualization Web Service/embedding_model/"
                + pre_embed_model, "rb"))

        tf_train_vectors = tfidf_vectorizer.transform(train_corpus1)
        tf_test_vectors = tfidf_vectorizer.transform(test_corpus1)

        sparse_tf_train_x = csr_matrix(tf_train_vectors)
        sparse_tf_test_x = csr_matrix(tf_test_vectors)

        end = time.time()

        print('pre-train TfidfVectorizer embedding time: '.format(end - start))

        return sparse_tf_train_x, sparse_tf_test_x, train['y'].values, test[
            'y'].values

    elif embed_type == "Doc2Vec":

        start = time.time()

        from collections import namedtuple
        TaggedDocument = namedtuple('TaggedDocument', 'words tags')

        doc2vec_train_tag = [
            TaggedDocument(doc, tag)
            for doc, tag in zip(train_corpus2, train['y'].values)
        ]
        doc2vec_test_tag = [
            TaggedDocument(doc, tag)
            for doc, tag in zip(test_corpus2, test['y'].values)
        ]

        from gensim.models.doc2vec import Doc2Vec

        doc_vectorizer = Doc2Vec.load(
            'C:/Users/battl/PycharmProjects/cse_project/project list/Machine Learning Classification Model Visualization Web Service/embedding_model/'
            + pre_embed_model)

        for epoch in range(10):
            doc_vectorizer.train(doc2vec_train_tag,
                                 total_examples=doc_vectorizer.corpus_count,
                                 epochs=10)
            doc_vectorizer.alpha -= 0.002  # decrease the learning rate
            doc_vectorizer.min_alpha = doc_vectorizer.alpha  # fix the learning rate, no decay

        doc_train_vectors = [
            doc_vectorizer.infer_vector(doc.words) for doc in doc2vec_train_tag
        ]
        doc_train_tags = [doc.tags for doc in doc2vec_train_tag]

        doc_test_vectors = [
            doc_vectorizer.infer_vector(doc.words) for doc in doc2vec_test_tag
        ]
        doc_test_tags = [doc.tags for doc in doc2vec_test_tag]

        import numpy as np

        doc_train_vectors_np = np.array(doc_train_vectors)
        doc_train_tags_np = np.array(doc_train_tags)

        doc_test_vectors_np = np.array(doc_test_vectors)
        doc_test_tags_np = np.array(doc_test_tags)

        sparse_doc_train_x = csr_matrix(doc_train_vectors_np)
        sparse_doc_test_x = csr_matrix(doc_test_vectors_np)

        end = time.time()

        print('pre-train Doc2Vec embedding time: {}'.format(end - start))

        return sparse_doc_train_x, sparse_doc_test_x, doc_train_tags_np, doc_test_tags_np

    elif embed_type == "user_defined_embedding":
        pass
Пример #36
0
 def load_model(self):
     model = Doc2Vec.load(self.model_loc)
     self.model = model
Пример #37
0
def most_similar(new_text):
    plt.style.use('ggplot')
    #Load the trained model
    model = Doc2Vec.load('doc2vec_abstracts')

    #Load the awards data
    awds = pd.read_csv('NSF CHE 2015.csv', encoding='latin-1')
    awds['StartDate'] = pd.to_datetime(
        awds['StartDate']).apply(lambda x: x.year)
    awds['EndDate'] = pd.to_datetime(awds['EndDate'])
    awds['AwardedAmountToDate'] = [
        x.replace('$', '') for x in awds['AwardedAmountToDate']
    ]
    awds['AwardedAmountToDate'] = [
        x.replace(',', '') for x in awds['AwardedAmountToDate']
    ]
    awds['AwardedAmountToDate'] = pd.to_numeric(awds['AwardedAmountToDate'])

    #Load the papers data sheet
    papers = pd.read_csv('che_paper_data.csv')
    papers['year'] = pd.to_datetime(papers['year'])
    papers['citations per year'] = papers['citations'].divide([
        ((datetime.datetime.today() - x).days) / 365.2422
        for x in papers['year']
    ])
    papers['year'] = papers['year'].apply(lambda x: x.year)

    #Here we build up and instantiate the stop words and lemmatizer
    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation)
    lemma = WordNetLemmatizer()
    boiler_plate = 'This award reflects NSF' 's statutory mission and has been deemed worthy of support through evaluation using the Foundation' 's intellectual merit and broader impacts review criteria'

    #The function below cleans and tokenizes the input text
    def word_mod(doc):
        doc = re.sub('<.*?>', ' ', doc)
        doc = re.sub(boiler_plate, '', doc)
        punct_free = ''.join(ch for ch in doc if ch not in exclude)
        words = punct_free.lower().split()
        stop_free = " ".join([i for i in words if i not in stop])
        lemm = " ".join(lemma.lemmatize(word) for word in stop_free.split())
        word_list = lemm.split()
        # only take words which are greater than 2 characters
        cleaned = [word for word in word_list if len(word) > 2]
        return cleaned

    #Here the cleaned up text is fed to the model. The model returns the similiarty of this text to all awards
    #We print out the two most similar award numbers
    new_text_clean = model.infer_vector(word_mod(new_text))
    sims = model.docvecs.most_similar([new_text_clean],
                                      topn=len(model.docvecs))
    sim1 = sims[0]
    sim2 = sims[1]
    print(
        'The most similar award numbers are {0} and {1}, with similarity scores of {2} and {3}.'
        .format(sim1[0], sim2[0], round(sim1[1], 3), round(sim2[1], 3)))

    #Here we examine the awards with similarity score greater than 0.5. It matches
    #with other awards made, the amount of the award, and the publication data
    #from each award.

    sims = [sims[i][0] for i in range(len(sims)) if sims[i][1] > 0.5]
    sim_awards = awds[awds['AwardNumber'].isin(sims)].copy()
    sim_papers = papers[papers['award number'].isin(sims)].copy()

    #Here plots for different data and metrics are generated.
    fig1 = plt.figure()
    sim_awards.groupby('StartDate')['AwardNumber'].count().plot.bar(rot=0)
    plt.title('Awards per Year Similar to Text')
    plt.ylabel('Number of Awards')
    plt.xlabel('Year of Award')
    plt.show()

    fig2 = plt.figure()
    sim_awards.groupby('StartDate')['AwardedAmountToDate'].sum().plot.bar(
        rot=0)
    plt.title('Total Awarded Dollars per Year for Awards Similar to Text')
    plt.ylabel('Total Dollars Awarded')
    plt.xlabel('Year of Award')
    plt.show()

    fig3 = plt.figure()
    sim_papers.groupby('year')['title'].count().plot.bar(rot=0)
    plt.title('Number of Publications Each Year from Awards Similar to Text')
    plt.ylabel('Number of Publications')
    plt.xlabel('Year of Publication')
    plt.show()

    fig4 = plt.figure()
    sim_papers.boxplot(column=['citations per year'], by='year')
    plt.title(
        'Citations per Year For \n Publications from Awards Similar to Text')
    plt.suptitle("")
    plt.ylabel('Citations per Year')
    plt.xlabel('Year of Publication')
    plt.show()
Пример #38
0
import itertools

import numpy as np
import torch
from torch.utils.data import Dataset
import os
import json
from gensim.models.doc2vec import Doc2Vec

from gensim.test.utils import common_texts

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec.load('./model')
# model = Doc2Vec(documents, vector_size=100, window=2, min_count=1, workers=4)
# from gensim.test.utils import get_tmpfile
#
#
# fname = get_tmpfile("my_doc2vec_model")
#
# model.save(fname)
#
# model = Doc2Vec.load(fname)  # you can continue training with the loaded model!
# model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
# vector = model.infer_vector(["0.11", "0.31"])
# print(vector)


class IntegerSortDataset(Dataset):
    def __init__(self,
Пример #39
0
        placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}_model_{}'.format(
            DOC2VEC_SIZE, DOC2VEC_WINDOW,
            'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow', DOC2VEC_CONCAT,
            DOC2VEC_MEAN, DOC2VEC_TRAIN_WORDS, DOC2VEC_HIERARCHICAL_SAMPLE,
            DOC2VEC_NEGATIVE_SAMPLE_SIZE, str(DOC2VEC_MAX_VOCAB_SIZE),
            str(part_level) + '_' + part_name)
        GLOBAL_VARS.DOC2VEC_MODEL_NAME = placeholder_model_name
        placeholder_model_name = os.path.join(placeholder_model_name,
                                              "epoch_{}")
        epoch = DOC2VEC_EPOCH
        GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)

        info("Loading Doc2vec model: {}".format(GLOBAL_VARS.MODEL_NAME))
        doc2vec_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location,
                                                  GLOBAL_VARS.MODEL_NAME,
                                                  MODEL_PREFIX),
                                     mmap=DOC2VEC_MMAP)
        info("Loading Validation Dict")
        validation_dict = dict(
            pickle.load(
                gzip.open(
                    os.path.join(doc2vec_model_save_location,
                                 GLOBAL_VARS.MODEL_NAME,
                                 VALIDATION_DICT + GZIP_EXTENSION))))
        info("Loading Test Dict")
        test_dict = dict(
            pickle.load(
                gzip.open(
                    os.path.join(doc2vec_model_save_location,
                                 GLOBAL_VARS.MODEL_NAME,
                                 TEST_DICT + GZIP_EXTENSION))))
Пример #40
0
    out_df = pd.DataFrame(static_dict, columns=static_dict.keys())
    out_df.to_csv('/users/votta/code/penn_apps/similarity.csv')
    out_df.to_parquet('/users/votta/code/penn_apps/similarity.parquet.gzip')


if __name__ == '__main__':
    """the rest of the model building happens here"""

    # scrape_all()
    df = pd.read_pickle("./test.pkl")
    docs = generate_tagged_docs(df)

    # model = Doc2Vec(docs, vector_size=30, window=5, min_count=2, workers=4)
    # model.save(D2V_MODEL_NAME)

    model = Doc2Vec.load(D2V_MODEL_NAME)
    # print_random_similarity(df, docs, model)

    generate_lookup_table(df, model)

    # Experiment with SEC data

    # # Initialize a downloader instance.
    # # If no argument is passed to the constructor, the package
    # # will attempt to locate the user's downloads folder.
    # dl = Downloader('/users/votta/code/penn_apps')
    # # Get all 8-K filings for Apple (ticker: AAPL)
    # dl.get_8k_filings("AAPL")
    # # Get all 8-K filings for Apple, including filing amends (8-K/A)
    # dl.get_8k_filings("AAPL", include_amends=True)
    # # Get all 8-K filings for Apple before March 25, 2017
def _d2v_vector(request, model_dir):
    model = Doc2Vec.load(model_dir)
    vector = model.infer_vector(request)
    return vector
Пример #42
0
from pyspark import SparkContext
from pyspark.sql import SQLContext

from gensim.models.doc2vec import Doc2Vec

sc = SparkContext()
sqlContext = SQLContext(sc)

# this is a large object we cache it on each worker node
gmod_broadcast = sc.broadcast( Doc2Vec.load("/root/doc2vec/doc2vec_model/hn") ) 

df = sqlContext.read.load("hdfs:///hndata/parquet_typed", format="parquet")

ids = df.where("score IS NOT NULL") \
         .where("type='story'") \
         .where("title IS NOT NULL") \
         .map(lambda row: row.id)

def mergeVec(id):
    gmod = gmod_broadcast.value 
    vec = gmod.docvecs["TITLE_%d" % id]
    return (id, vec)
    
docvecs = ids.map(mergeVec) 
docvecs.saveAsPickleFile("hdfs:///hndata/docvecs_glove_pickle")
Пример #43
0

if __name__ == '__main__':
    global model

    #----------- Parsing Arguments ---------------
    p = argparse.ArgumentParser()
    p.add_argument("--model", help="Path to the trained model")
    p.add_argument("--binary", help="Specifies the loaded model is binary")
    p.add_argument("--host", help="Host name (default: localhost)")
    p.add_argument("--port", help="Port (default: 5000)")
    p.add_argument("--path", help="Path (default: /word2vec)")
    args = p.parse_args()

    model_path = args.model if args.model else "./model.bin.gz"
    binary = True if args.binary else False
    host = args.host if args.host else "localhost"
    path = args.path if args.path else "/word2vec"
    port = int(args.port) if args.port else 5000
    if not args.model:
        print "Usage: word2vec-apy.py --model path/to/the/model [--host host --port 1234]"
    model = w.load(model_path)

    api.add_resource(N_Similarity, path + '/n_similarity')
    api.add_resource(Similarity, path + '/similarity')
    api.add_resource(MostSimilar, path + '/most_similar')
    api.add_resource(Model, path + '/model')
    api.add_resource(Infer, path + '/infer')
    api.add_resource(ModelWordSet, '/word2vec/model_word_set')
    app.run(host=host, port=port)
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from gensim.models.doc2vec import Doc2Vec
import pykeyvi

docvecs_process_input_keyvi_index_file = "docvecs_urlid_url.kv"
output_data_path = "/raid/ankit/doc2vec/out_s_p_1M"
doc2vec_trained_model = 'pages_with_spaces.doc2vec'
_alpha, _min_alpha, _passes = (0.020, 0.001, 20)

print "Loading keyvi dictionaries ..."
keyvi_dict=pykeyvi.Dictionary("{}/{}".format(output_data_path, docvecs_process_input_keyvi_index_file))
print "Finished Loading key-vi Dictionary."

print "Loading Doc2Vec Model ... "
model = Doc2Vec.load("{}/{}".format(output_data_path, doc2vec_trained_model))
print "Model Loaded Successfully!"


def get_similar_urls(sample_query, nearest_num):
    tokens = sample_query.lower().split()
    dv = model.infer_vector(tokens, alpha=_alpha, min_alpha=_min_alpha, steps=_passes)     # note: may want to use many more steps than default
    sims = model.docvecs.most_similar(positive=[dv],  topn=nearest_num)
    for url_id, distance in sims:
        url = ""
        for m in keyvi_dict.Get(str(url_id)):
            url = m.GetValueAsString()
        print "{}\t{}\t{}".format(url_id, url, distance)

def main():
    print "\nSimilar URLS for Queries - Doc2Vec Retrieval Interface [All URL's]"
Пример #45
0
    text = text.split()

    return text


r_data_loaded = random.sample(data_loaded, len(data_loaded))
r_samples = r_data_loaded[:100000]
df = pd.DataFrame(r_samples)
train_df = df
# Prepare embedding
vocabulary = dict()
inverse_vocabulary = [
    '<unk>'
]  # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding
word2vec_cfg = Doc2Vec.load(CFG_EMBEDDING_FILE)
word2vec = KeyedVectors.load(SOURCE_CODE_EMBEDDING_FILE)
# gensim.models.Word2Vec.load_word2vec_format('/data5/momo-projects/user_interest_classification/code/word2vec/vectors_groups_1105.bin', binary=True, unicode_errors='ignore')
code_clones_cols = ['code_clone1', 'code_clone2']

# Iterate over the questions only of both training and test datasets
for dataset in [train_df]:
    for index, row in dataset.iterrows():

        # Iterate through the text of both questions of the row
        for code_clone in code_clones_cols:

            q2n = []  # q2n -> question numbers representation
            for word in source_code_to_tokens(row[code_clone]):

                # Check for unwanted words
Пример #46
0
 def load(cls, model_file='synset2vec'):
     model = Doc2Vec.load(model_file)
     return cls(model)
f = open('pckl_df_problemen_16.pkl', 'rb')
problemen = pickle.load(f)
f.close()

print('LSTVRZ ophalen uit pickle bestand . . .')
f = open('pckl_LSTVRZ_16.pkl', 'rb')
lstVRZ = pickle.load(f)
f.close()

print('LSTVRZID ophalen uit pickle bestand . . .')
f = open('pckl_LSTVRZID_16.pkl', 'rb')
lstVRZID = pickle.load(f)
f.close()

print('Laad het KOD model . . . ')
modelKOD = Doc2Vec.load("KOD DOC2VEC PROBLEMS 6 MAAND_16.model")

print('Laad het VRZ model . . . ')
modelVRZ = Doc2Vec.load("VRZ DOC2VEC PROBLEMS 6 MAAND_16.model")

# We hebben de basis voor de analyse.
# Hierbij gaan we ervanuit dat de incidenten en de problemen die 
# zijn aangeleverd betrekking hebben op de periode van het halfjaar
# Gegevensverzamelingen hebben nu de vorm
#   incidenten
#   1. Incidentnummer
#   2. Korte omschrijving (Details)
#   3. Verzoek
#   4. LSTVRZ
#   5. VRZ
#   6. LSTKOD
Пример #48
0
TEST_INPUT_DATA = 'test_input.npy'
DATA_CONFIGS = 'data_configs.json'
SEQ_CONFIGS = 'seq_configs_bt.json'

# Train label save file name
TRAIN_LABEL_DATA = 'train_label.npy'
TRAIN_LABEL_SMALL = 'train_label_small.npy'
TEST_LABEL_DATA = 'test_label.npy'
TEST_LABEL_SMALL = 'test_label_small.npy'

# pre-trained model load
d2v_model_name = './model_save/embedding_model/Doc2vec_new.model'
w2v_model_name = './model_save/embedding_model/Word2vec1.model'
pre_trained_name = './model_save/embedding_model/trained_word2vec1.model'

doc_vectorizer = Doc2Vec.load(d2v_model_name)
word_vectorizer = Word2Vec.load(w2v_model_name)
pre_trained_w2v = Word2Vec.load(pre_trained_name)

train_X = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb'))
test_X = np.load(open(DATA_IN_PATH + TEST_INPUT_DATA, 'rb'))

if label_size == 'big':
    train_Y = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'rb'))
    train_YS = tf.one_hot(train_Y, 43)
    test_Y = np.load(open(DATA_IN_PATH + TEST_LABEL_DATA, 'rb'))
    test_YS = tf.one_hot(test_Y, 43)
else:
    train_Y = np.load(open(DATA_IN_PATH + TRAIN_LABEL_SMALL, 'rb'))
    train_YS = tf.one_hot(train_Y, 455)
    test_Y = np.load(open(DATA_IN_PATH + TEST_LABEL_SMALL, 'rb'))
    n closest subject_ids
    """
    s = distance_mat.loc[subject_id]
    closest = s.sort_values()[1:1+n]
    return closest


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description="produce similarity matrix")
    parser.add_argument('dbname', help="Database name")
    parser.add_argument('path_to_model', help="Model to test")
    parser.add_argument('n_closest', help="How many closest subjects to look into")
    args = parser.parse_args()

    model = Doc2Vec.load(args.path_to_model)

    subject_hash = get_subject_hash(args.dbname)
    subject_ids = list(subject_hash.keys())

    # loop over subjects and average docvecs belonging to subject.
    # place in dictionary
    subject_vectors = get_subject_vectors(subject_ids)
    distance_mat = get_distance_mat(subject_vectors)

    to_csv = []
    for subj_id in subject_ids:
        relateds = get_n_closest(distance_mat, subj_id, n=int(args.n_closest))
        for related_id, dist in relateds.iteritems():
            weight = round(1./dist)
            #weight = round((1-dist) * 10)
Пример #50
0
            model_DM.train(training_doc)
            model_DBOW.train(training_doc)

        # Save the trained models:
        fout = 'DM.d2v'
        model_DM.save(most_recent + fout)
        model_DM.init_sims(replace=True)

        fout = 'DBOW.d2v'
        model_DBOW.init_sims(replace=True)
        model_DBOW.save(most_recent + fout)

    else:
        # Load Doc2Vec model from disk:
        fout = 'DM.d2v'
        model_DM = Doc2Vec.load(most_recent + fout)

        fout = 'DBOW.d2v'
        model_DBOW = Doc2Vec.load(most_recent + fout)


    # train the two different methods of the Doc2Vec algorithm:
    # NB DBOW is more similar to the recommended skip-gram of 
    # Word2Vec by the original paper's authors.  
 

    print('nonmatch', model_DM.doesnt_match("delay government flooding lightning".split()))

    print('nonmatch', model_DM.doesnt_match("euref voteout remain lightning".split()))
    print('euref sim by word', model_DM.similar_by_word('euref'))
    print('flood ', model_DM.similar_by_word('flood'))
    if deps_model_file != "":
        has_deps_embeddings = True
        logging.info("Loading dependency embeddings from %s" % deps_model_file)
        deps_model = Embeddings.load(deps_model_file+".npy", deps_model_file+".vocab")
        logging.info("Deps Model loaded!")

        #deps_vocabulary = deps_model._vocab
        #deps_embeddings = deps_model._vecs


    # Load Models here
    is_doc2vec_model = False
    # load word2vec word2vec_model
    if doc2vec_model_file != '':
        model = Doc2Vec.load(doc2vec_model_file)
        is_doc2vec_model = True
    else:
        if word2vec_load_bin:
            model = Word2Vec.load_word2vec_format(word2vec_model_file, binary=True)  # use this for google vectors
        else:
            model = Word2Vec.load(word2vec_model_file)

    use_id_for_vector = use_id_for_vector and is_doc2vec_model

    word2vec_num_features = len(model.syn0[0])
    logging.info("Embeddings feature vectors length:%s" % word2vec_num_features)
    logging.info("Model syn0 len=%d" % (len(model.syn0)))

    # define classes
    class_mapping = dict([(val, idx) for idx, val in enumerate(valid_senses)])
Пример #52
0
        elif i >= 500: total_count_total+=1
posInfo.sort()
total_pos_tags = list(set([pos for sent in posInfo for pos in sent]))
print [pos+"."+str(i) for i,pos in enumerate(total_pos_tags)]
pos_tag_vector = []
for pos in total_pos_tags:
    pos_tag_vector.append([1 if p.count(pos)>0 else 0 for p in posInfo])
for i,pos in enumerate(total_pos_tags):
    dtm = base.cbind(dtm,pos=pos_tag_vector[i])
print vb_count,vb_count_total, total_count, total_count_total
# dtm_syntax = base.cbind(dtm,class_label=problem_class_labels)
# waikatoWriteArff(base.data_frame(dtm_syntax),file="problem_syntax.arff",class_col="class_label")

print "doc2vec"
doc2vecVectors=[]
doc2vecModel = Doc2Vec.load("/home/kh562/Corpora/MODELS/acl_sent_doc2vec.model")
for s in problem_strings+non_problem_strings:
    doc2vecVectors.append(doc2vecModel.infer_vector(s.split()))
for i in range(0,len(doc2vecVectors[0])):
    dtm = base.cbind(dtm,doc2vec=list(float(docVec[i]) for docVec in doc2vecVectors))
# dtm_doc2vec = base.cbind(dtm,class_label=problem_class_labels)
# waikatoWriteArff(base.data_frame(dtm_doc2vec),file="problem_doc2vec.arff",class_col="class_label")

print "word2vec"
word2vec_model = Word2Vec.load("/home/kh562/Corpora/MODELS/fuse_word2vec.model")
word2vec_vector = []
for [head,pos] in problem_heads+non_problem_heads:
    try:
        word2vec_vector.append(word2vec_model[head])
    except:
        word2vec_vector.append(np.array([0]*100,dtype=np.float32))
Пример #53
0
# model.build_vocab(tuples_list)
#
# for epoch in range(20):
#     print('iteration {0}'.format(epoch))
#     model.train(tuples_list,
#                 total_examples=model.corpus_count,
#                 epochs=model.iter)
#     # decrease the learning rate
#     model.alpha -= 0.002
#     # fix the learning rate, no decay
#     model.min_alpha = model.alpha
#
# model.save("d2v.model")
# print("Model Saved")

model = Doc2Vec.load("d2v.model")

# Get sentences embeddings of train and test data

docs_sentence_embeddings = np.zeros((len(docs_list), 50))
for i in range(len(docs_sentence_embeddings)):
    docs_sentence_embeddings[i] = model.docvecs[str(i)]
print(docs_sentence_embeddings.shape)
#
x_train, x_test, y_train, y_test = train_test_split(docs_sentence_embeddings,
                                                    labels,
                                                    test_size=0.20,
                                                    random_state=1)
#
# Train model
Пример #54
0
from gensim.models.doc2vec import Doc2Vec
from scipy.spatial import distance


def calculate_cosine_similarity(u, v):
    return 1 - distance.cosine(u, v)


def predict(model, item1, item2, threshold=0.5):
    vec1 = model.docvecs[item1]
    vec2 = model.docvecs[item2]
    return int(calculate_cosine_similarity(vec1, vec2) > threshold)


vector_size = '100-better-data-window-9'

model = Doc2Vec.load(f'models/doc2vec-{vector_size}.model')

threshold = 0.7

df_test = pd.read_csv('data/valid.csv')
df_test['temp_ltable_id'] = 'A_' + df_test['ltable_id'].astype(str)
df_test['temp_rtable_id'] = 'B_' + df_test['rtable_id'].astype(str)

df_test['label'] = df_test.apply(lambda row: predict(model, row[
    'temp_ltable_id'], row['temp_rtable_id'], threshold),
                                 axis=1)

df_test = df_test[['ltable_id', 'rtable_id', 'label']]
df_test.to_csv(f'results/submission-{vector_size}.csv', index=False)
Пример #55
0
 def __init__(self):
     self.model=Doc2Vec.load('./model.d2v')
     self.st = LancasterStemmer()
Пример #56
0
import pandas as pd
import numpy as np
import sanalytics.algorithms.utils as sau
from gensim.models.doc2vec import Doc2Vec
from progressbar import progressbar
import re
import random
from glob import glob

print("READING D2V")

## Read D2V Model
d2v = Doc2Vec.load("datasets/rq3_d2v/sec1.0R100.model")
## Read D2V Model All
d2vall = Doc2Vec.load("datasets/rq3_d2v/sec1.0R100_all.model")

print("LOADED D2V")

## Read files
while True:
    files = list(os.walk("datasets/rq3_dataR100"))[0][2]
    filename = random.sample(files,1)[0]

    if filename in set([".".join(i.split(".")[:-1]) for i in list(os.walk("datasets/rq3_vecdata_newR100"))[0][2]]):
        continue

    print("start {}".format(filename))
    X = pd.read_parquet("datasets/rq3_dataR100/{}".format(filename))
    if "all" not in filename:
        X["d2v"] = [d2v.infer_vector("{} {} {}".format(i.title, i.question, i.answers).split()) for i in progressbar(X.itertuples())]
    if "all" in filename:
Пример #57
0
import jieba
import gensim
import pandas as pd
from gensim.models.doc2vec import Doc2Vec
from Segmentation import *

def similarity(a_vect, b_vect):
    #计算两个向量余弦值
    dot_val = 0.0
    a_norm = 0.0
    b_norm = 0.0
    cos = None
    for a, b in zip(a_vect, b_vect):
        dot_val += a*b
        a_norm += a**2
        b_norm += b**2
    if a_norm == 0.0 or b_norm == 0.0:
        cos = -1
    else:
        cos = dot_val / ((a_norm*b_norm)**0.5)

    return cos

model = Doc2Vec.load(sys.path[0]+'/model/modeltest',mmap='r')
#推测文本的向量
#model.random.seed(0)
Vector1 = model.infer_vector(['新华社','报道','出现','偏差'],steps=500,alpha=0.025)
Vector2 = model.infer_vector(['新华社','的', '报道','出现','错误'],steps=500,alpha=0.025)
Vector3 = model.infer_vector(['今天','的','天气','非常','好'],steps=500,alpha=0.025)
print(similarity(Vector1,Vector2))
print(similarity(Vector1,Vector3))
def get_distances_subset(n_closest, category_hash_with_doc_ids, csv_path):
    # example
    # category_hash_with_doc_ids = {"cat1":["us-1", "us-2"], "cat2": ["us-3"]}
    # loop over subjects and average docvecs belonging to subject.
    # place in dictionary
    model = Doc2Vec.load('../doc2vec_model')
    cpc_vectors  = get_category_vectors_subset(model, category_hash_with_doc_ids)
    distance_mat = get_distance_mat(cpc_vectors)

    to_csv = []
    for subj_id in list(category_hash_with_doc_ids.keys()):
        relateds = get_n_closest(distance_mat, subj_id, n=n_closest)
        for related_id, dist in relateds.iteritems():
            weight = round(1./dist)
            #weight = round((1-dist) * 10)
            row = (subj_id, related_id, weight, subj_id, related_id)
            to_csv.append(row)

    edges = pd.DataFrame(to_csv, columns=['source', 'target', 'weight', 'source_name', 'target_name'])
    edges.to_csv(csv_path, index=False)

if __name__ == '__main__':
    print "main"
    model = Doc2Vec.load('../doc2vec_model')
    # db = MongoClient()
    # get_distances(db, model, int(sys.argv[1]))
    get_distances_subset(model, 5, data, '../static/subject_distances1.csv')


Пример #59
0
def mkExistingTrainedModel(path):
    return Doc2Vec.load(path)
Пример #60
0
    def kaifang(self):
        sen = self.get_input()
        vec = self.sen2vec(self.model, sen)
        siml = self.sim(vec, self.num, self.ids)
        print(siml)
        siml_2 = []
        for sim in siml:
            siml_2.append(str(100/(100+sim[0]))+'\n'+self.ana2print(self.ids, sim[1]+1))
        return siml_2


if __name__ == '__main__':
    out_1 = open('output_1.txt', 'w', encoding='utf-8')
    out_2 = open('output_2.txt', 'w', encoding='utf-8')
    # 读取训练好的doc2vec模型
    mod = Doc2Vec.load( 'model_4.0.1.md')

    # 读取input文件,寻找最接近的方剂
    # Disease(输入文件, doc2vec模型, 输出的个数)
    # 返回list[编号 相似度 描述]
    zd = Disease('input.txt', mod, 5)
    for line in zd.output():
        print(line)
        out_1.write('Score: ' + str(line[1]) + '\n')
        out_1.write(line[2]+'\n')
        out_1.write('\n')

    # 对输入进行归类 NaiveBayesPredict(输入文件, 预先训练好的朴素贝叶斯模型即概率矩阵)
    # 返回其分类的id(目前在2-23之间)
    nbp = NaiveBayesPredict('input.txt', 'result.model')
    classify = int(nbp.predict())