#!/usr/bin/python3
# coding: utf-8
from gensim.models.keyedvectors import KeyedVectors
model = KeyedVectors.load_word2vec_format('~/datasets/WordVec/GoogleNews/GoogleNews-vectors-negative300.bin', binary=True)
model.save_word2vec_format('~/datasets/WordVec/GoogleNews/GoogleNews-vectors-negative300', binary=False)
예제 #2
0
    def ensemble_embedding(self, word_embedding, context_embedding):
        """Replace current syn0 with the sum of context and word embeddings.

        Parameters
        ----------
        word_embedding : str
            Path to word embeddings in GloVe format.
        context_embedding : str
            Path to context embeddings in word2vec_format.

        Returns
        -------
        numpy.ndarray
            Matrix with new embeddings.

        """
        glove2word2vec(context_embedding, context_embedding + '.w2vformat')
        w_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % word_embedding)
        c_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % context_embedding)
        # compare vocab words using keys of dict vocab
        assert set(w_emb.vocab) == set(c_emb.vocab), 'Vocabs are not same for both embeddings'

        # sort context embedding to have words in same order as word embedding
        prev_c_emb = copy.deepcopy(c_emb.syn0)
        for word_id, word in enumerate(w_emb.index2word):
            c_emb.syn0[word_id] = prev_c_emb[c_emb.vocab[word].index]
        # add vectors of the two embeddings
        new_emb = w_emb.syn0 + c_emb.syn0
        self.syn0 = new_emb
        return new_emb
예제 #3
0
 def test_type_conversion(self):
     path = datapath('high_precision.kv.txt')
     binary_path = datapath('high_precision.kv.bin')
     model1 = KeyedVectors.load_word2vec_format(path, datatype=np.float16)
     model1.save_word2vec_format(binary_path, binary=True)
     model2 = KeyedVectors.load_word2vec_format(binary_path, datatype=np.float64, binary=True)
     self.assertAlmostEqual(model1["horse.n.01"][0], np.float16(model2["horse.n.01"][0]))
     self.assertEqual(model1["horse.n.01"][0].dtype, np.float16)
     self.assertEqual(model2["horse.n.01"][0].dtype, np.float64)
예제 #4
0
 def load_w2v_model(self):
     # Load pre-trained word2vec model
     model_loc = os.path.join(os.getcwd(),get_str('devise', 'w2v_model_name'))
     word_vectors = KeyedVectors.load_word2vec_format(model_loc, binary=True)
     # Get dimensions of word vector
     word_dim = word_vectors['the'].shape[0]
     return word_vectors, word_dim
def load_GNews_model():
    """
    Convenience function for loading the pre-trained Google News word2vec model vectors 
    published with the original work. For more information see: 
    https://code.google.com/archive/p/word2vec/ 
    """
    model = KeyedVectors.load_word2vec_format('rdata/GoogleNews-vectors-negative300.bin', binary=True) 
    return model
예제 #6
0
 def create_and_load_dic(self):
     model = KeyedVectors.load_word2vec_format(self.files_path + '.bin', binary=True)
     kmeans = cluster.KMeans(n_clusters=self.num_clusters)
     kmeans.fit(model.wv.vectors)
     self.w2v_dic = dict(zip(model.wv.index2word, zip(model.wv.vectors, kmeans.labels_)))
     output = open(self.files_path + '.pkl', 'wb')
     pickle.dump(self.w2v_dic, output)
     output.close()
예제 #7
0
 def __init__(self, config=None):
     super().__init__()
     self.embedding_path = config.get("embedding_path")
     self.embedding_type = config.get("embedding_type")
     if self.embedding_path is None or self.embedding_path == "":
         raise ValueError("Embedding_path is expected.")
     is_binary = True if self.embedding_type == "bin" else False
     from gensim.models.keyedvectors import KeyedVectors
     self.embedding = KeyedVectors.load_word2vec_format(self.embedding_path, binary=is_binary)
예제 #8
0
 def load_kv(filename=None, path=None, limit=None):
     if path is not None:
         return KeyedVectors.load_word2vec_format(
             path, binary=True, limit=limit)
     elif filename is not None:
         for dir_path in ASSET_SEARCH_DIRS:
             try:
                 path = os.path.join(dir_path, filename)
                 return KeyedVectors.load_word2vec_format(
                     path, binary=True, limit=limit)
             except FileNotFoundError:
                 continue
         raise FileNotFoundError("Please make sure that 'filename' \
                                 specifies the word vector binary name \
                                 in default search paths or 'path' \
                                 speficies file path of the binary")
     else:
         raise TypeError(
             "load_kv() requires either 'filename' or 'path' to be set.")
예제 #9
0
    def load_word2vec_model_from_path(self):
        """
        Load Word2Vec model

        Returns:
            the Word2Vec model
        """
        word_embeddings_model = KeyedVectors.load_word2vec_format(
            self.word2vec_model_path, binary=True)
        if not word_embeddings_model:
            return None
        return word_embeddings_model
예제 #10
0
 def _load_word2vec(path, limit=500000):
     """
     Init word2vec model
     :param path: path to the model
     :param limit: optional
     :return: word2vec model
     """
     print('Выгружаю семантическую модель слов...')
     w2v = KeyedVectors.load_word2vec_format(path, binary=True,
                                             unicode_errors='ignore', limit=limit)
     w2v.init_sims(replace=True)
     print('Выгрузка окончена')
     return w2v
예제 #11
0
from gensim.models.keyedvectors import KeyedVectors
import logging
from scipy import stats
import numpy as np
from sklearn import metrics

file = input('The vector file:')
model = KeyedVectors.load_word2vec_format(file, binary=False)

#verbs
similar = model.most_similar('击败')
print('击败:')
print(similar)
print('\n')
similar = model.most_similar('引用')
print('引用:')
print(similar)
print('\n')
similar = model.most_similar('研究')
print('研究:')
print(similar)
print('\n')
similar = model.most_similar('形成')
print('形成:')
print(similar)
print('\n')
similar = model.most_similar('增加')
print('增加:')
print(similar)
print('\n')
import numpy as np
import pickle
import gensim
import time
from gensim.models.keyedvectors import KeyedVectors

t0 = time.time()
path = "glove.6B.50d.txt.w2v"
glove = KeyedVectors.load_word2vec_format(path, binary=False)
t1 = time.time()
print("loaded word vectors in ", t1 - t0)


class Word_Association:
    """
    Capable of playing a simple word association game using word embeddings
    The computer starts by giving a random word,
    then the user gives a related word.
    This continues until the user gives a word that isn't related or has been used already
    or the computer can't come up with a related word that hasn't been used.
    """
    def __init__(self, seed=None, level=1):
        """
        Creates a game session.

        Parameters
        ----------
        seed: str
            the word on whcih the game will start (random if None)
        level: int
            the difficulty level (1 is recommended, higher is harder)
예제 #13
0
def get_w2v():
    model = KeyedVectors.load_word2vec_format('word2vec.bin', binary=True)
    return model
예제 #14
0
            y_true = sample['labels'].numpy()
            if task not in recalls:
                recalls[task] = 0.
            recalls[task] += get_recall(y_true, y)
            if task not in counts:
                counts[task] = 0
            counts[task] += 1
    recalls = {task: recall / counts[task] for task, recall in recalls.items()}
    if save_argmax:
        return argmaxes
    else:
        return recalls


print('Loading word vectors...')
we = KeyedVectors.load_word2vec_format(args.we_path, binary=1)
testset = CrossTask(
    data_path=args.data_path,
    features_path=args.features_path,
    features_path_3D=args.features_path_3D,
    we=we,
    feature_framerate=args.feature_framerate,
    feature_framerate_3D=args.feature_framerate_3D,
    we_dim=args.we_dim,
    max_words=args.max_words,
)
testloader = DataLoader(
    testset,
    batch_size=1,
    num_workers=args.num_thread_reader,
    shuffle=False,
예제 #15
0
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
from flask import Flask
import urllib
import numpy as np
import json
from elasticsearch import Elasticsearch

app = Flask(__name__)
#b = Word2Vec.load('tmp/brown.bin')
#g = Word2Vec.load('tmp/GoogleNews-vectors-negative300.bin')
g = KeyedVectors.load_word2vec_format('tmp/GoogleNews-vectors-negative300.bin',
                                      binary=True)
log = []


@app.route("/")
def hello():
    return "OK"


def ConvertVectorSetToVecAverageBased(vectorSet, ignore=[]):
    if len(ignore) == 0:
        return np.mean(vectorSet, axis=0)
    else:
        return np.dot(np.transpose(vectorSet), ignore) / sum(ignore)


def phrase_similarity(_phrase_1, _phrase_2):
    phrase_1 = _phrase_1.split(" ")
    phrase_2 = _phrase_2.split(" ")
예제 #16
0
def main(schema_path, word_embeddings_path, output_path,
         header=True, zero_vector=False):
    """The logic of the script."""
    if header:
        print "Skipping header row."
    else:
        print "No header row."

    # Initialize the gensim model.
    print "Loading word vectors. This may take a moment."
    start = time.time()
    gensim_model = KeyedVectors.load_word2vec_format(word_embeddings_path, binary=False)
    print "Model loaded in %0.3f seconds." % (time.time() - start)
    print "Word vectors loaded."
    try:
        _ = gensim_model["unk"]
    except:
        print "unk not in model"

    # We'll probably have to look up the same table and field names many times;
    # may as well cache them.
    cached_vectors = {}

    # Read in the schema CSV.
    all_vectors = []
    with open(schema_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if header:
                header = False
                continue
            if not len(row) == NUM_FIELDS:
                print "Expected %d fields, but found %d; skipping row." % (NUM_FIELDS, len(row))
                print "\t" + ", ".join(row)
                continue
            # Get word vectors for table names and field names
            from_table_name = row[0].strip()
            from_field_name = row[1].strip()
            to_table_name = row[2].strip()
            to_field_name = row[3].strip()
            vectors = []
            for name in [from_table_name, to_table_name,
                         from_field_name, to_field_name]:
                if name not in cached_vectors:
                    cached_vectors[name] = get_word_vector(name, gensim_model)
                current_vector = cached_vectors[name]
                vectors.append(current_vector)

            # Concatenate the four vectors
            entire_vector = np.concatenate(vectors)
            all_vectors.append(entire_vector)
    if zero_vector:
        print "Appending zero vector"
        zeros = np.zeros_like(all_vectors[0])
        all_vectors.append(zeros)

    matrix = np.stack(all_vectors)

    print "schema_map shape: %s" % str(matrix.shape)

    # Save matrix
    np.save(output_path, matrix)
예제 #17
0
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import datapath
from pathlib import Path

model = KeyedVectors.load_word2vec_format(Path.cwd() / 'model_test.txt',
                                          binary=False)

print('女生')
print(model.most_similar(positive='女生'))
print('----------------')
print('中国')
print(model.most_similar(positive='中国'))
# project = "db4o"
# cs_packages = ["Db4objects.","Db4oUnit"]
# java_packages = ["db4o."]

usage_type = "method"

with open(URL,"r") as f:
	data = f.readlines()

keys = list()
for line in data:
	line = line.strip()
	splits = line.split("-")
	keys.append(splits[0])

cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_global_local.txt",binary=False)
java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_global_local.txt",binary=False)

for key in keys:
	try:
		vector = java_vectors[key]
		k_nearest= cs_vectors.similar_by_vector(vector, topn=50)
		relevant_k = list()

		for k in k_nearest:
			if check_if_token_is_method_signature(k[0]) == True:

				# if check_package_include(java_packages,k[0]) == True:
				relevant_k.append(k[0])

		if len(relevant_k) != 0:
예제 #19
0
                                                    labels,
                                                    test_size=0.3,
                                                    random_state=42)
classes = sorted(list(set(y_train)))
print(classes)
print('train', len(x_train))
print('test', len(x_test))
del labels
del features

neigh = KNeighborsClassifier(n_neighbors=50)
neigh.fit(x_train, y_train)

print('loading model')
# load google pretrained word2vec model
w2v_model = KeyedVectors.load_word2vec_format(
    './word2vec_model/GoogleNews-vectors-negative300.bin', binary=True)

from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
import operator, functools

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# English stop words
stops = set(stopwords.words("english"))


def get_deep_features(text):
    # Remove HTML
예제 #20
0
def exampleTest():
	#set up corpus from classifier data file
	known=set()
	corpus=pd.read_csv(os.path.join("data","classifier_data.csv"))
	corpus=corpus.fillna('') #eliminates all NaN boxed from spreadsheet
	for i in range(len(corpus)):
		questions=corpus.iloc[i]['question'].split('\r\n')
		answer=corpus.iloc[i]['text']
		for question in questions:
			tokens=tokenize(question)
			known.update(tokens)
	answer_tokens=tokenize(answer)
	known.update(answer_tokens)
	# corpus_model=Word2Vec([known], min_count=1)
	# print(str(corpus_model.most_similar(positive=['navy'],topn=5)))
	# word_vectors = corpus_model.wv
	# print(str(word_vectors.most_similar(positive=['navy'],topn=5)))


	google_model=KeyedVectors.load_word2vec_format(os.path.join('..','GoogleNews-vectors-negative300.bin'), binary=True)
	allWords=google_model.index2word
	modelData={w: google_model[w] for w in allWords}
	origModel=DictVectorModel(modelData)

	
	#google_model=KeyedVectors.load_word2vec_format(os.path.join('..','GoogleNews-vectors-negative300.bin'), binary=True)
	vocab_obj = google_model.vocab["word"]
	print("count of word in keyed: " + str(vocab_obj.count))
	print("index of word in keyed: " + str(vocab_obj.index))
	allWords=google_model.index2word
	print("First word: " + str(allWords[1]))
	#totalCount = len(allWords)*(len(allWords)-1)/2.0
	totalCount = len(allWords)
	print("total words from totalCount: " + str(totalCount))
	#print(str(allWords))



	#create a pandas DataFrame with columns labeled
	# list_ = []

	# limit_array_length = 0
	# for w in google_model.index2word:
	# 	if limit_array_length <= 1000:
	# 		list_.append(w)
	# 		limit_array_length += 1
	# 	#print("w in google model " + str(w))
	# frame = pd.DataFrame(np.reshape(np.array(list_),-1), columns=["Word"])
	# frame = frame.drop_duplicates(['Word'],keep="first")
	# frame.to_csv("../first_words.csv", index=None)


	limit_array_length = 0

	freqs={w: zipfWordFrequency(google_model.vocab[w],totalCount) for w in allWords}
	modelData={w: google_model[w] for w in allWords}
	origModel=DictVectorModel(modelData)
	model=origModel
	thesaurus={}
	#print(model._modelDict['computer'])
	#v1=model._modelDict['the'] #for later comparison with reduced dict

	# Config Values
	maxDim = 4
	''' FREQUENCY for word counts with Google news vector(3 million words)
	calculated using FrequencyMetrics.py
	25000 words: 2.6820216255e-06
	50000 words: 1.34101081275e-06
	100000 words: 6.70505406374e-07
	150000 words: 4.4700360425e-07
	200000 words: 3.35252703187e-07
	250000 words: 2.6820216255e-07
	500000 words: 1.34101081275e-07
	750000 words: 8.94007208499e-08
	1000000 words: 6.70505406374e-08
	1250000 words: 5.364043251e-08
	1500000 words: 4.4700360425e-08
	1750000 words: 3.831459465e-08
	2000000 words: 3.35252703187e-08
	2250000 words: 2.98002402833e-08
	2500000 words: 2.6820216255e-08
	2750000 words: 2.43820147773e-08
	3000000 words: 2.23501802125e-08
	'''
	#250000 words by freq thresh
	freqThresh = 2.6820216255e-07
	#just below 300000 words by rel thresh
	knownThresh = 1 #currently using cosine, avg relevance with dot is ??? 2.27510558315
	#clumpThresh = 0.99 #not yet implemented

	print("Initialized")
	start=time.time()
	filterReducer = VectorModelReducer(model, freqDict=freqs, knownWords=known)
	print("Filtering...")

	filteredModel = filterReducer.filterModel(freqThresh, knownThresh)
	#reduceDimModel = filterReducer.reduceDimModel(filterModel)
	end=time.time()
	elapsed=end-start
	print("Time to filter is "+str(elapsed))
	#print(filteredModel._modelDict['computer'])

	#v2=filteredModel._modelDict['the']
	#norm = float([w**2 for w in v1])**0.5 + float([w**2 for w in v2])**0.5
	#similarity = 1 - scipy.spatial.distance.cosine(v1, v2)
	#print("Similarity is "+str(similarity))

	if not os.path.exists('vector_models'):
		os.mkdir('vector_models')
	model_file='vector_models'+os.sep+'model_'+str(freqThresh)+'_'+str(knownThresh)+'.pkl'

		
	start=time.time()
	print("Dumping...")
	pickle_dump(filteredModel._modelDict, model_file)
	end=time.time()
	elapsed=end-start
	print("Time to dump is " + str(elapsed))
	print("Finished filtering, length of new model: " + str(filteredModel.getLength()))

	#with open(model_file, 'wb') as pickle_file:
	#	pickle.dump(filteredModel._modelDict, pickle_file)
	#Code for all combinations of freqThresh and knownThresh
	# for freq_thresh in freqThresh:
	#     for known_thresh in knownThresh:
	#         filteredModel = filterReducer.filterModel(freqThresh, knownThresh)
	#         model = filteredModel
	#         model_file='vector_models'+os.sep+'model_'+str(freq_thresh)+'_'+str(known_thresh)+'.json'
	#         with open(model_file, 'w') as json_file:
	#             json.dump(model, json_file)
	#clumpReducer = VectorModelReducer(model)
	#clumpedModel, thesaurus = clumpReducer.clumpModel(clumpThresh)
	#model = clumpedModel
	return model
예제 #21
0
# Train the Doc2Vec model
model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs)

#%%

# Google word2vec

import gensim
import gensim.downloader as api
path = api.load("word2vec-google-news-300", return_path=True)
print(path)

from gensim.models.keyedvectors import KeyedVectors
gensim_model = KeyedVectors.load_word2vec_format(path,
                                                 binary=True,
                                                 limit=300000)

gensim_model['regression']

gensim_model.most_similar(positive=['statistics', 'diagnostics', 'outlier'])

gensim_model.most_similar(positive=['Tea', 'United_States'],
                          negative=['England'])

gensim_model.most_similar(positive=['statistics', 'diagnostics', 'outlier'])

#%%

# Wikipedia2vec - Studio Ousia
예제 #22
0
testlabel = []
labelfile = open('testlabel.txt')
for l in labelfile:
    line = l.strip('\n')
    if line in emo:
        testlabel.append(emo.index(line))
    else:
        testlabel.append(7)
labelfile.close()
'''
traintexts = [[word for word in document.split() if word not in stop_words]
              for document in traindata]
testtexts = [[word for word in document.split() if word not in stop_words]
             for document in testdata]

word_vectors = KeyedVectors.load_word2vec_format(
    'zhwiki_2017_03.sg_50d.word2vec', binary=False)

gensim_dict = Dictionary()
gensim_dict.doc2bow(word_vectors.vocab.keys(), allow_update=True)
w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引,从1开始编号
w2vec = {word: word_vectors[word] for word in w2indx.keys()}
trainseq = text_to_index_array(w2indx, traintexts)
testseq = text_to_index_array(w2indx, testtexts)

traindata = pad_sequences(trainseq, maxlen=MAX_SEQUENCE_LENGTH)
testdata = pad_sequences(testseq, maxlen=MAX_SEQUENCE_LENGTH)
word_index = w2indx
print('Found %s unique tokens.' % len(word_index))
labels = to_categorical(np.asarray(label))
#testlabels = to_categorical(np.asarray(testlabel))
indices = np.arange(traindata.shape[0])
예제 #23
0
def test5():
    from gensim.models import KeyedVectors
    word_vectors = KeyedVectors.load_word2vec_format(file_to_path,
                                                     binary=False)
예제 #24
0
        mean2 = mean2 + s(y, A)  # distance of all y from A in set 1 vs. set 2
    mean2 = mean2 / float(len(Y))

    sd = []
    for w in X + Y:
        sd.append(s(w, A))
    return (mean1 - mean2) / np.std(sd)


global wv
global wv2

base = 'vecs/pk_vectors.bin'
countries = glob.glob("vecs/*")

wv = KeyedVectors.load_word2vec_format(base, binary=True)

print "PK VS THE WORLD"

for country in countries:

    print "~~~~~~~~~~~~~~ " + country.split("/")[1] + " ~~~~~~~~~~~~~~"

    wv2 = KeyedVectors.load_word2vec_format(country, binary=True)

    woman = [
        "she", "woman", "female", "her", "hers", "girl", "daughter", "mother",
        "sister", "aunt"
    ]
    man = [
        "he", "man", "male", "his", "him", "boy", "son", "brother", "father",
with open('Final_Labeled_DataCollection-NTI_Tweets.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    for tweet in Tweets:
        if len(analyze(tweet[2])) >= 4 and tweet[0] != '-1':
            writer.writerow(tweet)
'''''

with open('Final_Labeled_DataCollection-NTI_Tweets.csv', 'r', newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    NTI_Tweets_1 = []
    NTI_Whole_Tweets = []
    for row in reader:
        NTI_Whole_Tweets.append(row)
        NTI_Tweets_1.append(row[2])

Google_Word2Vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, encoding='latin-1')

vectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=(1, 1), analyzer=u'word')
analyze = vectorizer.build_analyzer()
miss_words = []
Total_Tweet_Vec_List = []
for t in NTI_Tweets_1:
    Name = analyze(t)
    Tweet_Vec_List = []
    for word in Name:
        try:
            wordVec = Google_Word2Vec[word]
            Tweet_Vec_List.append(word)
        except KeyError:
            miss_words.append(word)
    Total_Tweet_Vec_List.append(Tweet_Vec_List)
예제 #26
0
def downloadGlove(file = "C:/glove/glove.6B.50d.txt.w2v"):
	gant = KeyedVectors.load_word2vec_format(file, binary=False)
	return gant
예제 #27
0
Copyright (C) - All Rights Reserved
"""

import os

from gensim.models.keyedvectors import KeyedVectors

from examples.example_corpus import corpus
from nlpkit.nlp_feature_extraction import WordEmbedsDocVectorizer

# Replace with the path of a pre-trained w2v model
WORD_EMBEDDINGS_FILEPATH = os.path.abspath(
    os.path.join(os.path.dirname(__file__), '..', '..', 'fake-news', 'data',
                 'resources', 'word-embeddings', 'glove.6B',
                 'glove.6B.300d.word2vec.txt'))

if __name__ == "__main__":
    word2vec = KeyedVectors.load_word2vec_format(WORD_EMBEDDINGS_FILEPATH,
                                                 binary=False)

    w2v_vectorizer = WordEmbedsDocVectorizer(word2vec, tfidf_weights=True)
    X = w2v_vectorizer.fit_transform(corpus)

    print(X)

    # Example usage in a pipeline
    # pipeline = Pipeline(
    #     ('vec', WordEmbedsDocVectorizer(word2vec, tfidf_weights=True)),
    #     ('clf', SVC(kernel='linear', C=1, probability=True))
    # ])
예제 #28
0
import codecs
from sklearn.manifold import TSNE
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import csv
from sklearn.metrics.pairwise import cosine_similarity
from util import vector_averaging
from util import vector_averaging_with_tfidf
from util import process_source_code
from util import word2weight
import sys

DIMENSION = 25
csv.field_size_limit(sys.maxsize)
PROJECT = "cordova"
cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_10_25_include_functions.txt",binary=False)
java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_10_25_include_functions.txt",binary=False)

with open("./sentences/sentences_cs_10.txt","r") as cs_f:
	cs_data = cs_f.readlines()
with open("./sentences/sentences_java_10.txt","r") as java_f:
	java_data = java_f.readlines()

cs_sentences = [x for x in cs_data]
java_sentences = [x for x in java_data]

cs_word2weight = word2weight(cs_sentences)
java_word2weight = word2weight(java_sentences)
# print cs_word2weight
# Predicting part ----------------------------------------------
# print(cosine_similarity(cs_vectors["while"].reshape(1,-1),java_vectors["class"].reshape(1,-1))
예제 #29
0
from argparse import ArgumentParser
from gensim.models.keyedvectors import KeyedVectors
import torch
from tqdm import tqdm
if __name__ == '__main__':
    parser = ArgumentParser(description='Convert binary word2vec to txt')
    parser.add_argument('input')
    parser.add_argument('output')
    args = parser.parse_args()
    model = KeyedVectors.load_word2vec_format(args.input, binary=True)
    model.save_word2vec_format(args.output, binary=False)
예제 #30
0
p1 = int(len(data)*(1-VALIDATION_SPLIT-TEST_SPLIT))
p2 = int(len(data)*(1-TEST_SPLIT))
x_train = data[:p1]
y_train = labels[:p1]
x_val = data[p1:p2]
y_val = labels[p1:p2]
x_test = data[p2:]
y_test = labels[p2:]
print ('train docs: '+str(len(x_train)))
print ('val docs: '+str(len(x_val)))
print ('test docs: '+str(len(x_test)))

###################################################################
print ('(4) load word2vec as embedding...')
from gensim.models.keyedvectors import KeyedVectors
w2v_model0 = KeyedVectors.load_word2vec_format('uniqueWords.vector', encoding='utf-8')
# w2v_model1 = KeyedVectors.load_word2vec_format('medCorpus_meanVec.zh.vector', encoding='utf-8')
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
not_in_model = 0
in_model = 0
#the origin word vector
for word, i in word_index.items():
    if word in w2v_model0:
        in_model += 1
        vec0 = w2v_model0[word].tolist()
        # vec1 = w2v_model1[word].tolist()
        embedding_matrix[i] = np.asarray(vec0, dtype='float32')
    else:
        not_in_model += 1

print (str(not_in_model)+' words not in w2v model')
1. using most recent publication of researchers as input to generate user profiles
2. pretrain word2vec model window_5.model.bin and candidate_paper.csv are available via google drive link,
you can download the files and
change the path in this script so as to run the script successfully.
3. result saved in rank_result_rm/rank_result_mr_own_corpus.csv
"""

import sys
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import pandas as pd
from datetime import datetime

# load pre-train model on my own corpus
model = '/Users/sherry/Downloads/window_5/window_5.model.bin'
w2v_model = KeyedVectors.load_word2vec_format(model, binary=True)

# read all candidate papers info, contain two columns: paper ID and paper content
candidate_paper_df = pd.read_csv(
    '/Users/sherry/Downloads/candidate_papers.csv')


# define DocSim class to calculate document similarities
class DocSim(object):
    def __init__(self, w2v_model, stopwords=[]):
        self.w2v_model = w2v_model
        self.stopwords = stopwords

    def vectorize(self, doc):
        """Identify the vector values for each word in the given document"""
        doc = str(doc)
import math
from gensim.models.keyedvectors import KeyedVectors
import pkuseg

model = KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin', binary=True, limit=300000)

seg = pkuseg.pkuseg()
sentence_1 = ''
sentence_2 = ''

text1 = seg.cut(sentence_1)  # 进行分词
text2 = seg.cut(sentence_2)


stopwords = [line.strip() for line in open('Englishstopwords.txt', encoding='UTF-8').readlines()]
clean1= list()
clean2= list()
for word in sentence_1.split(" "):
    if word not in stopwords:
        clean1.append(word)


for word in sentence_2.split(" "):
    if word not in stopwords:
        clean2.append(word)

vec1=list()
vec2=list()
sum=0
sq1=0
예제 #33
0
def load_gensim_model(path_to_model):
    model =  KeyedVectors.load_word2vec_format(path_to_model, binary=True)
    return model
예제 #34
0
from gensim.models.keyedvectors import KeyedVectors
from konlpy.tag import Twitter
import numpy as np

pos_vectors = KeyedVectors.load_word2vec_format('pos.vec', binary=False)
pos_vectors.most_similar("('남자','Noun')")
twitter = Twitter()
word = "대통령이"
pos_list = twitter.pos(word, norm=True)
word_vector = np.sum([pos_vectors.word_vec(str(pos).replace(" ", "")) for pos in pos_list], axis=0)
예제 #35
0
import csv
from sklearn.metrics.pairwise import cosine_similarity
from util import vector_averaging
from util import vector_averaging_with_tfidf
from util import process_source_code
from util import process_diff_srcml
from util import process_diff_srcml2
from util import word2weight
from util import process_expression
from util import mean_average_precision
from util import average_precision
from util import precision_at_k
import sys

DIMENSION = 20
cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_11_20.txt",binary=False)
java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_11_20.txt",binary=False)

with open("./sentences/sentences_cs_11.txt","r") as cs_f:
	cs_data = cs_f.readlines()
with open("./sentences/sentences_java_11.txt","r") as java_f:
	java_data = java_f.readlines()

cs_sentences = [x for x in cs_data]
java_sentences = [x for x in java_data]

cs_word2weight = word2weight(cs_sentences)
java_word2weight = word2weight(java_sentences)


with codecs.open("./evaluation_data/keywords.csv","r") as f_csv:
    )  #list9+list10+list11+list12 , list99+list1010+list1111+list1212
#test_data, test_label = preprocessing(list1+list2+list3+list4), enumerate_list(list111+list222+list333+list444) #list1+list2+list3+list4 , list11+list22+list33+list44
#note: full test dataset is too huge-> memory error, so use part of it
test_data_list = pd.read_csv(
    os.getcwd() + '\\data\\EI-reg-En-part-test.csv')['Tweet'].tolist()
test_label_list = pd.read_csv(
    os.getcwd() +
    '\\data\\EI-reg-En-part-test.csv')['Affect Dimension'].tolist()
test_data, test_label = preprocessing(test_data_list), test_label_list

print("Train shape:", len(train_data), len(train_label))
print("Validation shape:", len(dev_data), len(dev_label))
print("Test shape:", len(test_data), len(test_label))

# Loading all models
glove_model = KeyedVectors.load_word2vec_format(
    'word2vec.twitter.27B.100d.txt', binary=False)  # load Glove model
w2v_model = Word2Vec.load('w2v_model.bin')  # load word2vec model
e2v_model = gsm.KeyedVectors.load_word2vec_format(
    'emoji2vec.bin', binary=True)  # load emoji2vec model
print("All Models Loaded!")

# word embedding data with glove pretrained model and real word2vec/w2v
input_data = np.concatenate((train_data, dev_data, test_data))
max_sequence_length = max([len(x) for x in input_data
                           ])  # find the length of longest twitter
print("Max twitter length:", max_sequence_length)
print("input_data shape:", len(input_data))


# Find embedding for corpus
def embedding(data, max_len):
		# if "System." in split[0] or "antlr" in split[0].lower():
			cs_signature_tokens.append(split[0])

print "cs tokens : " + str(len(cs_signature_tokens))

for java_emb in java_embeddings:
	split = java_emb.split(" ")
	if func(split[0]) == True:

		if check_package_include(java_packages,split[0]) == True:
		# if "java." in split[0] or "antlr" in split[0].lower():
			java_signature_tokens.append(split[0])

print "java tokens : " + str(len(java_signature_tokens))
print "Loading word embedding..........."
cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_new_window3.txt",binary=False)
java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_new_window3.txt",binary=False)

print "Finish loading.............."
# print cs_vectors.similar_by_vector(java_vectors["java.util.concurrent.locks.Lock.lock()"], topn=30)
# print cs_vectors.similar_by_vector(java_vectors["package"], topn=30)


def check_if_relevant_k_contains_exact_name(method_source, relevant_k):
	
	
	check = False
	
	for k in relevant_k:
	
		split = k.split(".")
    print indexi
    start = time.time()

    num_features = 200  # Word vector dimensionality
    min_word_count = 20  # Minimum word count
    num_workers = 40  # Number of threads to run in parallel
    context = 10  # Context window size
    downsampling = 1e-3  # Downsample setting for frequent words

    model_name = str(num_features) + "features_" + str(min_word_count) + "minwords_" + str(
        context) + "context_len2alldata"
    # Load the trained Word2Vec model.
    model_name = 'wordvectors_reuters_nonpoly.txt'
    #model = Word2Vec.load(model_name)#.syn0
        # Get wordvectors for all words in vocabulary.
    model = KeyedVectors.load_word2vec_format(model_name, binary=False)
    word_vectors = model.syn0
    all = pd.read_pickle('all.pkl')
    start1 = time.time()
    start = time.time()
    # Set number of clusters.
    num_clusters = 60
    idx, idx_proba = cluster_GMM(num_clusters, word_vectors)
    idx_proba[idx_proba < 0.2] = 0
    n_clusteri = num_clusters
    f = open(filename, 'a')
    print "number of k clusters ", str(n_clusteri)
    f.write("number of k clusters " + str(n_clusteri) + "\n")
    start2 = time.time()
    f.write("time taken in clustering " + str(start2 - start1) + "\n")
    # Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments.
예제 #39
0
        h1 = F.dropout(h1,0.3)
        h2 = F.dropout(h2, 0.3)
        h1 = F.relu(h1)
        h2 = F.relu(h2)
        h = F.concat([h1, h2])
        out = self.hy(h)
        #out = self.bn3(out)
        #out = F.dropout(out,0.3)
        out = F.tanh(out)
        out = F.normalize(out)

        return F.mean_squared_error(out,t)


if __name__ == '__main__':
    dic = KeyedVectors.load_word2vec_format("trainer/glove.6B.100d.bin")
    original_word = 'does'
    glove_vec = dic.get_vector(original_word).reshape(1, 100)


    first_net = First_Network(100,30,len(original_word))
    first_net.cleargrads()
    optimizer1 = optimizers.Adam()
    optimizer1.setup(first_net)
    second_net = Second_Network(27,30, 50)
    second_net.cleargrads()
    second_net.reset()
    optimizer2 = optimizers.Adam()
    optimizer2.setup(second_net)
    second_net2 = Second_Network(27, 30, 50)
    second_net2.cleargrads()
예제 #40
0
odd = '/home/dpappas/'
###########################################################
w2v_bin_path = '/home/dpappas/bioasq_all/pubmed2018_w2v_30D.bin'
idf_pickle_path = '/home/dpappas/bioasq_all/idf.pkl'
###########################################################
avgdl, mean, deviation = 21.688767020746013, 0.7375801616020308, 1.3411418040865049
print(avgdl, mean, deviation)
###########################################################
k_for_maxpool = 5
k_sent_maxpool = 5
embedding_dim = 30  #200
###########################################################
print('loading idfs')
idf, max_idf = load_idfs(idf_pickle_path)
print('loading w2v')
wv = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
wv = dict([(word, wv[word]) for word in wv.vocab.keys()])
###########################################################
my_seed = 1
random.seed(my_seed)
torch.manual_seed(my_seed)
###########################################################
print('Compiling model...')
model = Sent_Posit_Drmm_Modeler(embedding_dim=embedding_dim,
                                k_for_maxpool=k_for_maxpool)
if (use_cuda):
    model = model.cuda()

###########################################################
resume_from = '/home/dpappas/bioasq_w2vjpdrmm_demo_run_0/best_dev_checkpoint.pth.tar'
load_model_from_checkpoint(resume_from)
예제 #41
0
파일: L2.py 프로젝트: lanweicai/NLP
if __name__ == '__main__':
    """
    @description: 训练词向量,保存、加载模型,并计算词的相似度
    word2vec保存和加载模型方法(save方式保存的模型可继续训练,save_word2vec_format速度快内存小):
    1.model.save('w2v.model')->model = Word2Vec.load('w2v.model')
    2.model.wv.save_word2vec_format('w2v.bin')
    ->KeyedVectors.load_word2vec_format('w2v.bin')
    """
    train_segx_path = '../datasets/train_set.seg_x.txt'
    train_segy_path = '../datasets/train_set.seg_y.txt'
    test_segx_path = '../datasets/test_set.seg_x.txt'
    sentences_path = '../datasets/sentences.txt'
    w2v_bin_path = 'w2v.bin'
    ft_bin_path = 'ft.bin'
    voacb_path = '../datasets/voacb.txt'
    lines = read_data(train_segx_path)
    lines += read_data(train_segy_path)
    lines += read_data(test_segx_path)
    save_data(lines, sentences_path)
    word2vec_build(sentences_path, w2v_bin_path)
    fasttext_build(sentences_path, ft_bin_path)

    w2v_model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    model_test(w2v_model, '汽车', '车')

    ft_model = KeyedVectors.load_word2vec_format(ft_bin_path, binary=True)
    model_test(ft_model, '汽车', '车')

    mt = embedding_matrix(w2v_model, voacb_path)
예제 #42
0
	def load_w2v_data(self, binary_file_name):
		self.w2v_model = KeyedVectors.load_word2vec_format(
			os.path.join(DATA_PATH, binary_file_name), binary=True)
예제 #43
0
validation = pd.read_csv("preprocess/validation_char.csv")
validation["content"] = validation.apply(lambda x: eval(x[1]), axis=1)

model_dir = "model_capsule_char/"
maxlen = 1000
max_features = 20000
batch_size = 128
epochs = 1
tokenizer = text.Tokenizer(num_words=None)
tokenizer.fit_on_texts(data["content"].values)
with open('tokenizer_char.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

word_index = tokenizer.word_index
w2_model = KeyedVectors.load_word2vec_format("word2vec/chars.vector",
                                             binary=True,
                                             encoding='utf8',
                                             unicode_errors='ignore')
embeddings_index = {}
embeddings_matrix = np.zeros((len(word_index) + 1, w2_model.vector_size))
word2idx = {"_PAD": 0}
vocab_list = [(k, w2_model.wv[k]) for k, v in w2_model.wv.vocab.items()]

for word, i in word_index.items():
    if word in w2_model:
        embedding_vector = w2_model[word]
    else:
        embedding_vector = None
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

column_list = [
예제 #44
0
    def add_embedding(self, embedding_file):

        self.embedding_files.append(KeyedVectors.load_word2vec_format(embedding_file, binary=self.binary))
#import gensim

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.7
set_session(tf.Session(config=config))

# basic model for tag recommendation
# Embedding layer -> BiLSTM -> Dense with softmax

# word model
#embeddings_file_bin = '../glove/vectors.bin'
#word_model = KeyedVectors.load_word2vec_format('../glove/vectors.txt', binary=False, unicode_errors='ignore')
word_model = KeyedVectors.load_word2vec_format('word2vec/vec_Body_Title.bin',
                                               binary=True,
                                               unicode_errors='ignore')
#meta_model = KeyedVectors.load_word2vec_format('metapath2vec/code_metapath2vec/stack_new_1000', binary=True, unicode_errors='ignore')
user_id = pickle.load(open("user.p", 'rb'))
user_tag = pickle.load(open("user_tags.p", 'rb'))
user_num = pickle.load(open("user_num.p", 'rb'))

count = len(user_tag)

meta_model = {}
openfile = open("graph_train.emd", 'r')
for line in openfile:
    arr = line.split()
    meta_model[arr[0]] = arr[1:]

#print meta_model['0']
예제 #46
0
import csv
from sklearn.metrics.pairwise import cosine_similarity
from util import vector_averaging
from util import vector_averaging_with_tfidf
from util import process_source_code
from util import process_diff_srcml
from util import process_diff_srcml2
from util import word2weight
from util import process_expression
from util import mean_average_precision
from util import average_precision
from util import precision_at_k
import sys

DIMENSION = 20
cs_vectors = KeyedVectors.load_word2vec_format(
    "./bi2vec_vectors/cs_vectors_11_20.txt", binary=False)
java_vectors = KeyedVectors.load_word2vec_format(
    "./bi2vec_vectors/java_vectors_11_20.txt", binary=False)

with open("./sentences/sentences_cs_11.txt", "r") as cs_f:
    cs_data = cs_f.readlines()
with open("./sentences/sentences_java_11.txt", "r") as java_f:
    java_data = java_f.readlines()

cs_sentences = [x for x in cs_data]
java_sentences = [x for x in java_data]

cs_word2weight = word2weight(cs_sentences)
java_word2weight = word2weight(java_sentences)

with codecs.open("./evaluation_data/keywords.csv", "r") as f_csv:
예제 #47
0
 def setUp(self):
     self.vectors = EuclideanKeyedVectors.load_word2vec_format(
         datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)
예제 #48
0
파일: embprox.py 프로젝트: GateNLP/emina
 def load_embeddings(self, embeddingsfilename):
     self.emb = KeyedVectors.load_word2vec_format(embeddingsfilename, binary=False)
예제 #49
0
 def load_model(self, datatype):
     path = datapath('high_precision.kv.txt')
     kv = KeyedVectors.load_word2vec_format(path, binary=False,
                                            datatype=datatype)
     return kv