Python KeyedVectors.load_word2vec_format 예제들, gensim.models.keyedvectors.KeyedVectors.load_word2vec_format Python 예제들

예제 #1

1

파일 보기

파일: l5_convert_word2vec_bin.py 프로젝트: coder352/shellscript

#!/usr/bin/python3
# coding: utf-8
from gensim.models.keyedvectors import KeyedVectors
model = KeyedVectors.load_word2vec_format('~/datasets/WordVec/GoogleNews/GoogleNews-vectors-negative300.bin', binary=True)
model.save_word2vec_format('~/datasets/WordVec/GoogleNews/GoogleNews-vectors-negative300', binary=False)

예제 #2

0

파일 보기

파일: wordrank.py 프로젝트: dpritsos/DoGSWrapper

    def ensemble_embedding(self, word_embedding, context_embedding):
        """Replace current syn0 with the sum of context and word embeddings.

        Parameters
        ----------
        word_embedding : str
            Path to word embeddings in GloVe format.
        context_embedding : str
            Path to context embeddings in word2vec_format.

        Returns
        -------
        numpy.ndarray
            Matrix with new embeddings.

        """
        glove2word2vec(context_embedding, context_embedding + '.w2vformat')
        w_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % word_embedding)
        c_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % context_embedding)
        # compare vocab words using keys of dict vocab
        assert set(w_emb.vocab) == set(c_emb.vocab), 'Vocabs are not same for both embeddings'

        # sort context embedding to have words in same order as word embedding
        prev_c_emb = copy.deepcopy(c_emb.syn0)
        for word_id, word in enumerate(w_emb.index2word):
            c_emb.syn0[word_id] = prev_c_emb[c_emb.vocab[word].index]
        # add vectors of the two embeddings
        new_emb = w_emb.syn0 + c_emb.syn0
        self.syn0 = new_emb
        return new_emb

예제 #3

0

파일 보기

파일: test_datatype.py 프로젝트: RaRe-Technologies/gensim

 def test_type_conversion(self):
     path = datapath('high_precision.kv.txt')
     binary_path = datapath('high_precision.kv.bin')
     model1 = KeyedVectors.load_word2vec_format(path, datatype=np.float16)
     model1.save_word2vec_format(binary_path, binary=True)
     model2 = KeyedVectors.load_word2vec_format(binary_path, datatype=np.float64, binary=True)
     self.assertAlmostEqual(model1["horse.n.01"][0], np.float16(model2["horse.n.01"][0]))
     self.assertEqual(model1["horse.n.01"][0].dtype, np.float16)
     self.assertEqual(model2["horse.n.01"][0].dtype, np.float64)

예제 #4

0

파일 보기

파일: devise_model.py 프로젝트: s60912frank/ML_MS3

 def load_w2v_model(self):
     # Load pre-trained word2vec model
     model_loc = os.path.join(os.getcwd(),get_str('devise', 'w2v_model_name'))
     word_vectors = KeyedVectors.load_word2vec_format(model_loc, binary=True)
     # Get dimensions of word vector
     word_dim = word_vectors['the'].shape[0]
     return word_vectors, word_dim

예제 #5

0

파일 보기

파일: densetweets.py 프로젝트: jrmontag/Data-Science-45min-Intros

def load_GNews_model():
    """
    Convenience function for loading the pre-trained Google News word2vec model vectors 
    published with the original work. For more information see: 
    https://code.google.com/archive/p/word2vec/ 
    """
    model = KeyedVectors.load_word2vec_format('rdata/GoogleNews-vectors-negative300.bin', binary=True) 
    return model

예제 #6

0

파일 보기

파일: W2VFeatures_old.py 프로젝트: presmerats/semeval13task9

 def create_and_load_dic(self):
     model = KeyedVectors.load_word2vec_format(self.files_path + '.bin', binary=True)
     kmeans = cluster.KMeans(n_clusters=self.num_clusters)
     kmeans.fit(model.wv.vectors)
     self.w2v_dic = dict(zip(model.wv.index2word, zip(model.wv.vectors, kmeans.labels_)))
     output = open(self.files_path + '.pkl', 'wb')
     pickle.dump(self.w2v_dic, output)
     output.close()

예제 #7

0

파일 보기

파일: embedding.py 프로젝트: TanYufei/Chinese-Annotator

 def __init__(self, config=None):
     super().__init__()
     self.embedding_path = config.get("embedding_path")
     self.embedding_type = config.get("embedding_type")
     if self.embedding_path is None or self.embedding_path == "":
         raise ValueError("Embedding_path is expected.")
     is_binary = True if self.embedding_type == "bin" else False
     from gensim.models.keyedvectors import KeyedVectors
     self.embedding = KeyedVectors.load_word2vec_format(self.embedding_path, binary=is_binary)

예제 #8

0

파일 보기

파일: vectorizers.py 프로젝트: wiki-ai/revscoring

 def load_kv(filename=None, path=None, limit=None):
     if path is not None:
         return KeyedVectors.load_word2vec_format(
             path, binary=True, limit=limit)
     elif filename is not None:
         for dir_path in ASSET_SEARCH_DIRS:
             try:
                 path = os.path.join(dir_path, filename)
                 return KeyedVectors.load_word2vec_format(
                     path, binary=True, limit=limit)
             except FileNotFoundError:
                 continue
         raise FileNotFoundError("Please make sure that 'filename' \
                                 specifies the word vector binary name \
                                 in default search paths or 'path' \
                                 speficies file path of the binary")
     else:
         raise TypeError(
             "load_kv() requires either 'filename' or 'path' to be set.")

예제 #9

0

파일 보기

파일: feature_extraction.py 프로젝트: cdj0311/nlp-architect

    def load_word2vec_model_from_path(self):
        """
        Load Word2Vec model

        Returns:
            the Word2Vec model
        """
        word_embeddings_model = KeyedVectors.load_word2vec_format(
            self.word2vec_model_path, binary=True)
        if not word_embeddings_model:
            return None
        return word_embeddings_model

예제 #10

0

파일 보기

파일: Cognate_Words.py 프로젝트: roddar92/linguistics_problems

 def _load_word2vec(path, limit=500000):
     """
     Init word2vec model
     :param path: path to the model
     :param limit: optional
     :return: word2vec model
     """
     print('Выгружаю семантическую модель слов...')
     w2v = KeyedVectors.load_word2vec_format(path, binary=True,
                                             unicode_errors='ignore', limit=limit)
     w2v.init_sims(replace=True)
     print('Выгрузка окончена')
     return w2v

예제 #11

0

파일 보기

from gensim.models.keyedvectors import KeyedVectors
import logging
from scipy import stats
import numpy as np
from sklearn import metrics

file = input('The vector file:')
model = KeyedVectors.load_word2vec_format(file, binary=False)

#verbs
similar = model.most_similar('击败')
print('击败：')
print(similar)
print('\n')
similar = model.most_similar('引用')
print('引用：')
print(similar)
print('\n')
similar = model.most_similar('研究')
print('研究：')
print(similar)
print('\n')
similar = model.most_similar('形成')
print('形成：')
print(similar)
print('\n')
similar = model.most_similar('增加')
print('增加：')
print(similar)
print('\n')

예제 #12

0

파일 보기

파일: word_association.py 프로젝트: lilianluong16/cogworks_team4

import numpy as np
import pickle
import gensim
import time
from gensim.models.keyedvectors import KeyedVectors

t0 = time.time()
path = "glove.6B.50d.txt.w2v"
glove = KeyedVectors.load_word2vec_format(path, binary=False)
t1 = time.time()
print("loaded word vectors in ", t1 - t0)


class Word_Association:
    """
    Capable of playing a simple word association game using word embeddings
    The computer starts by giving a random word,
    then the user gives a related word.
    This continues until the user gives a word that isn't related or has been used already
    or the computer can't come up with a related word that hasn't been used.
    """
    def __init__(self, seed=None, level=1):
        """
        Creates a game session.

        Parameters
        ----------
        seed: str
            the word on whcih the game will start (random if None)
        level: int
            the difficulty level (1 is recommended, higher is harder)

예제 #13

0

파일 보기

def get_w2v():
    model = KeyedVectors.load_word2vec_format('word2vec.bin', binary=True)
    return model

예제 #14

0

파일 보기

            y_true = sample['labels'].numpy()
            if task not in recalls:
                recalls[task] = 0.
            recalls[task] += get_recall(y_true, y)
            if task not in counts:
                counts[task] = 0
            counts[task] += 1
    recalls = {task: recall / counts[task] for task, recall in recalls.items()}
    if save_argmax:
        return argmaxes
    else:
        return recalls


print('Loading word vectors...')
we = KeyedVectors.load_word2vec_format(args.we_path, binary=1)
testset = CrossTask(
    data_path=args.data_path,
    features_path=args.features_path,
    features_path_3D=args.features_path_3D,
    we=we,
    feature_framerate=args.feature_framerate,
    feature_framerate_3D=args.feature_framerate_3D,
    we_dim=args.we_dim,
    max_words=args.max_words,
)
testloader = DataLoader(
    testset,
    batch_size=1,
    num_workers=args.num_thread_reader,
    shuffle=False,

예제 #15

0

파일 보기

파일: app.py 프로젝트: xiaoanshi/MA-INF-4222-NLP-Lab

from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
from flask import Flask
import urllib
import numpy as np
import json
from elasticsearch import Elasticsearch

app = Flask(__name__)
#b = Word2Vec.load('tmp/brown.bin')
#g = Word2Vec.load('tmp/GoogleNews-vectors-negative300.bin')
g = KeyedVectors.load_word2vec_format('tmp/GoogleNews-vectors-negative300.bin',
                                      binary=True)
log = []


@app.route("/")
def hello():
    return "OK"


def ConvertVectorSetToVecAverageBased(vectorSet, ignore=[]):
    if len(ignore) == 0:
        return np.mean(vectorSet, axis=0)
    else:
        return np.dot(np.transpose(vectorSet), ignore) / sum(ignore)


def phrase_similarity(_phrase_1, _phrase_2):
    phrase_1 = _phrase_1.split(" ")
    phrase_2 = _phrase_2.split(" ")

예제 #16

0

파일 보기

def main(schema_path, word_embeddings_path, output_path,
         header=True, zero_vector=False):
    """The logic of the script."""
    if header:
        print "Skipping header row."
    else:
        print "No header row."

    # Initialize the gensim model.
    print "Loading word vectors. This may take a moment."
    start = time.time()
    gensim_model = KeyedVectors.load_word2vec_format(word_embeddings_path, binary=False)
    print "Model loaded in %0.3f seconds." % (time.time() - start)
    print "Word vectors loaded."
    try:
        _ = gensim_model["unk"]
    except:
        print "unk not in model"

    # We'll probably have to look up the same table and field names many times;
    # may as well cache them.
    cached_vectors = {}

    # Read in the schema CSV.
    all_vectors = []
    with open(schema_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if header:
                header = False
                continue
            if not len(row) == NUM_FIELDS:
                print "Expected %d fields, but found %d; skipping row." % (NUM_FIELDS, len(row))
                print "\t" + ", ".join(row)
                continue
            # Get word vectors for table names and field names
            from_table_name = row[0].strip()
            from_field_name = row[1].strip()
            to_table_name = row[2].strip()
            to_field_name = row[3].strip()
            vectors = []
            for name in [from_table_name, to_table_name,
                         from_field_name, to_field_name]:
                if name not in cached_vectors:
                    cached_vectors[name] = get_word_vector(name, gensim_model)
                current_vector = cached_vectors[name]
                vectors.append(current_vector)

            # Concatenate the four vectors
            entire_vector = np.concatenate(vectors)
            all_vectors.append(entire_vector)
    if zero_vector:
        print "Appending zero vector"
        zeros = np.zeros_like(all_vectors[0])
        all_vectors.append(zeros)

    matrix = np.stack(all_vectors)

    print "schema_map shape: %s" % str(matrix.shape)

    # Save matrix
    np.save(output_path, matrix)

예제 #17

0

파일 보기

from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import datapath
from pathlib import Path

model = KeyedVectors.load_word2vec_format(Path.cwd() / 'model_test.txt',
                                          binary=False)

print('女生')
print(model.most_similar(positive='女生'))
print('----------------')
print('中国')
print(model.most_similar(positive='中国'))

예제 #18

0

파일 보기

파일: evaluate_same_records_between_vector_version.py 프로젝트: bdqnghi/code_clone_2vec

# project = "db4o"
# cs_packages = ["Db4objects.","Db4oUnit"]
# java_packages = ["db4o."]

usage_type = "method"

with open(URL,"r") as f:
	data = f.readlines()

keys = list()
for line in data:
	line = line.strip()
	splits = line.split("-")
	keys.append(splits[0])

cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_global_local.txt",binary=False)
java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_global_local.txt",binary=False)

for key in keys:
	try:
		vector = java_vectors[key]
		k_nearest= cs_vectors.similar_by_vector(vector, topn=50)
		relevant_k = list()

		for k in k_nearest:
			if check_if_token_is_method_signature(k[0]) == True:

				# if check_package_include(java_packages,k[0]) == True:
				relevant_k.append(k[0])

		if len(relevant_k) != 0:

예제 #19

0

파일 보기

                                                    labels,
                                                    test_size=0.3,
                                                    random_state=42)
classes = sorted(list(set(y_train)))
print(classes)
print('train', len(x_train))
print('test', len(x_test))
del labels
del features

neigh = KNeighborsClassifier(n_neighbors=50)
neigh.fit(x_train, y_train)

print('loading model')
# load google pretrained word2vec model
w2v_model = KeyedVectors.load_word2vec_format(
    './word2vec_model/GoogleNews-vectors-negative300.bin', binary=True)

from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
import operator, functools

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# English stop words
stops = set(stopwords.words("english"))


def get_deep_features(text):
    # Remove HTML

예제 #20

0

파일 보기

def exampleTest():
	#set up corpus from classifier data file
	known=set()
	corpus=pd.read_csv(os.path.join("data","classifier_data.csv"))
	corpus=corpus.fillna('') #eliminates all NaN boxed from spreadsheet
	for i in range(len(corpus)):
		questions=corpus.iloc[i]['question'].split('\r\n')
		answer=corpus.iloc[i]['text']
		for question in questions:
			tokens=tokenize(question)
			known.update(tokens)
	answer_tokens=tokenize(answer)
	known.update(answer_tokens)
	# corpus_model=Word2Vec([known], min_count=1)
	# print(str(corpus_model.most_similar(positive=['navy'],topn=5)))
	# word_vectors = corpus_model.wv
	# print(str(word_vectors.most_similar(positive=['navy'],topn=5)))


	google_model=KeyedVectors.load_word2vec_format(os.path.join('..','GoogleNews-vectors-negative300.bin'), binary=True)
	allWords=google_model.index2word
	modelData={w: google_model[w] for w in allWords}
	origModel=DictVectorModel(modelData)

	
	#google_model=KeyedVectors.load_word2vec_format(os.path.join('..','GoogleNews-vectors-negative300.bin'), binary=True)
	vocab_obj = google_model.vocab["word"]
	print("count of word in keyed: " + str(vocab_obj.count))
	print("index of word in keyed: " + str(vocab_obj.index))
	allWords=google_model.index2word
	print("First word: " + str(allWords[1]))
	#totalCount = len(allWords)*(len(allWords)-1)/2.0
	totalCount = len(allWords)
	print("total words from totalCount: " + str(totalCount))
	#print(str(allWords))



	#create a pandas DataFrame with columns labeled
	# list_ = []

	# limit_array_length = 0
	# for w in google_model.index2word:
	# 	if limit_array_length <= 1000:
	# 		list_.append(w)
	# 		limit_array_length += 1
	# 	#print("w in google model " + str(w))
	# frame = pd.DataFrame(np.reshape(np.array(list_),-1), columns=["Word"])
	# frame = frame.drop_duplicates(['Word'],keep="first")
	# frame.to_csv("../first_words.csv", index=None)


	limit_array_length = 0

	freqs={w: zipfWordFrequency(google_model.vocab[w],totalCount) for w in allWords}
	modelData={w: google_model[w] for w in allWords}
	origModel=DictVectorModel(modelData)
	model=origModel
	thesaurus={}
	#print(model._modelDict['computer'])
	#v1=model._modelDict['the'] #for later comparison with reduced dict

	# Config Values
	maxDim = 4
	''' FREQUENCY for word counts with Google news vector(3 million words)
	calculated using FrequencyMetrics.py
	25000 words: 2.6820216255e-06
	50000 words: 1.34101081275e-06
	100000 words: 6.70505406374e-07
	150000 words: 4.4700360425e-07
	200000 words: 3.35252703187e-07
	250000 words: 2.6820216255e-07
	500000 words: 1.34101081275e-07
	750000 words: 8.94007208499e-08
	1000000 words: 6.70505406374e-08
	1250000 words: 5.364043251e-08
	1500000 words: 4.4700360425e-08
	1750000 words: 3.831459465e-08
	2000000 words: 3.35252703187e-08
	2250000 words: 2.98002402833e-08
	2500000 words: 2.6820216255e-08
	2750000 words: 2.43820147773e-08
	3000000 words: 2.23501802125e-08
	'''
	#250000 words by freq thresh
	freqThresh = 2.6820216255e-07
	#just below 300000 words by rel thresh
	knownThresh = 1 #currently using cosine, avg relevance with dot is ??? 2.27510558315
	#clumpThresh = 0.99 #not yet implemented

	print("Initialized")
	start=time.time()
	filterReducer = VectorModelReducer(model, freqDict=freqs, knownWords=known)
	print("Filtering...")

	filteredModel = filterReducer.filterModel(freqThresh, knownThresh)
	#reduceDimModel = filterReducer.reduceDimModel(filterModel)
	end=time.time()
	elapsed=end-start
	print("Time to filter is "+str(elapsed))
	#print(filteredModel._modelDict['computer'])

	#v2=filteredModel._modelDict['the']
	#norm = float([w**2 for w in v1])**0.5 + float([w**2 for w in v2])**0.5
	#similarity = 1 - scipy.spatial.distance.cosine(v1, v2)
	#print("Similarity is "+str(similarity))

	if not os.path.exists('vector_models'):
		os.mkdir('vector_models')
	model_file='vector_models'+os.sep+'model_'+str(freqThresh)+'_'+str(knownThresh)+'.pkl'

		
	start=time.time()
	print("Dumping...")
	pickle_dump(filteredModel._modelDict, model_file)
	end=time.time()
	elapsed=end-start
	print("Time to dump is " + str(elapsed))
	print("Finished filtering, length of new model: " + str(filteredModel.getLength()))

	#with open(model_file, 'wb') as pickle_file:
	#	pickle.dump(filteredModel._modelDict, pickle_file)
	#Code for all combinations of freqThresh and knownThresh
	# for freq_thresh in freqThresh:
	#     for known_thresh in knownThresh:
	#         filteredModel = filterReducer.filterModel(freqThresh, knownThresh)
	#         model = filteredModel
	#         model_file='vector_models'+os.sep+'model_'+str(freq_thresh)+'_'+str(known_thresh)+'.json'
	#         with open(model_file, 'w') as json_file:
	#             json.dump(model, json_file)
	#clumpReducer = VectorModelReducer(model)
	#clumpedModel, thesaurus = clumpReducer.clumpModel(clumpThresh)
	#model = clumpedModel
	return model

예제 #21

0

파일 보기

파일: tutorial.py 프로젝트: QihangYang/Concept-Map

# Train the Doc2Vec model
model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs)

#%%

# Google word2vec

import gensim
import gensim.downloader as api
path = api.load("word2vec-google-news-300", return_path=True)
print(path)

from gensim.models.keyedvectors import KeyedVectors
gensim_model = KeyedVectors.load_word2vec_format(path,
                                                 binary=True,
                                                 limit=300000)

gensim_model['regression']

gensim_model.most_similar(positive=['statistics', 'diagnostics', 'outlier'])

gensim_model.most_similar(positive=['Tea', 'United_States'],
                          negative=['England'])

gensim_model.most_similar(positive=['statistics', 'diagnostics', 'outlier'])

#%%

# Wikipedia2vec - Studio Ousia

예제 #22

0

파일 보기

파일: badwordsclassify.py 프로젝트: cezhang52111/spyder

testlabel = []
labelfile = open('testlabel.txt')
for l in labelfile:
    line = l.strip('\n')
    if line in emo:
        testlabel.append(emo.index(line))
    else:
        testlabel.append(7)
labelfile.close()
'''
traintexts = [[word for word in document.split() if word not in stop_words]
              for document in traindata]
testtexts = [[word for word in document.split() if word not in stop_words]
             for document in testdata]

word_vectors = KeyedVectors.load_word2vec_format(
    'zhwiki_2017_03.sg_50d.word2vec', binary=False)

gensim_dict = Dictionary()
gensim_dict.doc2bow(word_vectors.vocab.keys(), allow_update=True)
w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引，从1开始编号
w2vec = {word: word_vectors[word] for word in w2indx.keys()}
trainseq = text_to_index_array(w2indx, traintexts)
testseq = text_to_index_array(w2indx, testtexts)

traindata = pad_sequences(trainseq, maxlen=MAX_SEQUENCE_LENGTH)
testdata = pad_sequences(testseq, maxlen=MAX_SEQUENCE_LENGTH)
word_index = w2indx
print('Found %s unique tokens.' % len(word_index))
labels = to_categorical(np.asarray(label))
#testlabels = to_categorical(np.asarray(testlabel))
indices = np.arange(traindata.shape[0])

예제 #23

0

파일 보기

def test5():
    from gensim.models import KeyedVectors
    word_vectors = KeyedVectors.load_word2vec_format(file_to_path,
                                                     binary=False)

예제 #24

0

파일 보기

        mean2 = mean2 + s(y, A)  # distance of all y from A in set 1 vs. set 2
    mean2 = mean2 / float(len(Y))

    sd = []
    for w in X + Y:
        sd.append(s(w, A))
    return (mean1 - mean2) / np.std(sd)


global wv
global wv2

base = 'vecs/pk_vectors.bin'
countries = glob.glob("vecs/*")

wv = KeyedVectors.load_word2vec_format(base, binary=True)

print "PK VS THE WORLD"

for country in countries:

    print "~~~~~~~~~~~~~~ " + country.split("/")[1] + " ~~~~~~~~~~~~~~"

    wv2 = KeyedVectors.load_word2vec_format(country, binary=True)

    woman = [
        "she", "woman", "female", "her", "hers", "girl", "daughter", "mother",
        "sister", "aunt"
    ]
    man = [
        "he", "man", "male", "his", "him", "boy", "son", "brother", "father",

예제 #25

0

파일 보기

파일: Filter_Data Collection_NTI.py 프로젝트: sinadabiri/Tweet-Classification-Deep-Learning-Traffic

with open('Final_Labeled_DataCollection-NTI_Tweets.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    for tweet in Tweets:
        if len(analyze(tweet[2])) >= 4 and tweet[0] != '-1':
            writer.writerow(tweet)
'''''

with open('Final_Labeled_DataCollection-NTI_Tweets.csv', 'r', newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    NTI_Tweets_1 = []
    NTI_Whole_Tweets = []
    for row in reader:
        NTI_Whole_Tweets.append(row)
        NTI_Tweets_1.append(row[2])

Google_Word2Vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, encoding='latin-1')

vectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=(1, 1), analyzer=u'word')
analyze = vectorizer.build_analyzer()
miss_words = []
Total_Tweet_Vec_List = []
for t in NTI_Tweets_1:
    Name = analyze(t)
    Tweet_Vec_List = []
    for word in Name:
        try:
            wordVec = Google_Word2Vec[word]
            Tweet_Vec_List.append(word)
        except KeyError:
            miss_words.append(word)
    Total_Tweet_Vec_List.append(Tweet_Vec_List)

예제 #26

0

파일 보기

파일: nlp_stock.py 프로젝트: skyli42/RiceKrispies

def downloadGlove(file = "C:/glove/glove.6B.50d.txt.w2v"):
	gant = KeyedVectors.load_word2vec_format(file, binary=False)
	return gant

예제 #27

0

파일 보기

Copyright (C) - All Rights Reserved
"""

import os

from gensim.models.keyedvectors import KeyedVectors

from examples.example_corpus import corpus
from nlpkit.nlp_feature_extraction import WordEmbedsDocVectorizer

# Replace with the path of a pre-trained w2v model
WORD_EMBEDDINGS_FILEPATH = os.path.abspath(
    os.path.join(os.path.dirname(__file__), '..', '..', 'fake-news', 'data',
                 'resources', 'word-embeddings', 'glove.6B',
                 'glove.6B.300d.word2vec.txt'))

if __name__ == "__main__":
    word2vec = KeyedVectors.load_word2vec_format(WORD_EMBEDDINGS_FILEPATH,
                                                 binary=False)

    w2v_vectorizer = WordEmbedsDocVectorizer(word2vec, tfidf_weights=True)
    X = w2v_vectorizer.fit_transform(corpus)

    print(X)

    # Example usage in a pipeline
    # pipeline = Pipeline(
    #     ('vec', WordEmbedsDocVectorizer(word2vec, tfidf_weights=True)),
    #     ('clf', SVC(kernel='linear', C=1, probability=True))
    # ])

예제 #28

0

파일 보기

파일: evaluation_normal.py 프로젝트: bdqnghi/code_clone_2vec

import codecs
from sklearn.manifold import TSNE
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import csv
from sklearn.metrics.pairwise import cosine_similarity
from util import vector_averaging
from util import vector_averaging_with_tfidf
from util import process_source_code
from util import word2weight
import sys

DIMENSION = 25
csv.field_size_limit(sys.maxsize)
PROJECT = "cordova"
cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_10_25_include_functions.txt",binary=False)
java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_10_25_include_functions.txt",binary=False)

with open("./sentences/sentences_cs_10.txt","r") as cs_f:
	cs_data = cs_f.readlines()
with open("./sentences/sentences_java_10.txt","r") as java_f:
	java_data = java_f.readlines()

cs_sentences = [x for x in cs_data]
java_sentences = [x for x in java_data]

cs_word2weight = word2weight(cs_sentences)
java_word2weight = word2weight(java_sentences)
# print cs_word2weight
# Predicting part ----------------------------------------------
# print(cosine_similarity(cs_vectors["while"].reshape(1,-1),java_vectors["class"].reshape(1,-1))

예제 #29

0

파일 보기

파일: convert_bin2txt.py 프로젝트: harryprince/ONNX-demo

from argparse import ArgumentParser
from gensim.models.keyedvectors import KeyedVectors
import torch
from tqdm import tqdm
if __name__ == '__main__':
    parser = ArgumentParser(description='Convert binary word2vec to txt')
    parser.add_argument('input')
    parser.add_argument('output')
    args = parser.parse_args()
    model = KeyedVectors.load_word2vec_format(args.input, binary=True)
    model.save_word2vec_format(args.output, binary=False)

예제 #30

0

파일 보기

p1 = int(len(data)*(1-VALIDATION_SPLIT-TEST_SPLIT))
p2 = int(len(data)*(1-TEST_SPLIT))
x_train = data[:p1]
y_train = labels[:p1]
x_val = data[p1:p2]
y_val = labels[p1:p2]
x_test = data[p2:]
y_test = labels[p2:]
print ('train docs: '+str(len(x_train)))
print ('val docs: '+str(len(x_val)))
print ('test docs: '+str(len(x_test)))

###################################################################
print ('(4) load word2vec as embedding...')
from gensim.models.keyedvectors import KeyedVectors
w2v_model0 = KeyedVectors.load_word2vec_format('uniqueWords.vector', encoding='utf-8')
# w2v_model1 = KeyedVectors.load_word2vec_format('medCorpus_meanVec.zh.vector', encoding='utf-8')
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
not_in_model = 0
in_model = 0
#the origin word vector
for word, i in word_index.items():
    if word in w2v_model0:
        in_model += 1
        vec0 = w2v_model0[word].tolist()
        # vec1 = w2v_model1[word].tolist()
        embedding_matrix[i] = np.asarray(vec0, dtype='float32')
    else:
        not_in_model += 1

print (str(not_in_model)+' words not in w2v model')

예제 #31

0

파일 보기

파일: sim_all_CP.py 프로젝트: xueli-pan/scholarly_paper_recommendatation

1. using most recent publication of researchers as input to generate user profiles
2. pretrain word2vec model window_5.model.bin and candidate_paper.csv are available via google drive link,
you can download the files and
change the path in this script so as to run the script successfully.
3. result saved in rank_result_rm/rank_result_mr_own_corpus.csv
"""

import sys
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import pandas as pd
from datetime import datetime

# load pre-train model on my own corpus
model = '/Users/sherry/Downloads/window_5/window_5.model.bin'
w2v_model = KeyedVectors.load_word2vec_format(model, binary=True)

# read all candidate papers info, contain two columns: paper ID and paper content
candidate_paper_df = pd.read_csv(
    '/Users/sherry/Downloads/candidate_papers.csv')


# define DocSim class to calculate document similarities
class DocSim(object):
    def __init__(self, w2v_model, stopwords=[]):
        self.w2v_model = w2v_model
        self.stopwords = stopwords

    def vectorize(self, doc):
        """Identify the vector values for each word in the given document"""
        doc = str(doc)

예제 #32

0

파일 보기

파일: 3_passage cos distance.py 프로젝트: peacelovingng/NLP-Python-Exercise

import math
from gensim.models.keyedvectors import KeyedVectors
import pkuseg

model = KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin', binary=True, limit=300000)

seg = pkuseg.pkuseg()
sentence_1 = ''
sentence_2 = ''

text1 = seg.cut(sentence_1)  # 进行分词
text2 = seg.cut(sentence_2)


stopwords = [line.strip() for line in open('Englishstopwords.txt', encoding='UTF-8').readlines()]
clean1= list()
clean2= list()
for word in sentence_1.split(" "):
    if word not in stopwords:
        clean1.append(word)


for word in sentence_2.split(" "):
    if word not in stopwords:
        clean2.append(word)

vec1=list()
vec2=list()
sum=0
sq1=0

예제 #33

0

파일 보기

def load_gensim_model(path_to_model):
    model =  KeyedVectors.load_word2vec_format(path_to_model, binary=True)
    return model

예제 #34

0

파일 보기

파일: load.py 프로젝트: cammm988/graduation-project

from gensim.models.keyedvectors import KeyedVectors
from konlpy.tag import Twitter
import numpy as np

pos_vectors = KeyedVectors.load_word2vec_format('pos.vec', binary=False)
pos_vectors.most_similar("('남자','Noun')")
twitter = Twitter()
word = "대통령이"
pos_list = twitter.pos(word, norm=True)
word_vector = np.sum([pos_vectors.word_vec(str(pos).replace(" ", "")) for pos in pos_list], axis=0)

예제 #35

0

파일 보기

파일: evaluate_keywords.py 프로젝트: bdqnghi/code_clone_2vec

import csv
from sklearn.metrics.pairwise import cosine_similarity
from util import vector_averaging
from util import vector_averaging_with_tfidf
from util import process_source_code
from util import process_diff_srcml
from util import process_diff_srcml2
from util import word2weight
from util import process_expression
from util import mean_average_precision
from util import average_precision
from util import precision_at_k
import sys

DIMENSION = 20
cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_11_20.txt",binary=False)
java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_11_20.txt",binary=False)

with open("./sentences/sentences_cs_11.txt","r") as cs_f:
	cs_data = cs_f.readlines()
with open("./sentences/sentences_java_11.txt","r") as java_f:
	java_data = java_f.readlines()

cs_sentences = [x for x in cs_data]
java_sentences = [x for x in java_data]

cs_word2weight = word2weight(cs_sentences)
java_word2weight = word2weight(java_sentences)


with codecs.open("./evaluation_data/keywords.csv","r") as f_csv:

예제 #36

0

파일 보기

파일: Emotion_Classification_Models_titli.py 프로젝트: TitliSarkar/Twitter-Emotion-Classify-CSCE-598

    )  #list9+list10+list11+list12 , list99+list1010+list1111+list1212
#test_data, test_label = preprocessing(list1+list2+list3+list4), enumerate_list(list111+list222+list333+list444) #list1+list2+list3+list4 , list11+list22+list33+list44
#note: full test dataset is too huge-> memory error, so use part of it
test_data_list = pd.read_csv(
    os.getcwd() + '\\data\\EI-reg-En-part-test.csv')['Tweet'].tolist()
test_label_list = pd.read_csv(
    os.getcwd() +
    '\\data\\EI-reg-En-part-test.csv')['Affect Dimension'].tolist()
test_data, test_label = preprocessing(test_data_list), test_label_list

print("Train shape:", len(train_data), len(train_label))
print("Validation shape:", len(dev_data), len(dev_label))
print("Test shape:", len(test_data), len(test_label))

# Loading all models
glove_model = KeyedVectors.load_word2vec_format(
    'word2vec.twitter.27B.100d.txt', binary=False)  # load Glove model
w2v_model = Word2Vec.load('w2v_model.bin')  # load word2vec model
e2v_model = gsm.KeyedVectors.load_word2vec_format(
    'emoji2vec.bin', binary=True)  # load emoji2vec model
print("All Models Loaded!")

# word embedding data with glove pretrained model and real word2vec/w2v
input_data = np.concatenate((train_data, dev_data, test_data))
max_sequence_length = max([len(x) for x in input_data
                           ])  # find the length of longest twitter
print("Max twitter length:", max_sequence_length)
print("input_data shape:", len(input_data))


# Find embedding for corpus
def embedding(data, max_len):

예제 #37

0

파일 보기

파일: evaluate_knearest_usage_mapping_rest.py 프로젝트: bdqnghi/code_clone_2vec

		# if "System." in split[0] or "antlr" in split[0].lower():
			cs_signature_tokens.append(split[0])

print "cs tokens : " + str(len(cs_signature_tokens))

for java_emb in java_embeddings:
	split = java_emb.split(" ")
	if func(split[0]) == True:

		if check_package_include(java_packages,split[0]) == True:
		# if "java." in split[0] or "antlr" in split[0].lower():
			java_signature_tokens.append(split[0])

print "java tokens : " + str(len(java_signature_tokens))
print "Loading word embedding..........."
cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_new_window3.txt",binary=False)
java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_new_window3.txt",binary=False)

print "Finish loading.............."
# print cs_vectors.similar_by_vector(java_vectors["java.util.concurrent.locks.Lock.lock()"], topn=30)
# print cs_vectors.similar_by_vector(java_vectors["package"], topn=30)


def check_if_relevant_k_contains_exact_name(method_source, relevant_k):
	
	
	check = False
	
	for k in relevant_k:
	
		split = k.split(".")

예제 #38

0

파일 보기

파일: reuters_scdv_doc2vec_sparsity.py 프로젝트: SCDV-MS/SCDV_MS

    print indexi
    start = time.time()

    num_features = 200  # Word vector dimensionality
    min_word_count = 20  # Minimum word count
    num_workers = 40  # Number of threads to run in parallel
    context = 10  # Context window size
    downsampling = 1e-3  # Downsample setting for frequent words

    model_name = str(num_features) + "features_" + str(min_word_count) + "minwords_" + str(
        context) + "context_len2alldata"
    # Load the trained Word2Vec model.
    model_name = 'wordvectors_reuters_nonpoly.txt'
    #model = Word2Vec.load(model_name)#.syn0
        # Get wordvectors for all words in vocabulary.
    model = KeyedVectors.load_word2vec_format(model_name, binary=False)
    word_vectors = model.syn0
    all = pd.read_pickle('all.pkl')
    start1 = time.time()
    start = time.time()
    # Set number of clusters.
    num_clusters = 60
    idx, idx_proba = cluster_GMM(num_clusters, word_vectors)
    idx_proba[idx_proba < 0.2] = 0
    n_clusteri = num_clusters
    f = open(filename, 'a')
    print "number of k clusters ", str(n_clusteri)
    f.write("number of k clusters " + str(n_clusteri) + "\n")
    start2 = time.time()
    f.write("time taken in clustering " + str(start2 - start1) + "\n")
    # Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments.

예제 #39

0

파일 보기

        h1 = F.dropout(h1,0.3)
        h2 = F.dropout(h2, 0.3)
        h1 = F.relu(h1)
        h2 = F.relu(h2)
        h = F.concat([h1, h2])
        out = self.hy(h)
        #out = self.bn3(out)
        #out = F.dropout(out,0.3)
        out = F.tanh(out)
        out = F.normalize(out)

        return F.mean_squared_error(out,t)


if __name__ == '__main__':
    dic = KeyedVectors.load_word2vec_format("trainer/glove.6B.100d.bin")
    original_word = 'does'
    glove_vec = dic.get_vector(original_word).reshape(1, 100)


    first_net = First_Network(100,30,len(original_word))
    first_net.cleargrads()
    optimizer1 = optimizers.Adam()
    optimizer1.setup(first_net)
    second_net = Second_Network(27,30, 50)
    second_net.cleargrads()
    second_net.reset()
    optimizer2 = optimizers.Adam()
    optimizer2.setup(second_net)
    second_net2 = Second_Network(27, 30, 50)
    second_net2.cleargrads()

예제 #40

0

파일 보기

odd = '/home/dpappas/'
###########################################################
w2v_bin_path = '/home/dpappas/bioasq_all/pubmed2018_w2v_30D.bin'
idf_pickle_path = '/home/dpappas/bioasq_all/idf.pkl'
###########################################################
avgdl, mean, deviation = 21.688767020746013, 0.7375801616020308, 1.3411418040865049
print(avgdl, mean, deviation)
###########################################################
k_for_maxpool = 5
k_sent_maxpool = 5
embedding_dim = 30  #200
###########################################################
print('loading idfs')
idf, max_idf = load_idfs(idf_pickle_path)
print('loading w2v')
wv = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
wv = dict([(word, wv[word]) for word in wv.vocab.keys()])
###########################################################
my_seed = 1
random.seed(my_seed)
torch.manual_seed(my_seed)
###########################################################
print('Compiling model...')
model = Sent_Posit_Drmm_Modeler(embedding_dim=embedding_dim,
                                k_for_maxpool=k_for_maxpool)
if (use_cuda):
    model = model.cuda()

###########################################################
resume_from = '/home/dpappas/bioasq_w2vjpdrmm_demo_run_0/best_dev_checkpoint.pth.tar'
load_model_from_checkpoint(resume_from)

예제 #41

0

파일 보기

파일: L2.py 프로젝트: lanweicai/NLP

if __name__ == '__main__':
    """
    @description: 训练词向量，保存、加载模型，并计算词的相似度
    word2vec保存和加载模型方法（save方式保存的模型可继续训练，save_word2vec_format速度快内存小）：
    1.model.save('w2v.model')->model = Word2Vec.load('w2v.model')
    2.model.wv.save_word2vec_format('w2v.bin')
    ->KeyedVectors.load_word2vec_format('w2v.bin')
    """
    train_segx_path = '../datasets/train_set.seg_x.txt'
    train_segy_path = '../datasets/train_set.seg_y.txt'
    test_segx_path = '../datasets/test_set.seg_x.txt'
    sentences_path = '../datasets/sentences.txt'
    w2v_bin_path = 'w2v.bin'
    ft_bin_path = 'ft.bin'
    voacb_path = '../datasets/voacb.txt'
    lines = read_data(train_segx_path)
    lines += read_data(train_segy_path)
    lines += read_data(test_segx_path)
    save_data(lines, sentences_path)
    word2vec_build(sentences_path, w2v_bin_path)
    fasttext_build(sentences_path, ft_bin_path)

    w2v_model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    model_test(w2v_model, '汽车', '车')

    ft_model = KeyedVectors.load_word2vec_format(ft_bin_path, binary=True)
    model_test(ft_model, '汽车', '车')

    mt = embedding_matrix(w2v_model, voacb_path)

예제 #42

0

파일 보기

파일: features_extractor.py 프로젝트: VovaMind/opencorpora

	def load_w2v_data(self, binary_file_name):
		self.w2v_model = KeyedVectors.load_word2vec_format(
			os.path.join(DATA_PATH, binary_file_name), binary=True)

예제 #43

0

파일 보기

파일: model_batch.py 프로젝트: alexliu2360/ai_stat

validation = pd.read_csv("preprocess/validation_char.csv")
validation["content"] = validation.apply(lambda x: eval(x[1]), axis=1)

model_dir = "model_capsule_char/"
maxlen = 1000
max_features = 20000
batch_size = 128
epochs = 1
tokenizer = text.Tokenizer(num_words=None)
tokenizer.fit_on_texts(data["content"].values)
with open('tokenizer_char.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

word_index = tokenizer.word_index
w2_model = KeyedVectors.load_word2vec_format("word2vec/chars.vector",
                                             binary=True,
                                             encoding='utf8',
                                             unicode_errors='ignore')
embeddings_index = {}
embeddings_matrix = np.zeros((len(word_index) + 1, w2_model.vector_size))
word2idx = {"_PAD": 0}
vocab_list = [(k, w2_model.wv[k]) for k, v in w2_model.wv.vocab.items()]

for word, i in word_index.items():
    if word in w2_model:
        embedding_vector = w2_model[word]
    else:
        embedding_vector = None
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

column_list = [

예제 #44

0

파일 보기

파일: entity2rel.py 프로젝트: Loricanal/entity2rec

    def add_embedding(self, embedding_file):

        self.embedding_files.append(KeyedVectors.load_word2vec_format(embedding_file, binary=self.binary))

예제 #45

0

파일 보기

파일: model4_Body_Title_test_view_mid.py 프로젝트: world4jason/DeepTagRec-A-Content-cum-User-based-Tag-Recommendation-Framework-for-Stack-Overflow

#import gensim

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.7
set_session(tf.Session(config=config))

# basic model for tag recommendation
# Embedding layer -> BiLSTM -> Dense with softmax

# word model
#embeddings_file_bin = '../glove/vectors.bin'
#word_model = KeyedVectors.load_word2vec_format('../glove/vectors.txt', binary=False, unicode_errors='ignore')
word_model = KeyedVectors.load_word2vec_format('word2vec/vec_Body_Title.bin',
                                               binary=True,
                                               unicode_errors='ignore')
#meta_model = KeyedVectors.load_word2vec_format('metapath2vec/code_metapath2vec/stack_new_1000', binary=True, unicode_errors='ignore')
user_id = pickle.load(open("user.p", 'rb'))
user_tag = pickle.load(open("user_tags.p", 'rb'))
user_num = pickle.load(open("user_num.p", 'rb'))

count = len(user_tag)

meta_model = {}
openfile = open("graph_train.emd", 'r')
for line in openfile:
    arr = line.split()
    meta_model[arr[0]] = arr[1:]

#print meta_model['0']

예제 #46

0

파일 보기

import csv
from sklearn.metrics.pairwise import cosine_similarity
from util import vector_averaging
from util import vector_averaging_with_tfidf
from util import process_source_code
from util import process_diff_srcml
from util import process_diff_srcml2
from util import word2weight
from util import process_expression
from util import mean_average_precision
from util import average_precision
from util import precision_at_k
import sys

DIMENSION = 20
cs_vectors = KeyedVectors.load_word2vec_format(
    "./bi2vec_vectors/cs_vectors_11_20.txt", binary=False)
java_vectors = KeyedVectors.load_word2vec_format(
    "./bi2vec_vectors/java_vectors_11_20.txt", binary=False)

with open("./sentences/sentences_cs_11.txt", "r") as cs_f:
    cs_data = cs_f.readlines()
with open("./sentences/sentences_java_11.txt", "r") as java_f:
    java_data = java_f.readlines()

cs_sentences = [x for x in cs_data]
java_sentences = [x for x in java_data]

cs_word2weight = word2weight(cs_sentences)
java_word2weight = word2weight(java_sentences)

with codecs.open("./evaluation_data/keywords.csv", "r") as f_csv:

예제 #47

0

파일 보기

파일: test_keyedvectors.py 프로젝트: RaRe-Technologies/gensim

 def setUp(self):
     self.vectors = EuclideanKeyedVectors.load_word2vec_format(
         datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)

예제 #48

0

파일 보기

파일: embprox.py 프로젝트: GateNLP/emina

 def load_embeddings(self, embeddingsfilename):
     self.emb = KeyedVectors.load_word2vec_format(embeddingsfilename, binary=False)

예제 #49

0

파일 보기

파일: test_datatype.py 프로젝트: RaRe-Technologies/gensim

 def load_model(self, datatype):
     path = datapath('high_precision.kv.txt')
     kv = KeyedVectors.load_word2vec_format(path, binary=False,
                                            datatype=datatype)
     return kv