def test_embedding(): from gensim.models import KeyedVectors from sematch.utility import FileIO from sematch.semantic.relatedness import WordRelatedness model_wiki = KeyedVectors.load_word2vec_format(FileIO.filename('models/w2v-model-enwiki_w2vformat'), binary=True) model_news = KeyedVectors.load_word2vec_format(FileIO.filename('models/googlenews.bin'), binary=True) rel = WordRelatedness(model_news) print(rel.word_similarity('happy','sad'))
def setUp(self): self.source_word_vec_file = datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt") self.target_word_vec_file = datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt") self.word_pairs = [("one", "uno"), ("two", "due"), ("three", "tre"), ("four", "quattro"), ("five", "cinque"), ("seven", "sette"), ("eight", "otto"), ("dog", "cane"), ("pig", "maiale"), ("fish", "cavallo"), ("birds", "uccelli"), ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana") ] self.test_word_pairs = [("ten", "dieci"), ("cat", "gatto")] self.source_word_vec = KeyedVectors.load_word2vec_format(self.source_word_vec_file, binary=False) self.target_word_vec = KeyedVectors.load_word2vec_format(self.target_word_vec_file, binary=False)
def __init__(self): print("Loading in word vectors...") self.word_vectors = KeyedVectors.load_word2vec_format( '../large_files/GoogleNews-vectors-negative300.bin', binary=True ) print("Finished loading in word vectors")
def load(self, *args, **kwargs) -> KeyedVectors: """ Load dict of embeddings from given file Args: *args: arguments **kwargs: arguments Returns: """ # Check that header with n_words emb_dim present with open(self.load_path, encoding='utf8') as f: header = f.readline() if len(header.split()) != 2: raise RuntimeError('The GloVe file must start with number_of_words embeddings_dim line! ' 'For example "40000 100" for 40000 words vocabulary and 100 embeddings ' 'dimension.') if self.load_path and self.load_path.is_file(): log.info("[loading embeddings from `{}`]".format(self.load_path)) model_file = str(self.load_path) model = KeyedVectors.load_word2vec_format(model_file) else: log.error('No pretrained GloVe model provided or provided load_path "{}" is incorrect.' .format(self.load_path)) sys.exit(1) return model
def testConversion(self): word2vec2tensor(word2vec_model_path=self.datapath, tensor_filename=self.output_folder) with smart_open(self.metadata_file, 'rb') as f: metadata = f.readlines() with smart_open(self.tensor_file, 'rb') as f: vectors = f.readlines() # check if number of words and vector size in tensor file line up with word2vec with smart_open(self.datapath, 'rb') as f: first_line = f.readline().strip() number_words, vector_size = map(int, first_line.split(b' ')) self.assertTrue(len(metadata) == len(vectors) == number_words, ('Metadata file %s and tensor file %s imply different number of rows.' % (self.metadata_file, self.tensor_file))) # grab metadata and vectors from written file metadata = [word.strip() for word in metadata] vectors = [vector.replace(b'\t', b' ') for vector in vectors] # get the originaly vector KV model orig_model = KeyedVectors.load_word2vec_format(self.datapath, binary=False) # check that the KV model and tensor files have the same values key-wise for word, vector in zip(metadata, vectors): word_string = word.decode("utf8") vector_string = vector.decode("utf8") vector_array = np.array(list(map(float, vector_string.split()))) np.testing.assert_almost_equal(orig_model[word_string], vector_array, decimal=5)
def get_model(): """ Download model :return: `gensim` model """ return KeyedVectors.load_word2vec_format(_download(), binary=True)
def testAnnoyIndexingOfKeyedVectors(self): from gensim.similarities.index import AnnoyIndexer keyVectors_file = datapath('lee_fasttext.vec') model = KeyedVectors.load_word2vec_format(keyVectors_file) index = AnnoyIndexer(model, 10) self.assertEqual(index.num_trees, 10) self.assertVectorIsSimilarToItself(model, index) self.assertApproxNeighborsMatchExact(model, model, index)
def load_embeddings(self, file_path): # Embeddins must be in fastText format either bin or print('Loading embeddins...') if file_path.endswith('.bin'): from gensim.models.wrappers import FastText embeddings = FastText.load_fasttext_format(file_path) else: from gensim.models import KeyedVectors embeddings = KeyedVectors.load_word2vec_format(file_path) return embeddings
def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size): model = KeyedVectors.load_word2vec_format(filepath, binary=True) emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std() word_index = tokenizer.word_index nb_words = min(max_features, len(word_index)) embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size)) for word, i in word_index.items(): if i >= max_features: continue try: embedding_vector = model[word] embedding_matrix[i] = embedding_vector except KeyError: continue return embedding_matrix
def load(cls, np2vec_model_file, binary=False, word_ngrams=0): """ Load the np2vec model. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword ( ngrams) information. Returns: np2vec model to load """ if word_ngrams == 0: return KeyedVectors.load_word2vec_format( np2vec_model_file, binary=binary) elif word_ngrams == 1: return FastText.load(np2vec_model_file) else: logger.error('invalid value for \'word_ngrams\'')
def wv(w1, w2, t): # lazy load the wordvector model... global wvmodel if wvmodel == None: print ' *', 'loading wordvector model (', modelFile, ')...' wvmodel = KeyedVectors.load_word2vec_format(modelFile, binary=False) wvmodel.init_sims(replace=True) # no more updates, prune memory try: # # since we've got wordnet synset objects (like cat.n.01), we # must turn this back into a regular word ('cat') because the # word vector GloVe models are plain words with spaces turned # into hyphens on phrases (e.g. climate-change, black-and-white) # wv_w1, wv_w2 = _mk_wv_word(w1), _mk_wv_word(w2) distance = wvmodel.similarity(wv_w1, wv_w2) return distance if abs(distance) >= t else 0 except: return 0
def load_embeddings(pytorch_embedding, word2idx, filename, embedding_size): print("Copying pretrained word embeddings from ", filename, flush=True) en_model = KeyedVectors.load_word2vec_format(filename) """ Fetching all of the words in the vocabulary. """ pretrained_words = set() for word in en_model.vocab: pretrained_words.add(word) arr = [0] * len(word2idx) for word in word2idx: index = word2idx[word] if word in pretrained_words: arr[index] = en_model[word] else: arr[index] = np.random.uniform(-1.0, 1.0, embedding_size) """ Creating a numpy dictionary for the index -> embedding mapping """ arr = np.array(arr) """ Add the word embeddings to the empty PyTorch Embedding object """ pytorch_embedding.weight.data.copy_(torch.from_numpy(arr)) return pytorch_embedding
def fit(self, X, y=None): dw_params = self.get_params() print dw_params if False: #exists(self.output_file): model = KeyedVectors.load_word2vec_format(self.output_file) else: model = run_gensim(dw_params) nb_vecs = len(model.wv.vocab) # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) features_matrix = np.asarray([model[str(node)] for node in range(nb_vecs)]) #features_matrix = np.random.randn((4,2)) if self.normalize: norms = np.linalg.norm(features_matrix, axis=1) if self.verbose: print norms print norms.shape assert norms.shape[0] == features_matrix.shape[0] for i in range(features_matrix.shape[0]): features_matrix[i,:] /= norms[i] norms = np.linalg.norm(features_matrix, axis=1) if self.verbose: print norms if self.verbose: print('features_matrix.shape = %s' % str(features_matrix.shape)) self.dw_params_ = dw_params self.gs_model_ = model self.features_matrix_ = features_matrix print('fit', self.features_matrix_.shape) return self
def eval_blogcat(embeddings_file, labels_matrix=None, G=None, verbose=1, normalize=1, training_percents=[0.1, 0.6, 0.9]): # 0. Files #embeddings_file = "/mnt/raid1/deepwalk/blogcatalog.vec" if labels_matrix is None and G is None: G, labels_matrix = load_blogcat() # 1. Load Embeddings model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False) labels = np.argwhere(labels_matrix) label_cnts = pd.Series(labels[:,1]).value_counts() if verbose > 1: print('\nLabel counts:') print(label_cnts) # delete the least frequent labels, which causes balancing problems labels_matrix = labels_matrix[:, :-2] # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) features_matrix = np.asarray([model[str(node)] for node in range(len(G))]) if normalize: norms = np.linalg.norm(features_matrix, axis=1) if verbose: print norms print norms.shape assert norms.shape[0] == features_matrix.shape[0] for i in range(features_matrix.shape[0]): features_matrix[i,:] /= norms[i] norms = np.linalg.norm(features_matrix, axis=1) if verbose: print norms if verbose: print('-'*100) print(embeddings_file) print('features_matrix.shape = %s' % str(features_matrix.shape)) print('labels_matrix.shape = %s' % str(labels_matrix.shape)) # 2. Shuffle, to create train/test groups shuffles = [] number_shuffles = 1 for x in range(number_shuffles): # if we just have one group, make the split the same every time if number_shuffles == 1: shuffles.append(skshuffle(features_matrix, labels_matrix, random_state=123)) else: shuffles.append(skshuffle(features_matrix, labels_matrix)) # 3. to score each train/test group all_results = defaultdict(list) # uncomment for all training percents #training_percents = np.asarray(range(1,10))*.1 for train_percent in training_percents: # print('-'*100) # print('pct_train: %.2f' % train_percent) for shuf in shuffles: X, y = shuf training_size = int(train_percent * X.shape[0]) X_train = X[:training_size, :] y_train = y[:training_size] X_test = X[training_size:, :] y_test = y[training_size:] clf = TopKRanker(LogisticRegression()) clf.fit(X_train, y_train) # find out how many labels should be predicted #top_k_list = [len(l) for l in y_test] top_k_list = np.array(np.sum(y_test, axis=1).flatten()[0])[0].astype(np.int32) preds = clf.predict(X_test, top_k_list) if y_test.shape[1] != preds.shape[1]: raise Exception("imbalance of class dims") #continue results = OrderedDict() averages = ["micro", "macro", "samples", "weighted"] for average in averages: results[average] = f1_score(y_test, preds, average=average) all_results[train_percent].append(results) #break if verbose: print '-------------------' for train_percent in sorted(all_results.keys()): print 'Train percent:', train_percent for x in all_results[train_percent]: print x print '-------------------' return all_results
print("creating word sequences...") ws, ys = [], [] fin = open(INPUT_FILE, "rb") for line in fin: label, sent = line.strip().split("\t") ys.append(int(label)) words = [x.lower() for x in nltk.word_tokenize(sent)] wids = [word2index[word] for word in words] ws.append(wids) fin.close() W = pad_sequences(ws, maxlen=maxlen) Y = np_utils.to_categorical(ys) # load GloVe vectors print("loading word2vec vectors...") word2vec = KeyedVectors.load_word2vec_format(WORD2VEC_MODEL, binary=True) print("transferring embeddings...") X = np.zeros((W.shape[0], EMBED_SIZE)) for i in range(W.shape[0]): E = np.zeros((EMBED_SIZE, maxlen)) words = [index2word[wid] for wid in W[i].tolist()] for j in range(maxlen): try: E[:, j] = word2vec[words[j]] except KeyError: pass X[i, :] = np.sum(E, axis=1) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=42)
def get_word_embeddings(): word_embeddings = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subword.vec") word_embeddings.init_sims(replace=True) return word_embeddings
""" Main function. """ from util import is_word, load_model_embedding, info_from_line, log, get_tensor, repack_tensors from data import InputData from nn import CNN import time from gensim.models import KeyedVectors from functools import partial import tensorflow from tensorflow.python import debug def filter_1(string): return is_word(string) MODEL = KeyedVectors.load_word2vec_format("../data/GoogleNews-vectors-negative300.bin", binary=True) def readFile(path): load_model = partial(load_model_embedding, model=MODEL) inp = InputData(path, info_from_line, load_model, [filter_1]) return inp def print_info_map(map_name, map_data, top_k, file): with open(file, 'w') as f: for k, v in sorted(map_data.items(), key=lambda x: x[1], reverse=True)[0:top_k]: f.write("'%s'\t%d\n" % (k, v)) def create_file_info(path, prefix="", save_file_pref="", data_f=None, model_f=None): inp = readFile(path) all_words = {}
def get_model(): ''' :return: Downloads the `gensim` model.''' return KeyedVectors.load_word2vec_format(download(),binary=False)
def _get_embedding(self, embedding_path): model = KeyedVectors.load_word2vec_format(embedding_path) vocab = model.vocab vocab_len = len(vocab) return np.array([model.word_vec(k) for k in vocab.keys()])
nargs="+", help='location of json file with definitions.') parser.add_argument('--save', type=str, required=True, nargs="+", help='where to save files') parser.add_argument("--w2v", type=str, required=True, help="location of binary w2v file") args = parser.parse_args() if len(args.defs) != len(args.save): parser.error("Number of defs files must match number of save locations") word_vectors = KeyedVectors.load_word2vec_format(args.w2v, binary=True) for i in range(len(args.defs)): vectors = [] with open(args.defs[i], "r") as infile: definitions = json.load(infile) for elem in definitions: if elem[0][0] in word_vectors: vectors.append(word_vectors[elem[0][0]]) else: vectors.append(np.zeros(word_vectors.vector_size)) vectors = np.array(vectors) np.save(args.save[i], vectors)
from gensim.models import KeyedVectors from mysite.settings import BASE_DIR from pathlib import Path import random, re import CaboCha import sys sys.path.append(str(Path(BASE_DIR).joinpath('handaioh_NLP/utils/').resolve())) # from Spotlight_return import Spotlight_return data_path = str( Path(BASE_DIR).joinpath( 'handaioh_NLP/utils/data/word2vec.300d.ja.txt').resolve()) model = KeyedVectors.load_word2vec_format(data_path) # IREXに準拠 repl_align = { 'ORGANIZATION': 'どこの機関', 'PERSON': '誰', 'LOCATION': 'どこの場所', 'DATE': 'いつ', 'TIME': 'いつ', 'MONEY': 'いくら', 'PERCENT': 'どのくらい', 'ARTIFACT': '何', 'O': '何', } class Chunk: def __init__(self): self.words = []
from gensim.models import KeyedVectors from gensim.scripts.glove2word2vec import glove2word2vec # 输入文件 glove_file = r'E:/Study/Codings/python_work/nlp_pro1/word2vec_model/glove.6B/glove.6B.300d.txt' # 输出文件 tmp_file = r'E:/Study/Codings/python_work/nlp_pro1/word2vec_model/glove_vec/glove.6B.300d.txt' # 命令行调用 # python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file> # 开始转换 glove2word2vec(glove_file, tmp_file) # 加载转化后的文件 model = KeyedVectors.load_word2vec_format(tmp_file)
def __loadModel(self): print("Loading model '{0}'...".format(self.__modelPath)) self.__model = KeyedVectors.load_word2vec_format(self.__modelPath, binary=True) print("Loaded!")
def w2v_export(embedding_file): try: model = KeyedVectors.load(embedding_file) except Exception, e: model = KeyedVectors.load_word2vec_format(embedding_file)
def normalize_word(embedding_file): # 将词向量进行规范化 try: model = KeyedVectors.load(embedding_file) except Exception, e: model = KeyedVectors.load_word2vec_format(embedding_file)
from gensim.models import KeyedVectors import time # Get the time at the beginning of the load start_time = time.time() print(time.ctime(start_time)) # Load the model file loaded_model = KeyedVectors.load_word2vec_format( '~/Documents/glove_word2vec/word2vec.840B.300d.txt') # Get the time at the end of the load and calculate how long it took end_time = time.time() print(time.ctime(end_time)) elapsed_time = end_time - start_time print('Loaded model file in ' + str(elapsed_time / 60.0) + ' minutes') # Get most similar words to "day" word1 = loaded_model.get_vector('day') # print(word1) print(loaded_model.most_similar(positive=['day']))
from gensim.models import KeyedVectors en_vectors = KeyedVectors.load_word2vec_format('data/wiki-news-300d-1M.vec', binary=False) from gensim.models import Word2Vec vi_vectors = Word2Vec.load('data/vi.bin').wv # # Lưu ý: đối với model glove, cần chuyển về format word2vec # # Ví dụ # from gensim.scripts.glove2word2vec import glove2word2vec # glove2word2vec('data/glove.6B.50d.txt', 'data/en.vec') en_vectors.vocab en_vectors["cat"] print ("vector size: ", en_vectors.vector_size) print ("vocab size: ", len(en_vectors.vocab)) print ("vector size: ", vi_vectors.vector_size) print ("vocab size: ", len(vi_vectors.vocab)) en_vectors.most_similar("cat") vi_vectors.most_similar("mèo") sim_words = en_vectors.most_similar(positive=['queen', 'man'], negative=['king']) print('Queen is a: ', sim_words[0][0]) sim_words = en_vectors.most_similar(negative=['king'], positive=['kings', 'queen'])
from tensorflow.python.keras.saving import load_model from tensorflow.python.keras.models import Sequential from tensorflow.python.keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional from tensorflow.python.keras.preprocessing.text import Tokenizer from tensorflow.python.keras.preprocessing.sequence import pad_sequences from tensorflow.python.keras.optimizers import RMSprop from tensorflow.python.keras.optimizers import Adam # 进行训练和测试样本的分割 from sklearn.model_selection import train_test_split from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau # 忽略警告 warnings.filterwarnings("ignore") # 测试加载预训练模型 cn_model = KeyedVectors.load_word2vec_format('sgns.zhihu.bigram', binary=False, unicode_errors="ignore") # print(cn_model.similarity('橘子', '橙子')) # print(cn_model.most_similar(positive=['大学'], topn=10)) # 我们数据集有4000条评论 # 只使用前50000个中文词做测试----目前作为测试,生产过程可以全部使用 num_words = 50000 # 词向量数----该值是基于sgns.zhihu.bigram中的维度来设定 embedding_dim = 300 # 输入的最大维度值----该值是代表所有被处理的评论的词数。 max_tokens = 236 # 建立一个权重的存储点 path_checkpoint = 'checkpoint.h5' # 建立模型 model = Sequential()
def do_keras_textcnn_w2v(text,stars,trainable): #转换成词袋序列 max_document_length=200 embedding_dims = 300 #获取已经训练好的词向量 model = KeyedVectors.load_word2vec_format(word2vec_file, binary=True) print model['word'].shape #设置分词最大个数 即词袋的单词个数 tokenizer = Tokenizer(num_words=max_features,lower=True) tokenizer.fit_on_texts(text) sequences = tokenizer.texts_to_sequences(text) x=pad_sequences(sequences, maxlen=max_document_length) #我们可以使用从scikit-learn LabelEncoder类。 # 这个类通过 fit() 函数获取整个数据集模型所需的编码,然后使用transform()函数应用编码来创建一个新的输出变量。 encoder=LabelEncoder() encoder.fit(stars) encoded_y = encoder.transform(stars) #labels = to_categorical(np.asarray(labels))也可以进行数据处理 #获取word到对应数字编号的映射关系 word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) #获取词向量的映射矩阵 embedding_matrix = np.zeros((max_features + 1, embedding_dims)) for word, i in word_index.items(): #编号大于max_features的忽略 该字典是按照字典顺序 所以对应的id不一定是顺序的 if i > max_features: continue try: embedding_matrix[i] = model[word].reshape(embedding_dims) except: print "%s not found!" % (word) #构造神经网络 def baseline_model(): #CNN参数 #filters个数通常与文本长度相当 便于提取特征 filters = max_document_length # Inputs input = Input(shape=[max_document_length]) # 词向量层,本文使用了预训练word2vec词向量,把trainable设为False x = Embedding(max_features + 1, embedding_dims, weights=[embedding_matrix], trainable=trainable)(input) # conv layers convs = [] for filter_size in [3,4,5]: l_conv = Conv1D(filters=filters, kernel_size=filter_size, activation='relu')(x) l_pool = MaxPooling1D()(l_conv) l_pool = Flatten()(l_pool) convs.append(l_pool) merge = concatenate(convs, axis=1) out = Dropout(0.2)(merge) output = Dense(32, activation='relu')(out) output = Dense(units=2, activation='softmax')(output) #输出层 model = Model([input], output) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #可视化 plot_model(model, to_file='yelp-cnn-model-textcnn.png',show_shapes=True) model.summary() return model #在 scikit-learn 中使用 Keras 的模型,我们必须使用 KerasClassifier 进行包装。这个类起到创建并返回我们的神经网络模型的作用。 # 它需要传入调用 fit()所需要的参数,比如迭代次数和批处理大小。 # 最新接口指定训练的次数为epochs clf = KerasClassifier(build_fn=baseline_model, epochs=10, batch_size=50, verbose=1) #使用5折交叉验证 scores = cross_val_score(clf, x, encoded_y, cv=5, scoring='f1_micro') # print scores print("f1_micro: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def load_model(): # load the pre-trained word2vec model (here we are using a model # pre-trained) global model filename = 'glove.6B.100d.txt.word2vec' model = KeyedVectors.load_word2vec_format(filename, binary=False)
import os import json from gensim.models import KeyedVectors print ' *', 'loading wv model' modelFile = os.environ['HOME'] + "/models/" + "glove.6B.300d_word2vec.txt" model = KeyedVectors.load_word2vec_format(modelFile, binary=False) print ' *', 'model ready' w1 = 'nostalgia' w2 = 'memory' print ' *', w1, w2, 'similarity:', model.similarity(w1, w2) for w in ['nostalgia', 'blurred', 'figurative_art', 'erotic', 'voyeurism']: if w in model: print ' *', w, model.most_similar(positive=[w]) words = set(["contemporary conceptualism", "appropriation", "contemporary participation", "colombian", "color photography", "american", "figurative art", "language", "abstract versus figurative art", "consumerism", "art that plays with scale", "architecture in art", "korean", "assemblage", "calarts", "collage", "1980s", "biomorphic", "collective history", "found objects", "grotesque", "cut/ripped", "decay", "united states", "flatness", "group of objects", "china", "chinese", "graffiti", "street art", "graffiti/street art", "color theory", "abstract sculpture", "art in art", "film/video", "singaporean", "cinematic", "brazil", "abstract", "brazilian", "'85 new wave", "city scenes", "drawing", "cultural commentary", "endurance art", "feminism", "bedrooms and bathrooms", "canadian", "columns and totems", "architecture's effects", "close-up", "1918 - 1939", "documentary photography", "black-and-white photography", "italian", "monochromatic", "gender", "globalization", "outdoor art", "mixed-media", "mexican", "mexico", "1990s", "ceramic", "animals", "artists' books", "1970s", "contemporary fact versus fiction", "art and technology", "installation art", "erased and obscured", "erotic", "contemporary grotesque", "etching/engraving", "abstract painting", "photoconceptualism", "bright/vivid", "abstract photography", "dark", "focus on materials", "contemporary traces of memory", "miniature and small-scale paintings", "conceptual", "photography", "japanese", "japan", "dutch", "contemporary vintage photography", "comic", "calligraphic", "belgium", "belgian", "contemporary surrealistic", "animation", "1960s", "collecting and modes of display", "cityscapes", "chance", "spain", "spanish", "black and white", "americana", "indian", "contemporary graphic realism", "conflict", "malaysian", "caricature / parody", "cross-cultural dialogue", "neo-conceptualism", "advertising and brands", "vietnamese", "australia and new zealand", "figurative painting", "central america", "el salvador", "food", "german-american", "germany", "puerto rican", "allover composition", "southern cone", "isolation", "sexual identity", "argentinean", "antiquity as subject", "contemporary archaeological", "human figure", "nude", "contemporary pop", "british", "indonesian", "anthropomorphism", "celebrity", "pakistani", "digital culture", "political", "violence", "social action", "contemporary diy", "narrative", "design", "architecture", "hard-edged", "minimalism", "flora", "chicano art", "crime", "color gradient", "contemporary color fields", "childhood", "suburbia", "blurred", "mexican american", "artist as ethnographer", "venezuelan", "humor", "figurative sculpture", "allegory", "focus on the social margins", "neo-concretism", "cuban", "myth/religion", "immersive", "modern", "pakistani-american", "angular", "costa rican", "abstract landscape", "body art", "performance art", "abject art", "light and space movement", "line, form and color", "classical mythology", "sculpture", "work on paper", "argentinian", "peruvian", "individual portrait", "automatism", "cuba", "engagement with mass media", "cubism", "emerging art"]) results = {} unmatched = [] for w in words: x = w.replace(' ', '-') if x in model:
def setUp(self): self.model = KeyedVectors.load_word2vec_format( rocanr.app.config['VECTOR_FILE'], binary=False) rocanr.app.testing = True self.app = rocanr.app.test_client()
weibo_neg = pd.read_table(path.join(path.dirname(__file__), '..', 'data', 'weibo_neg.txt'), header=None, sep='\n', encoding='utf8') weibo_neg['label'] = 0 all_ = all_.append(weibo_neg, ignore_index=True) wb_len = len(all_) - jd_len print('len(all_) = ' + str(len(all_))) stop_words = load_stop_words() all_['words'] = all_[0].apply(lambda s: extract_cn_jd(s).split(' ')) #调用结巴分词 print(all_['words']) w2v_model = KeyedVectors.load_word2vec_format(path.join( path.dirname(__file__), '..', 'data', 'w2v_onlycn_100_c_2.bin'), binary=True, unicode_errors='ignore') word2vec_dim = 100 maxlen = 100 #截断词数 min_count = 5 #出现次数少于该值的词扔掉。这是最简单的降维方法 content = [] for i in all_['words']: content.extend(i) #'收到', '少', '一本', '钱', '算啦', '这本', '宝宝', ……这样,形成了一条 # 建字典索引,这是用status_big_seg.txt做的,是完全的所有的单词,{单词: 索引数字} dict_index = pd.Series(content).value_counts() #index是词,value是数值 dict_index = dict_index[dict_index >= min_count] #这个是去掉了出现次数少于5的词 dict_index[:] = range(1, len(dict_index) + 1) #对value重排了,按照1到13212排
from gensim.models import KeyedVectors def cos_sim(word,define_words,model_novice,model_expert): c_n = {} c_e = {} for i in define_words: c_n[i] = model_novice.wv.similarity(word, i) c_e[i] = model_expert.wv.similarity(word, i) #print(word + "(" + model.wv.similarity(word, i)+ ")"+ i) return c_e,c_n model_novice = KeyedVectors.load_word2vec_format("very_novice_epoch4.bin", binary=True) model_expert = KeyedVectors.load_word2vec_format("expert_epoch4.bin", binary=True) word = "badminton" word1 = "dance" word2 = "shooter" word3 = "psychotherapy" define_words = ["baddy", "ace", "alley", "backcourt", "baseline", "carry", "court", "deception", "doubles", "dribble", "drive", "drop",
print(' word freq index ...') num_freq_words = 0 model = None file_path = "" f_words = [] with open("generated/" + args.language + "/word_wiki_freq.txt") as f: for line in f: parts = unquote(line.strip()).split('\t') w = parts[0] if not is_stop_word_or_number(w) and parts[1].isdigit(): freq_words[w] = int(parts[1]) num_freq_words += 1 if args.word_vecs == "w2v": model = KeyedVectors.load_word2vec_format('data/basic_data/wordEmbeddings/w2v/GoogleGoogleNews-vectors-negative300.bin', binary=True) elif args.word_vecs == "fasttext": model = KeyedVectors.load_word2vec_format("data/basic_data/wordEmbeddings/fasttext/cc."+args.language+".300.vec", binary=False) elif args.word_vecs == "muse": file_path = 'data/basic_data/wordEmbeddings/muse/wiki.multi.' + args.language + '.vec' if not path.exists(file_path): url = "https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi." + args.language + ".vec" urllib.request.urlretrieve(url, file_path) model = KeyedVectors.load_word2vec_format("data/basic_data/wordEmbeddings/muse/wiki.multi."+args.language+".vec", binary=False) common_w2v_freq_words = [ word for word in model.vocab if word in freq_words ] print("common_w2v_freq_words : ", len(common_w2v_freq_words)) we_word2id = {} we_id2word = {} if path.exists("generated/" + args.language + "/we_word_id.p"):
cv_file = inDir + "/CVSchema/Prav_CVindices_5folds.csv" CV_Schema = pd.read_csv(cv_file) train_df = pd.merge(train_df, CV_Schema, how='left', on=['id', 'qid1', 'qid2']) act = 'relu' re_weight = True # whether to re-weight classes to fit the 17.5% share in test set STAMP = 'dn51_question_pairs_weights.h5' ######################################## ## index word vectors ######################################## print('Indexing word vectors') word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \ binary=True) print('Found %s word vectors of word2vec' % len(word2vec.vocab)) ######################################## ## process texts in datasets ######################################## print('Processing text dataset') # The function "text_to_wordlist" is from # https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower().split()
def calcfeatures(stancesFile, bodiesFile): path = os.path.abspath("") #gensim.models.KeyedVectors.load_word2vec_format #wmd_model = Word2Vec.load_word2vec_format('/data/w2v_googlenews/GoogleNews-vectors-negative300.bin.gz', binary=True) wmd_model = KeyedVectors.load_word2vec_format(path+'/data/GoogleNews-vectors-negative300.bin', binary=True) wmd_model.init_sims(replace=True) tknzr = TweetTokenizer() count = 0 features = [] classes = [] #N = getDocCount(path+'/data/training/train_bodies.csv') keys = {'agree': 0, 'disagree': 1, 'discuss': 2, 'unrelated': 3} bodies = loadBodies(bodiesFile) bigram_vectorizer = CountVectorizer(tokenizer=tknzr.tokenize, ngram_range=(1, 2), binary=False, lowercase=True, stop_words='english', min_df=1) vectorizer = TfidfVectorizer(tokenizer=tknzr.tokenize, ngram_range=(1, 1), binary=False, lowercase=True, stop_words='english', min_df=1) tfidfMat = vectorizer.fit_transform(list(bodies.values())) tfidfMat = vectorizer.transform(list(bodies.values())) tfidfMat = tfidfMat.toarray() vocab = vectorizer.get_feature_names() k = list(bodies.keys()) bodiesTokens = loadBodiesTokens(bodiesFile) with open(stancesFile, 'r', encoding='UTF-8') as csvDataFile1: csvReader1 = csv.reader(csvDataFile1) first = 1 for row in csvReader1: f = [] if first == 1: first = 0 else: print(count) count = count + 1 #class classes.append(keys[row[2]]) #canberra distance f.append(feat.canberraDist(row[0],bodies[row[1]], bigram_vectorizer)) #polarity scores neg, neu, pos = feat.polarityScores(row[0], bodies[row[1]]) f.append(neg) f.append(neu) f.append(pos) tokens1 = tknzr.tokenize(row[0]) tokens1=[token.lower() for token in tokens1 if (token.isalpha() and token not in stop_words)] tokens2 = bodiesTokens[row[1]] #word movers distance f.append(feat.wmd(tokens1, tokens2,wmd_model)) #common words common = (set(tokens1) & set(tokens2)) f.append(feat.overlap(common)) #tfidf f.append(feat.tfidf(tfidfMat, common,vocab,k.index(row[1]))) #negations f.append(feat.negWords(tokens1,tokens2)) #add all features features.append(f) return np.array(features), np.array(classes)
for language in ['fr', 'en']: print "loading resources..." start = time.time() URIs = config['URI_' + language] stopwords = utils.load_stopwords( path_to_resources + URIs['stopwords'] ) filler_words = utils.load_filler_words( path_to_resources + URIs['filler_words'] ) word_vectors = KeyedVectors.load_word2vec_format( path_to_resources + URIs['word_vectors'], binary=True ) language_model = LanguageModel( path_to_resources + URIs['language_model'] ) pos_tagger = StanfordPOSTagger( model_filename=path_to_resources + URIs['pos_tagger_model'], path_to_jar=path_to_resources + URIs['pos_tagger_jar'] ) print "time_cost = %.2fs" % (time.time() - start) resources[language] = { 'stopwords': stopwords,
return def main(args): ''' Pipeline for representational learning for all nodes in a graph. ''' nx_G = read_graph(args) G = node2vec.Graph(nx_G, args.directed, args.p, args.q) G.preprocess_transition_probs() walks = G.simulate_walks(args.num_walks, args.walk_length) learn_embeddings(walks) if __name__ == "__main__": args = parse_args() args.input = 'user_edges' args.output = 'user_vec' args.walk_length = 5 args.num_walks = 10 # args.weighted = True # args.directed = True args.dimensions = 64 args.window_size = 2 args.p = 2 args.q = 2 main(args) model = KeyedVectors.load_word2vec_format('user_vec') print(model.wv.most_similar('4'))
def loadVectors(location, model='gensim', binary=True): if model == 'gensim': return Word2Vec.load(location) elif model == 'w2v': return KeyedVectors.load_word2vec_format(location, binary=binary)
#!/usr/bin/python # -*- coding: utf-8 -*- from gensim.models import KeyedVectors model = KeyedVectors.load_word2vec_format('/home/tj/big_data/data/talk/2j3s.vec', binary=False) model.save_word2vec_format('/home/tj/big_data/data/talk/2j3s.vec.bin', binary=True)
#The WordSimilarity-353 Test Collectionの評価データをダウンロードし,単語ベクトルにより計算される類似度のランキングと,人間の類似度判定のランキングの間のスピアマン相関係数を計算せよ. import pandas as pd import numpy as np from gensim.models import KeyedVectors from tqdm import tqdm def cosSim(v1, v2): return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) def culcCosSim(row): global model w1v = model[row['Word 1']] w2v = model[row['Word 2']] return cosSim(w1v, w2v) tqdm.pandas() model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) df = pd.read_csv('./wordsim353/combined.csv') df['cosSim'] = df.progress_apply(culcCosSim, axis=1) print(df[['Human (mean)', 'cosSim']].corr(method='spearman'))
For more information on this file, see https://docs.djangoproject.com/en/1.10/topics/settings/ For the full list of settings and their values, see https://docs.djangoproject.com/en/1.10/ref/settings/ """ import os from gensim.models import KeyedVectors # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) settings_dir = os.path.dirname(__file__) PROJECT_ROOT = os.path.abspath(os.path.dirname(settings_dir)) MODEL_PATH = os.path.join(PROJECT_ROOT, 'apollo/w2v/GoogleNews-vectors-negative300.bin.gz') MODEL = KeyedVectors.load_word2vec_format(MODEL_PATH, unicode_errors = 'replace', binary = 'True', limit=10000) # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = ')p1#0dnupk$xc59wdfl^%!7)4myi--la+xd4=$krk&a55$%0rz' # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True ALLOWED_HOSTS = [] # Application definition
def setUp(self): self.vectors = EuclideanKeyedVectors.load_word2vec_format( datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)
def load_google_vec(): from gensim.models import KeyedVectors return KeyedVectors.load_word2vec_format('~/nlp/w2v/GoogleNews-vectors-negative300.bin.gz', binary=True)
def categorizer(): #driver function, #returns model output mapped on the input corpora as a dict object stats = open('stats.txt', 'w', encoding='utf-8') st = time.time() wordmodelfile = "C:/Users/anush/Desktop/Venter_CMS-master/Venter/ML_model/Civis/MAX.bin" wordmodel = KeyedVectors.load_word2vec_format(wordmodelfile, binary = True, limit=200000) keywords = { 'test_data': ['bedbugs', 'cctv', 'pipeline', 'Open spaces', 'gutter', 'garbage', 'rats', 'mice', 'robbery', 'theft', 'passage', 'galli', 'lane', 'light', 'bathrooms not clean', 'toilets not clean', 'playarea', 'mosquito', 'fogging','water'], } #wordmodelfile = os.path.join(BASE_DIR, 'Venter/ML_model/Civis/MAX.bin') wordmodel = KeyedVectors.load_word2vec_format(wordmodelfile, binary=True, limit=200000) et = time.time() s = 'Word embedding loaded in %f secs.' % (et-st) print(s) stats.write(s + '\n') #filepaths #responsePath = os.path('./comments/') responsePath=('./comments/') responseDomains = os.listdir('./comments/') #responseDomains.sort() #dictionary for populating the json output results = {} for responseDomain in zip(responseDomains): #instantiating the key for the domain responseDomain=str(responseDomain) domain=responseDomain[2:-7] responseDomain=responseDomain[2:-3] #domain = responseDomain[:-4] print("ResponseDomain is: ",responseDomain) print("Domain is: ",domain) results[domain] = {} print('Categorizing %s domain...' % domain) temp = open(os.path.join(responsePath, responseDomain), 'r', encoding='utf-8-sig') responses = temp.readlines() rows=0 for response in responses: response = list(filter(None, response.lower().split('.'))) num=0 if '\n' in response: num+=1 rows+=(len(response)-num) categories=keywords[domain] columns = len(categories) #categories = category #saving the scores in a similarity matrix #initializing the matrix with -1 to catch dump/false entries st = time.time() similarity_matrix = [[-1 for c in range(columns)] for r in range(rows)] et = time.time() s = 'Similarity matrix initialized in %f secs.' % (et-st) print(s) stats.write(s + '\n') row = 0 st = time.time() for response in responses: response = list(filter(None, response.lower().split('.'))) print("Row: ",row) for single_response in response: print("Current sentence is: ",single_response) if len(single_response) == 1: continue #print(single_response) if single_response=='\n': continue else: column = 0 for category in categories: print("Current category is: ",category) similarity_matrix[row][column] = wmd_similarity(single_response, category, wordmodel) column += 1 row += 1 et = time.time() s = 'Similarity matrix populated in %f secs. ' % (et-st) print(s) stats.write(s + '\n') print('Initializing json output...') for catName in categories: results[domain][catName] = [] print('Populating category files...') for score_row, response in zip(similarity_matrix, responses): #max_sim_index = len(categories)-1 response = list(filter(None, response.lower().split('.'))) for single_response in response: if single_response!='\n': print("Current score row: \n",np.array(score_row)) min_sim_index=len(categories)-1 #if np.array(score_row).sum() > 0: min_sim_index = np.array(score_row).argmin() temp = {} temp['response'] = single_response temp['score'] = float((np.array(score_row).min())) # else: #temp = response results[domain][categories[min_sim_index]].append(temp) print('Completed.\n') #sorting domain wise categorised responses based on scores for domain in results: for category in results[domain]: temp = results[domain][category] if len(temp)==0 or category=='Novel': continue #print(temp) results[domain][category] = sorted(temp, key=lambda k: k['score'], reverse=True) #newlist = sorted(list_to_be_sorted, key=lambda k: k['name']) --> to sort list of dictionaries print('***********************************************************') with open('out_new_2.json', 'w') as temp: json.dump(results, temp) return results
from gensim.scripts.glove2word2vec import glove2word2vec from gensim.models import KeyedVectors import numpy as np from sklearn.decomposition import PCA from matplotlib import pyplot import pandas as pd #%% glove_path = 'F:/year 3/zsl/class_embedding/GloVe/glove.6B.300d.txt' w2v_path = 'F:/year 3/zsl/class_embedding/GloVe/glove.6B.300d.txt.word2vec' glove2word2vec(glove_path, w2v_path) #%% model = KeyedVectors.load_word2vec_format(w2v_path, binary = False) results = model.most_similar(positive = ['woman', 'king'], negative = ['man'], topn = 3) print (results) words = list (model.vocab) print (len(words)) #%% X = model[model.vocab] # 400000 * 50 print (X.shape)
def load_word2vec_dataset(): words = [] words.append("airplane") words.append("alarm clock") words.append("ant") words.append("ape") words.append("apple") words.append("metal") #armour words.append("axe") words.append("banana") words.append("bat") words.append("bear") words.append("bee") words.append("beetle") words.append("bell") words.append("bench") words.append("bicycle") words.append("blimp") words.append("bread") words.append("butterfly") words.append("cabin") words.append("camel") words.append("candle") words.append("cannon") words.append("car") words.append("castle") words.append("cat") words.append("chair") words.append("chicken") words.append("church") words.append("couch") words.append("cow") words.append("crab") words.append("crocodile") words.append("cup") words.append("deer") words.append("dog") words.append("dolphin") words.append("door") words.append("duck") words.append("elephant") words.append("eyeglasses") words.append("fan") words.append("fish") words.append("flower") words.append("frog") words.append("geyser") words.append("giraffe") words.append("guitar") words.append("hamburger") words.append("hammer") words.append("harp") words.append("hat") words.append("hedgehog") words.append("helicopter") words.append("hermit crab") words.append("horse") words.append("hot air balloon") words.append("hot dog") words.append("hour glass") words.append("jack o lantern") words.append("jelly fish") words.append("kangaroo") words.append("knife") words.append("lion") words.append("lizard") words.append("lobster") words.append("motorcycle") words.append("mouse") words.append("mushroom") words.append("owl") words.append("parrot") words.append("pear") words.append("penguin") words.append("piano") words.append("pickup truck") words.append("pig") words.append("pineapple") words.append("pistol") words.append("pizza") words.append("pretzel") words.append("Rabbit") words.append("raccoon") words.append("racket") words.append("ray") words.append("rhinoceros") words.append("rifle") words.append("rocket") words.append("sail boat") words.append("saw") words.append("saxophone") words.append("scissors") words.append("scorpion") words.append("seagull") words.append("seal") words.append("sea turtle") words.append("shark") words.append("sheep") words.append("shoe") words.append("skyscraper") words.append("snail") words.append("snake") words.append("songbird") words.append("spider") words.append("spoon") words.append("squirrel") words.append("starfish") words.append("strawberry") words.append("swan") words.append("sword") words.append("table") words.append("tank") words.append("teapot") words.append("teddy bear") words.append("tiger") words.append("tree") words.append("trumpet") words.append("turtle") words.append("umbrella") words.append("violin") words.append("volcano") words.append("wading bird") words.append("wheel chair") words.append("windmill") words.append("window") words.append("wine bottle") words.append("zebra") model = KeyedVectors.load_word2vec_format( 'dataset/GoogleNews-vectors-negative300.bin', binary=True) wv_embeddings = np.zeros((125, 300)) #print(model['cars']) #print type(model['cars']) for i in range(125): if i == 1: wv_embeddings[i, :] = (model['alarm'] + model['clock']) / 2 elif i == 6: wv_embeddings[i, :] = model['metal'] elif i == 53: wv_embeddings[i, :] = (model['hermit'] + model['crab']) / 2 elif i == 55: wv_embeddings[i, :] = (model['hot'] + model['air'] + model['balloon']) / 3 elif i == 56: wv_embeddings[i, :] = (model['hot'] + model['dog']) / 2 elif i == 57: wv_embeddings[i, :] = (model['hour'] + model['glass']) / 2 elif i == 58: wv_embeddings[i, :] = (model['jack'] + model['lantern']) / 2 elif i == 59: wv_embeddings[i, :] = (model['jelly'] + model['fish']) / 2 elif i == 73: wv_embeddings[i, :] = (model['pickup'] + model['truck']) / 2 elif i == 86: wv_embeddings[i, :] = (model['sail'] + model['boat']) / 2 elif i == 93: wv_embeddings[i, :] = (model['sea'] + model['turtle']) / 2 elif i == 111: wv_embeddings[i, :] = (model['teddy'] + model['bear']) / 2 elif i == 119: wv_embeddings[i, :] = (model['wading'] + model['bird']) / 2 elif i == 120: wv_embeddings[i, :] = (model['wheel'] + model['chair']) / 2 elif i == 123: wv_embeddings[i, :] = (model['wine'] + model['bottle']) / 2 else: print(i) wv_embeddings[i, :] = model[words[i]] scipy.io.savemat('dataset/wv_embeddings.mat', {'features': wv_embeddings}) #saving return words
vectors_to_compare.append(vector) """ if len(target_words) > 1: if "_".join(target_words) in model: target_vector = model["_".join(target_words)] else: target_vectors = [model[word] for word in target_words if word in model] # and word not in stops)] if target_vectors: target_vector = np.mean(target_vectors, axis=0) else: return 0.0 else: try: target_vector = model[target_words[0]] except: return 0.0 """ return 1 - cosine(vectors_to_compare[0], vectors_to_compare[1]) if __name__ == "__main__": embeddings_path = 'GoogleNews-vectors-negative300.bin' #embeddings_path = "numberbatch-en-17.06.txt" model = KeyedVectors.load_word2vec_format(embeddings_path, binary=False) with open('stopwords.txt', 'r') as f: stops = set(line.strip() for line in f.readlines()) stops = stops.union(string.punctuation) #print(cos_similarity(["make", "you", "sneeze"], ["separate"], model, stops))
def main(): parser = ArgumentParser("scoring", formatter_class=ArgumentDefaultsHelpFormatter, conflict_handler='resolve') parser.add_argument("--emb", required=True, help='Embeddings file') parser.add_argument("--format", default='mat', help='mat or edgelist') parser.add_argument( "--network", required=True, help= 'A .mat file containing the adjacency matrix and node labels of the input network.' ) parser.add_argument( "--adj-matrix-name", default='network', help='Variable name of the adjacency matrix inside the .mat file.') parser.add_argument( "--label-matrix-name", default='group', help='Variable name of the labels matrix inside the .mat file.') parser.add_argument("--num-shuffles", default=2, type=int, help='Number of shuffles.') parser.add_argument( "--all", default=False, action='store_true', help= 'The embeddings are evaluated on all training percents from 10 to 90 when this flag is set to true. ' 'By default, only training percents of 10, 50 and 90 are used.') args = parser.parse_args() # 0. Files embeddings_file = args.emb matfile = args.network # 1. Load Embeddings model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False) # 2. Load labels mat = loadmat(matfile) A = mat[args.adj_matrix_name] graph = sparse2graph(A) labels_matrix = mat[args.label_matrix_name] labels_count = labels_matrix.shape[1] mlb = MultiLabelBinarizer(range(labels_count)) # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) features_matrix = numpy.asarray( [model[str(node)] for node in range(len(graph))]) # 2. Shuffle, to create train/test groups shuffles = [] for x in range(args.num_shuffles): shuffles.append(skshuffle(features_matrix, labels_matrix)) # 3. to score each train/test group all_results = defaultdict(list) if args.all: training_percents = numpy.asarray(range(1, 10)) * .1 else: training_percents = [0.1, 0.5, 0.9] for train_percent in training_percents: for shuf in shuffles: X, y = shuf training_size = int(train_percent * X.shape[0]) X_train = X[:training_size, :] y_train_ = y[:training_size] y_train = [[] for x in range(y_train_.shape[0])] cy = y_train_.tocoo() for i, j in zip(cy.row, cy.col): y_train[i].append(j) assert sum(len(l) for l in y_train) == y_train_.nnz X_test = X[training_size:, :] y_test_ = y[training_size:] y_test = [[] for _ in range(y_test_.shape[0])] cy = y_test_.tocoo() for i, j in zip(cy.row, cy.col): y_test[i].append(j) clf = TopKRanker(LogisticRegression()) clf.fit(X_train, y_train_) # find out how many labels should be predicted top_k_list = [len(l) for l in y_test] preds = clf.predict(X_test, top_k_list) results = {} averages = ["micro", "macro"] for average in averages: results[average] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average) all_results[train_percent].append(results) print('Results, using embeddings of dimensionality', X.shape[1]) print('-------------------') for train_percent in sorted(all_results.keys()): print('Train percent:', train_percent) for index, result in enumerate(all_results[train_percent]): print('Shuffle #%d: ' % (index + 1), result) avg_score = defaultdict(float) for score_dict in all_results[train_percent]: for metric, score in iteritems(score_dict): avg_score[metric] += score for metric in avg_score: avg_score[metric] /= len(all_results[train_percent]) print('Average score:', dict(avg_score)) print('-------------------')
def pega_dados(vecfile, target, ant, syn): import csv from gensim.models import KeyedVectors cosine_ant = [] cosine_syn = [] subcos_ant = [] subcos_syn = [] mod = KeyedVectors.load_word2vec_format("/home/bthalenberg/ic/novos novos/"+vecfile, binary=False) i = 0 while i != len(target): #getting cosine similary between target and antonym try: cos = mod.similarity(target[i], ant[i]) except KeyError: cos = None cosine_ant.append(cos) #getting cosine similary between target and synonym try: cos_s = mod.similarity(target[i], syn[i]) except KeyError: cos_s = None cosine_syn.append(cos_s) #subtracting the antonym cosine similarity from the synonym similarity for syn input try: subcos_syn.append(cos_s - cos) except TypeError: subcos_syn.append(None) #negating subtracted values for ant input try: subcos_ant.append(-(cos_s - cos)) except TypeError: subcos_ant.append(None) i += 1 dirname = vecfile[:-4] with open(dirname+"/db_ant.csv", "w", encoding="utf-8") as f: writer = csv.writer(f) i = 0 while i != len(target): writer.writerow([target[i], ant[i], cosine_ant[i]]) i += 1 with open(dirname+"/db_syn.csv", "w", encoding="utf-8") as f: writer = csv.writer(f) i = 0 while i != len(target): writer.writerow([target[i], syn[i], cosine_syn[i]]) i += 1 with open(dirname+"/db_sub_ant.csv", "w", encoding="utf-8") as f: writer = csv.writer(f) i = 0 while i != len(target): writer.writerow([target[i], ant[i], subcos_ant[i]]) i += 1 with open(dirname+"/db_sub_syn.csv", "w", encoding="utf-8") as f: writer = csv.writer(f) i = 0 while i != len(target): writer.writerow([target[i], syn[i], subcos_syn[i]]) i += 1
word2int_vietnamese.get(word, word2int_vietnamese["<unk>"]) for word in words ] # Add int_seq to seq_list seq_list.append(int_seq) return seq_list # Define the max length of english and vietnamese english_max_len = 50 vietnamese_max_len = 50 # load model word_embed_english_w2v = KeyedVectors.load_word2vec_format( args["word_emb_src"], binary=True, unicode_errors='ignore') # Sort the int2word int2word_sorted = sorted(int2word_english.items()) # Get the list of word embedding corresponding to int value in ascending order word_emb_list = list() embedding_size = len(word_embed_english_w2v['the']) for int_val, word in int2word_sorted: # Add Glove embedding if it exists if (word in word_embed_english_w2v): word_emb_list.append(word_embed_english_w2v[word]) # Otherwise, the value of word embedding is 0 else: word_emb_list.append(np.zeros([embedding_size], dtype=np.float32))
def main(args): # Build the correct ensembly information print("Building Embeddings") if args.ensembler == "Infuser": if not (args.skipensembler): ensemble_infuser_method_StrucDiff2vec() output = args.output + "Infuser" + str(args.dimensions) + ".emb" ## Decide ensemble method to choose between multiple vocabs or infuser embeddings_model = KeyedVectors.load_word2vec_format(output) elif args.ensembler == "strucdiff2vec": if not (args.skipensembler): ensemble_method_StrucDiff2vec() args.dimensions = int(args.dimensions / 2) output1 = args.output + "struc2vec" + str(args.dimensions) + ".emb" ## Decide ensemble method to choose between multiple vocabs or infuser embeddings_model_struc = KeyedVectors.load_word2vec_format(output1) output2 = args.output + "diff2vec" + str(args.dimensions) + ".emb" ## Decide ensemble method to choose between multiple vocabs or infuser embeddings_model_diff = KeyedVectors.load_word2vec_format(output2) embeddings_model = [embeddings_model_struc, embeddings_model_diff] args.dimensions = int(args.dimensions * 2) elif args.ensembler == "Skip ensembler": output = "Embeddings/file.emb" ## Decide ensemble method to choose between multiple vocabs or infuser embeddings_model = KeyedVectors.load_word2vec_format(output) else: if args.ensembler == "node2vec": if not (args.skipensembler): no_ensemble_method_node2vec() output = args.output + "node2vec" + str(args.dimensions) + ".emb" ## Decide ensemble method to choose between multiple vocabs or infuser embeddings_model = KeyedVectors.load_word2vec_format(output) elif args.ensembler == "struc2vec": if not (args.skipensembler): no_ensemble_method_struc2vec() output = args.output + "struc2vec" + str(args.dimensions) + ".emb" ## Decide ensemble method to choose between multiple vocabs or infuser embeddings_model = KeyedVectors.load_word2vec_format(output) elif args.ensembler == "diff2vec": if not (args.skipensembler): no_ensemble_method_diff2vec() output = args.output + "diff2vec" + str(args.dimensions) + ".emb" ## Decide ensemble method to choose between multiple vocabs or infuser embeddings_model = KeyedVectors.load_word2vec_format(output) else: print("No ensembly method chosen try again") return print("Loaded Embeddings") ## Transform the dataset to be trained in the model. dataset_pos, dataset_neg = dataset_transformer(embeddings_model, args.datasetinput, args.dimensions, args.usenegativesample, args.buildnegativesample) ## Transform the testset to be trained in the model. testset_pos, testset_neg = dataset_transformer(embeddings_model, args.testsetinput, args.dimensions, args.usenegativesample, args.buildnegativesample) ## Transform the validset to be trained in the model. validset_pos, validset_neg = dataset_transformer(embeddings_model, args.validsetinput, args.dimensions, args.usenegativesample, args.buildnegativesample) ## Preprocess data dataset, targets = dataset_preprocess_hadamard(dataset_pos, dataset_neg) testset, testtargets = dataset_preprocess_hadamard(testset_pos, testset_neg) validset, validtargets = dataset_preprocess_hadamard( validset_pos, validset_neg) print("Building model") ## build mode with tensorflow print("Training Model") ## Run the training on the dataset. model_accuracy = train([dataset, targets], [testset, testtargets], [validset, validtargets], args) return model_accuracy
# Making an empty column in our test data for predicted labels. test['Predicted Label'] = '' print("Unique words in Training Data: {}".format(train_unique_words)) print("Unique words in Test Data: {}".format(test_unique_words)) preprocess() trained.head() test.head() !wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz word2vec = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True) """# **Repeating Part 1 with Word2Vec**""" def extract_features(sentence): words = [word for word in sentence.split() if word in word2vec.vocab] if words == []: return [] else: return np.mean(word2vec[words],axis=0) train_embeddings = [] for sentence in trained['Tweet']: words = extract_features(sentence) if words == []:
num_dense = np.random.randint(100, 150) rate_drop_lstm = 0.15 + np.random.rand() * 0.25 rate_drop_dense = 0.15 + np.random.rand() * 0.25 act = 'relu' re_weight = True # whether to re-weight classes to fit the 17.5% share in test set STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \ rate_drop_dense) ######################################## ## index word vectors ######################################## print('Indexing word vectors') word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \ binary=True) print('Found %s word vectors of word2vec' % len(word2vec.vocab)) ######################################## ## process texts in datasets ######################################## print('Processing text dataset') # The function "text_to_wordlist" is from # https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords:
"United States", "United States Minor Outlying Islands", "Uruguay", "Uzbekistan", "Vanuatu", "Venezuela", "Viet Nam", "Virgin Islands, British", "Virgin Islands, U.s.", "Wallis And Futuna", "Western Sahara", "Yemen", "Zambia", "Zimbabwe"] model = KeyedVectors.load_word2vec_format('section10/matrix_word2vec.txt', binary=True) country_to_id = defaultdict(int) matrix = np.empty([0,300], dtype=np.float) cnt = 0 for c in country: try: matrix = np.vstack([matrix, model[c]]) country_to_id[c] = cnt cnt += 1 except: pass io.savemat("section10/country_matrix", {"matrix":matrix}) with open("section10/country_to_id", "wb") as f: pickle.dump(country_to_id, f)
continue X = np.array(X)/np.linalg.norm(X) Y = np.array(Y)/np.linalg.norm(Y) o = np.dot(X, Y.T)/np.linalg.norm(X)/np.linalg.norm(Y) scores.append(o) scores = np.asarray(scores) return np.mean(scores), 1.96*np.std(scores)/float(len(scores)), np.std(scores) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('ground_truth', help="ground truth text file, one example per line") parser.add_argument('predicted', help="predicted text file, one example per line") parser.add_argument('embeddings', help="embeddings bin file") args = parser.parse_args() print("loading embeddings file...") w2v = KeyedVectors.load_word2vec_format(args.embeddings, binary=True) r = average(args.ground_truth, args.predicted, w2v) print("Embedding Average Score: %f +/- %f ( %f )" %(r[0], r[1], r[2])) r = greedy_match(args.ground_truth, args.predicted, w2v) print("Greedy Matching Score: %f +/- %f ( %f )" %(r[0], r[1], r[2])) r = extrema_score(args.ground_truth, args.predicted, w2v) print("Extrema Score: %f +/- %f ( %f )" %(r[0], r[1], r[2]))
raise FileExistsError() if not os.path.exists(PATH_ENTITY_VECTOR): raise FileExistsError() ## initialize tokenizer funtion ## tokenizer_obj = MecabWrapper(dictType='neologd') get_token = partial(__func_japanese_tokenizer, tokenizer_obj=tokenizer_obj, pos_condition=POS_CONDITION, is_surface=False) ## load word embedding ## try: embedding_model = KeyedVectors.load_word2vec_format( PATH_ENTITY_VECTOR, **{ 'binary': True, 'unicode_errors': 'ignore' }) except: embedding_model = Word2Vec.load_word2vec_format( PATH_ENTITY_VECTOR, **{ 'binary': True, 'unicode_errors': 'ignore' }) ## make training data ## with open(PATH_TRAINING_TEXT, 'r') as f: seq_wikipedia_training_text = json.loads(f.read()) seq_training_input_text_obj = [] for i, wikipedia_article_obj in enumerate(seq_wikipedia_training_text):
from future.utils import iteritems from builtins import range # Note: you may need to update your version of future # sudo pip install -U future from gensim.models import KeyedVectors # warning: takes quite awhile # https://code.google.com/archive/p/word2vec/ # direct link: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing # 3 million words and phrases # D = 300 word_vectors = KeyedVectors.load_word2vec_format( '../large_files/GoogleNews-vectors-negative300.bin', binary=True ) # convenience # result looks like: # [('athens', 0.6001024842262268), # ('albert', 0.5729557275772095), # ('holmes', 0.569324254989624), # ('donnie', 0.5690680742263794), # ('italy', 0.5673537254333496), # ('toni', 0.5666348338127136), # ('spain', 0.5661854147911072), # ('jh', 0.5661597847938538), # ('pablo', 0.5631559491157532), # ('malta', 0.5620371103286743)]
def load_w2v_model(file_name: str) -> None: print("loading w2v_model...") return KeyedVectors.load_word2vec_format(file_name, binary=True, encoding='utf-8')