def save_fasttext(vocab): model = FastText.load_word2vec_format('../../corpora/wiki.en.vec') # 新建KeyedVectors kmodel = KeyedVectors(300) loss = 0 for word in vocab: try: vec = model[word] except: loss += 1 continue kmodel.add(word, vec, replace=True) print('loss word: ', loss) kmodel.save('../../corpora/fasttext.wv')
def build_vocab(filenames, vocabfile): """Write unique words from a set of files to a new file""" if os.path.isfile(vocabfile): print('Loading existing vocabulary from', vocabfile) return vocab = set() for filename in filenames: if filename.endswith('.vec'): model = FastText.load_word2vec_format(filename) vocab |= set(model.vocab.keys()) else: with open(filename, 'r', encoding='utf-8') as f: for line in f: tokens = line.rstrip('\n').lower().split() vocab |= set(tokens) with open(vocabfile, 'w', encoding='utf-8') as f: for token in vocab: f.write(token + '\n')
def load_word_vectors(embeddings_path): if os.path.isfile(embeddings_path + '.pth') and \ os.path.isfile(embeddings_path + '.vocab'): print('==> File found, loading to memory') vectors = torch.load(embeddings_path + '.pth') vocab = Vocab(filename=embeddings_path + '.vocab') return vocab, vectors if os.path.isfile(embeddings_path + '.model'): model = KeyedVectors.load(embeddings_path + ".model") if os.path.isfile(embeddings_path + '.vec'): model = FastText.load_word2vec_format(embeddings_path + '.vec') list_of_tokens = model.vocab.keys() vectors = torch.zeros(len(list_of_tokens), model.vector_size) with open(embeddings_path + '.vocab', 'w', encoding='utf-8') as f: for token in list_of_tokens: f.write(token+'\n') vocab = Vocab(filename=embeddings_path + '.vocab') for index, word in enumerate(list_of_tokens): vectors[index, :] = torch.from_numpy(model[word]) return vocab, vectors
df_feat['glove_cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df_feat['glove_cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df_feat['glove_jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df_feat['glove_canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df_feat['glove_euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df_feat['glove_minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df_feat['glove_braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df_feat['glove_skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] df_feat['glove_skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] df_feat['glove_kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] df_feat['glove_kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)] # fasttext w2v distance model = FastText.load_word2vec_format('../../corpora/wiki.en.vec') def sent2vec(s): words = str(s).lower() words = nltk.word_tokenize(words) words = [w for w in words if not w in stop_words] words = [w for w in words if w.isalpha()] M = [] for w in words: try: M.append(model[w]) except: continue M = np.array(M) v = M.sum(axis=0) return v / np.sqrt((v ** 2).sum())
from gensim.models.wrappers import FastText model = FastText() model.load_word2vec_format('/home/ltp/WorkShop/fastText/model/wiki.zh.vec')
def load_embeddings(embeddings_path): if os.path.isfile(embeddings_path + '.model'): model = KeyedVectors.load(embeddings_path + ".model") if os.path.isfile(embeddings_path + '.vec'): model = FastText.load_word2vec_format(embeddings_path + '.vec') return model