def build_glove(word2vec, target_files, output_path): word2vec1 = KeyedVectors(vector_size=300) print(word2vec1.vectors.shape, (len(word2vec1.vocab), word2vec1.vector_size)) buf1 = [] buf2 = [] contains = set() def add_buffer(w, f): nonlocal buf1, buf2 if w not in contains: buf1.append(w) buf2.append(f) contains.add(w) def clear_buffer(): nonlocal buf1, buf2 buf1 = [] buf2 = [] for f in target_files: for i, s in enumerate(load_json(f), 1): sentence = s['description'] for w in tokenize(sentence): w = w.lower() if w in word2vec: add_buffer(w, word2vec[w]) if i % 10 == 0 and len(buf1) > 0: word2vec1.add(buf1, buf2, replace=False) clear_buffer() if len(buf1) > 0: word2vec1.add(buf1, buf2, replace=False) print(word2vec1.vectors.shape, (len(word2vec1.vocab), word2vec1.vector_size)) KeyedVectors.save_word2vec_format(word2vec1, output_path, binary=True)
def new_w2v(): pkl_file = open("%s/data.para" % TPS, 'rb') vocab_u, vocab_i = load_vocabulary(pkl_file) # print(vocab_u) print(len(vocab_u)) print(len(vocab_i)) print(vocab_u['love']) all_words = set() all_words = all_words.union(set(vocab_u.keys())) print(len(all_words)) all_words = all_words.union(set(vocab_i.keys())) print(len(all_words)) length = len(all_words) w2v_model = KeyedVectors.load_word2vec_format('E:/embedding/GoogleNews-vectors-negative300.bin', binary=True) word_list = list(all_words) embeds_list = [] miss = set() for w in word_list: if w in w2v_model: # in_set.add(w) embeds = w2v_model[w] else: miss.add(w) embeds = np.random.uniform(-0.25, 0.25, 300) embeds_list.append(embeds) print("miss:", len(miss)/len(all_words)) new_w2v = KeyedVectors(300) new_w2v.add(word_list, embeds_list) new_w2v.save_word2vec_format("%s/google.w2v.bin" % TPS, binary=True)
def make_bert_sentence_file(filename, bert_sent_model, labels, vec_size=300): #Get all the embeddings = get_sentence_bert(bert_sent_model, labels) kv = KeyedVectors(vector_size=vec_size) vec_id_list = range(0, len(labels)) kv.add(vec_id_list, embeddings) kv.save_word2vec_format(filename, binary=False) return
def make_word2vec_file(filename, model, labels): # Get mean word2vec vector for all labels and write them to a file. kv = KeyedVectors(vector_size=model.wv.vector_size) vec_id_list = range(0, len(labels)) vectors = [] for label in labels: vec = get_mean_vector(model, label) vectors.append(vec) kv.add(vec_id_list, vectors) kv.save_word2vec_format(filename, binary=False) return
def reduce_word2vec_vocab(input_path, output_path, vocab): """ Downsamples the vocabulary in word2vec embeddings to less storage overhead. Given the input path of the embeddings and the vocabulary needed, create a new word2vec model removing words not in the voabulary. Save this resulting model in the output_path. """ input_model = KeyedVectors.load_word2vec_format(input_path, binary=True) output_model = KeyedVectors(100) for word in vocab: if word in input_model.vocab: output_model.add([word], [input_model[word]]) output_model.save_word2vec_format(output_path, binary=True)
def main(kv_filepath, vocab_filepath, output_filepath): model = KeyedVectors.load_word2vec_format(kv_filepath, binary=True) vocab = Vocab(vocab_filepath) short_kv = KeyedVectors(vector_size=len(model['hello'])) for word in vocab.word2int.keys(): try: short_kv.add(word, model[word]) except KeyError: continue short_kv.save_word2vec_format( os.path.join(output_filepath, 'short-vectors.bin'))
def save(self, filename="gensim_KeyedVectors.txt"): ''' Saves the model to the specified filename as a gensim KeyedVectors in the text format so you can load it separately. ''' # Creates an empty KeyedVectors with our embedding size kv = KeyedVectors(vector_size=self.hidden_layer_size) vectors = [] words = [] # Get the list of words/vectors in a consistent order for index, word in enumerate(self.index_to_word): vectors.append(self.W[index].copy()) words.append(word) # Fills the KV object with our data in the right order kv.add(words, vectors) kv.save_word2vec_format(filename, binary=False)
def readOneUsernameTextFile(filename, saveToFile_vectors, saveToFile_model, subreddit, username, model, count): update = False c = 0 for batch in usernameSentenceIterator(filename): #a batch is a list of sentences #a sentence is a list of words. update = True if (c == 0 and count == 0): update = False # don't update the first model c += 1 model.build_vocab(batch, update=update) model.train(batch, total_examples=model.corpus_count, epochs=100) KeyedVectors.save_word2vec_format( model.wv, saveToFile_vectors, binary=False) # save the vectors for ease of use model.save(saveToFile_model) # save the model information return model
def compute_doc_vecs(experiment, data_dir='./data', workers=None, override=False, dense_vector_size=300, sparse_vector_size=500000, gpu=None): """ Examples: python cli.py compute_doc_vecs wikisource --override=1 --gpu 0 python cli.py compute_doc_vecs ocb --override=1 --gpu 1 :param data_dir: Path to data (for input and output) :param experiment: Experiment name (ocb or wikisource) :param workers: Number of workers :param override: Override existing output :param dense_vector_size: Size of dense document vectors (avg word2vec, graph embeddings, ...) :param sparse_vector_size: Size of sparse document vectors (TF-IDF) :param cuda_device: Use CUDA device for Transformer models :return: """ env = get_env() data_dir = Path(data_dir) logger.info(f'Experiment: {experiment}') exp = Experiment(name=experiment, env=env, data_dir=data_dir) exp.load_data() exp.filter_docs() models_dir = exp.models_dir common_kwargs = exp.get_common_kwargs() if not workers: workers = env['workers'] logger.info(f'Using {workers} workers') if gpu: logger.info(f'Using CUDA device={gpu}') os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) # TF-IDF out_fp = models_dir / 'tfidf.pickle' if override or not os.path.exists(out_fp): rs = TfIdfRecSys(vector_size=sparse_vector_size, **common_kwargs) rs.train(exp.texts) rs.save_to_disk(out_fp, override=override) # Doc2Vec out_fp = models_dir / 'doc2vec.w2v.txt' if override or not os.path.exists(out_fp): rs = Doc2VecRecSys(**common_kwargs, vector_size=dense_vector_size) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'doc2vec_512.w2v.txt' if override or not os.path.exists(out_fp): rs = Doc2VecRecSys(**common_kwargs, vector_size=dense_vector_size) rs.train(exp.get_limited_texts(512)) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'doc2vec_4096.w2v.txt' if override or not os.path.exists(out_fp): rs = Doc2VecRecSys(**common_kwargs, vector_size=dense_vector_size) rs.train(exp.get_limited_texts(4096)) rs.save_word2vec_format(out_fp, override=override) # Avg GloVe out_fp = models_dir / 'avg_glove.w2v.txt' if override or not os.path.exists(out_fp): rs = WeightedAvgWordVectorsRecSys(w2v_model=exp.get_w2v_model('glove'), **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) # With custom GloVe embeddings out_fp = models_dir / 'avg_glove_custom.w2v.txt' if override or not os.path.exists(out_fp): rs = WeightedAvgWordVectorsRecSys( w2v_model=exp.get_w2v_model('glove_custom'), **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'avg_fasttext.w2v.txt' if override or not os.path.exists(out_fp): rs = WeightedAvgWordVectorsRecSys( w2v_model=exp.get_w2v_model('fasttext'), **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'avg_fasttext_custom.w2v.txt' if override or not os.path.exists(out_fp): rs = WeightedAvgWordVectorsRecSys( w2v_model=exp.get_w2v_model('fasttext_custom'), **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'avg_fasttext_custom_512.w2v.txt' if override or not os.path.exists(out_fp): rs = WeightedAvgWordVectorsRecSys( w2v_model=exp.get_w2v_model('fasttext_custom'), **common_kwargs) rs.train(exp.get_limited_texts(512)) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'avg_fasttext_custom_4096.w2v.txt' if override or not os.path.exists(out_fp): rs = WeightedAvgWordVectorsRecSys( w2v_model=exp.get_w2v_model('fasttext_custom'), **common_kwargs) rs.train(exp.get_limited_texts(4096)) rs.save_word2vec_format(out_fp, override=override) # Transformers # BERT standard pooled out_fp = models_dir / 'bert-base-cased.w2v.txt' if override or not os.path.exists(out_fp): rs = TransformerRecSys(model_name_or_path=env['bert_dir'] + '/bert-base-cased', **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(models_dir / 'bert-base-cased.w2v.txt', override=override) # All "MEAN" transformers for tf_name in [ 'bert-base-cased', 'bert-large-cased', 'roberta-base', 'roberta-large', 'legal-bert' ]: out_fp = models_dir / f'{tf_name}_mean.w2v.txt' if override or not os.path.exists(out_fp): rs = TransformerRecSys(model_name_or_path=env['bert_dir'] + '/' + tf_name, pooling_strategy='reduce_mean', **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) # Long former if transformers.__version__ == '2.0.0': from longformer.longformer import Longformer from transformers import RobertaTokenizer out_fp = models_dir / 'longformer-base-4096-mean.w2v.txt' if override or not os.path.exists(out_fp): lf_lm = Longformer.from_pretrained(env['bert_dir'] + '/longformer-base-4096') lf_tokenizer = RobertaTokenizer.from_pretrained(env['bert_dir'] + '/roberta-base') lf_tokenizer.max_len = lf_lm.config.max_position_embeddings rs = TransformerRecSys(language_model=lf_lm, tokenizer=lf_tokenizer, max_length=4096, pooling_strategy='reduce_mean', **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'longformer-large-4096-mean.w2v.txt' if override or not os.path.exists(out_fp): lf_lm = Longformer.from_pretrained(env['bert_dir'] + '/longformer-large-4096') lf_tokenizer = RobertaTokenizer.from_pretrained(env['bert_dir'] + '/roberta-large') lf_tokenizer.max_len = lf_lm.config.max_position_embeddings rs = TransformerRecSys(language_model=lf_lm, tokenizer=lf_tokenizer, max_length=4096, pooling_strategy='reduce_mean', **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) else: # Wait for https://github.com/allenai/longformer/pull/14 logger.warning('Cannot run LongFormer with transformers!=2.0.0') # Sentence transformer if LooseVersion(transformers.__version__) >= LooseVersion('2.8.0'): # See https://github.com/UKPLab/sentence-transformers/blob/master/requirements.txt#L1 st_models = [ 'bert-base-nli-mean-tokens', 'bert-large-nli-mean-tokens', 'roberta-base-nli-mean-tokens', 'roberta-large-nli-mean-tokens', 'bert-base-nli-stsb-mean-tokens', 'bert-large-nli-stsb-mean-tokens', 'roberta-base-nli-stsb-mean-tokens', 'roberta-large-nli-stsb-mean-tokens', ] st_dir = env['datasets_dir'] + '/sentence_transformers/' for st_model_name in st_models: out_fp = models_dir / f's{st_model_name}.w2v.txt' if override or not os.path.exists(out_fp): rs = SentenceTransformerRecSys(model_name_or_path=st_dir + st_model_name, **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) # break else: logger.warning( 'Cannot run sentence-transformers with transformers==%s' % transformers.__version__) # Citation # DeepWalk out_fp = models_dir / 'deepwalk.pickle' if override or not os.path.exists(out_fp): rs = GraphEmbeddingRecSys(include_seeds=exp.get_included_seeds(), graph_model_cls='karateclub.DeepWalk', graph_model_kwargs=dict( dimensions=dense_vector_size, workers=workers), **common_kwargs) rs.train(exp.cits) rs.save_to_disk(out_fp, override=override) # Diff2Vec """ out_fp = models_dir / 'diff2vec.pickle' if override or not os.path.exists(out_fp): diff2vec = GraphEmbeddingRecSys( include_seeds=exp.get_included_seeds(), graph_model_cls='karateclub.Diff2Vec', graph_model_kwargs=dict(dimensions=dense_vector_size, workers=workers), **common_kwargs ) diff2vec.train(exp.cits) diff2vec.save_to_disk(out_fp, override=override) """ # Walklets out_fp = models_dir / 'walklets.pickle' if override or not os.path.exists(out_fp): walklets_window_size = 5 # or 3 walklets_dim = int(dense_vector_size / walklets_window_size) # must be int rs = GraphEmbeddingRecSys(include_seeds=exp.get_included_seeds(), graph_model_cls='karateclub.Walklets', graph_model_kwargs=dict( dimensions=walklets_dim, window_size=walklets_window_size, workers=workers), **common_kwargs) rs.train(exp.cits) rs.save_to_disk(out_fp, override=override) # Node2Vec out_fp = models_dir / 'node2vec.pickle' if override or not os.path.exists(out_fp): rs = GraphEmbeddingRecSys(include_seeds=exp.get_included_seeds(), graph_model_cls='node2vec.Node2Vec', graph_model_kwargs=dict( dimensions=dense_vector_size, workers=workers), **common_kwargs) rs.train(exp.cits) rs.save_to_disk(out_fp, override=override) # NodeSketch """ out_fp = models_dir / 'nodesketch.pickle' if override or not os.path.exists(out_fp): nodesketch = GraphEmbeddingRecSys( include_seeds=exp.get_included_seeds(), graph_model_cls='karateclub.NodeSketch', graph_model_kwargs=dict(dimensions=dense_vector_size), **common_kwargs ) nodesketch.train(exp.cits) nodesketch.save_to_disk(out_fp, override=override) """ # BoostNE out_fp = models_dir / 'boostne.pickle' if override or not os.path.exists(out_fp): boostne_iters = 9 # 14 boostne_dim = 30 # 20 assert boostne_dim * (boostne_iters + 1) == dense_vector_size boostne = GraphEmbeddingRecSys( include_seeds=exp.get_included_seeds(), # vector_size=dense_vector_size, graph_model_cls='karateclub.BoostNE', graph_model_kwargs=dict( dimensions=boostne_dim, # 8 order=2, # 2 iterations=boostne_iters, # 16 alpha=0.01, ), # Take only embedding from last boosting # node_embedding_slice=slice(dense_vector_size * boostne_iters, dense_vector_size * (boostne_iters + 1)), **common_kwargs) boostne.train(exp.cits) boostne.save_to_disk(out_fp, override=override) # Poincare from gensim.models.poincare import PoincareModel out_fp = models_dir / 'poincare.w2v.txt' if override or not os.path.exists(out_fp): poincare_model = PoincareModel( exp.cits, size=300, alpha=0.1, negative=10, workers=1, epsilon=1e-05, regularization_coeff=1.0, burn_in=10, burn_in_alpha=0.01, init_range=(-0.001, 0.001), ) poincare_model.train(epochs=50, ) # init empty model poincare = KeyedVectors(vector_size=poincare_model.kv.vector_size) # ignore items not part of gold standard for doc_id in list(poincare_model.kv.vocab.keys()): if doc_id in exp.get_included_seeds(): poincare.add(doc_id, poincare_model.kv.get_vector(doc_id)) poincare.save_word2vec_format(out_fp) logger.info('Done')
def build_avg_word_vectors(hf_dataset, w2v_path, output_path, override=False): """ Run with: $ ./data_cli.py build_avg_word_vectors paperswithcode_aspects ./output/fasttext.w2v.txt ./output/pwc_doc_id2avg_fasttext.w2v.txt :param hf_dataset: :param w2v_path: :param output_path: :param override: :return: """ stop_words = 'english' count_vector_size = 100000 if os.path.exists(output_path): if override: logger.debug(f'Override {output_path}') os.remove(output_path) else: logger.info( f'Stop. Output file exists already (override disabled): {output_path}' ) return w2v_model = KeyedVectors.load_word2vec_format(w2v_path) doc_model = KeyedVectors(vector_size=w2v_model.vector_size) count_vec = CountVectorizer(stop_words=stop_words, analyzer='word', lowercase=True, ngram_range=(1, 1), max_features=count_vector_size) docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir='./data/nlp_cache', split='docs') logger.info(f'Documents loaded: {len(docs_ds):,}') # Extract plain text texts = [] doc_id2idx = {} idx2doc_id = {} for idx, doc in enumerate(docs_ds): # Extract plain text texts.append(doc['title'] + ': ' + doc['abstract']) doc_id2idx[doc['paper_id']] = idx idx2doc_id[idx] = doc['paper_id'] # Transforms the data into a bag of words count_train = count_vec.fit(texts) idx2bow = count_vec.transform(texts) vidx2word = {v: k for k, v in count_train.vocabulary_.items()} assert len(vidx2word) == len(count_train.vocabulary_) logger.info(f'Vocab size: {len(count_train.vocabulary_)}') for idx, text in enumerate( tqdm(texts, total=len(texts), desc='Converting docs to vectors')): bow = idx2bow[idx].A[0] vectors = [] weights = [] for _idx, count in enumerate(bow): if count > 0: word = vidx2word[_idx] try: v = w2v_model.get_vector(word) vectors.append(v) weights.append(count) except KeyError: # unknown word pass pass # Check if at least one document term exists as word vector if vectors and weights: # Weight avg doc = np.average(np.array(vectors), axis=0, weights=np.array(weights)) # Add to model with doc_id doc_model.add([str(idx2doc_id[idx])], [doc]) else: logger.debug( f'Cannot add document {idx2doc_id[idx]} due to missing word vectors' ) # Save to disk doc_model.save_word2vec_format(output_path) logger.info(f'Saved to: {output_path}')
def build_specter_vectors(hf_dataset: str, specter_path: str, output_path: str, cuda_device: int = -1, batch_size: int = 32, vector_size: int = 768, override=False): """ Run with: $ ./data_cli.py build_specter_vectors paperswithcode_aspects ./specter_archive ./output/pwc_doc_id2specter.w2v.txt --cuda_device=5 Download specter: $ wget https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/specter/archive.tar.gz $ tar -xzvf archive.tar.gz :param vector_size: :param output_path: ./output :param override: :param cuda_device: :param batch_size: :param hf_dataset: :param specter_path: Path to specter :return: """ from specter.predict_command import predictor_from_archive from allennlp.models import load_archive # load to register from specter.model import Model from specter.data import DataReader, DataReaderFromPickled from specter.predictor import SpecterPredictor if Model and DataReader and SpecterPredictor: pass if os.path.exists(output_path) and not override: logger.error(f'Output file exists already: {output_path}') return # Dataset docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir='./data/nlp_cache', split='docs') logger.info(f'Documents loaded: {len(docs_ds):,}') papers_to_embed = [doc for doc in docs_ds] # Specter settings archive_path = os.path.join(specter_path, 'model.tar.gz') metadata_path = os.path.join(specter_path, 'metadata_sample.json') included_text_fields = 'abstract title' vocab_dir = os.path.join(specter_path, 'data/vocab/') cuda_device = int(cuda_device) overrides = f"{{'model':{{'predict_mode':'true','include_venue':'false'}},'dataset_reader':{{'type':'specter_data_reader','predict_mode':'true','paper_features_path':'{metadata_path}','included_text_fields': '{included_text_fields}'}},'vocabulary':{{'directory_path':'{vocab_dir}'}}}}" logger.info(f'SPECTER overrides: {overrides}') archive = load_archive(archive_path, cuda_device=cuda_device, overrides=overrides) predictor = predictor_from_archive(archive, predictor_name='specter_predictor', paper_features_path=metadata_path) # Batches def chunks(lst, chunk_size): """Splits a longer list to respect batch size""" for i in range(0, len(lst), chunk_size): yield lst[i:i + chunk_size] batches_count = int(len(papers_to_embed) / batch_size) batch_embed_papers = [] # 30min on GPU for batch in tqdm(chunks(papers_to_embed, batch_size), total=batches_count): batch_out = predictor.predict_batch_json(batch) batch_embed_papers += batch_out # To keyed vectors doc_model = KeyedVectors(vector_size=vector_size) for embed_paper in tqdm(batch_embed_papers): doc_model.add([embed_paper['paper_id']], [embed_paper['embedding']]) # Save to disk doc_model.save_word2vec_format(output_path) logger.info('Done')
def build_transformers_vectors(hf_dataset: str, model_name_or_path: str, output_path: str, pooling: str, batch_size: int = 16, override: bool = False): """ $ ./data_cli.py build_transformers_vectors paperswithcode_aspects scibert-scivocab-uncased ./output/scibert-cls --pooling=cls --batch_size=16 :param hf_dataset: :param model_name_or_path: :param output_path: :param pooling: :param override: :return: """ env = get_env() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") pooling_strategies = ['cls', 'mean'] if os.path.exists(output_path) and not override: logger.error(f'Output file exists already: {output_path}') sys.exit(1) if pooling not in pooling_strategies: raise ValueError(f'Invalid pooling: {pooling}') # Model path from env if not os.path.exists(model_name_or_path) and os.path.exists( os.path.join(env['bert_dir'], model_name_or_path)): model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path) # Dataset docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir='./data/nlp_cache', split='docs') logger.info(f'Documents loaded: {len(docs_ds):,}') # Model model = AutoModel.from_pretrained(model_name_or_path) model = model.to(device) # Tokenize docs tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) texts = [doc['title'] + ': ' + doc['abstract'] for doc in docs_ds] inputs = tokenizer(texts, add_special_tokens=True, return_tensors='pt', padding=True, max_length=model.config.max_position_embeddings, truncation=True, return_token_type_ids=False, return_attention_mask=True) ds = TensorDataset(inputs['input_ids'], inputs['attention_mask']) dl = DataLoader(ds, shuffle=False, batch_size=batch_size) # Vectors doc_model = KeyedVectors(vector_size=model.config.hidden_size) with torch.no_grad(): for batch_idx, batch_data in enumerate(tqdm(dl, desc='Inference')): batch_data = tuple(t.to(device) for t in batch_data) outputs = model(*batch_data, return_dict=True) if pooling == 'cls': batch_embeddings = outputs['pooler_output'].detach().cpu( ).numpy() elif pooling == 'mean': batch_embeddings = np.mean( outputs['last_hidden_state'].detach().cpu().numpy(), axis=1) else: raise NotImplementedError() batch_ids = docs_ds[batch_idx * batch_size:batch_idx * batch_size + batch_size]['paper_id'] doc_model.add(batch_ids, batch_embeddings) # Save to disk doc_model.save_word2vec_format(output_path) logger.info('Done')
import json import numpy as np from gensim.models import KeyedVectors as KV from tqdm import tqdm from gensim.models import Word2Vec with open('index_title.json') as f: i_t = json.loads(f.read()) f = open('nv77k', 'r') num_of_nodes, dim = [int(x) for x in f.readline().split()] nv = KV(vector_size = dim) for line in tqdm(f.readlines()): splits = line.split() nv[i_t[splits[0]]] = np.array([float(x) for x in splits[1:]]) nv.save_word2vec_format('nv77k.emb')
def save(self, kvs: KeyedVectors): filepath = path.join(self._folder_path, kvs.name) kvs.save_word2vec_format(filepath, binary=True)
def convert_word2vec_from_bin_to_txt(file_path, save_path): model = KeyedVectors.load_word2vec_format(file_path, binary=True) KeyedVectors.save_word2vec_format(model, save_path)