def load(self, embedding_fname, embedding_url=None, *args, **kwargs): """ Method initializes dict of embeddings from file Args: fname: file name Returns: Nothing """ if not embedding_fname: raise RuntimeError('No pretrained fasttext intent_model provided') fasttext_model_file = embedding_fname if not Path(fasttext_model_file).is_file(): emb_path = embedding_url if not emb_path: raise RuntimeError( 'No pretrained fasttext intent_model provided') embedding_fname = Path(fasttext_model_file).name try: download_path = './' download_untar(embedding_url, download_path) except Exception as e: raise RuntimeError( 'Looks like the `EMBEDDINGS_URL` variable is set incorrectly', e) self.model = FastText.load_fasttext_format(fasttext_model_file) return
def load_fasttext_format(cls, *args, **kwargs): """Load a :class:`~gensim.models.fasttext.FastText` model from a format compatible with the original fasttext implementation. Parameters ---------- fname : str Path to the file. """ return Ft_Wrapper.load_fasttext_format(*args, **kwargs)
def make_w2v(vocab): d = {} if opt.lang == 'en_w2v': model = KeyedVectors.load_word2vec_format( '../../../GoogleNews-vectors-negative300.bin', binary=True) if opt.lang == 'en_fast': model = KeyedVectors.load_word2vec_format( '../../../wiki-news-300d-1M.vec') if opt.lang == 'es': model = FastText.load_fasttext_format('../../../cc.es.300.bin') if opt.lang == 'fr': model = FastText.load_fasttext_format('../../../cc.fr.300.bin') for i in range(4, vocab.size()): word = vocab.idxToLabel[i] #if opt.lang == 'en_w2v': #if model.emb(word)[0] != None: #if model.emb(word)[0] != None: #d[i] = model.emb(word) #d[i] = model[word] if word in model: d[i] = model[word] return d
def make_embedding_matrix(train_captions): tokenizer.fit_on_texts(train_captions) model = FastText.load_fasttext_format(cfg.fasttext) #---------embedding matrix 만듬-------- vocab_size = len(tokenizer.word_index) embedding_matrix = np.random.random((vocab_size, 256)) for word,i in tokenizer.word_index.items(): # 1부터 시작함 try: embedding_vector = model[word] except: #min count 이하 등장 단어 #print(word, 'not found') pass if embedding_vector is not None: embedding_matrix[i-1] = embedding_vector return embedding_matrix
def embedding_weights_load(words_map, embedding_weights_path): pre_trained_embedding = None try: model = FastText.load_fasttext_format(embedding_weights_path) pre_trained_embedding = "bin" except: print("fastText binary file (.bin) is not found!") if os.path.exists("./Word_embedding/wiki.en.vec"): print("Using wikipedia(en) pre-trained word vectors.") else: print("Downloading wikipedia(en) pre-trained word vectors.") chakin.download(number=2, save_dir="./Word_embedding") print("Loading vectors...") if os.path.exists("./Word_embedding_model.pkl"): with open("./Word_embedding_model.pkl", mode="rb") as f: model = pickle.load(f) else: model = KeyedVectors.load_word2vec_format( './Word_embedding/wiki.en.vec') with open("Word_embedding_model.pkl", mode="wb") as f: pickle.dump(model, f) pre_trained_embedding = "txt" vocab_size = len(words_map) word_dimension = model['a'].shape[0] w = np.zeros((vocab_size, word_dimension), dtype=np.float32) for k, v in words_map.items(): word = k word_number = v try: w[word_number][:] = model[word] except KeyError as e: if pre_trained_embedding == "bin": w[word_number][:] = model.seeded_vector(word) else: np.random.seed(word_number) w[word_number][:] = np.random.uniform(-0.25, 0.25, word_dimension) return w
def embedding_weights_load(words_map, embeddingWeights_path): pre_trained_embedding = None try: model = FastText.load_fasttext_format( embeddingWeights_path) #binファイルがある場合はそちらを読み込む pre_trained_embedding = "bin" except: print("fastText binary file (.bin) is not found!" ) #ない場合はwikipediaの分散表現を使用する if os.path.exists("./Word_embedding/wiki.en.vec"): print("Using wikipedia(en) pre-trained word vectors.") else: print("Downloading wikipedia(en) pre-trained word vectors.") chakin.download(number=2, save_dir="./Word_embedding") print("Loading vectors...") model = KeyedVectors.load_word2vec_format( './Word_embedding/wiki.en.vec') pre_trained_embedding = "txt" vocab_size = len(words_map) word_dimension = model['a'].shape[0] #次元数を取得 W = np.zeros((vocab_size, word_dimension), dtype=np.float32) #分散表現を格納するための行列 for k, v in words_map.items(): #kには単語,vには単語ID word = k word_number = v #モデル中に存在しないチャンゴがある場合には、その単語の分散表現は乱数となる try: W[word_number][:] = model[word] except KeyError as e: if pre_trained_embedding == "bin": W[word_number][:] = model.seeded_vector(word) else: np.random.seed(word_number) W[word_number][:] = np.random.uniform(-0.25, 0.25, word_dimension) return W
def load_fasttext_format(cls, *args, **kwargs): return Ft_Wrapper.load_fasttext_format(*args, **kwargs)
import gensim from gensim.models.wrappers.fasttext import FastText from file_path_manager import FilePathManager if __name__ == '__main__': model = FastText.load_fasttext_format(FilePathManager.resolve("data/wiki.en")) model.save(FilePathManager.resolve("data/fasttext.model"))
start = time.time() clf = lgb.LGBMClassifier(objective="multiclass") clf.fit(plain_fasttext, train["class"]) Y_true, Y_pred = test["class"], clf.predict(plain_fasttext_test) print("Report") print(classification_report(Y_true, Y_pred, digits=6)) print("Accuracy: ", clf.score(plain_fasttext_test, test["class"])) print("Time taken:", time.time() - start, "\n") # In[ ]: ## SCDV based fasttext from gensim.models.wrappers.fasttext import FastText fasttext_model_200 = FastText.load_fasttext_format( '../japanese-dataset/livedoor-news-corpus/for-fasttext/fasttext_model_200dim' ) # In[ ]: # Get wordvectors for all words in vocabulary. word_vectors = fasttext_model_200.wv.syn0 # Set number of clusters. num_clusters = 60 # Uncomment below line for creating new clusters. idx, idx_proba = cluster_GMM(num_clusters, word_vectors) # Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments. # idx_name = "gmm_latestclusmodel_len2alldata.pkl" # idx_proba_name = "gmm_prob_latestclusmodel_len2alldata.pkl"
def get_fasttext(): global _fasttext if _fasttext is None: log.debug("Loading fasttext model..") _fasttext = FastText.load_fasttext_format(FASTTEXT_PATH) return _fasttext
def load(self, fname): self.fasttext = FastText.load_fasttext_format(fname)
# In[1]: from gensim.models import KeyedVectors from gensim.models.wrappers.fasttext import FastText # In[2]: ## load model word2vec = KeyedVectors.load( "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/word2vec200.model" ) word2vec_weighted = KeyedVectors.load( "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/word2vec_weighted.model" ) fasttext = FastText.load_fasttext_format( "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/fasttext_model_200dim" ) fasttext_weighted = KeyedVectors.load( "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/fasttext_weighted.model" ) poincare_vec = KeyedVectors.load( "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/poincare_vec.model" ) poincare_vec_weighted = KeyedVectors.load( "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/poincare_vec_weighted.model" ) # In[3]: len(word2vec.most_similar("独身"))
] and not c[1] in ["非自立", "代名詞"]: words.append(cols[0]) return words questions_src = [] questions = [] answers = [] for line in open(args.input, "r", encoding="utf-8", errors="ignore"): cols = line.strip().split('\t') #print(cols[0]) questions_src.append(cols[0]) questions.append(wakati(cols[0])) answers.append(cols[1]) model = FastText.load_fasttext_format(args.model) def part_minus(v): #正と負で別のベクトルにする tmp_v = np.zeros(DIM * 2) for i in range(DIM): if v[i] >= 0: tmp_v[i] = v[i] else: tmp_v[i * 2] = -v[i] return tmp_v questions_vec = [] tf_vecs = []