def test_filter(self): embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header") path_vocab = "./tests/data/vocabs/plain" vocab = Vocabulary() vocab.load(path_vocab) embs.filter_by_vocab(["the", "apple"]) embs.filter_by_vocab([])
def test_filter(self): embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) path_vocab = path.join('.', 'tests', 'data', 'vocabs', 'plain') vocab = Vocabulary() vocab.load(path_vocab) embs.filter_by_vocab(["the", "apple"]) embs.filter_by_vocab([])
def load_hdf5(self, path): """loads embeddings from hdf5 format""" file_in = tables.open_file(os.path.join(path, 'vectors.h5p'), 'r') self.matrix = file_in.root.vectors.read() self.vocabulary = Vocabulary() self.vocabulary.load(path) # self.name += os.path.basename(os.path.normpath(path)) file_in.close()
def test_save_and_load(self): vocab = Vocabulary() vocab.load(path_vocab) cnt_1 = vocab.cnt_words vocab.save_to_dir("/tmp/vecto/vocab/save1") vocab.load("/tmp/vecto/vocab/save1") assert cnt_1 == vocab.cnt_words
def load_npy(self, path): """loads embeddings from numpy format""" self.matrix = np.load(os.path.join(path, "vectors.npy")) # self.load_with_alpha(0.6) self.vocabulary = Vocabulary_simple() self.vocabulary.load(path) self.name += os.path.basename(os.path.normpath(path))
def load_from_text(self, path): i = 0 # self.name+="_"+os.path.basename(os.path.normpath(path)) self.vocabulary = Vocabulary() rows = [] header = False vec_size = -1 with detect_archive_format_and_open(path) as file_in: for line_number, line in enumerate(file_in): tokens = line.split() if i == 0 and len(tokens) == 2: header = True cnt_words = int(tokens[0]) vec_size = int(tokens[1]) continue # word = tokens[0].decode('ascii',errors="ignore") # word = tokens[0].decode('UTF-8', errors="ignore") word = tokens[0] self.vocabulary.dic_words_ids[word] = i self.vocabulary.lst_words.append(word) str_vec = tokens[1:] if vec_size == -1: vec_size = len(str_vec) if vec_size != len(str_vec): warning_message = "input error in line {}, expected tokens: {}, read tokens: {}, line: {} ".format( line_number, vec_size, len(str_vec), line) warnings.warn(warning_message) continue row = np.zeros(len(str_vec), dtype=np.float32) for j in range(len(str_vec)): row[j] = float(str_vec[j]) rows.append(row) i += 1 # if header: # assert cnt_words == len(rows) self.matrix = np.vstack(rows) if header: assert vec_size == self.matrix.shape[1] self.vocabulary.lst_frequencies = np.zeros(len( self.vocabulary.lst_words), dtype=np.int32) self.name = os.path.basename(os.path.dirname(os.path.normpath(path)))
def load_with_alpha(self, path, power=0.6): f = tables.open_file(os.path.join(path, 'vectors.h5p'), 'r') # left = np.nan_to_num(f.root.vectors.read()) left = f.root.vectors.read() sigma = f.root.sigma.read() logger.info("loaded left singular vectors and sigma") sigma = np.power(sigma, power) self.matrix = np.dot(left, np.diag(sigma)) logger.info("computed the product") self.metadata["pow_sigma"] = power self.metadata["size_dimensions"] = int(self.matrix.shape[1]) f.close() self.vocabulary = Vocabulary_simple() self.vocabulary.load(path) self.name += os.path.basename( os.path.normpath(path)) + "_a" + str(power)
def load_from_file(self, filename): self.vocabulary = Vocabulary() f = open(filename, "rb") header = f.readline().split() cnt_rows = int(header[0]) size_row = int(header[1]) # self.name += "_{}".format(size_row) self.matrix = np.zeros((cnt_rows, size_row), dtype=np.float32) # logger.debug("cnt rows = {}, size row = {}".format(cnt_rows, size_row)) for i in range(cnt_rows): word = ModelW2V._load_word(f).decode('UTF-8', errors="ignore").strip() self.vocabulary.dic_words_ids[word] = i self.vocabulary.lst_words.append(word) s_row = f.read(size_row * 4) row = np.fromstring(s_row, dtype=np.float32) # row = row / np.linalg.norm(row) self.matrix[i] = row f.close()
def test_text_to_ids(self): v = Vocabulary() v.load(path_vocab) doc = load_file_as_ids(path_text_file, v) assert doc.shape == (TEST_TEXT_LEN, ) assert np.allclose(doc[:10], [-1, 40, -1, -1, -1, -1, -1, -1, 57, -1])
def load_from_dir(path): """Automatically detects embeddings format and loads Args: path: directory where embeddings are stores Returns: Instance of appropriate Model-based class """ # if os.path.isfile(os.path.join(path, "cooccurrence_csr.h5p")): # logger.info("detected as sparse explicit in hdf5") # result = ModelSparse() # result.load_from_hdf5(path) # result.load_metadata(path) # return result # if os.path.isfile(os.path.join(path, "bigrams.data.bin")): # logger.info("detected as sparse in vecto legacy format") # result = ModelSparse() # result.load(path) # result.load_metadata(path) # return result # if os.path.isfile(os.path.join(path, "vectors.bin")): # logger.info("this is w2v original binary format") # result = ModelW2V() # result.load_from_dir(path) # result.load_metadata(path) # return result # if os.path.isfile(os.path.join(path, "sgns.words.npy")): # result = ModelLevy() # logger.info("this is Levi") # result.load_from_dir(path) # result.load_metadata(path) # return result # if os.path.isfile(os.path.join(path, "vectors.npy")): # result = ModelNumbered() # logger.info("detected as dense ") # result.load_npy(path) # result.load_metadata(path) # return result if os.path.isfile(os.path.join(path, "vectors.h5p")): result = vecto.embeddings.dense.WordEmbeddingsDense() logger.info("detected as vecto format ") result.load_hdf5(path) result.load_metadata(path) return result result = vecto.embeddings.dense.WordEmbeddingsDense() files = os.listdir(path) for f in files: if f.endswith(".gz") or f.endswith(".bz") or f.endswith( ".txt") or f.endswith(".vec"): logger.info(path + "Detected VSM in plain text format") result.load_from_text(os.path.join(path, f)) result.load_metadata(path) return result if f.endswith(".npy"): logger.info("Detected VSM in numpy format") result.matrix = np.load(os.path.join(path, f)) result.vocabulary = Vocabulary() result.vocabulary.load(path) result.load_metadata(path) return result # if any(file.endswith('bin') for file in os.listdir(path)): # result = ModelW2V() # logger.info("Detected VSM in the w2v original binary format") # result.load_from_dir(path) # result.load_metadata(path) # return result # if f.startswith("words") and f.endswith(".npy") \ # and os.path.isfile(os.path.join(path, f.replace(".npy", ".vocab"))): # result = Model_Fun() # result = ModelLevy() # logger.info("Detected VSM in npy and vocab in plain text file format") # result.load_from_dir(path, f[: -4]) # result.load_metadata(path) # return result raise RuntimeError("Cannot detect the format of this VSM")
def train(args): time_start = timer() if args.subword == 'none': current_utils = utils.word else: current_utils = utils.subword current_utils.args = args if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() cuda.check_cuda_available() if args.path_vocab == '': vocab = create_from_dir(args.path_corpus, language=args.language) else: vocab = Vocabulary() vocab.load(args.path_vocab) logger.info("loaded vocabulary") if args.context_representation != 'word': # for deps or ner context representation, we need a new context vocab for NS or HSM loss function. vocab_context = create_from_annotated_dir( args.path_corpus, representation=args.context_representation) else: vocab_context = vocab vocab_ngram_tokens = None if args.subword != 'none': if args.path_vocab_ngram_tokens == '': vocab_ngram_tokens = create_ngram_tokens_from_dir( args.path_corpus, args.min_gram, args.max_gram) else: vocab_ngram_tokens = Vocabulary() vocab_ngram_tokens.load(args.path_vocab_ngram_tokens) if args.path_word2chars == '': word2chars = None else: word2chars = get_word2chars(args.path_word2chars) loss_func = get_loss_func(args, vocab_context) model = get_model(args, loss_func, vocab, vocab_ngram_tokens, current_utils) if args.gpu >= 0: model.to_gpu() logger.debug("model sent to gpu") optimizer = chainer.optimizers.Adam() optimizer.setup(model) if os.path.isfile(args.path_corpus): # todo for file corpus pass else: if args.subword == 'none': train_iter = current_utils.DirWindowIterator( path=args.path_corpus, vocab=vocab, window_size=args.window, batch_size=args.batchsize, language=args.language) else: train_iter = current_utils.DirWindowIterator( path=args.path_corpus, vocab=vocab, vocab_ngram_tokens=vocab_ngram_tokens, word2chars=word2chars, window_size=args.window, batch_size=args.batchsize, language=args.language) updater = training.StandardUpdater(train_iter, optimizer, converter=current_utils.convert, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.path_out) if os.path.isfile(args.path_corpus): # todo for file corpus # trainer.extend(extensions.Evaluator(val_iter, model, converter=convert, device=args.gpu)) # trainer.extend(extensions.PrintReport(['epoch', 'main/loss', 'validation/main/loss', 'elapsed_time'])) pass else: trainer.extend( extensions.PrintReport(['epoch', 'main/loss', 'elapsed_time'])) trainer.extend(extensions.ProgressBar()) trainer.extend(extensions.LogReport()) trainer.run() model = create_model(args, model, vocab) time_end = timer() model.metadata["execution_time"] = time_end - time_start return model
def test_tokens_to_ids(self): vocab = Vocabulary() vocab.load(path_vocab) tokens = ["the", "apple"] ids = vocab.tokens_to_ids(tokens) print("ids:", ids)
def test_misc(self): vocab = Vocabulary() vocab.load(path_vocab) vocab.get_word_by_id(1) vocab.get_frequency("the") vocab.get_frequency("apple") vocab.lst_frequencies = [] vocab.get_frequency("apple")
def test_filter(self): vocab = Vocabulary() vocab.load(path_vocab) vocab.filter_by_wordlist(["the"])
class WordEmbeddingsDense(WordEmbeddings): """Stores dense embeddings. """ def cmp_vectors(self, vec1, vec2): cos = normed(vec1) @ normed(vec2) if math.isnan(cos): return 0 return (cos + 1) / 2 def cmp_rows(self, id1, id2): vec1 = self.matrix[id1] vec2 = self.matrix[id2] return self.cmp_vectors(vec1, vec2) def cmp_words(self, word1, word2): id1 = self.vocabulary.get_id(word1) id2 = self.vocabulary.get_id(word2) if (id1 < 0) or (id2 < 0): return 0 return self.cmp_rows(id1, id2) def save_matr_to_hdf5(self, path): file_out = tables.open_file(os.path.join(path, 'vectors.h5p'), 'w') atom = tables.Atom.from_dtype(self.matrix.dtype) ds = file_out.create_carray(file_out.root, 'vectors', atom, self.matrix.shape) ds[:] = self.matrix ds.flush() file_out.close() def load_hdf5(self, path): """loads embeddings from hdf5 format""" file_in = tables.open_file(os.path.join(path, 'vectors.h5p'), 'r') self.matrix = file_in.root.vectors.read() self.vocabulary = Vocabulary() self.vocabulary.load(path) # self.name += os.path.basename(os.path.normpath(path)) file_in.close() def load_npy(self, path): """loads embeddings from numpy format""" self.matrix = np.load(os.path.join(path, "vectors.npy")) # self.load_with_alpha(0.6) self.vocabulary = Vocabulary_simple() self.vocabulary.load(path) self.name += os.path.basename(os.path.normpath(path)) def save_to_dir(self, path): os.makedirs(path, exist_ok=True) self.vocabulary.save_to_dir(path) # self.matrix.tofile(os.path.join(path,"vectors.bin")) # np.save(os.path.join(path, "vectors.npy"), self.matrix) self.save_matr_to_hdf5(path) save_json(self.metadata, os.path.join(path, "metadata.json")) def save_to_dir_plain_txt(self, path): os.makedirs(path, exist_ok=True) with open(os.path.join(path, 'vectors.txt'), 'w') as output: for i, w in enumerate(self.vocabulary.lst_words): if len(w.strip()) == 0: continue output.write(w + ' ') for j in range(self.matrix[i].shape[0]): output.write(str(self.matrix[i][j])) output.write(' ') output.write("\n") def load_with_alpha(self, path, power=0.6): f = tables.open_file(os.path.join(path, 'vectors.h5p'), 'r') # left = np.nan_to_num(f.root.vectors.read()) left = f.root.vectors.read() sigma = f.root.sigma.read() logger.info("loaded left singular vectors and sigma") sigma = np.power(sigma, power) self.matrix = np.dot(left, np.diag(sigma)) logger.info("computed the product") self.metadata["pow_sigma"] = power self.metadata["size_dimensions"] = int(self.matrix.shape[1]) f.close() self.vocabulary = Vocabulary_simple() self.vocabulary.load(path) self.name += os.path.basename( os.path.normpath(path)) + "_a" + str(power) def normalize(self): nrm = np.linalg.norm(self.matrix, axis=1) nrm[nrm == 0] = 1 self.matrix /= nrm[:, np.newaxis] self._normalized_matrix = self.matrix self.metadata["normalized"] = True self.normalized = True def cache_normalized_copy(self): if hasattr(self, 'normalized') and self.normalized == True: self._normalized_matrix = self.matrix else: self._normalized_matrix = self.matrix.copy() self._normalized_matrix /= np.linalg.norm(self._normalized_matrix, axis=1)[:, None] def load_from_text(self, path): i = 0 # self.name+="_"+os.path.basename(os.path.normpath(path)) self.vocabulary = Vocabulary() rows = [] header = False vec_size = -1 with detect_archive_format_and_open(path) as file_in: for line_number, line in enumerate(file_in): tokens = line.split() if i == 0 and len(tokens) == 2: header = True cnt_words = int(tokens[0]) vec_size = int(tokens[1]) continue # word = tokens[0].decode('ascii',errors="ignore") # word = tokens[0].decode('UTF-8', errors="ignore") word = tokens[0] self.vocabulary.dic_words_ids[word] = i self.vocabulary.lst_words.append(word) str_vec = tokens[1:] if vec_size == -1: vec_size = len(str_vec) if vec_size != len(str_vec): warning_message = "input error in line {}, expected tokens: {}, read tokens: {}, line: {} ".format( line_number, vec_size, len(str_vec), line) warnings.warn(warning_message) continue row = np.zeros(len(str_vec), dtype=np.float32) for j in range(len(str_vec)): row[j] = float(str_vec[j]) rows.append(row) i += 1 # if header: # assert cnt_words == len(rows) self.matrix = np.vstack(rows) if header: assert vec_size == self.matrix.shape[1] self.vocabulary.lst_frequencies = np.zeros(len( self.vocabulary.lst_words), dtype=np.int32) self.name = os.path.basename(os.path.dirname(os.path.normpath(path))) def _populate_from_source_and_wordlist(self, source, wordlist): self.metadata["class"] = "embeddings" self.metadata["source"] = source.metadata self.vocabulary = source.vocabulary.filter_by_wordlist(wordlist) self.metadata["vocabulary"] = self.vocabulary.metadata lst_new_vectors = [] for w in self.vocabulary.lst_words: lst_new_vectors.append(source.get_vector(w)) self.matrix = np.array(lst_new_vectors, dtype=np.float32) def filter_by_vocab(self, words): """reduced embeddings to the provided list of words Args: words: set or list of words to keep Returns: Instance of Dense class """ if len(words) == 0: return self new_embds = WordEmbeddingsDense() new_embds._populate_from_source_and_wordlist(self, words) return new_embds def get_x_label(self, i): return i def viz_wordlist(self, wl, colored=False, show_legend=False): colors = brewer2mpl.get_map('Set2', 'qualitative', 8).mpl_colors cnt = 0 for i in wl: row = self.get_vector(i) row = normed(row) if colored: plt.bar(range(0, len(row)), row, color=colors[cnt], linewidth=0, alpha=0.6, label=i) else: plt.bar(range(0, len(row)), row, color="black", linewidth=0, alpha=1 / len(wl), label=i) cnt += 1 if show_legend: plt.legend() def get_most_similar_vectors(self, u, cnt=10): scores = np.zeros(self.matrix.shape[0], dtype=np.float32) if hasattr(self, "_normalized_matrix"): scores = normed(u) @ self._normalized_matrix.T scores = (scores + 1) / 2 else: str_warn = "\n\tthis method executes slow if embeddings are not normalized." str_warn += "\n\tuse normalize() method to normalize your embeddings" str_warn += "\n\tif for whatever reasons you need your embeddings to be not normalized, you can use .cache_normalized_copy() method to cache normalized copy of embeddings" str_warn += "\n\tplease note that latter will consume additional memory\n" warnings.warn(str_warn, RuntimeWarning) for i in range(self.matrix.shape[0]): scores[i] = self.cmp_vectors(u, self.matrix[i]) ids = np.argsort(scores)[::-1] ids = ids[:cnt] return zip(ids, scores[ids]) def get_most_similar_words(self, w, cnt=10): """returns list of words sorted by cosine proximity to a target word Args: w: target word cnt: how many similar words are needed Returns: list of words and corresponding similarities """ if isinstance(w, str): vec = self.matrix[self.vocabulary.get_id(w)] else: vec = w rows = self.get_most_similar_vectors(vec, cnt) results = [] for i in rows: results.append([self.vocabulary.get_word_by_id(i[0]), i[1]]) return results def get_vector(self, w): i = self.vocabulary.get_id(w) if i < 0: raise RuntimeError('word do not exist', w) row = self.matrix[i] return row def has_word(self, w): i = self.vocabulary.get_id(w) if i < 0: return False return True
def test_load_from_dir(self): vocab = Vocabulary() vocab.load(path_vocab) print("the:", vocab.get_id("the")) vocab.load(path_vocab_one) print("the:", vocab.get_id("the"))
def test_text_to_ids(self): v = Vocabulary() v.load(path_vocab) doc = load_path_as_ids(path_text_file, v) # assert doc.shape == (TEST_TEXT_LEN,) assert np.allclose(doc[:10], [0, 40, 0, 0, 0, 1, 0, 0, 0, 0])