def _load_all_vectors_from_disk(self, mapfile_path: Path): """ Reads all vectors from disk Parameters ---------- mapfile_path : Path Path where to read the vectors from """ path = str(mapfile_path.absolute()) self.wv.vectors = np_memmap( f"{path}_wv.vectors", dtype=REAL, mode="r", shape=self.wv_mapfile_shapes["vectors"], ) if self.is_ft: self.wv.vectors_vocab = np_memmap( f"{path}_vocab.vectors", dtype=REAL, mode="r", shape=self.wv_mapfile_shapes["vectors_vocab"], ) self.wv.vectors_ngrams = np_memmap( f"{path}_ngrams.vectors", dtype=REAL, mode="r", shape=self.wv_mapfile_shapes["vectors_ngrams"], )
def _move_ndarray_to_disk(self, vector: ndarray, mapfile_path: str, name: str = "") -> ndarray: """ Moves a numpy ndarray to disk via memmap Parameters ---------- vector : ndarray The vector to write to disk mapfile_path : Path Path where to write the vector to name : str Suffix which is appended to the path to distinguish multiple files Returns ------- ndarray readonly ndarray to be used in further computations """ shape = vector.shape path = Path(f"{mapfile_path}_{name}.vectors") if not path.exists(): logger.info(f"writing {name} to {path}") memvecs = np_memmap(path, dtype=REAL, mode="w+", shape=shape) memvecs[:] = vector[:] del memvecs, vector else: # If multiple instances of this class exist, all can access the same files logger.info(f"loading pre-existing {name} from {path}") readonly_memvecs = np_memmap(path, dtype=REAL, mode="r", shape=shape) return readonly_memvecs
def reset_weights(self, model): length = max(len(self.doctags), self.count) if self.mapfile_path: self.doctag_syn0 = np_memmap(self.mapfile_path + '.doctag_syn0', dtype=REAL, mode='w+', shape=(length, model.vector_size)) self.doctag_syn0_lockf = np_memmap(self.mapfile_path + '.doctag_syn0_lockf', dtype=REAL, mode='w+', shape=(length,)) self.doctag_syn0_lockf.fill(1.0) else: self.doctag_syn0 = empty((length, model.vector_size), dtype=REAL) self.doctag_syn0_lockf = ones((length,), dtype=REAL) # zeros suppress learning for i in xrange(length): # construct deterministic seed from index AND model seed seed = "%d %s" % (model.seed, self.index_to_doctag(i)) self.doctag_syn0[i] = model.seeded_vector(seed)
def init_sims(self, replace=False): """ Precompute L2-normalized vectors. If `replace` is set, forget the original vectors and only keep the normalized ones = saves lots of memory! Note that you **cannot continue training** after doing a replace. The model becomes effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`. """ if getattr(self, 'doctag_syn0norm', None) is None or replace: logger.info("precomputing L2-norms of doc weight vectors") if replace: for i in xrange(self.doctag_syn0.shape[0]): self.doctag_syn0[i, :] /= sqrt((self.doctag_syn0[i, :] ** 2).sum(-1)) self.doctag_syn0norm = self.doctag_syn0 else: if self.mapfile_path: self.doctag_syn0norm = np_memmap( self.mapfile_path+'.doctag_syn0norm', dtype=REAL, mode='w+', shape=self.doctag_syn0.shape) else: self.doctag_syn0norm = empty(self.doctag_syn0.shape, dtype=REAL) np_divide(self.doctag_syn0, sqrt((self.doctag_syn0 ** 2).sum(-1))[..., newaxis], self.doctag_syn0norm)
def _load_all_vectors_from_disk(self, mapfile_path: Path): """ Reads all vectors from disk """ path = str(mapfile_path.absolute()) self.vectors = np_memmap(f"{path}.vectors", dtype=REAL, mode="r+", shape=self.mapfile_shape)
def update_vectors(self, sv: SentenceVectors, total_sentences: int): """Given existing sentence vectors, append new ones""" logger.info( f"appending sentence vectors for {total_sentences} sentences") sentences_before = len(sv.vectors) sentences_after = len(sv.vectors) + total_sentences if sv.mapfile_path: sv.vectors = np_memmap( str(sv.mapfile_path) + ".vectors", dtype=REAL, mode="r+", shape=(sentences_after, sv.vector_size), ) for i in range(sentences_before, sentences_after): sv.vectors[i] = full(shape=sv.vector_size, fill_value=EPS, dtype=REAL) else: newvectors = empty((total_sentences, sv.vector_size), dtype=REAL) for i in range(total_sentences): newvectors[i] = full(shape=sv.vector_size, fill_value=EPS, dtype=REAL) sv.vectors = vstack([sv.vectors, newvectors]) sv.vectors_norm = None
def reset_doc_weights(self, docvecs): length = max(len(docvecs.doctags), docvecs.count) if docvecs.mapfile_path: docvecs.vectors_docs = np_memmap( docvecs.mapfile_path + '.vectors_docs', dtype=REAL, mode='w+', shape=(length, docvecs.vector_size) ) self.vectors_docs_lockf = np_memmap( docvecs.mapfile_path + '.vectors_docs_lockf', dtype=REAL, mode='w+', shape=(length,) ) self.vectors_docs_lockf.fill(1.0) else: docvecs.vectors_docs = empty((length, docvecs.vector_size), dtype=REAL) self.vectors_docs_lockf = ones((length,), dtype=REAL) # zeros suppress learning for i in xrange(length): # construct deterministic seed from index AND model seed seed = "%d %s" % ( self.seed, Doc2VecKeyedVectors._index_to_doctag(i, docvecs.offset2doctag, docvecs.max_rawint)) docvecs.vectors_docs[i] = self.seeded_vector(seed, docvecs.vector_size)
def reset_vectors(self, sv: SentenceVectors, total_sentences: int): """Initialize all sentence vectors to zero and overwrite existing files""" logger.info( f"initializing sentence vectors for {total_sentences} sentences") if sv.mapfile_path: sv.vectors = np_memmap(str(sv.mapfile_path) + '.vectors', dtype=REAL, mode='w+', shape=(total_sentences, sv.vector_size)) else: sv.vectors = empty((total_sentences, sv.vector_size), dtype=REAL) for i in range(total_sentences): sv.vectors[i] = zeros(sv.vector_size, dtype=REAL) sv.vectors_norm = None
def init_sims(self, replace:bool=False): """Precompute L2-normalized vectors. Parameters ---------- replace : bool, optional If True - forget the original vectors and only keep the normalized ones = saves lots of memory! """ if getattr(self, 'vectors_norm', None) is None or replace: logger.info("precomputing L2-norms of sentence vectors") if not replace and self.mapfile_path is not None: self.vectors_norm = np_memmap( self.mapfile_path + '.vectors_norm', dtype=REAL, mode='w+', shape=self.vectors.shape) self.vectors_norm = _l2_norm(self.vectors, replace=replace)