Пример #1
0
    def _load_all_vectors_from_disk(self, mapfile_path: Path):
        """ Reads all vectors from disk 

        Parameters
        ----------
        mapfile_path : Path
            Path where to read the vectors from

        """
        path = str(mapfile_path.absolute())

        self.wv.vectors = np_memmap(
            f"{path}_wv.vectors",
            dtype=REAL,
            mode="r",
            shape=self.wv_mapfile_shapes["vectors"],
        )
        if self.is_ft:
            self.wv.vectors_vocab = np_memmap(
                f"{path}_vocab.vectors",
                dtype=REAL,
                mode="r",
                shape=self.wv_mapfile_shapes["vectors_vocab"],
            )
            self.wv.vectors_ngrams = np_memmap(
                f"{path}_ngrams.vectors",
                dtype=REAL,
                mode="r",
                shape=self.wv_mapfile_shapes["vectors_ngrams"],
            )
Пример #2
0
    def _move_ndarray_to_disk(self,
                              vector: ndarray,
                              mapfile_path: str,
                              name: str = "") -> ndarray:
        """ Moves a numpy ndarray to disk via memmap

        Parameters
        ----------
        vector : ndarray
            The vector to write to disk
        mapfile_path : Path
            Path where to write the vector to
        name : str
            Suffix which is appended to the path to distinguish multiple files

        Returns
        -------
        ndarray
            readonly ndarray to be used in further computations

        """
        shape = vector.shape
        path = Path(f"{mapfile_path}_{name}.vectors")

        if not path.exists():
            logger.info(f"writing {name} to {path}")
            memvecs = np_memmap(path, dtype=REAL, mode="w+", shape=shape)
            memvecs[:] = vector[:]
            del memvecs, vector
        else:
            # If multiple instances of this class exist, all can access the same files
            logger.info(f"loading pre-existing {name} from {path}")

        readonly_memvecs = np_memmap(path, dtype=REAL, mode="r", shape=shape)
        return readonly_memvecs
Пример #3
0
    def reset_weights(self, model):
        length = max(len(self.doctags), self.count)
        if self.mapfile_path:
            self.doctag_syn0 = np_memmap(self.mapfile_path + '.doctag_syn0', dtype=REAL, mode='w+', shape=(length, model.vector_size))
            self.doctag_syn0_lockf = np_memmap(self.mapfile_path + '.doctag_syn0_lockf', dtype=REAL, mode='w+', shape=(length,))
            self.doctag_syn0_lockf.fill(1.0)
        else:
            self.doctag_syn0 = empty((length, model.vector_size), dtype=REAL)
            self.doctag_syn0_lockf = ones((length,), dtype=REAL)  # zeros suppress learning

        for i in xrange(length):
            # construct deterministic seed from index AND model seed
            seed = "%d %s" % (model.seed, self.index_to_doctag(i))
            self.doctag_syn0[i] = model.seeded_vector(seed)
Пример #4
0
    def reset_weights(self, model):
        length = max(len(self.doctags), self.count)
        if self.mapfile_path:
            self.doctag_syn0 = np_memmap(self.mapfile_path + '.doctag_syn0', dtype=REAL, mode='w+', shape=(length, model.vector_size))
            self.doctag_syn0_lockf = np_memmap(self.mapfile_path + '.doctag_syn0_lockf', dtype=REAL, mode='w+', shape=(length,))
            self.doctag_syn0_lockf.fill(1.0)
        else:
            self.doctag_syn0 = empty((length, model.vector_size), dtype=REAL)
            self.doctag_syn0_lockf = ones((length,), dtype=REAL)  # zeros suppress learning

        for i in xrange(length):
            # construct deterministic seed from index AND model seed
            seed = "%d %s" % (model.seed, self.index_to_doctag(i))
            self.doctag_syn0[i] = model.seeded_vector(seed)
Пример #5
0
    def init_sims(self, replace=False):
        """
        Precompute L2-normalized vectors.

        If `replace` is set, forget the original vectors and only keep the normalized
        ones = saves lots of memory!

        Note that you **cannot continue training** after doing a replace. The model becomes
        effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`.

        """
        if getattr(self, 'doctag_syn0norm', None) is None or replace:
            logger.info("precomputing L2-norms of doc weight vectors")
            if replace:
                for i in xrange(self.doctag_syn0.shape[0]):
                    self.doctag_syn0[i, :] /= sqrt((self.doctag_syn0[i, :] ** 2).sum(-1))
                self.doctag_syn0norm = self.doctag_syn0
            else:
                if self.mapfile_path:
                    self.doctag_syn0norm = np_memmap(
                        self.mapfile_path+'.doctag_syn0norm', dtype=REAL,
                        mode='w+', shape=self.doctag_syn0.shape)
                else:
                    self.doctag_syn0norm = empty(self.doctag_syn0.shape, dtype=REAL)
                np_divide(self.doctag_syn0, sqrt((self.doctag_syn0 ** 2).sum(-1))[..., newaxis], self.doctag_syn0norm)
Пример #6
0
 def _load_all_vectors_from_disk(self, mapfile_path: Path):
     """ Reads all vectors from disk """
     path = str(mapfile_path.absolute())
     self.vectors = np_memmap(f"{path}.vectors",
                              dtype=REAL,
                              mode="r+",
                              shape=self.mapfile_shape)
Пример #7
0
    def update_vectors(self, sv: SentenceVectors, total_sentences: int):
        """Given existing sentence vectors, append new ones"""
        logger.info(
            f"appending sentence vectors for {total_sentences} sentences")
        sentences_before = len(sv.vectors)
        sentences_after = len(sv.vectors) + total_sentences

        if sv.mapfile_path:
            sv.vectors = np_memmap(
                str(sv.mapfile_path) + ".vectors",
                dtype=REAL,
                mode="r+",
                shape=(sentences_after, sv.vector_size),
            )
            for i in range(sentences_before, sentences_after):
                sv.vectors[i] = full(shape=sv.vector_size,
                                     fill_value=EPS,
                                     dtype=REAL)
        else:
            newvectors = empty((total_sentences, sv.vector_size), dtype=REAL)
            for i in range(total_sentences):
                newvectors[i] = full(shape=sv.vector_size,
                                     fill_value=EPS,
                                     dtype=REAL)
            sv.vectors = vstack([sv.vectors, newvectors])
        sv.vectors_norm = None
Пример #8
0
    def reset_doc_weights(self, docvecs):
        length = max(len(docvecs.doctags), docvecs.count)
        if docvecs.mapfile_path:
            docvecs.vectors_docs = np_memmap(
                docvecs.mapfile_path + '.vectors_docs', dtype=REAL, mode='w+', shape=(length, docvecs.vector_size)
            )
            self.vectors_docs_lockf = np_memmap(
                docvecs.mapfile_path + '.vectors_docs_lockf', dtype=REAL, mode='w+', shape=(length,)
            )
            self.vectors_docs_lockf.fill(1.0)
        else:
            docvecs.vectors_docs = empty((length, docvecs.vector_size), dtype=REAL)
            self.vectors_docs_lockf = ones((length,), dtype=REAL)  # zeros suppress learning

        for i in xrange(length):
            # construct deterministic seed from index AND model seed
            seed = "%d %s" % (
                self.seed, Doc2VecKeyedVectors._index_to_doctag(i, docvecs.offset2doctag, docvecs.max_rawint))
            docvecs.vectors_docs[i] = self.seeded_vector(seed, docvecs.vector_size)
Пример #9
0
    def reset_doc_weights(self, docvecs):
        length = max(len(docvecs.doctags), docvecs.count)
        if docvecs.mapfile_path:
            docvecs.vectors_docs = np_memmap(
                docvecs.mapfile_path + '.vectors_docs', dtype=REAL, mode='w+', shape=(length, docvecs.vector_size)
            )
            self.vectors_docs_lockf = np_memmap(
                docvecs.mapfile_path + '.vectors_docs_lockf', dtype=REAL, mode='w+', shape=(length,)
            )
            self.vectors_docs_lockf.fill(1.0)
        else:
            docvecs.vectors_docs = empty((length, docvecs.vector_size), dtype=REAL)
            self.vectors_docs_lockf = ones((length,), dtype=REAL)  # zeros suppress learning

        for i in xrange(length):
            # construct deterministic seed from index AND model seed
            seed = "%d %s" % (
                self.seed, Doc2VecKeyedVectors._index_to_doctag(i, docvecs.offset2doctag, docvecs.max_rawint))
            docvecs.vectors_docs[i] = self.seeded_vector(seed, docvecs.vector_size)
    def reset_vectors(self, sv: SentenceVectors, total_sentences: int):
        """Initialize all sentence vectors to zero and overwrite existing files"""
        logger.info(
            f"initializing sentence vectors for {total_sentences} sentences")
        if sv.mapfile_path:
            sv.vectors = np_memmap(str(sv.mapfile_path) + '.vectors',
                                   dtype=REAL,
                                   mode='w+',
                                   shape=(total_sentences, sv.vector_size))
        else:
            sv.vectors = empty((total_sentences, sv.vector_size), dtype=REAL)

        for i in range(total_sentences):
            sv.vectors[i] = zeros(sv.vector_size, dtype=REAL)
        sv.vectors_norm = None
    def init_sims(self, replace:bool=False):
        """Precompute L2-normalized vectors.

        Parameters
        ----------
        replace : bool, optional
            If True - forget the original vectors and only keep the normalized ones = saves lots of memory!
        """
        if getattr(self, 'vectors_norm', None) is None or replace:
            logger.info("precomputing L2-norms of sentence vectors")
            if not replace and self.mapfile_path is not None:
                self.vectors_norm = np_memmap(
                    self.mapfile_path + '.vectors_norm', dtype=REAL,
                    mode='w+', shape=self.vectors.shape)

            self.vectors_norm = _l2_norm(self.vectors, replace=replace)