예제 #1
0
def _write_words_binary(b_words: Iterable[bytes], file: BinaryIO):
    """
    Helper method to write an iterable of bytes and their lengths.
    """
    for word in b_words:
        _write_binary(file, "<I", len(word))
        file.write(word)
예제 #2
0
 def write_chunk(self, file: BinaryIO):
     _write_binary(file, "<I", int(self.chunk_identifier()))
     padding = _pad_float32(file.tell())
     chunk_len = struct.calcsize("QI") + padding + struct.calcsize(
         f"<{self.size}f")
     _write_binary(file, f"<QQI{padding}x", chunk_len, self.size,
                   int(TypeId.f32))
     _serialize_array_as_le(file, self)
예제 #3
0
 def write_chunk(self, file: BinaryIO):
     _write_binary(file, "<I", int(self.chunk_identifier()))
     b_word_len_sum = sum(len(bytes(word, "utf-8")) for word in self.words)
     n_words_size = struct.calcsize("<Q")
     word_lens_size = len(self.words) * struct.calcsize("<I")
     chunk_length = n_words_size + word_lens_size + b_word_len_sum
     _write_binary(file, "<QQ", chunk_length, len(self.words))
     _write_words_binary((bytes(word, "utf-8") for word in self.words),
                         file)
예제 #4
0
파일: fasttext.py 프로젝트: finalfusion/ffp
def write_fasttext(file: Union[str, bytes, int, PathLike], embeds: Embeddings):
    """
    Write embeddings in fastText format.

    fastText requires Metadata with all expected keys for fastText configs:
        * dims: int (inferred from model)
        * window_size: int (default -1)
        * min_count: int (default -1)
        * ns: int (default -1)
        * word_ngrams: int (default 1)
        * loss: one of ``['HierarchicalSoftmax', 'NegativeSampling', 'Softmax']`` (default Softmax)
        * model: one of ``['CBOW', 'SkipGram', 'Supervised']`` (default SkipGram)
        * buckets: int (inferred from model)
        * min_n: int (inferred from model)
        * max_n: int (inferred from model)
        * lr_update_rate: int (default -1)
        * sampling_threshold: float (default -1)

    ``dims``, ``buckets``, ``min_n`` and ``max_n`` are inferred from the model. If other values
    are unspecified, a default value of ``-1`` is used for all numerical fields. Loss defaults
    to ``Softmax``, model to ``SkipGram``. Unknown values for ``loss`` and ``model`` are
    overwritten with defaults since the models are incompatible with fastText otherwise.

    Some information from original fastText models gets lost e.g.:
        * word frequencies
        * n_tokens

    Embeddings are un-normalized before serialization: if norms are present, each embedding is
    scaled by the associated norm. Additionally, the original state of the embedding matrix is
    restored, precomputation and l2-normalization of word embeddings is undone.

    Only embeddings with a FastTextVocab or SimpleVocab can be serialized to this format.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Output file
    embeds : Embeddings
        Embeddings to write
    """
    with open(file, 'wb') as outf:
        if not isinstance(embeds.vocab, (FastTextVocab, SimpleVocab)):
            raise ValueError(
                f'Expected FastTextVocab or SimpleVocab, not: {type(embeds.vocab).__name__}'
            )
        _write_binary(outf, "<ii", _FT_MAGIC, 12)
        _write_ft_cfg(outf, embeds)
        _write_ft_vocab(outf, embeds.vocab)
        _write_binary(outf, "<?QQ", 0, *embeds.storage.shape)
        if isinstance(embeds.vocab, SimpleVocab):
            _write_ft_storage_simple(outf, embeds)
        else:
            _write_ft_storage_subwords(outf, embeds)
        _serialize_array_as_le(outf, embeds.storage)
예제 #5
0
def write_fasttext(file: Union[str, bytes, int, PathLike], embeds: Embeddings):
    """
    Write embeddings in fastText format.

    Only embeddings with fastText vocabulary can be written to fastText format.

    fastText models require values for all config keys, some of these can be inferred from
    finalfusion models others are assigned some default values:

        * dims: inferred from model
        * window_size: 0
        * min_count: 0
        * ns: 0
        * word_ngrams: 1
        * loss: HierarchicalSoftmax
        * model: CBOW
        * buckets: inferred from model
        * min_n: inferred from model
        * max_n: inferred from model
        * lr_update_rate: 0
        * sampling_threshold: 0

    Some information from original fastText models gets lost e.g.:
        * word frequencies
        * n_tokens

    Embeddings are un-normalized before serialization: if norms are present, each embedding is
    scaled by the associated norm. Additionally, the original state of the embedding matrix is
    restored, precomputation and l2-normalization of word embeddings is undone.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Output file
    embeds : Embeddings
        Embeddings to write
    """
    with open(file, 'wb') as outf:
        vocab = embeds.vocab
        if not isinstance(vocab, FastTextVocab):
            raise ValueError(
                f'Expected FastTextVocab, not: {type(embeds.vocab).__name__}')
        _write_binary(outf, "<ii", _FT_MAGIC, 12)
        _write_ft_cfg(outf, embeds.dims, vocab.subword_indexer.n_buckets,
                      vocab.min_n, vocab.max_n)
        _write_ft_vocab(outf, embeds.vocab)
        _write_binary(outf, "<?QQ", 0, *embeds.storage.shape)
        if isinstance(embeds.vocab, SimpleVocab):
            _write_ft_storage_simple(outf, embeds)
        else:
            _write_ft_storage_subwords(outf, embeds)
        _serialize_array_as_le(outf, embeds.storage)
예제 #6
0
파일: fasttext.py 프로젝트: finalfusion/ffp
def _write_ft_cfg(file: BinaryIO, embeds: Embeddings):
    """
    Helper method to write fastText config.

    * dims: taken from embeds
    * window_size: -1 if unspecified
    * min_count:  -1 if unspecified
    * ns:  -1 if unspecified
    * word_ngrams:  1
    * loss: one of `['HierarchicalSoftmax', 'NegativeSampling', 'Softmax']`, defaults to 'Softmax'
    * model: one of `['CBOW', 'SkipGram', 'Supervised']`, defaults to SkipGram
    * buckets: taken from embeds, 0 if SimpleVocab
    * min_n: taken from embeds, 0 if SimpleVocab
    * max_n: taken from embeds, 0 if SimpleVocab
    * lr_update_rate: -1 if unspecified
    * sampling_threshold: -1 if unspecified

    loss and model values are overwritten by the default if they are not listed above.
    """
    # declare some dummy values that we can't get from embeds
    meta = {
        'window_size': -1,
        'epoch': -1,
        'min_count': -1,
        'ns': -1,
        'word_ngrams': 1,
        'loss': 'Softmax',
        # fastText uses an integral enum with vals 1, 2, 3, so we can't use
        # a placeholder for unknown models which maps to e.g. 0.
        'model': 'SkipGram',
        'lr_update_rate': -1,
        'sampling_threshold': -1
    }  # type: Dict[str, Any]
    if embeds.metadata is not None:
        meta.update(embeds.metadata)
    meta['dims'] = embeds.storage.shape[1]
    if isinstance(embeds.vocab, FastTextVocab):
        meta['min_n'] = embeds.vocab.min_n
        meta['max_n'] = embeds.vocab.max_n
        meta['buckets'] = embeds.vocab.subword_indexer.n_buckets
    else:
        meta['min_n'] = 0
        meta['max_n'] = 0
        meta['buckets'] = 0
    cfg = [meta[k] for k in _FT_REQUIRED_CFG_KEYS]
    # see explanation above why we need to select some known value
    losses = {'HierarchicalSoftmax': 1, 'NegativeSampling': 2, 'Softmax': 3}
    cfg[6] = losses.get(cfg[6], 3)
    models = {'CBOW': 1, 'SkipGram': 2, 'Supervised': 3}
    cfg[7] = models.get(cfg[7], 2)
    _write_binary(file, "<12id", *cfg)
예제 #7
0
def _write_bucket_vocab(file: BinaryIO, vocab: Union[FastTextVocab,
                                                     FinalfusionBucketVocab]):
    min_n_max_n_size = struct.calcsize("<II")
    buckets_size = struct.calcsize("<I")
    chunk_length = _calculate_binary_list_size(vocab.words)
    chunk_length += min_n_max_n_size
    chunk_length += buckets_size

    chunk_id = vocab.chunk_identifier()
    if chunk_id == ChunkIdentifier.FastTextSubwordVocab:
        buckets = vocab.subword_indexer.upper_bound
    else:
        buckets = cast(FinalfusionHashIndexer,
                       vocab.subword_indexer).buckets_exp

    chunk_header = (int(chunk_id), chunk_length, len(vocab.words), vocab.min_n,
                    vocab.max_n, buckets)
    _write_binary(file, "<IQQIII", *chunk_header)
    _write_words_binary((bytes(word, "utf-8") for word in vocab.words), file)
예제 #8
0
파일: fasttext.py 프로젝트: finalfusion/ffp
def _write_ft_vocab(outf: BinaryIO, vocab: Vocab):
    """
    Helper method to write a vocab to fastText.
    """
    # assumes that vocab_size == word_size if n_labels == 0
    _write_binary(outf, "<iii", len(vocab), len(vocab), 0)
    # we discard n_tokens, serialize as 0, no pruned vocabs exist, also 0
    _write_binary(outf, "<qq", 0, 0)
    for word in vocab:
        outf.write(word.encode("utf-8"))
        outf.write(b'\x00')
        # we don't store frequency, also set to 0
        _write_binary(outf, "<q", 0)
        # all entries are words = 0
        _write_binary(outf, "b", 0)
예제 #9
0
def _write_ft_cfg(file: BinaryIO, dims: int, n_buckets: int, min_n: int,
                  max_n: int):
    """
    Helper method to write fastText config.

    The following values are used:

    * dims: passed as arg
    * window_size: 0
    * min_count:  0
    * ns:  0
    * word_ngrams:  1
    * loss: HierarchicalSoftmax
    * model: CBOW
    * buckets: passed as arg
    * min_n: passed as arg
    * max_n: passed as arg
    * lr_update_rate: 0
    * sampling_threshold: 0
    """
    # declare some dummy values that we can't get from embeds
    cfg = [
        dims,  # dims
        0,  # window_size
        0,  # epoch
        0,  # mincount
        0,  # ns
        1,  # word_ngrams
        1,  # loss, defaults to hierarchical_softmax
        1,  # model, defaults to CBOW
        n_buckets,  # buckets
        min_n,  # min_n
        max_n,  # max_n
        0,  # lr_update_rate
        0,  # sampling_threshold
    ]
    _write_binary(file, "<12id", *cfg)
예제 #10
0
 def write_chunk(self, file: BinaryIO):
     _write_binary(file, "<I", int(self.chunk_identifier()))
     padding = _pad_float32(file.tell())
     chunk_len = struct.calcsize("<IIIIIQII") + padding
     proj = self._quantizer.projection is not None
     if proj:
         chunk_len += struct.calcsize(
             f"<{pow(self._quantizer.reconstructed_len, 2)}f")
     chunk_len += struct.calcsize(f"<{self._quantizer.subquantizers.size}f")
     norms = self._norms is not None
     if self._norms is not None:
         chunk_len += struct.calcsize(f"<{self._norms.size}f")
     chunk_len += self._quantized_embeddings.size
     chunk_header = (chunk_len, proj, norms, self.quantized_len,
                     self.shape[1], self.quantizer.n_centroids,
                     self.shape[0], int(TypeId.u8), int(TypeId.f32))
     _write_binary(file, "<QIIIIIQII", *chunk_header)
     file.write(struct.pack(f"{padding}x"))
     if proj:
         _serialize_array_as_le(file, self.quantizer.projection)
     _serialize_array_as_le(file, self.quantizer.subquantizers)
     if norms:
         _serialize_array_as_le(file, self._norms)
     self._quantized_embeddings.tofile(file)
예제 #11
0
 def write_chunk(self, file: BinaryIO):
     _write_binary(file, "<I", int(self.chunk_identifier()))
     padding = _pad_float32(file.tell())
     chunk_len = struct.calcsize("<QII") + padding + struct.calcsize(
         f'<{self.size}f')
     # pylint: disable=unpacking-non-sequence
     rows, cols = self.shape
     _write_binary(file, "<QQII", chunk_len, rows, cols, int(TypeId.f32))
     _write_binary(file, f"{padding}x")
     _serialize_array_as_le(file, self)
예제 #12
0
 def write_chunk(self, file) -> None:
     chunk_length = _calculate_binary_list_size(self.words)
     chunk_length += _calculate_binary_list_size(
         self.subword_indexer.ngrams)
     min_n_max_n_size = struct.calcsize("<II")
     chunk_length += min_n_max_n_size
     chunk_header = (int(self.chunk_identifier()), chunk_length,
                     len(self.words), len(self.subword_indexer.ngrams),
                     self.min_n, self.max_n)
     _write_binary(file, "<IQQQII", *chunk_header)
     _write_words_binary((bytes(word, "utf-8") for word in self.words),
                         file)
     for ngram in self.subword_indexer.ngrams:
         b_ngram = ngram.encode("utf-8")
         _write_binary(file, "<I", len(b_ngram))
         file.write(b_ngram)
         _write_binary(file, "<Q", self.subword_indexer.ngram_index[ngram])
예제 #13
0
 def write_chunk(self, file: BinaryIO):
     b_data = bytes(toml.dumps(self), "utf-8")
     _write_binary(file, "<IQ", int(self.chunk_identifier()), len(b_data))
     file.write(b_data)