def _read_ft_subwordvocab(file: BinaryIO, buckets: int, min_n: int, max_n: int, vocab_size: int) -> FastTextVocab: """ Helper method to build a FastTextVocab from a fastText file. """ words = [_read_binary_word(file) for _ in range(vocab_size)] indexer = FastTextIndexer(buckets, min_n, max_n) return FastTextVocab(words, indexer)
def test_fasttext_constructor(): v = FastTextVocab([str(i) for i in range(10)]) assert [v[str(i)] for i in range(10)] == [i for i in range(10)] with pytest.raises(AssertionError): v = FastTextVocab(["a"] * 2) with pytest.raises(AssertionError): _ = FastTextVocab(v.words, FinalfusionHashIndexer(21)) assert len(v) == 10 assert v.upper_bound == len(v) + 2_000_000 assert v == v assert v in v assert v != SimpleVocab(v.words) assert v != FastTextVocab(v.words, FastTextIndexer(20)) assert repr(v) == f"FastTextVocab(\n" \ f"\tindexer={repr(v.subword_indexer)}\n" \ "\twords=[...]\n" \ "\tword_index={{...}})"
def load_finalfusion(file: Union[str, bytes, int, PathLike], mmap: bool = False) -> Embeddings: """ Read embeddings from a file in finalfusion format. Parameters ---------- file : str, bytes, int, PathLike Path to a file with embeddings in finalfusoin format. mmap : bool Toggles memory mapping the storage buffer. Returns ------- embeddings : Embeddings The embeddings from the input file. """ with open(file, 'rb') as inf: _ = Header.read_chunk(inf) chunk_id, _ = _read_required_chunk_header(inf) norms = None metadata = None if chunk_id == ChunkIdentifier.Metadata: metadata = Metadata.read_chunk(inf) chunk_id, _ = _read_required_chunk_header(inf) if chunk_id == ChunkIdentifier.SimpleVocab: vocab = SimpleVocab.read_chunk(inf) # type: Vocab elif chunk_id == ChunkIdentifier.BucketSubwordVocab: vocab = FinalfusionBucketVocab.read_chunk(inf) elif chunk_id == ChunkIdentifier.FastTextSubwordVocab: vocab = FastTextVocab.read_chunk(inf) elif chunk_id == ChunkIdentifier.ExplicitSubwordVocab: vocab = ExplicitVocab.read_chunk(inf) else: raise FinalfusionFormatError( f'Expected vocab chunk, not {str(chunk_id)}') chunk_id, _ = _read_required_chunk_header(inf) if chunk_id == ChunkIdentifier.NdArray: storage = NdArray.load(inf, mmap) # type: Storage elif chunk_id == ChunkIdentifier.QuantizedArray: storage = QuantizedArray.load(inf, mmap) else: raise FinalfusionFormatError( f'Expected storage chunk, not {str(chunk_id)}') maybe_chunk_id = _read_chunk_header(inf) if maybe_chunk_id is not None: if maybe_chunk_id[0] == ChunkIdentifier.NdNorms: norms = Norms.read_chunk(inf) else: raise FinalfusionFormatError( f'Expected norms chunk, not {str(chunk_id)}') return Embeddings(storage, vocab, norms, metadata, inf.name)
def _precompute_word_vecs(vocab: FastTextVocab, matrix: np.ndarray): """ Helper method to precompute word vectors. Averages the distinct word representation and the corresponding ngram embeddings. """ for i, word in enumerate(vocab): indices = [i] if isinstance(vocab, FastTextVocab): subword_indices = cast( List[int], vocab.subword_indices(word, with_ngrams=False)) indices += subword_indices matrix[i] = matrix[indices].mean(0, keepdims=False)
def test_explicit_constructor(): i = ExplicitIndexer([str(i) for i in range(10)]) v = ExplicitVocab([str(i) for i in range(10, 100)], indexer=i) assert [v[str(i)] for i in range(10, 100)] == [i for i in range(90)] with pytest.raises(AssertionError): _ = ExplicitVocab(v.words, FinalfusionHashIndexer(21)) assert len(v) == 90 assert v.upper_bound == len(v) + 10 assert v == v assert v in v assert v != SimpleVocab(v.words) assert v != FastTextVocab(v.words, FastTextIndexer(20)) assert repr(v) == f"ExplicitVocab(\n" \ f"\tindexer={repr(v.subword_indexer)}\n" \ "\twords=[...]\n" \ "\tword_index={{...}})"
def _read_ft_vocab(file: BinaryIO, buckets: int, min_n: int, max_n: int, lossy: bool) -> Union[FastTextVocab, SimpleVocab]: """ Helper method to read a vocab from a fastText file Returns a FastTextVocab. """ # discard n_words vocab_size, _n_words, n_labels = _read_required_binary(file, "<iii") if n_labels: raise NotImplementedError( "fastText prediction models are not supported") # discard n_tokens _read_required_binary(file, "<q") prune_idx_size = _read_required_binary(file, "<q")[0] if prune_idx_size >= 0: raise NotImplementedError("Pruned vocabs are not supported") words = [_read_binary_word(file, lossy) for _ in range(vocab_size)] indexer = FastTextIndexer(buckets, min_n, max_n) return FastTextVocab(words, indexer)
def test_fasttext_vocab_roundtrip(tmp_path): filename = tmp_path / "write_ft_vocab.fifu" v = FastTextVocab([str(i) for i in range(10)]) v.write(filename) v2 = load_vocab(filename) assert v == v2