def test_explicit_subword_indices(): ngrams_test = [ "<Test>", "<Test", "<Tes", "<Te", "Test>", "Test", "Tes", "est>" ] indexer = ExplicitIndexer(ngrams_test) assert indexer.subword_indices("Test", bracket=True, with_ngrams=True) == list( (x, i) for i, x in enumerate(ngrams_test)) assert indexer.subword_indices("") == [] assert indexer.subword_indices("oov") == [] assert "st>" not in indexer with pytest.raises(KeyError): _ = indexer["st>"]
def test_explicit_vocab_roundtrip(tmp_path): filename = tmp_path / "write_explicit_vocab.fifu" i = ExplicitIndexer([str(i) for i in range(10)]) v = ExplicitVocab([str(i) for i in range(10, 100)], indexer=i) v.write(filename) v2 = load_vocab(filename) assert v == v2
def read_chunk(file: BinaryIO) -> 'ExplicitVocab': length, ngram_length, min_n, max_n = _read_required_binary( file, "<QQII") words = _read_items(file, length) ngram_list, ngram_index = _read_items_with_indices(file, ngram_length) indexer = ExplicitIndexer(ngram_list, min_n, max_n, ngram_index) return ExplicitVocab(words, indexer)
def test_explicit_with_ngram_index(): ngrams10 = [str(i) for i in range(10)] index = dict((v, i) for i, v in enumerate(ngrams10)) indexer = ExplicitIndexer(ngrams10, ngram_index=index) assert indexer.ngrams == ngrams10 assert indexer.ngram_index == index assert indexer["0"] == 0 assert indexer.ngrams[0] == "0" assert indexer("0") == 0
def test_explicit(): ngrams10 = [str(i) for i in range(10)] indexer = ExplicitIndexer(ngrams10) assert indexer.ngrams == ngrams10 assert indexer.ngram_index == dict((v, i) for i, v in enumerate(ngrams10)) assert repr(indexer) == "ExplicitIndexer(min_n=3, max_n=6, " \ "n_ngrams=10, n_indices=10)" assert indexer["0"] == 0 assert indexer.ngrams[0] == "0" assert indexer("0") == 0 assert indexer("") is None ngrams5 = [str(i) for i in range(5)] assert ExplicitIndexer(ngrams5) in indexer assert ngrams5 in indexer assert "0" in indexer assert "01" not in indexer assert 0 not in indexer
def test_explicit_constructor(): i = ExplicitIndexer([str(i) for i in range(10)]) v = ExplicitVocab([str(i) for i in range(10, 100)], indexer=i) assert [v[str(i)] for i in range(10, 100)] == [i for i in range(90)] with pytest.raises(AssertionError): _ = ExplicitVocab(v.words, FinalfusionHashIndexer(21)) assert len(v) == 90 assert v.upper_bound == len(v) + 10 assert v == v assert v in v assert v != SimpleVocab(v.words) assert v != FastTextVocab(v.words, FastTextIndexer(20)) assert repr(v) == f"ExplicitVocab(\n" \ f"\tindexer={repr(v.subword_indexer)}\n" \ "\twords=[...]\n" \ "\tword_index={{...}})"
def _bucket_to_explicit(vocab: Union[FinalfusionBucketVocab, FastTextVocab] ) -> 'ExplicitVocab': ngram_index = dict() idx_index = dict() # type: Dict[int, int] ngram_list = [] for word in vocab.words: token_ngrams = vocab.subwords(word) for ngram in token_ngrams: if ngram not in ngram_index: ngram_list.append(ngram) idx = vocab.subword_indexer(ngram) if idx not in idx_index: idx_index[idx] = len(idx_index) ngram_index[ngram] = idx_index[idx] indexer = ExplicitIndexer(ngram_list, vocab.min_n, vocab.max_n, ngram_index) return ExplicitVocab(vocab.words, indexer)
def test_explicit_assertions(): with pytest.raises(AssertionError): ExplicitIndexer(["a"] * 2) with pytest.raises(AssertionError): ExplicitIndexer(["a"], ngram_index={"b": 0}) with pytest.raises(AssertionError): ExplicitIndexer(["a"], ngram_index={"a": 1}) with pytest.raises(AssertionError): ExplicitIndexer(["a"], ngram_index={"a": 0, "b": 1}) with pytest.raises(AssertionError): ExplicitIndexer(["a", "b"], ngram_index={"a": 0, "b": 2}) with pytest.raises(AssertionError): ExplicitIndexer(["a"], ngram_index={"a": 1})