def test_vectors_deduplicate(): data = OPS.asarray([[1, 1], [2, 2], [3, 4], [1, 1], [3, 4]], dtype="f") v = Vectors(data=data, keys=["a1", "b1", "c1", "a2", "c2"]) vocab = Vocab() vocab.vectors = v # duplicate vectors do not use the same keys assert (vocab.vectors.key2row[v.strings["a1"]] != vocab.vectors.key2row[v.strings["a2"]]) assert (vocab.vectors.key2row[v.strings["c1"]] != vocab.vectors.key2row[v.strings["c2"]]) vocab.deduplicate_vectors() # there are three unique vectors assert vocab.vectors.shape[0] == 3 # the uniqued data is the same as the deduplicated data assert_equal( numpy.unique(OPS.to_numpy(vocab.vectors.data), axis=0), OPS.to_numpy(vocab.vectors.data), ) # duplicate vectors use the same keys now assert (vocab.vectors.key2row[v.strings["a1"]] == vocab.vectors.key2row[ v.strings["a2"]]) assert (vocab.vectors.key2row[v.strings["c1"]] == vocab.vectors.key2row[ v.strings["c2"]]) # deduplicating again makes no changes vocab_b = vocab.to_bytes() vocab.deduplicate_vectors() assert vocab_b == vocab.to_bytes()
def test_lookups_to_from_bytes_via_vocab(): table_name = "test" vocab = Vocab() vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"}) assert table_name in vocab.lookups vocab_bytes = vocab.to_bytes() new_vocab = Vocab() new_vocab.from_bytes(vocab_bytes) assert len(new_vocab.lookups) == len(vocab.lookups) assert table_name in new_vocab.lookups table = new_vocab.lookups.get_table(table_name) assert len(table) == 2 assert table["hello"] == "world" assert new_vocab.to_bytes() == vocab_bytes
def test_serialize_vocab_roundtrip_bytes(strings1, strings2): vocab1 = Vocab(strings=strings1) vocab2 = Vocab(strings=strings2) vocab1_b = vocab1.to_bytes() vocab2_b = vocab2.to_bytes() if strings1 == strings2: assert vocab1_b == vocab2_b else: assert vocab1_b != vocab2_b vocab1 = vocab1.from_bytes(vocab1_b) assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b assert len(new_vocab1.strings) == len(strings1) + 1 # adds _SP assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])
def test_serialize_vocab_roundtrip_bytes(strings1, strings2): vocab1 = Vocab(strings=strings1) vocab2 = Vocab(strings=strings2) vocab1_b = vocab1.to_bytes() vocab2_b = vocab2.to_bytes() if strings1 == strings2: assert vocab1_b == vocab2_b else: assert vocab1_b != vocab2_b vocab1 = vocab1.from_bytes(vocab1_b) assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b assert len(new_vocab1) == len(strings1) assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr): vocab1 = Vocab(strings=strings) vocab2 = Vocab() vocab1[strings[0]].norm_ = lex_attr assert vocab1[strings[0]].norm_ == lex_attr assert vocab2[strings[0]].norm_ != lex_attr vocab2 = vocab2.from_bytes(vocab1.to_bytes()) assert vocab2[strings[0]].norm_ == lex_attr
def test_pickle_vocab(strings, lex_attr): vocab = Vocab(strings=strings) ops = get_current_ops() vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1) vocab.vectors = vectors vocab[strings[0]].norm_ = lex_attr vocab_pickled = pickle.dumps(vocab) vocab_unpickled = pickle.loads(vocab_pickled) assert vocab.to_bytes() == vocab_unpickled.to_bytes() assert vocab_unpickled.vectors.mode == "floret"
def test_deserialize_vocab_seen_entries(strings, lex_attr): # Reported in #2153 vocab = Vocab(strings=strings) vocab.from_bytes(vocab.to_bytes()) assert len(vocab.strings) == len(strings) + 1 # adds _SP
def test_serialize_vocab(en_vocab, text): text_hash = en_vocab.strings.add(text) vocab_bytes = en_vocab.to_bytes(exclude=["lookups"]) new_vocab = Vocab().from_bytes(vocab_bytes) assert new_vocab.strings[text_hash] == text assert new_vocab.to_bytes(exclude=["lookups"]) == vocab_bytes
def test_pickle_vocab(strings, lex_attr): vocab = Vocab(strings=strings) vocab[strings[0]].norm_ = lex_attr vocab_pickled = pickle.dumps(vocab) vocab_unpickled = pickle.loads(vocab_pickled) assert vocab.to_bytes() == vocab_unpickled.to_bytes()
def test_deserialize_vocab_seen_entries(strings, lex_attr): # Reported in #2153 vocab = Vocab(strings=strings) length = len(vocab) vocab.from_bytes(vocab.to_bytes()) assert len(vocab) == length