def test_vectors_deduplicate(): data = OPS.asarray([[1, 1], [2, 2], [3, 4], [1, 1], [3, 4]], dtype="f") v = Vectors(data=data, keys=["a1", "b1", "c1", "a2", "c2"]) vocab = Vocab() vocab.vectors = v # duplicate vectors do not use the same keys assert (vocab.vectors.key2row[v.strings["a1"]] != vocab.vectors.key2row[v.strings["a2"]]) assert (vocab.vectors.key2row[v.strings["c1"]] != vocab.vectors.key2row[v.strings["c2"]]) vocab.deduplicate_vectors() # there are three unique vectors assert vocab.vectors.shape[0] == 3 # the uniqued data is the same as the deduplicated data assert_equal( numpy.unique(OPS.to_numpy(vocab.vectors.data), axis=0), OPS.to_numpy(vocab.vectors.data), ) # duplicate vectors use the same keys now assert (vocab.vectors.key2row[v.strings["a1"]] == vocab.vectors.key2row[ v.strings["a2"]]) assert (vocab.vectors.key2row[v.strings["c1"]] == vocab.vectors.key2row[ v.strings["c2"]]) # deduplicating again makes no changes vocab_b = vocab.to_bytes() vocab.deduplicate_vectors() assert vocab_b == vocab.to_bytes()
def test_pickle_vocab(strings, lex_attr): vocab = Vocab(strings=strings) ops = get_current_ops() vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1) vocab.vectors = vectors vocab[strings[0]].norm_ = lex_attr vocab_pickled = pickle.dumps(vocab) vocab_unpickled = pickle.loads(vocab_pickled) assert vocab.to_bytes() == vocab_unpickled.to_bytes() assert vocab_unpickled.vectors.mode == "floret"