def test_tok2vec_batch_sizes(batch_size, width, embed_size): batch = get_batch(batch_size) tok2vec = build_Tok2Vec_model( MultiHashEmbed( width=width, rows=[embed_size] * 4, include_static_vectors=False, attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"], ), MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3), ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update(batch) assert len(vectors) == len(batch) for doc_vec, doc in zip(vectors, batch): assert doc_vec.shape == (len(doc), width)
def test_empty_doc(): width = 128 embed_size = 2000 vocab = Vocab() doc = Doc(vocab, words=[]) tok2vec = build_Tok2Vec_model( MultiHashEmbed( width=width, rows=[embed_size, embed_size, embed_size, embed_size], include_static_vectors=False, attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"], ), MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3), ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update([doc]) assert len(vectors) == 1 assert vectors[0].shape == (0, width)