def test_load_embedding_has_all_words(self, instances, embedding_type): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, embedding_type=embedding_type, ) vocab.build_vocab() embedding = vocab.load_embedding() assert embedding.size(0) == vocab.get_vocab_len()
def setup_lstm2seqdecoder(request, ): HIDDEN_DIM = 1024 NUM_LAYERS = request.param[0] BIDIRECTIONAL = request.param[1] TEACHER_FORCING_RATIO = request.param[3] MAX_LENGTH = 5 lines = [] words = [] # texts = ["First", "second", "Third"] texts = ["First sentence", "second sentence", "Third long sentence here"] for text in texts: line = Line(text=text) word = Line(text=text.split()[0]) lines.append(line) words.append(word) flat_texts = [[word for sentence in texts for word in sentence]] vocab = Vocab(flat_texts) vocab.build_vocab() num_direction = 2 if BIDIRECTIONAL else 1 h0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.1 c0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.2 embedder = WordEmbedder(embedding_type="glove_6B_50") encoder_outputs = (torch.ones(len(texts), 5, num_direction * HIDDEN_DIM) * 0.5 if request.param[2] else None) decoder = Lstm2SeqDecoder( embedder=embedder, vocab=vocab, max_length=MAX_LENGTH, attn_module=request.param[2], dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, rnn_bias=False, num_layers=NUM_LAYERS, ) return ( decoder, { "HIDDEN_DIM": HIDDEN_DIM, "NUM_LAYERS": NUM_LAYERS, "MAX_LENGTH": MAX_LENGTH, "TEACHER_FORCING_RATIO": TEACHER_FORCING_RATIO, "LINES": lines, "WORDS": words, "VOCAB_SIZE": vocab.get_vocab_len(), "BIDIRECTIONAL": BIDIRECTIONAL, }, encoder_outputs, (h0, c0), )
def test_vocab_length_min_freq_1_max_words_1(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 1 MIN_FREQ = 1 vocab_builder = Vocab(instances=single_instance, min_count=MIN_FREQ, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() len_vocab = vocab_builder.get_vocab_len() assert len_vocab == 1 + len(vocab_builder.special_vocab)