def question_1g_sanity_check(): """ Sanity check for to input tensor char() function. """ print("-" * 80) print("Running Sanity Check for Question 1g: Creating Input Tensor") print("-" * 80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] device = torch.device("cpu") max_sentence_length = max([len(sent) for sent in sentences]) max_word_length = 21 input_tensor = vocab.to_input_tensor_char(sentences, device) print(input_tensor) batch_size = len(sentences) correct_shape = [max_sentence_length, batch_size, max_word_length] actual_shape = list(input_tensor.size()) assert actual_shape == correct_shape, "Input Tensor Creation is incorrect: it should be \n{} but is:{}".format( correct_shape, input_tensor.size()) print("Sanity Check Passed for Question 1g: Creating Input Tensor!") print("-" * 80)
def question_1e_sanity_check(): """ Sanity check for to_input_tensor_char() function. """ print("-" * 80) print("Running Sanity Check for Question 1e: To Input Tensor Char") print("-" * 80) vocabEntry = VocabEntry() print("Running test on a list of sentences") sentences = [['Human', ':', 'What', 'do', 'we', 'want', '?'], ['Computer', ':', 'Natural', 'language', 'processing', '!'], ['Human', ':', 'When', 'do', 'we', 'want', 'it', '?'], ['Computer', ':', 'When', 'do', 'we', 'want', 'what', '?']] sentence_length = 8 BATCH_SIZE = 4 word_length = 12 output = vocabEntry.to_input_tensor_char(sentences, 'cpu') output_expected_size = [sentence_length, BATCH_SIZE, word_length] assert list( output.size() ) == output_expected_size, "output shape is incorrect: it should be:\n {} but is:\n{}".format( output_expected_size, list(output.size())) print("Sanity Check Passed for Question 1e: To Input Tensor Char!") print("-" * 80)
def question_1g_sanity_check(model): """ Sanity check for pad_sents_char() function. """ print("-" * 80) print("Running Sanity Check for Question 1g: Padding") print("-" * 80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] word_ids = vocab.words2charindices(sentences) #padded_sentences = pad_sents_char(word_ids, 0) padded_sentences = vocab.to_input_tensor_char(sentences, model.device) gold_padded_sentences = torch.load( './sanity_check_en_es_data/gold_padded_sentences.pkl') a = torch.rand(6, 4, 21) print(a.size()) print(padded_sentences.size()) assert padded_sentences.size() == a.size( ), "to_input_tensor size incorrect! is incorrect: it should be:\n {} but is:\n{}".format( a.size(), padded_sentences.size()) print("Sanity Check Passed for Question 1g: Padding!") print("-" * 80)
def question_1e_sanity_check(): """Sanity check for to_input_tensor_char() function.""" print("-" * 80) print("Running Sanity Check for Question 1e: To Input Tensor Char") print("-" * 80) vocabEntry = VocabEntry() print("Running test on a list of sentences") sentences = [ ["Human", ":", "What", "do", "we", "want", "?"], ["Computer", ":", "Natural", "language", "processing", "!"], ["Human", ":", "When", "do", "we", "want", "it", "?"], ["Computer", ":", "When", "do", "we", "want", "what", "?"], ] sentence_length = 8 BATCH_SIZE = 4 word_length = 12 output = vocabEntry.to_input_tensor_char(sentences, "cpu") output_expected_size = [sentence_length, BATCH_SIZE, word_length] assert ( list(output.size()) == output_expected_size ), "output shape is incorrect: it should be:\n {} but is:\n{}".format( output_expected_size, list(output.size())) print("Sanity Check Passed for Question 1e: To Input Tensor Char!") print("-" * 80)
def question_1a_sanity_check(): """ Sanity check for words2charindices function. """ print("-" * 80) print("Running Sanity Check for Question 1a: words2charindices()") print("-" * 80) vocab = VocabEntry() print('Running test on small list of sentences') sentences = [["a", "b", "c?"], ["~d~", "c", "b", "a"]] small_ind = vocab.words2charindices(sentences) small_ind_gold = [[[1, 30, 2], [1, 31, 2], [1, 32, 70, 2]], [[1, 85, 33, 85, 2], [1, 32, 2], [1, 31, 2], [1, 30, 2]]] assert (small_ind == small_ind_gold), \ "small test resulted in indices list {:}, expected {:}".format(small_ind, small_ind_gold) print('Running test on large list of sentences') tgt_sents = [ ['<s>', "Let's", 'start', 'by', 'thinking', 'about', 'the', 'member', 'countries', 'of', 'the', 'OECD,', 'or', 'the', 'Organization', 'of', 'Economic', 'Cooperation', 'and', 'Development.', '</s>'], ['<s>', 'In', 'the', 'case', 'of', 'gun', 'control,', 'we', 'really', 'underestimated', 'our', 'opponents.', '</s>'], ['<s>', 'Let', 'me', 'share', 'with', 'those', 'of', 'you', 'here', 'in', 'the', 'first', 'row.', '</s>'], ['<s>', 'It', 'suggests', 'that', 'we', 'care', 'about', 'the', 'fight,', 'about', 'the', 'challenge.', '</s>'], ['<s>', 'A', 'lot', 'of', 'numbers', 'there.', 'A', 'lot', 'of', 'numbers.', '</s>']] tgt_ind = vocab.words2charindices(tgt_sents) tgt_ind_gold = pickle.load(open('./sanity_check_en_es_data/1a_tgt.pkl', 'rb')) assert (tgt_ind == tgt_ind_gold), "target vocab test resulted in indices list {:}, expected {:}".format(tgt_ind, tgt_ind_gold) print("All Sanity Checks Passed for Question 1a: words2charindices()!") print("-" * 80)
def question_1g_sanity_check(): """ Sanity check for to_input_tensor_char() function. """ print("-" * 80) print("Running Sanity Check for Question 1g: to_input_tensor_char") print("-" * 80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] X = vocab.to_input_tensor_char(sentences, "cpu") # 6 is the max_sentence_length # 4 is batch size # 21 is max_word_length assert X.shape == ( 6, 4, 21), f"Size is incorrect: it should be (6, 4, 21) but it is {X.shape}" print("Sanity Check Passed for Question 1g: to_input_tensor_char!") print("-" * 80)
def question_1b_sanity_check(): """ Sanity check for pad_sents_char() function. """ print("-" * 80) print("Running Sanity Check for Question 1b: Padding") print("-" * 80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] word_ids = vocab.words2charindices(sentences) padded_sentences = pad_sents_char(word_ids, 0) gold_padded_sentences = torch.load( './sanity_check_en_es_data/gold_padded_sentences.pkl') assert padded_sentences == gold_padded_sentences, "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format( gold_padded_sentences, padded_sentences) print( len(padded_sentences), torch.transpose(torch.tensor(padded_sentences, dtype=torch.int), 0, 1).shape) print("Sanity Check Passed for Question 1b: Padding!") print("-" * 80)
def question_1g_test(): """ Custom simple test for to_input_tensor_char() function. """ print("-" * 80) print("Running Sanity Check for Question 1g: Padding") print("-" * 80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] gold_shape = torch.Size( [6, 4, 21]) # (max sentence length, batch size, max word length) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') input_tensor = vocab.to_input_tensor_char(sentences, device) # print("We get torch tensor:\n", input_tensor) assert input_tensor.shape == gold_shape, "Ouput tensor shape is incorrect: it should be:\n {} but is:\n{}".format( gold_shape, input_tensor.shape) print("Sanity Check Passed for Question 1g: Padding!") print("-" * 80)
def load_word2vec(fpath: str, vocab: VocabEntry, device: torch.device) -> torch.tensor: """load pretrained embedding vectors for words in vocab. :param fpath : word2vec file(from fasttext) path, in which already contains </s> token. :param vocab : constructed vocabulary :return word2vec (vocab_size, embed_size): tensor of word2vec """ print("loading pretrained word2vec from %s......" % fpath) model = KeyedVectors.load_word2vec_format(fpath, limit=int(1e5)) words = vocab.get_words() word2vec = [] for w in tqdm(words, desc='loading'): try: word2vec.append(model[w].astype(np.float)) except KeyError: if w == vocab.get_pad_info(0): # initialize pad token with zero vector word2vec.append(np.zeros(model.vector_size, dtype=np.float)) else: uniform_init = 0.1 word2vec.append( np.random.uniform(low=-uniform_init, high=uniform_init, size=model.vector_size).astype(np.float)) word2vec = np.stack(word2vec, axis=0) word2vec = torch.from_numpy(word2vec).to(torch.float).to(device) assert word2vec.size(0) == len( vocab ), "tensor size wrong, first dimention should be equal to vocab size" return word2vec
def question_1h_sanity_check(model): """ Sanity check for highway network """ print("-" * 80) print("Running Sanity Check for Question 1h: Padding") print("-" * 80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] word_ids = vocab.words2charindices(sentences) #padded_sentences = pad_sents_char(word_ids, 0) padded_sentences = vocab.to_input_tensor_char(sentences, model.device) gold_padded_sentences = torch.load( './sanity_check_en_es_data/gold_padded_sentences.pkl') #Test with batch size 1 x = torch.rand(1, 1, 21) hw = Highway(21, 21, 21, 0.5) hw.forward(x) #Test with batch size 4 print(a.size()) print(padded_sentences.size()) #assert padded_sentences.size() == a.size(), "to_input_tensor size incorrect! is incorrect: it should be:\n {} but is:\n{}".format(a.size(), padded_sentences.size()) print("Sanity Check Passed for Question 1h: Padding!") print("-" * 80)
def question_1c_sanity_check(): """ Sanity check for to_input_tensor_char() function. """ print("-" * 80) print("Running Sanity Check for Question 1c: To input tensor") print("-" * 80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] word_ids = vocab.words2charindices(sentences) padded_sentences = pad_sents_char(word_ids, 0) gold_padded_sentences = torch.load('./sanity_check_en_es_data/gold_padded_sentences.pkl') assert padded_sentences == gold_padded_sentences, "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format( gold_padded_sentences, padded_sentences) batch_size = len(gold_padded_sentences) max_sentence_length = len(gold_padded_sentences[0]) max_word_length = len(gold_padded_sentences[0][0]) padded_sentences_tensor = vocab.to_input_tensor_char(sentences, device=torch.device('cpu')) assert (padded_sentences_tensor.size() == (max_sentence_length, batch_size, max_word_length)) print("Sanity Check Passed for Question 1c: To input tensor") print("-" * 80)
def test_file1_method1(): batch_size = 2 max_sent_len = 3 max_word_length = 21 sentence = [['ciao', 'come', 'staiii'], ['sto', 'bene']] v = VocabEntry() tens = v.to_input_tensor_char(sentence, torch.device('cpu')) assert tens.shape[0] == max_sent_len assert tens.shape[1] == batch_size assert tens.shape[2] == max_word_length, ''
def get_code_change_tensors(self, code_vocab: VocabEntry, action_vocab: VocabEntry, device: torch.device): code_tensor_a = code_vocab.to_input_tensor(self.old_code_tokens, device) code_tensor_b = code_vocab.to_input_tensor(self.new_code_tokens, device) edit_tensor = action_vocab.to_input_tensor(self.edit_actions, device) return code_tensor_a, code_tensor_b, edit_tensor
def question_1g_sanity_check(): """ Sanity check for to_input() function. """ print ("-"*80) print("Running Sanity Check for Question 1g: Reshape") print ("-"*80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] sent_tensor = vocab.to_input_tensor_char(sentences, "cpu") print("Sanity Check Passed for Question 1g: Reshape!") print("-"*80)
def question_1c_sanity_check(): """ Sanity check for to_input_tensor_char() function. """ print("-" * 80) print("Running Sanity Check for Question 1c") print("-" * 80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] sent_padded = vocab.to_input_tensor_char(sentences, torch.device)
def question_1g_sanity_check(): """ Sanity check for to_input_tensor_char() function. """ print ("-"*80) print("Running Sanity Check for Question 1g: Padding") print ("-"*80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] input_tensor = vocab.to_input_tensor_char(sentences, torch.device('cuda', 0)) # print(input_tensor.shape) assert input_tensor.shape == (6, 4, 21) print("Sanity Check Passed for Question 1g: Padding!") print("-"*80) pass
def question_1c_sanity_check(): print("-" * 80) print("Running Sanity Check for Question 1c: Input tensor") print("-" * 80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] word_ids = vocab.words2charindices(sentences) padded_sentences = pad_sents_char(word_ids, 0) o_tnsr = vocab.to_input_tensor_char(sentences, "cpu") print(o_tnsr.shape)
def question_1g_sanity_check(): """ Sanity check for pad_sents_char() function. """ print("-" * 80) print("Running Sanity Check for Question 1g: question_1g_sanity_check") print("-" * 80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] t = vocab.to_input_tensor_char(sentences, torch.device('cpu', 0)) print("Sanity Check Passed for Question 1g:shape=" + str(t.shape)) print("-" * 80)
def test_question_1f_sanity_check(self): """ Sanity check for pad_sents_char() function. """ vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] word_ids = vocab.words2charindices(sentences) padded_sentences = pad_sents_char(word_ids, 0) gold_padded_sentences = torch.load('./sanity_check_en_es_data/gold_padded_sentences.pkl') assert len(gold_padded_sentences) == len(padded_sentences) for expected, got in zip(gold_padded_sentences, padded_sentences): if got != expected: raise AssertionError('got {}: expected: {}'.format(got, expected)) assert padded_sentences == gold_padded_sentences, "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format(gold_padded_sentences, padded_sentences)
def sentence_ids_to_multi_ones_hot_vector( y: List[str], dictionary: VocabEntry) -> np.array: total_length = len(dictionary) ones_hot = np.zeros(total_length, dtype=np.int) hot_indices = dictionary.words2indices(y) ones_hot[hot_indices] = 1 # ignore the following words '<pad>' '<s>' '</s>' '<unk>' return ones_hot[4:]
def question_1g_sanity_check(): """ Sanity check for to_input_tensor_char() function. """ print ("-"*80) print("Running Sanity Check for Question 1g: Building the input tensor") print ("-"*80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] device = torch.device('cpu') padded_tensor = vocab.to_input_tensor_char(sentences, device) gold_padded_sentences = torch.load('./sanity_check_en_es_data/gold_padded_sentences.pkl') gold_padded_tensor = torch.tensor(gold_padded_sentences, device = device).permute(1, 0, 2) assert padded_tensor.size() == gold_padded_tensor.size(), "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format(gold_padded_sentences, padded_sentences) print("Sanity Check Passed for Question 1g: Building the input tensor!") print("-"*80)
def get_train_and_dev(self, train_file_path, grammar_file, primitive_types): src_freq = 3 code_freq = 3 grammar = ASDLGrammar.grammar_from_text( open(grammar_file).read(), primitive_types) transition_system = TransitionSystem(grammar) train_examples = self.preprocess_dataset(train_file_path, transition_system) full_train_examples = train_examples[:] np.random.shuffle(train_examples) dev_examples = train_examples[:200] train_examples = train_examples[200:] src_vocab = VocabEntry.from_corpus( [e.sentence for e in train_examples], size=5000, freq_cutoff=src_freq) primitive_tokens = [ map( lambda a: a.action.token, filter(lambda a: isinstance(a.action, GenTokenAction), e.tgt_actions)) for e in train_examples ] primitive_vocab = VocabEntry.from_corpus(primitive_tokens, size=5000, freq_cutoff=code_freq) # generate vocabulary for the code tokens! code_tokens = [ transition_system.tokenize_code(e.code, mode='decoder') for e in train_examples ] code_vocab = VocabEntry.from_corpus(code_tokens, size=5000, freq_cutoff=code_freq) vocab = Vocab(source=src_vocab, primitive=primitive_vocab, code=code_vocab) return train_examples, dev_examples, vocab
def question_1f_sanity_check(): """ Sanity check for pad_sents_char() function. """ print ("-"*80) print("Running Sanity Check for Question 1f: Padding") print ("-"*80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] word_ids = vocab.words2charindices(sentences) padded_sentences = pad_sents_char(word_ids, 0) gold_padded_sentences = torch.load('./sanity_check_en_es_data/gold_padded_sentences.pkl') assert padded_sentences == gold_padded_sentences, "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format(gold_padded_sentences, padded_sentences) test_list = [[[4]*33]] padded_sent = pad_sents_char(test_list, 0) assert len(padded_sent[0][0]) == 21 print("Sanity Check Passed for Question 1f: Padding!") print("-"*80)
def _from_json(self): vocab_file = "qna_data/{}_vocab.json".format(self.method) dataset_file = "qna_data/{}_dataset.json".format(self.method) self.vocab = VocabEntry.from_json(vocab_file) dataset_json = read_json_data(dataset_file) for key in self.train_keys: setattr(self, key, dataset_json[key]) self._to_numpy()
def load_vocab(self): # Load the vocabulary or create vocabulary if not exists if self.args.vocab is not None: if not os.path.isfile(self.args.vocab): print('create new vocab and save to %s' % self.args.vocab) corpus = [] for story in self.trn[0]: for sent in story: corpus.append(sent) if self.args.rebuild_vocab: self.vocab = VocabEntry.from_corpus( corpus, 50000, remove_singleton=not self.args.include_singleton) else: self.vocab = VocabEntry.from_dict(self.w2id) torch.save(self.vocab, self.args.vocab) else: self.vocab = torch.load(self.args.vocab) else: print('vocab file is required') exit(0)
def question_1f_sanity_check(): print("running sanity check for 1f:cnn") VocabEntry = VocabEntry() sentences = [['Human', ':', 'What', 'do', 'we', 'want', '?'], ['Computer', ':', 'Natural', 'language', 'processing', '!'], ['Human', ':', 'When', 'do', 'we', 'want', 'it', '?'], ['Computer', ':', 'When', 'do', 'we', 'want', 'what', '?']] sentence_length = 8 BATCH_SIZE = 4 word_length = 12 x_reshape = vocabEntry.to_input_tensor_char(sentences, 'cpu') cnn = CNN(k=5, f=2, emb_size=word_length, m_word=sentence_length) conv_out = self.cnn(x_reshape) print(conv_out.size())
def question_1i_sanity_check(): """ Sanity check for nmt_model.py basic shape check """ print("-" * 80) print("Running Sanity Check for Question 1i: NMT") print("-" * 80) src_vocab_entry = VocabEntry() tgt_vocab_entry = VocabEntry() dummy_vocab = Vocab(src_vocab_entry, tgt_vocab_entry) word_embed_size = 5 hidden_size = 10 nmt = NMT(word_embed_size, hidden_size, dummy_vocab) source = [["Hello my friend"], ["How are you"]] target = [["Bonjour mon ami"], ["Comment vas tu"]] output = nmt.forward(source, target) print(output) #output_expected_size = [sentence_length, BATCH_SIZE, EMBED_SIZE] #assert(list(output.size()) == output_expected_size), "output shape is incorrect: it should be:\n {} but is:\n{}".format(output_expected_size, list(output.size())) print("Sanity Check Passed for Question 1i: NMT!") print("-" * 80)
def train_forward(self, char_sequence, dec_hidden=None): """ Forward computation during training. @param char_sequence (Tensor): tensor of integers, shape (length, batch_size). Note that "length" here and in forward() need not be the same. @param dec_hidden (tuple(Tensor, Tensor)): initial internal state of the LSTM, obtained from the output of the word-level decoder. A tuple of two tensors of shape (1, batch_size, hidden_size) @returns The cross-entropy loss (Tensor), computed as the *sum* of cross-entropy losses of all the words in the batch. """ ### YOUR CODE HERE for part 2b ### TODO - Implement training forward pass. ### ### Hint: - Make sure padding characters do not contribute to the cross-entropy loss. Check vocab.py to find the padding token's index. ### - char_sequence corresponds to the sequence x_1 ... x_{n+1} (e.g., <START>,m,u,s,i,c,<END>). Read the handout about how to construct input and target sequence of CharDecoderLSTM. ### - Carefully read the documentation for nn.CrossEntropyLoss and our handout to see what this criterion have already included: ### https://pytorch.org/docs/stable/nn.html#crossentropyloss # char_sequence: [length, b] => delete end_token => input_sequence: [length, b] X_input = char_sequence[:-1] # char_sequence: [length, b] => delete start_token => input_sequence: [length, b] X_target = char_sequence[1:] # X_input: [length, b], dec_hidden = (h_n, c_n): ([1, b, h], [1, b, h]) # ==> softmax ==> # s_t: [length, b, self.vocab_size], dec_hidden = (h_n, c_n): ([1, b, h], [1, b, h]) s_t, dec_hidden = self.forward(X_input, dec_hidden) # For lookup char_pad index value, shall be 0 vocab_entry = VocabEntry() idx_char_pad = vocab_entry.char_pad # Initialiate CrossEntropyLoss Instances, combines logsoftmax and nllloss compute_loss = nn.CrossEntropyLoss(ignore_index=idx_char_pad, reduction='sum') # Reshape s_t for compute_loss, length*b => b_char # length = length of a word, b = batch size, length*b = # of characters in the batch # s_t: [length, b, self.vocab_size] ==> s_t: [length*b, self.vocab_size] = [N, C] s_t = s_t.reshape(s_t.shape[0] * s_t.shape[1], -1) # Reshape X_target for compute_loss # X_target: [length, b] ==> X_target: [length*b] = [N] X_target = X_target.reshape(-1) # s_t: [length*b, self.vocab_size] = [N, C, d1...dk], X_target: [length*b] = [N] # ==> compute_loss ==> loss_char_dec: loss_char_dec = compute_loss(s_t, X_target) return loss_char_dec
def question_1g_sanity_check(): """ Sanity check for to_input_tensor_char() function :return: """ print("-" * 80) print("Running Sanity Check for Question 1g") print("-" * 80) vocab = VocabEntry() print("Running test on a list of sentences") sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']] word_ids = vocab.words2charindices(sentences) padded_sentences = pad_sents_char(word_ids, 0) a = np.asarray(padded_sentences).transpose((1, 0, 2)) a = torch.Tensor(a) b = vocab.to_input_tensor_char(sentences, device="cpu") assert a.equal(b), "Wrong!" print("Sanity Check Passed for Question 1g") print("-" * 80)
def test_question_1e_sanity_check(self): """ Sanity check for words2charindices function. """ vocab = VocabEntry() sentences = [["a", "b", "c?"], ["~d~", "c", "b", "a"]] small_ind = vocab.words2charindices(sentences) small_ind_gold = [[[1, 30, 2], [1, 31, 2], [1, 32, 70, 2]], [[1, 85, 33, 85, 2], [1, 32, 2], [1, 31, 2], [1, 30, 2]]] assert(small_ind == small_ind_gold), \ "small test resulted in indices list {:}, expected {:}".format(small_ind, small_ind_gold) # print('Running test on single sentence') # sentence = ["right", "arcs", "only"] # single_ind = vocab.words2charindices(sentence) # single_ind_gold = [[[1, 47, 2], [1, 38, 2], [1, 36, 2], [1, 37, 2], [1, 49, 2]], [[1, 30, 2], [1, 47, 2], [1, 32, 2], [1, 48, 2]], [[1, 44, 2], [1, 43, 2], [1, 41, 2], [1, 54, 2]]] # assert(single_ind == single_ind_gold), \ # "single sentence test resulted in indices list {:}, expected {:}".format(single_ind, single_ind_gold) print('Running test on large list of sentences') tgt_sents = [['<s>', "Let's", 'start', 'by', 'thinking', 'about', 'the', 'member', 'countries', 'of', 'the', 'OECD,', 'or', 'the', 'Organization', 'of', 'Economic', 'Cooperation', 'and', 'Development.', '</s>'], ['<s>', 'In', 'the', 'case', 'of', 'gun', 'control,', 'we', 'really', 'underestimated', 'our', 'opponents.', '</s>'], ['<s>', 'Let', 'me', 'share', 'with', 'those', 'of', 'you', 'here', 'in', 'the', 'first', 'row.', '</s>'], ['<s>', 'It', 'suggests', 'that', 'we', 'care', 'about', 'the', 'fight,', 'about', 'the', 'challenge.', '</s>'], ['<s>', 'A', 'lot', 'of', 'numbers', 'there.', 'A', 'lot', 'of', 'numbers.', '</s>']] tgt_ind = vocab.words2charindices(tgt_sents) tgt_ind_gold = pickle.load(open('./sanity_check_en_es_data/1e_tgt.pkl', 'rb')) assert(tgt_ind == tgt_ind_gold), "target vocab test resulted in indices list {:}, expected {:}".format(tgt_ind, tgt_ind_gold)