def base_plus_copy_indices(words, dynamic_vocabs, base_vocab, volatile=False): """Compute base + copy indices. Args: words (list[list[unicode]]) dynamic_vocabs (list[HardCopyDynamicVocab]) base_vocab (HardCopyVocab) volatile (bool) Returns: MultiVocabIndices """ unk = base_vocab.UNK copy_seqs = [] for seq, dyna_vocab in izip(words, dynamic_vocabs): word_to_copy = dyna_vocab.word_to_copy_token normal_copy_seq = [] for w in seq: normal_copy_seq.append(word_to_copy.get(w, unk)) copy_seqs.append(normal_copy_seq) # each SeqBatch.values has shape (batch_size, seq_length) base_indices = SequenceBatch.from_sequences(words, base_vocab, volatile=volatile) copy_indices = SequenceBatch.from_sequences(copy_seqs, base_vocab, volatile=volatile) assert_tensor_equal(base_indices.mask, copy_indices.mask) # has shape (batch_size, seq_length, 2) concat_values = torch.stack([base_indices.values, copy_indices.values], 2) return MultiVocabIndices(concat_values, base_indices.mask)
def preprocess(self, source_words, insert_words, insert_exact_words, delete_words, delete_exact_words, edit_embed): """Preprocess. Args: source_words (list[list[unicode]]): a batch of source sequences insert_words (list[list[unicode]]): a batch of insert words insert_exact_words (list[list[unicode]]): a batch of insert words, used without noise delete_words (list[list[unicode]]): a batch of delete words delete_exact_words (list[list[unicode]]): a batch of delete words, used without noise edit_embed (np.ndarray | None): of shape (batch_size, edit_dim), or None. Returns: EncoderInput """ return EncoderInput( SequenceBatch.from_sequences(source_words, self.word_vocab), SequenceBatch.from_sequences(insert_words, self.word_vocab, min_seq_length=1), SequenceBatch.from_sequences(insert_exact_words, self.word_vocab, min_seq_length=1), SequenceBatch.from_sequences(delete_words, self.word_vocab, min_seq_length=1), SequenceBatch.from_sequences(delete_exact_words, self.word_vocab, min_seq_length=1), edit_embed)
def __init__(self, target_words, word_vocab, keep_rate): input_words = [[word_vocab.START] + tokens for tokens in target_words] target_words_shifted = [tokens + [word_vocab.STOP] for tokens in target_words] input_words = SequenceBatch.from_sequences(input_words, word_vocab) self.input_words = self._drop_seq_batch(input_words, word_vocab, keep_rate) self.target_words = SequenceBatch.from_sequences(target_words_shifted, word_vocab)
def __init__(self, target_words, word_vocab): """Create TrainDecoderInput. Args: target_words (list[list[unicode]]) word_vocab (WordVocab) """ input_words = [[word_vocab.START] + tokens for tokens in target_words] # prepend with <start> token target_words_shifted = [tokens + [word_vocab.STOP] for tokens in target_words] # append with <stop> token self.input_words = SequenceBatch.from_sequences(input_words, word_vocab) self.target_words = SequenceBatch.from_sequences(target_words_shifted, word_vocab)
def _get_neighbor_indices(self, dom_elements, is_neighbor): """Compute neighbor indices. Args: dom_elements (list[DOMElement]): may include PAD elements is_neighbor (Callable: DOMElement x DOMElement --> bool): True if two DOM elements are neighbors of each other, otherwise False Returns: SequenceBatch: of shape (total_dom_elems, max_neighbors) """ dom_element_ids = [id(e) for e in flatten(dom_elements)] dom_element_ids_set = set(dom_element_ids) vocab = SuperSimpleVocab(dom_element_ids) neighbors_batch = [] for dom_batch in dom_elements: for dom_elem in dom_batch: # Optimization: no DOM PAD has neighbors if isinstance(dom_elem, DOMElementPAD): neighbors = [] else: neighbors = [] for neighbor in dom_batch: if is_neighbor(dom_elem, neighbor): neighbors.append(id(neighbor)) neighbors_batch.append(neighbors) neighbor_indices = SequenceBatch.from_sequences(neighbors_batch, vocab, min_seq_length=1) return neighbor_indices
def test_embed(self): sequences = [ [], [1, 2, 3], [3, 3], [2] ] vocab = SimpleVocab([0, 1, 2, 3, 4]) indices = SequenceBatch.from_sequences(sequences, vocab) embeds = GPUVariable(torch.FloatTensor([ [0, 0], [2, 2], # 1 [3, 4], # 2 [-10, 1], # 3 [11, -1] # 4 ])) embedded = SequenceBatch.embed(indices, embeds) correct = np.array([ [[0, 0], [0, 0], [0, 0]], [[2, 2], [3, 4], [-10, 1]], [[-10, 1], [-10, 1], [0, 0]], [[3, 4], [0, 0], [0, 0]] ], dtype=np.float32) assert_tensor_equal(embedded.values, correct)
def forward(self, utterances): """Embeds a batch of utterances. Args: utterances (list[list[unicode]]): list[unicode] is a list of tokens forming a sentence. list[list[unicode]] is batch of sentences. Returns: Variable[FloatTensor]: batch x lstm_dim (concatenated first and last hidden states) """ # Cut to max_words + look up indices utterances = [ utterance[:self._max_words] + [EOS] for utterance in utterances ] token_indices = SequenceBatch.from_sequences( utterances, self._token_embedder.vocab) # batch x seq_len x token_embed_dim token_embeds = self._token_embedder.embed_seq_batch(token_indices) # print('token_embeds', token_embeds) bi_hidden_states = self._bilstm(token_embeds.split()) final_states = torch.cat(bi_hidden_states.final_states, 1) hidden_states = SequenceBatch.cat(bi_hidden_states.combined_states) return self._attention(hidden_states, final_states).context
def forward(self, utterance): """Embeds a batch of utterances. Args: utterance (list[list[unicode]]): list[unicode] is a list of tokens forming a sentence. list[list[unicode]] is batch of sentences. Returns: Variable[FloatTensor]: batch x lstm_dim (concatenated first and last hidden states) list[SequenceBatchElement]: list of length batch, where each element's values is seq_len x embed_dim and mask is seq_len, representing the hidden states of each token. """ # Make keys hashable utterance = [tuple(utt) for utt in utterance] uncached_utterances = self._cache.uncached_keys(utterance) # Cache the uncached utterances if len(uncached_utterances) > 0: token_indices = SequenceBatch.from_sequences( uncached_utterances, self._token_embedder.vocab) # batch x seq_len x token_embed_dim token_embeds = self._token_embedder.embed_seq_batch(token_indices) bi_hidden_states = self._bilstm(token_embeds.split()) final_states = torch.cat(bi_hidden_states.final_states, 1) # Store the combined states in batch x stuff order for caching. combined_states = bi_hidden_states.combined_states # batch x seq_len x embed_dim combined_values = torch.stack( [state.values for state in combined_states], 1) # batch x seq_len combined_masks = torch.stack( [state.mask for state in combined_states], 1) assert len(combined_values) == len(combined_masks) combined_states_by_batch = [SequenceBatchElement( value, mask) for value, mask in zip( combined_values, combined_masks)] assert len(final_states) == len(combined_states_by_batch) # self._cache.cache( # uncached_utterances, # zip(final_states, combined_states_by_batch)) self._cache.cache( list(uncached_utterances), list(zip(final_states, combined_states_by_batch))) final_states, combined_states = zip(*self._cache.get(utterance)) return torch.stack(final_states, 0), combined_states
def embed_tokens(token_embedder, max_words, node_texts): """ Args: token_embedder (TokenEmbedder) max_words (int) node_texts (List[str]) """ # Cut to max_words + look up indices texts = [text[:max_words-1] + [EOS] for text in node_texts] token_indices = SequenceBatch.from_sequences(texts, token_embedder.vocab, min_seq_length=max_words) # batch x seq_len x token_embed_dim token_embeds = token_embedder.embed_seq_batch(token_indices) return token_embeds
def input_embeds_list(self): sequences = [ [1, 2, 3], [8, 4, 2, 1, 1], [], ] # token 1 maps to embedding [1], 2 maps to [2] and so on... vocab = SimpleVocab([1, 2, 3, 4, 5, 6, 7, 8]) array = np.expand_dims(np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.float32), 1) token_embedder = TokenEmbedder(Bunch(vocab=vocab, array=array)) seq_embeds = token_embedder.embed_seq_batch(SequenceBatch.from_sequences(sequences, vocab)) return seq_embeds.split()
def test_from_sequences(self, sequences, vocab): seq_batch = SequenceBatch.from_sequences(sequences, vocab) assert_tensor_equal(seq_batch.values, np.array([ [1, 2, 2, 3], [3, 0, 0, 0], [0, 0, 0, 0], ], dtype=np.int32)) assert_tensor_equal(seq_batch.mask, np.array([ [1, 1, 1, 1], [1, 0, 0, 0], [0, 0, 0, 0], ], dtype=np.float32))
def embed(self, sequences): for seq in sequences: if len(seq) == 0: raise ValueError("Cannot embed empty sequence.") token_indices = SequenceBatch.from_sequences(sequences, self.vocab, min_seq_length=1) token_embeds = self.token_embedder.embed_seq_batch(token_indices) # SequenceBatch of size (batch_size, max_seq_length, word_dim) if self.pool == 'sum': pooled_token_embeds = SequenceBatch.reduce_sum(token_embeds) # (batch_size, word_dim) elif self.pool == 'mean': pooled_token_embeds = SequenceBatch.reduce_mean(token_embeds) # (batch_size, word_dim) elif self.pool == 'max': pooled_token_embeds = SequenceBatch.reduce_max(token_embeds) # (batch_size, word_dim) else: raise ValueError(self.pool) seq_embeds = self.transform(pooled_token_embeds) # (batch_size, embed_dim) assert seq_embeds.size()[1] == self.embed_dim return seq_embeds
def forward(self, utterances): """Embeds an utterances. Args: utterances (list[list[str]]): list[str] is a list of tokens forming a sentence. list[list[str]] is batch of sentences. Returns: Tensor: batch x word_embed_dim (average of word vectors) """ # Cut to max_words + look up indices utterances = [ utterance[:self._max_words] + [EOS] for utterance in utterances ] token_indices = SequenceBatch.from_sequences( utterances, self._token_embedder.vocab) # batch x seq_len x token_embed_dim token_embeds = self._token_embedder.embed_seq_batch(token_indices) # batch x token_embed_dim averaged = SequenceBatch.reduce_mean(token_embeds) return averaged
def base_plus_copy_indices(words, dynamic_vocabs, base_vocab, volatile=False): """Compute base + copy indices. Args: words (list[list[unicode]]) dynamic_vocabs (list[HardCopyDynamicVocab]) base_vocab (HardCopyVocab) volatile (bool) Returns: MultiVocabIndices """ unk = base_vocab.UNK copy_seqs = [] for seq, dyna_vocab in izip(words, dynamic_vocabs): word_to_copy = dyna_vocab.word_to_copy_token normal_copy_seq = [] try: for w in seq: normal_copy_seq.append(word_to_copy.get(w, unk)) except Exception as e: print e seq = flat(seq) for w in seq: normal_copy_seq.append(word_to_copy.get(w, unk)) copy_seqs.append(normal_copy_seq) # each SeqBatch.values has shape (batch_size, seq_length) base_indices = SequenceBatch.from_sequences(words, base_vocab, volatile=volatile) copy_indices = SequenceBatch.from_sequences(copy_seqs, base_vocab, volatile=volatile) try: assert_tensor_equal(base_indices.mask, copy_indices.mask) except Exception as e: diff = max( base_indices.mask.size(1) - copy_indices.mask.size(1), copy_indices.mask.size(1) - base_indices.mask.size(1)) if base_indices.mask.size(1) < copy_indices.mask.size(1): zer = torch.zeros((base_indices.mask.size(0), diff), dtype=torch.long) base_indices = SequenceBatch( torch.cat((base_indices.values, torch.zeros((base_indices.values.size(0), diff), dtype=base_indices.values.dtype, device=base_indices.values.device)), dim=1), torch.cat((base_indices.mask, torch.zeros((base_indices.mask.size(0), diff), dtype=base_indices.mask.dtype, device=base_indices.mask.device)), dim=1)) else: zer = torch.zeros((copy_indices.mask.size(0), diff), dtype=torch.long) copy_indices = SequenceBatch( torch.cat((copy_indices.values, torch.zeros((copy_indices.values.size(0), diff), dtype=copy_indices.values.dtype, device=copy_indices.values.device)), dim=1), torch.cat((copy_indices.mask, torch.zeros((copy_indices.mask.size(0), diff), dtype=copy_indices.mask.dtype, device=copy_indices.mask.device)), dim=1)) # has shape (batch_size, seq_length, 2) concat_values = torch.stack([base_indices.values, copy_indices.values], 2) return MultiVocabIndices(concat_values, base_indices.mask)
def preprocess(self, examples): return SequenceBatch.from_sequences(examples, self.word_vocab)
def embed(self, sequences): sequence_indices = SequenceBatch.from_sequences(sequences, self.vocab, min_seq_length=1) # (batch_size, max_seq_length) sequence_embeds = self.token_embedder.embed_seq_batch(sequence_indices) # (batch_size, max_seq_length, word_dim) sequence_embeds_list = sequence_embeds.split() hidden_states_list = self.source_encoder(sequence_embeds_list) return hidden_states_list[-1].values # (batch_size, embed_dim)
def test_min_seq_length(self, vocab): seq_batch = SequenceBatch.from_sequences([[], [], []], vocab, min_seq_length=2) assert_tensor_equal(seq_batch.values, np.zeros((3, 2))) assert_tensor_equal(seq_batch.mask, np.zeros((3, 2)))