def forward(self, x): x = self.embed(x) if (self.cove): outputs_both_layer_cove_with_glove = MTLSTM( n_vocab=None, vectors=None, layer0=True, residual_embeddings=True) outputs_both_layer_cove_with_glove.cuda() x = outputs_both_layer_cove_with_glove(x, [x.shape[1]] * x.shape[0]) x = x.unsqueeze(1) x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] x = torch.cat(x, 1) x = self.dropout(x) output = self.fully_connected(x) return output
def compute_torch_values(inputs, embeddings): model = MTLSTM(n_vocab=embeddings.shape[0], vectors=torch.from_numpy(embeddings.astype(np.float32))) model.cuda(0) model_inputs = Variable(torch.from_numpy(inputs.astype(np.int64))) lengths = torch.from_numpy( np.ones((inputs.shape[0], ), dtype=np.int64) * inputs.shape[1]) cove_outputs = model.forward(model_inputs.cuda(), lengths=lengths.cuda()) torch_output = (cove_outputs.data.cpu().numpy()) print("Torch output shape", torch_output.shape) return torch_output
class tmcove(Model): def load(self, vectors): self.model = MTLSTM(n_vocab=len(vectors.keys()), vectors=vectors) self.model.cuda() def train(self, X, Y): pass def predict(self, X): X, Y = self.input_function(X, []) return [[get_word2vec(token, self.vectors) for token in tokens_list] for tokens in X]
print('Generating train, dev, test splits') train, dev, test = datasets.IWSLT.splits(root=args.data, exts=['.en', '.de'], fields=[inputs, inputs]) train_iter, dev_iter, test_iter = data.Iterator.splits( (train, dev, test), batch_size=100, device=torch.device(args.device) if args.device >= 0 else None) print('Building vocabulary') inputs.build_vocab(train, dev, test) inputs.vocab.load_vectors(vectors=GloVe(name='840B', dim=300, cache=args.embeddings)) outputs_last_layer_cove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, model_cache=args.embeddings) outputs_both_layer_cove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, layer0=True, model_cache=args.embeddings) outputs_both_layer_cove_with_glove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, layer0=True, residual_embeddings=True, model_cache=args.embeddings) if args.device >=0: outputs_last_layer_cove.cuda() outputs_both_layer_cove.cuda() outputs_both_layer_cove_with_glove.cuda() train_iter.init_epoch() print('Generating CoVe') for batch_idx, batch in enumerate(train_iter): if batch_idx > 0: break last_layer_cove = outputs_last_layer_cove(*batch.src) print(last_layer_cove.size()) first_then_last_layer_cove = outputs_both_layer_cove(*batch.src) print(first_then_last_layer_cove.size()) glove_then_first_then_last_layer_cove = outputs_both_layer_cove_with_glove(*batch.src) print(glove_then_first_then_last_layer_cove.size()) assert np.allclose(last_layer_cove, first_then_last_layer_cove[:, :, -600:])
import torch from torchtext import data from torchtext import datasets from cove import MTLSTM inputs = data.Field(lower=True, include_lengths=True, batch_first=True) answers = data.Field(sequential=False) print('Generating train, dev, test splits') train, dev, test = datasets.SNLI.splits(inputs, answers) print('Building vocabulary') inputs.build_vocab(train, dev, test) inputs.vocab.load_vectors(wv_type='glove.840B', wv_dim=300) answers.build_vocab(train) model = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors) model.cuda(0) train_iter, dev_iter, test_iter = data.BucketIterator.splits( (train, dev, test), batch_size=100, device=0) train_iter.init_epoch() print('Generating CoVe') for batch_idx, batch in enumerate(train_iter): model.train() cove_premise = model(*batch.premise) cove_hypothesis = model(*batch.hypothesis)
def forward(self, # type: ignore tokens: Dict[str, torch.LongTensor], label: torch.LongTensor = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``. label : torch.LongTensor, optional (default = None) A variable representing the label for each instance in the batch. Returns ------- An output dictionary consisting of: class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_classes)`` representing a distribution over the label classes for each instance. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ text_mask = util.get_text_field_mask(tokens).float() # Pop elmo tokens, since elmo embedder should not be present. elmo_tokens = tokens.pop("elmo", None) if tokens: embedded_text = self._text_field_embedder(tokens) else: # only using "elmo" for input embedded_text = None # Add the "elmo" key back to "tokens" if not None, since the tests and the # subsequent training epochs rely not being modified during forward() if elmo_tokens is not None: tokens["elmo"] = elmo_tokens # Create ELMo embeddings if applicable if self._elmo: if elmo_tokens is not None: elmo_representations = self._elmo(elmo_tokens)["elmo_representations"] # Pop from the end is more performant with list if self._use_integrator_output_elmo: integrator_output_elmo = elmo_representations.pop() if self._use_input_elmo: input_elmo = elmo_representations.pop() assert not elmo_representations else: raise ConfigurationError( "Model was built to use Elmo, but input text is not tokenized for Elmo.") if self._use_input_elmo: if embedded_text is not None: embedded_text = torch.cat([embedded_text, input_elmo], dim=-1) else: embedded_text = input_elmo # While using embeddings from the mt-cnn encoder, the hardcoded values for vocab_size can be initialsed appropriately if cnn: embedded_text_cnn = embedded_text enc = Encoder(7855, 300, 600, 5, 3, 0.25, 'cuda') dec = Decoder(5893, 300, 600, 5, 3, 0.25, 1, 'cuda') cnn_model = Seq2Seq(enc, dec).cuda() cnn_model.load_state_dict(torch.load('../cnn_lstm_model.pt')) cnn_model.eval() v1, v2 = cnn_model.encoder(embedded_text[:,:,:256]) v3 = torch.cat((v1,v2),2) embedded_text = torch.cat((embedded_text_cnn,v3),2) # While using embeddings from the mt-lstm encoder (either load from the saved model from the paper or the reproduced model) elif lstm: outputs_both_layer_cove_with_glove = MTLSTM(n_vocab=None, vectors=None, layer0=True, residual_embeddings=True) outputs_both_layer_cove_with_glove.cuda() embedded_text = outputs_both_layer_cove_with_glove(embedded_text,[embedded_text.shape[1]]*embedded_text.shape[0]) dropped_embedded_text = self._embedding_dropout(embedded_text) pre_encoded_text = self._pre_encode_feedforward(dropped_embedded_text) encoded_tokens = self._encoder(pre_encoded_text, text_mask) # Compute biattention. This is a special case since the inputs are the same. attention_logits = encoded_tokens.bmm(encoded_tokens.permute(0, 2, 1).contiguous()) attention_weights = util.masked_softmax(attention_logits, text_mask) encoded_text = util.weighted_sum(encoded_tokens, attention_weights) # Build the input to the integrator integrator_input = torch.cat([encoded_tokens, encoded_tokens - encoded_text, encoded_tokens * encoded_text], 2) integrated_encodings = self._integrator(integrator_input, text_mask) # Concatenate ELMo representations to integrated_encodings if specified if self._use_integrator_output_elmo: integrated_encodings = torch.cat([integrated_encodings, integrator_output_elmo], dim=-1) # Simple Pooling layers max_masked_integrated_encodings = util.replace_masked_values( integrated_encodings, text_mask.unsqueeze(2), -1e7) max_pool = torch.max(max_masked_integrated_encodings, 1)[0] min_masked_integrated_encodings = util.replace_masked_values( integrated_encodings, text_mask.unsqueeze(2), +1e7) min_pool = torch.min(min_masked_integrated_encodings, 1)[0] mean_pool = torch.sum(integrated_encodings, 1) / torch.sum(text_mask, 1, keepdim=True) # Self-attentive pooling layer # Run through linear projection. Shape: (batch_size, sequence length, 1) # Then remove the last dimension to get the proper attention shape (batch_size, sequence length). self_attentive_logits = self._self_attentive_pooling_projection( integrated_encodings).squeeze(2) self_weights = util.masked_softmax(self_attentive_logits, text_mask) self_attentive_pool = util.weighted_sum(integrated_encodings, self_weights) pooled_representations = torch.cat([max_pool, min_pool, mean_pool, self_attentive_pool], 1) pooled_representations_dropped = self._integrator_dropout(pooled_representations) logits = self._output_layer(pooled_representations_dropped) class_probabilities = F.softmax(logits, dim=-1) output_dict = {'logits': logits, 'class_probabilities': class_probabilities} if label is not None: loss = self.loss(logits, label) for metric in self.metrics.values(): metric(logits, label) output_dict["loss"] = loss return output_dict