def hidden_depth(self, value): if not isinstance(value, int): raise InvalidArgumentError("hidden_depth should be an integer.") elif value < 0: raise InvalidArgumentError("hidden_depth should be positive.") else: self._hidden_depth = value
def flow_depth(self, value): if not isinstance(value, int): raise InvalidArgumentError("flow_depth should be an integer.") elif value < 1: raise InvalidArgumentError( "flow_depth should be strictly positive.") else: self._flow_depth = value
def use_prior(self, val): if not isinstance(val, bool): if not isinstance(val, int): raise InvalidArgumentError("use_prior should be a boolean switch.") elif val != 0 and val != 1: raise InvalidArgumentError("Only 0 or 1 can be interpreted as a boolean.") else: self._use_prior = bool(val) else: self._use_prior = val
def __init__(self, device, seq_len, kl_step, word_p, word_p_enc, parameter_p, encoder_p, drop_type, min_rate, unk_index, css, sparse, N, rnn_type, tie_in_out, beta, lamb, mmd, ann_mode, rate_mode, posterior, hinge_weight, k, ann_word, word_step, v_dim, x_dim, h_dim, z_dim, s_dim, l_dim, h_dim_enc, l_dim_enc, lagrangian, constraint, max_mmd, max_elbo, alpha): super(BowmanDecoder, self).__init__(device, seq_len, word_p, word_p_enc, parameter_p, encoder_p, drop_type, min_rate, unk_index, css, N, rnn_type, kl_step, beta, lamb, mmd, ann_mode, rate_mode, posterior, hinge_weight, ann_word, word_step, v_dim, x_dim, h_dim, s_dim, z_dim, l_dim, h_dim_enc, l_dim_enc, lagrangian, constraint, max_mmd, max_elbo, alpha) # The inference model, that is, the model that encodes data into an approximation of latent distribution self.encoder = BowmanEncoder(x_dim, h_dim_enc, z_dim, l_dim_enc, self.encoder_p, self.var_mask, self.posterior, k) # The generative model, that is, the model that decodes from the latent space to data, or generates novel data. self.emb = nn.Embedding(v_dim, x_dim, sparse=bool(sparse)) self.ztohidden = nn.Linear(z_dim, h_dim * l_dim) self.decoder = nn.GRU(x_dim, h_dim, l_dim, batch_first=True) self.linear = nn.Linear(h_dim, v_dim) # Tying weights in the input and output layer might increase performance (Inan et al., 2016) if tie_in_out: if self.h_dim != self.x_dim: raise InvalidArgumentError( "h_dim should match x_dim when tying weights.") self.linear.weight = self.emb.weight
def _vmf_sample_z(self, location, kappa, shape, det): """Reparameterized sample from a vMF distribution with location and concentration kappa.""" if location is None and kappa is None and shape is not None: if det: raise InvalidArgumentError("Cannot deterministically sample from the Uniform on a Hypersphere.") else: return HypersphericalUniform(self.z_dim - 1, device=self.device).sample(shape[:-1]) elif location is not None and kappa is not None: if det: return location if self.training: return VonMisesFisher(location, kappa).rsample() else: return VonMisesFisher(location, kappa).sample() else: raise InvalidArgumentError("Either provide location and kappa or neither with a shape.")
def _unpack_data(self, data, N): """Unpacks the input data to the forward pass and supplies missing data. Args: data(list of torch.Tensor): data provided to forward pass. We assume the following ordering [input, length(optional), mask(optional), reversed(optional), reversed_length(optional)] N(int): the number of data tensors to return. Can be 1-4. Returns: x_in(torch.Tensor): batch of input sequences. x_len(torch.Tensor): lengths of input sequences or None. x_mask(torch.Tensor): mask over the padding of input sequences that are not of max length or None. x_reverse(torch.Tensor): batch of reversed input sequences or None. """ # Checks and padding of data, so we have N tensors or None to process if not isinstance(data[0], torch.Tensor): raise InvalidArgumentError( "Data should contain a torch Tensor with data at the first position." ) if N < 1 or N > 4: raise InvalidArgumentError("N should be between 1 and 4.") data = (data + [ None, ] * N)[:N] for d in data: if not isinstance(d, torch.Tensor) and d is not None: raise InvalidArgumentError( "Data should contain only torch Tensors or None.") # If no mask is given, we create an empty mask as placeholder. if N > 2 and data[2] is None: data[2] = torch.ones(data[0].shape).to(self.device) if data[1] is not None: warn( "Data length is given without mask. Assuming all sentences are of the same length. Sentences shorter than {} words will not be masked." .format(self.seq_len)) # When the reversed data is not given, we assume no padding and reverse the sequence ourselves if N > 3 and data[3] is None: warn( "Reversed data not provided. We assume no padding and reverse the data cheaply." ) indices = torch.arange(data[0].shape[1] - 1, -1, -1) data[3] = data[0].index_select(1, indices) return data
def linear(self, val): self._linear = val if self.tie_in_out: if self.h_dim != self.x_dim: raise InvalidArgumentError( "h_dim should match x_dim when tying weights.") self._linear.weight = self.emb.weight
def _gaussian_kl_divergence(self, mu_1, var_1, mu_2, var_2, mask, dim): """Computes the batch KL-divergence between two Gaussian distributions with diagonal covariance.""" if mu_2 is None and var_2 is None: return 0.5 * torch.sum((-torch.log(var_1) + var_1 + mu_1 ** 2 - 1) * mask.unsqueeze(dim), dim=dim) elif mu_2 is not None and var_2 is not None: return 0.5 * torch.sum((torch.log(var_2) - torch.log(var_1) + var_1 / var_2 + (mu_2 - mu_1) ** 2 / var_2 - 1) * mask.unsqueeze(dim), dim=dim) else: raise InvalidArgumentError("Either provide mu_2 and var_2 or neither.")
def _vmf_log_likelihood(self, sample, location=None, kappa=None): """Get the log likelihood of a sample under the vMF distribution with location and kappa.""" if location is None and kappa is None: return HypersphericalUniform(self.z_dim - 1, device=self.device).log_prob(sample) elif location is not None and kappa is not None: return VonMisesFisher(location, kappa).log_prob(sample) else: raise InvalidArgumentError("Provide either location and kappa or neither.")
def scale(self, val): if not isinstance(val, float) and not isinstance(val, torch.FloatTensor) and not isinstance(val, torch.cuda.FloatTensor): raise InvalidArgumentError("scale should be a float.") if isinstance(val, float): val = self.beta.new_tensor(val) if self.training: if val <= self.beta and val >= 0.: self._scale = val elif val > self.beta and self._scale < self.beta: self._scale = torch.tensor(self.beta.item(), device=self.device) elif val < 0.: raise InvalidArgumentError("scale should be positive.") if val < 0.0001: self._scale = torch.tensor(0.0001, device=self.device)
def min_rate(self, val): if isinstance(val, float): val = torch.tensor(val, device=self.device) if not isinstance(val, torch.FloatTensor) and not isinstance(val, torch.cuda.FloatTensor): raise InvalidArgumentError("min_rate should be a float or FloatTensor.") if val > 0: self._min_rate = val else: self._min_rate = torch.tensor(0., device=self.device)
def constraint(self, vals): if type(vals) != list: if isinstance(vals, str): vals = [vals] else: raise InvalidArgumentError('constraint should be a list or str') for val in vals: if val not in ['mdr', 'mmd']: raise UnknownArgumentError( 'constraint {} unknown. Please choose [mdr, mmd].'.format(val)) self._constraint = vals
def word_p(self, val): if isinstance(val, float): val = self.beta.new_tensor(val) if not isinstance(val, torch.FloatTensor) and not isinstance(val, torch.cuda.FloatTensor): raise InvalidArgumentError("word_p should be a float or FloatTensor, not {}.".format(type(val))) if val > self.min_word_p and val <= 1.: self._word_p = val elif val > 1.: self._word_p = self.min_word_p.new_tensor(1.) elif val <= self.min_word_p: self._word_p = self.min_word_p.new_tensor(self.min_word_p.item())
def _gaussian_sample_z(self, mu, var, shape, det): """Sample from a Gaussian distribution with mean mu and variance var.""" if mu is None and var is None and shape is not None: if det: return torch.zeros(shape, device=self.device) else: return self.error.sample(shape) elif mu is not None and var is not None: if det: return mu else: return mu + torch.sqrt(var) * self.error.sample(var.shape) else: raise InvalidArgumentError("Provide either mu and var or neither with a shape.")
def __init__(self, device, seq_len, word_p, word_p_enc, parameter_p, encoder_p, drop_type, min_rate, unk_index, css, N, rnn_type, kl_step, beta, lamb, mmd, ann_mode, rate_mode, posterior, hinge_weight, ann_word, word_step, v_dim, x_dim, h_dim, s_dim, z_dim, l_dim, h_dim_enc, l_dim_enc, lagrangian, constraint, max_mmd, max_elbo, alpha): super(GenerativeDecoder, self).__init__(device, seq_len, word_p, parameter_p, drop_type, unk_index, css, N, rnn_type, v_dim, x_dim, h_dim, s_dim, l_dim) # Recurrent dropout was never implemented for the VAE's because it doesn't work well if self.drop_type == "recurrent": raise InvalidArgumentError( "Recurrent dropout not implemented for this model. Please choose ['varied', 'shared']" ) # LSTM's are not supported because GRU's work equally well (with less parameters) if self.rnn_type == "LSTM": raise InvalidArgumentError( "LSTM not implemented for this model. Please choose ['GRU']") # Choose between the vMF-autoencoder and Gauss-autoencoder self.posterior = posterior # Encoder architecture settings self.encoder_p = torch.tensor(encoder_p, device=self.device, dtype=torch.float) self.h_dim_enc = h_dim_enc self.l_dim_enc = l_dim_enc self.word_p_enc = word_p_enc # Optimization hyperparameters self.min_rate = torch.tensor( min_rate, device=self.device, dtype=torch.float) # minimum Rate of hinge/FB self.beta = torch.tensor(beta, device=self.device, dtype=torch.float) # beta value of beta-VAE self.alpha = torch.tensor(alpha, device=self.device, dtype=torch.float) # alpha value of InfoVAE self.lamb = torch.tensor(lamb, device=self.device, dtype=torch.float) # lambda value of InfoVAE self.kl_step = torch.tensor( kl_step, device=self.device, dtype=torch.float) # Step size of KL annealing self.hinge_weight = torch.tensor( hinge_weight, device=self.device, dtype=torch.float) # Weight of hinge loss # Step size of word dropout annealing self.word_step = torch.tensor(word_step, device=self.device, dtype=torch.float) self.max_mmd = torch.tensor(max_mmd, device=self.device, dtype=torch.float) # Maximum MMD self.max_elbo = torch.tensor(max_elbo, device=self.device, dtype=torch.float) # Maximum ELBO # Optimization modes self.mmd = mmd # When true, we add the maximum mean discrepancy to the loss, and optimize the InfoVAE self.ann_mode = ann_mode # The mode of annealing self.rate_mode = rate_mode # How to force the VAE to encode a minimum rate self.ann_word = ann_word # Whether to anneal word dropout # The weight of the constraint in the Lagrangian dual function # Hardcoded start at 1.01 self.lagrangian = lagrangian self.constraint = constraint self.lag_weight = Parameter(torch.tensor([1.01] * len(self.constraint))) if self.ann_word: self.word_p = 1. self.z_dim = z_dim # We start the scale factor at zero, to be incremented linearly with kl_step every forward pass if self.ann_mode == "linear": self.scale = torch.tensor(self.kl_step.item() * self.beta.item(), device=self.device) # Or we start the scale at 10%, to be increased or decreased in 10% increments based on a desired rate elif self.ann_mode == "sfb": self.scale = torch.tensor(0.1 * self.beta.item(), device=self.device) # This switch should be manually managed from training/testing scripts to select generating from prior/posterior self.use_prior = False # N(0, I) error distribution to sample from latent spaces with reparameterized gradient self.error = Normal(torch.tensor(0., device=device), torch.tensor(1., device=device))
def seq_len(self, val): if not isinstance(val, int): raise InvalidArgumentError("seq_len should be an integer.") self._seq_len = val