class LinearVarianceUnif(ModuleWrapper): def __init__(self, in_features, out_features, bias=True): super(LinearVarianceUnif, self).__init__() self.in_features = in_features self.out_features = out_features self.W = Parameter(torch.Tensor(out_features, in_features)) if bias: self.bias = Parameter(torch.Tensor(1, out_features)) else: self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self): stdv = 1. / math.sqrt(self.W.size(1)) self.W.data.uniform_(-stdv, stdv) if self.bias is not None: self.bias.data.zero_() def forward(self, x): if self.training: eps = Variable(self.W.data.new(self.W.size()).uniform_() - 0.5) else: eps = 0.0 output = F.linear(x, self.W*eps) if self.bias is not None: output = output + self.bias return output def __repr__(self): return self.__class__.__name__ + '(' \ + 'in_features=' + str(self.in_features) \ + ', out_features=' + str(self.out_features) \ + ', bias=' + str(self.bias is not None) + ')'
def __init__(self, in_channels, out_channels, kernel_size, alpha_shape, stride=1, padding=0, dilation=1, prior='loguni', bias=True): super(ConvVDO, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = (kernel_size, kernel_size) self.stride = stride self.padding = padding self.dilation = dilation self.alpha_shape = alpha_shape self.groups = 1 self.weight = Parameter(torch.Tensor( out_channels, in_channels, *self.kernel_size)) if bias: self.bias = Parameter(torch.Tensor(1, out_channels, 1, 1)) else: self.register_parameter('bias', None) self.op_bias = lambda input, kernel: F.conv2d(input, kernel, self.bias, self.stride, self.padding, self.dilation, self.groups) self.op_nobias = lambda input, kernel: F.conv2d(input, kernel, None, self.stride, self.padding, self.dilation, self.groups) self.log_alpha = Parameter(torch.Tensor(*alpha_shape)) self.reset_parameters() self.zero_mean = False self.permute_sigma = False self.prior = prior if prior == 'loguni': self.kl_fun = metrics.kl_loguni else: self.kl_fun = metrics.kl_ard
class LinearVDO(ModuleWrapper): def __init__(self, in_features, out_features, prior='loguni', alpha_shape=(1, 1), bias=True): super(LinearVDO, self).__init__() self.in_features = in_features self.out_features = out_features self.alpha_shape = alpha_shape self.W = Parameter(torch.Tensor(out_features, in_features)) self.log_alpha = Parameter(torch.Tensor(*alpha_shape)) if bias: self.bias = Parameter(torch.Tensor(1, out_features)) else: self.register_parameter('bias', None) self.reset_parameters() self.zero_mean = False self.permute_sigma = False self.prior = prior if prior == 'loguni': self.kl_fun = metrics.kl_loguni else: self.kl_fun = metrics.kl_ard def reset_parameters(self): stdv = 1. / math.sqrt(self.W.size(1)) self.W.data.uniform_(-stdv, stdv) self.log_alpha.data.fill_(-5.0) if self.bias is not None: self.bias.data.zero_() def forward(self, x): if self.zero_mean: lrt_mean = 0.0 else: lrt_mean = F.linear(x, self.W) if self.bias is not None: lrt_mean = lrt_mean + self.bias sigma2 = Variable.exp(self.log_alpha) * self.W * self.W if self.permute_sigma: sigma2 = sigma2.view(-1)[torch.randperm(self.in_features * self.out_features).cuda()].view(self.out_features, self.in_features) lrt_std = Variable.sqrt(1e-16 + F.linear(x * x, sigma2)) if self.training: eps = Variable(lrt_std.data.new(lrt_std.size()).normal_()) else: eps = 0.0 return lrt_mean + lrt_std * eps def kl_reg(self): return self.W.nelement() * self.kl_fun(self.log_alpha) / self.log_alpha.nelement() def __repr__(self): return self.__class__.__name__ + '(' \ + 'in_features=' + str(self.in_features) \ + ', out_features=' + str(self.out_features) \ + ', alpha_shape=' + str(self.alpha_shape) \ + ', prior=' + self.prior \ + ', bias=' + str(self.bias is not None) + ')' ', bias=' + str(self.bias is not None) + ')'
class RepNormal(torch.nn.Module): def __init__(self): super().__init__() self.mu = Parameter(FloatTensor([0.0])) self.log_variance = Parameter(FloatTensor([0.0])) def __call__(self): z = Variable(torch.randn(1)) return self.mu + self.log_variance.exp() * z def _repr_pretty_(self, p, cycle): p.text("mu = {}".format(self.mu)) p.text("std = {}".format(self.log_variance.exp()))
def __init__(self, X, y, kernel, Xu, likelihood, mean_function=None, latent_shape=None, num_data=None, whiten=False, jitter=1e-6, name="SVGP"): super(VariationalSparseGP, self).__init__(X, y, kernel, mean_function, jitter, name) self.likelihood = likelihood self.num_data = num_data if num_data is not None else self.X.shape[0] self.whiten = whiten self.Xu = Parameter(Xu) y_batch_shape = self.y.shape[:-1] if self.y is not None else torch.Size([]) self.latent_shape = latent_shape if latent_shape is not None else y_batch_shape M = self.Xu.shape[0] u_loc_shape = self.latent_shape + (M,) u_loc = self.Xu.new_zeros(u_loc_shape) self.u_loc = Parameter(u_loc) u_scale_tril_shape = self.latent_shape + (M, M) Id = torch.eye(M, out=self.Xu.new_empty(M, M)) u_scale_tril = Id.expand(u_scale_tril_shape) self.u_scale_tril = Parameter(u_scale_tril) self.set_constraint("u_scale_tril", constraints.lower_cholesky) self._sample_latent = True
class LinearVariance(ModuleWrapper): def __init__(self, in_features, out_features, bias=True): super(LinearVariance, self).__init__() self.in_features = in_features self.out_features = out_features self.sigma = Parameter(torch.Tensor(out_features, in_features)) if bias: self.bias = Parameter(torch.Tensor(1, out_features)) else: self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self): stdv = 1. / math.sqrt(self.sigma.size(1)) self.sigma.data.uniform_(-stdv, stdv) if self.bias is not None: self.bias.data.zero_() def forward(self, x): lrt_mean = self.bias lrt_std = torch.sqrt_(1e-16 + F.linear(x * x, self.sigma * self.sigma)) if self.training: eps = Variable(lrt_std.data.new(lrt_std.size()).normal_()) else: eps = 0.0 return lrt_mean + eps * lrt_std
class LinearAttention(Attention): """ This ``Attention`` module performs a dot product between a vector of weights and some combination of the two input vectors, followed by an (optional) activation function. The combination used is configurable. If the two vectors are ``x`` and ``y``, we allow the following kinds of combinations: ``x``, ``y``, ``x*y``, ``x+y``, ``x-y``, ``x/y``, where each of those binary operations is performed elementwise. You can list as many combinations as you want, comma separated. For example, you might give ``x,y,x*y`` as the ``combination`` parameter to this class. The computed similarity function would then be ``w^T [x; y; x*y] + b``, where ``w`` is a vector of weights, ``b`` is a bias parameter, and ``[;]`` is vector concatenation. Note that if you want a bilinear similarity function with a diagonal weight matrix W, where the similarity function is computed as `x * w * y + b` (with `w` the diagonal of `W`), you can accomplish that with this class by using "x*y" for `combination`. Parameters ---------- tensor_1_dim : ``int`` The dimension of the first tensor, ``x``, described above. This is ``x.size()[-1]`` - the length of the vector that will go into the similarity computation. We need this so we can build weight vectors correctly. tensor_2_dim : ``int`` The dimension of the second tensor, ``y``, described above. This is ``y.size()[-1]`` - the length of the vector that will go into the similarity computation. We need this so we can build weight vectors correctly. combination : ``str``, optional (default="x,y") Described above. activation : ``Activation``, optional (default=linear (i.e. no activation)) An activation function applied after the ``w^T * [x;y] + b`` calculation. Default is no activation. """ def __init__(self, tensor_1_dim: int, tensor_2_dim: int, combination: str = 'x,y', activation: Activation = None, normalize: bool = True) -> None: super().__init__(normalize) self._combination = combination combined_dim = util.get_combined_dim(combination, [tensor_1_dim, tensor_2_dim]) self._weight_vector = Parameter(torch.Tensor(combined_dim)) self._bias = Parameter(torch.Tensor(1)) self._activation = activation or Activation.by_name('linear')() self.reset_parameters() def reset_parameters(self): std = math.sqrt(6 / (self._weight_vector.size(0) + 1)) self._weight_vector.data.uniform_(-std, std) self._bias.data.fill_(0) @overrides def _forward_internal(self, vector: torch.Tensor, matrix: torch.Tensor) -> torch.Tensor: combined_tensors = util.combine_tensors_and_multiply(self._combination, [vector.unsqueeze(1), matrix], self._weight_vector) return self._activation(combined_tensors.squeeze(1) + self._bias)
def _make_params(self): w = getattr(self.module, self.name) height = w.data.shape[0] width = w.view(height, -1).data.shape[1] u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False) v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False) u.data = l2normalize(u.data) v.data = l2normalize(v.data) w_bar = Parameter(w.data) del self.module._parameters[self.name] self.module.register_parameter(self.name + "_u", u) self.module.register_parameter(self.name + "_v", v) self.module.register_parameter(self.name + "_bar", w_bar)
def __init__(self, in_features, out_features, prior='loguni', alpha_shape=(1, 1), bias=True): super(LinearVDO, self).__init__() self.in_features = in_features self.out_features = out_features self.alpha_shape = alpha_shape self.W = Parameter(torch.Tensor(out_features, in_features)) self.log_alpha = Parameter(torch.Tensor(*alpha_shape)) if bias: self.bias = Parameter(torch.Tensor(1, out_features)) else: self.register_parameter('bias', None) self.reset_parameters() self.zero_mean = False self.permute_sigma = False self.prior = prior if prior == 'loguni': self.kl_fun = metrics.kl_loguni else: self.kl_fun = metrics.kl_ard
def __init__(self, in_features, out_features, bias=True): super(LinearVariance, self).__init__() self.in_features = in_features self.out_features = out_features self.sigma = Parameter(torch.Tensor(out_features, in_features)) if bias: self.bias = Parameter(torch.Tensor(1, out_features)) else: self.register_parameter('bias', None) self.reset_parameters()
def __init__(self, in_features, out_features, bias=True): super(LinearVarianceBe, self).__init__() self.in_features = in_features self.out_features = out_features self.probs = torch.ones([out_features, in_features]).cuda() * 0.5 self.W = Parameter(torch.Tensor(out_features, in_features)) if bias: self.bias = Parameter(torch.Tensor(1, out_features)) else: self.register_parameter('bias', None) self.reset_parameters()
def __init__(self, tensor_1_dim: int, tensor_2_dim: int, combination: str = 'x,y', activation: Activation = None) -> None: super().__init__() self._combination = combination combined_dim = util.get_combined_dim(combination, [tensor_1_dim, tensor_2_dim]) self._weight_vector = Parameter(torch.Tensor(combined_dim)) self._bias = Parameter(torch.Tensor(1)) self._activation = activation or Activation.by_name('linear')() self.reset_parameters()
class ConvVarianceUnif(ModuleWrapper): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, bias=True): super(ConvVarianceUnif, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = (kernel_size, kernel_size) self.stride = stride self.padding = padding self.dilation = dilation self.groups = 1 self.W = Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size)) if bias: self.bias = Parameter(torch.Tensor(1, out_channels, 1, 1)) else: self.register_parameter('bias', None) self.op_bias = lambda input, kernel: F.conv2d(input, kernel, self.bias, self.stride, self.padding, self.dilation, self.groups) self.op_nobias = lambda input, kernel: F.conv2d(input, kernel, None, self.stride, self.padding, self.dilation, self.groups) self.reset_parameters() def reset_parameters(self): n = self.in_channels for k in self.kernel_size: n *= k stdv = 1. / math.sqrt(n) self.W.data.uniform_(-stdv, stdv) if self.bias is not None: self.bias.data.uniform_(-stdv, stdv) def forward(self, x): if self.training: eps = Variable(torch.rand(self.W.size()) - 0.5) else: eps = 0.0 output = self.op_nobias(x, self.W*eps) if self.bias is not None: output = output + self.bias return output def __repr__(self): s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}' ', stride={stride}') s += ', padding={padding}' s += ', dilation={dilation}' if self.bias is None: s += ', bias=False' s += ')' return s.format(name=self.__class__.__name__, **self.__dict__)
def __init__(self, base_model, name="GPLVM"): super(GPLVM, self).__init__(name) if base_model.X.dim() != 2: raise ValueError("GPLVM model only works with 2D latent X, but got " "X.dim() = {}.".format(base_model.X.dim())) self.base_model = base_model self.y = self.base_model.y self.X_loc = Parameter(self.base_model.X) C = self.X_loc.shape[1] X_scale_tril_shape = self.X_loc.shape + (C,) Id = torch.eye(C, out=self.X_loc.new_empty(C, C)) X_scale_tril = Id.expand(X_scale_tril_shape) self.X_scale_tril = Parameter(X_scale_tril) self.set_constraint("X_scale_tril", constraints.lower_cholesky) self._call_base_model_guide = True
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, bias=True): super(ConvVarianceUnif, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = (kernel_size, kernel_size) self.stride = stride self.padding = padding self.dilation = dilation self.groups = 1 self.W = Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size)) if bias: self.bias = Parameter(torch.Tensor(1, out_channels, 1, 1)) else: self.register_parameter('bias', None) self.op_bias = lambda input, kernel: F.conv2d(input, kernel, self.bias, self.stride, self.padding, self.dilation, self.groups) self.op_nobias = lambda input, kernel: F.conv2d(input, kernel, None, self.stride, self.padding, self.dilation, self.groups) self.reset_parameters()
def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1, recurrent_dropout_probability: float = 0) -> None: super(AlternatingHighwayLSTM, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers self.recurrent_dropout_probability = recurrent_dropout_probability self.training = True # Input dimensions consider the fact that we do # all of the LSTM projections (and highway parts) # in a single matrix multiplication. input_projection_size = 6 * hidden_size state_projection_size = 5 * hidden_size bias_size = 5 * hidden_size # Here we are creating a single weight and bias with the # parameters for all layers unfolded into it. This is necessary # because unpacking and re-packing the weights inside the # kernel would be slow, as it would happen every time it is called. total_weight_size = 0 total_bias_size = 0 for layer in range(num_layers): layer_input_size = input_size if layer == 0 else hidden_size input_weights = input_projection_size * layer_input_size state_weights = state_projection_size * hidden_size total_weight_size += input_weights + state_weights total_bias_size += bias_size self.weight = Parameter(torch.FloatTensor(total_weight_size)) self.bias = Parameter(torch.FloatTensor(total_bias_size)) self.reset_parameters()
class LinearMatrixAttention(MatrixAttention): """ This ``MatrixAttention`` takes two matrices as input and returns a matrix of attentions by performing a dot product between a vector of weights and some combination of the two input matrices, followed by an (optional) activation function. The combination used is configurable. If the two vectors are ``x`` and ``y``, we allow the following kinds of combinations: ``x``, ``y``, ``x*y``, ``x+y``, ``x-y``, ``x/y``, where each of those binary operations is performed elementwise. You can list as many combinations as you want, comma separated. For example, you might give ``x,y,x*y`` as the ``combination`` parameter to this class. The computed similarity function would then be ``w^T [x; y; x*y] + b``, where ``w`` is a vector of weights, ``b`` is a bias parameter, and ``[;]`` is vector concatenation. Note that if you want a bilinear similarity function with a diagonal weight matrix W, where the similarity function is computed as `x * w * y + b` (with `w` the diagonal of `W`), you can accomplish that with this class by using "x*y" for `combination`. Parameters ---------- tensor_1_dim : ``int`` The dimension of the first tensor, ``x``, described above. This is ``x.size()[-1]`` - the length of the vector that will go into the similarity computation. We need this so we can build weight vectors correctly. tensor_2_dim : ``int`` The dimension of the second tensor, ``y``, described above. This is ``y.size()[-1]`` - the length of the vector that will go into the similarity computation. We need this so we can build weight vectors correctly. combination : ``str``, optional (default="x,y") Described above. activation : ``Activation``, optional (default=linear (i.e. no activation)) An activation function applied after the ``w^T * [x;y] + b`` calculation. Default is no activation. """ def __init__(self, tensor_1_dim: int, tensor_2_dim: int, combination: str = 'x,y', activation: Activation = None) -> None: super().__init__() self._combination = combination combined_dim = util.get_combined_dim(combination, [tensor_1_dim, tensor_2_dim]) self._weight_vector = Parameter(torch.Tensor(combined_dim)) self._bias = Parameter(torch.Tensor(1)) self._activation = activation or Activation.by_name('linear')() self.reset_parameters() def reset_parameters(self): std = math.sqrt(6 / (self._weight_vector.size(0) + 1)) self._weight_vector.data.uniform_(-std, std) self._bias.data.fill_(0) @overrides def forward(self, # pylint: disable=arguments-differ matrix_1: torch.Tensor, matrix_2: torch.Tensor) -> torch.Tensor: # TODO(mattg): Remove the need for this tiling. # https://github.com/allenai/allennlp/pull/1235#issuecomment-391540133 tiled_matrix_1 = matrix_1.unsqueeze(2).expand(matrix_1.size()[0], matrix_1.size()[1], matrix_2.size()[1], matrix_1.size()[2]) tiled_matrix_2 = matrix_2.unsqueeze(1).expand(matrix_2.size()[0], matrix_1.size()[1], matrix_2.size()[1], matrix_2.size()[2]) combined_tensors = util.combine_tensors(self._combination, [tiled_matrix_1, tiled_matrix_2]) dot_product = torch.matmul(combined_tensors, self._weight_vector) return self._activation(dot_product + self._bias)
class LSTMcell_untied(torch.nn.Module): def __init__(self, *, inputSize, hiddenSize, train=True, dr=0.5, drMethod='gal+sem', gpu=0): super(LSTMcell_untied, self).__init__() self.inputSize = inputSize self.hiddenSize = inputSize self.dr = dr self.w_xi = Parameter(torch.Tensor(hiddenSize, inputSize)) self.w_xf = Parameter(torch.Tensor(hiddenSize, inputSize)) self.w_xo = Parameter(torch.Tensor(hiddenSize, inputSize)) self.w_xc = Parameter(torch.Tensor(hiddenSize, inputSize)) self.w_hi = Parameter(torch.Tensor(hiddenSize, hiddenSize)) self.w_hf = Parameter(torch.Tensor(hiddenSize, hiddenSize)) self.w_ho = Parameter(torch.Tensor(hiddenSize, hiddenSize)) self.w_hc = Parameter(torch.Tensor(hiddenSize, hiddenSize)) self.b_i = Parameter(torch.Tensor(hiddenSize)) self.b_f = Parameter(torch.Tensor(hiddenSize)) self.b_o = Parameter(torch.Tensor(hiddenSize)) self.b_c = Parameter(torch.Tensor(hiddenSize)) self.drMethod = drMethod.split('+') self.gpu = gpu self.train = train if gpu >= 0: self = self.cuda(gpu) self.is_cuda = True else: self.is_cuda = False self.reset_parameters() def reset_parameters(self): std = 1.0 / math.sqrt(self.hiddenSize) for w in self.parameters(): w.data.uniform_(-std, std) def init_mask(self, x, h, c): self.maskX_i = createMask(x, self.dr) self.maskX_f = createMask(x, self.dr) self.maskX_c = createMask(x, self.dr) self.maskX_o = createMask(x, self.dr) self.maskH_i = createMask(h, self.dr) self.maskH_f = createMask(h, self.dr) self.maskH_c = createMask(h, self.dr) self.maskH_o = createMask(h, self.dr) self.maskC = createMask(c, self.dr) self.maskW_xi = createMask(self.w_xi, self.dr) self.maskW_xf = createMask(self.w_xf, self.dr) self.maskW_xc = createMask(self.w_xc, self.dr) self.maskW_xo = createMask(self.w_xo, self.dr) self.maskW_hi = createMask(self.w_hi, self.dr) self.maskW_hf = createMask(self.w_hf, self.dr) self.maskW_hc = createMask(self.w_hc, self.dr) self.maskW_ho = createMask(self.w_ho, self.dr) def forward(self, x, hidden): h0, c0 = hidden doDrop = self.training and self.dr > 0.0 if doDrop: self.init_mask(x, h0, c0) if doDrop and 'drH' in self.drMethod: h0_i = h0.mul(self.maskH_i) h0_f = h0.mul(self.maskH_f) h0_c = h0.mul(self.maskH_c) h0_o = h0.mul(self.maskH_o) else: h0_i = h0 h0_f = h0 h0_c = h0 h0_o = h0 if doDrop and 'drX' in self.drMethod: x_i = x.mul(self.maskX_i) x_f = x.mul(self.maskX_f) x_c = x.mul(self.maskX_c) x_o = x.mul(self.maskX_o) else: x_i = x x_f = x x_c = x x_o = x if doDrop and 'drW' in self.drMethod: w_xi = self.w_xi.mul(self.maskW_xi) w_xf = self.w_xf.mul(self.maskW_xf) w_xc = self.w_xc.mul(self.maskW_xc) w_xo = self.w_xo.mul(self.maskW_xo) w_hi = self.w_hi.mul(self.maskW_hi) w_hf = self.w_hf.mul(self.maskW_hf) w_hc = self.w_hc.mul(self.maskW_hc) w_ho = self.w_ho.mul(self.maskW_ho) else: w_xi = self.w_xi w_xf = self.w_xf w_xc = self.w_xc w_xo = self.w_xo w_hi = self.w_hi w_hf = self.w_hf w_hc = self.w_hc w_ho = self.w_ho gate_i = F.linear(x_i, w_xi) + F.linear(h0_i, w_hi) + self.b_i gate_f = F.linear(x_f, w_xf) + F.linear(h0_f, w_hf) + self.b_f gate_c = F.linear(x_c, w_xc) + F.linear(h0_c, w_hc) + self.b_c gate_o = F.linear(x_o, w_xo) + F.linear(h0_o, w_ho) + self.b_o gate_i = F.sigmoid(gate_i) gate_f = F.sigmoid(gate_f) gate_c = F.tanh(gate_c) gate_o = F.sigmoid(gate_o) if doDrop and 'drC' in self.drMethod: gate_c = gate_c.mul(self.maskC) c1 = (gate_f * c0) + (gate_i * gate_c) h1 = gate_o * F.tanh(c1) return h1, c1
game = 'ipd' param = 'flat`' if game == 'ipd': game = IteratedPrisonerDilemna() elif game == 'mp': game = IteratedMatchingPennies() else: raise ValueError() n_players = game.n_players dim_strategy = game.dim_strategy if param == 'flat': strategies = [ Parameter(torch.empty(dim_strategy)) for _ in range(n_players) ] for s in strategies: s.data.uniform_(0., 1.) elif param == 'sigmoid': strategies = [ Parameter(torch.empty(dim_strategy, 2)) for _ in range(n_players) ] for s in strategies: s.data.normal_(0., 1.) strat_rec = [[] for _ in range(n_players)] vs_rec = [[] for _ in range(n_players)] def objective_fn(strategies):
def z(self, value): self._z = Parameter(torch.as_tensor(value))
class BBBLinearFactorial(nn.Module): """ Describes a Linear fully connected Bayesian layer with a distribution over each of the weights and biases in the layer. """ def __init__(self, in_features, out_features, p_logvar_init=-3, p_pi=1.0, q_logvar_init=-5): # p_logvar_init, p_pi can be either # (list/tuples): prior model is a mixture of Gaussians components=len(p_pi)=len(p_logvar_init) # float: Gussian distribution # q_logvar_init: float, the approximate posterior is currently always a factorized gaussian super(BBBLinearFactorial, self).__init__() self.in_features = in_features self.out_features = out_features self.p_logvar_init = p_logvar_init self.q_logvar_init = q_logvar_init # Approximate posterior weights... self.qw_mean = Parameter(torch.Tensor(out_features, in_features)) self.qw_logvar = Parameter(torch.Tensor(out_features, in_features)) # optionally add bias # self.qb_mean = Parameter(torch.Tensor(out_features)) # self.qb_logvar = Parameter(torch.Tensor(out_features)) # ...and output... self.fc_qw_mean = Parameter(torch.Tensor(out_features, in_features)) self.fc_qw_std = Parameter(torch.Tensor(out_features, in_features)) # ...as normal distributions self.qw = Normal(mu=self.qw_mean, logvar=self.qw_logvar) # self.qb = Normal(mu=self.qb_mean, logvar=self.qb_logvar) self.fc_qw = Normalout(mu=self.fc_qw_mean, std=self.fc_qw_std) # initialise self.log_alpha = Parameter(torch.Tensor(1, 1)) # prior model self.pw = distribution_selector(mu=0.0, logvar=p_logvar_init, pi=p_pi) # self.pb = distribution_selector(mu=0.0, logvar=p_logvar_init, pi=p_pi) # initialize all paramaters self.reset_parameters() def reset_parameters(self): # initialize (trainable) approximate posterior parameters stdv = 10. / math.sqrt(self.in_features) self.qw_mean.data.uniform_(-stdv, stdv) self.qw_logvar.data.uniform_(-stdv, stdv).add_(self.q_logvar_init) # self.qb_mean.data.uniform_(-stdv, stdv) # self.qb_logvar.data.uniform_(-stdv, stdv).add_(self.q_logvar_init) self.fc_qw_mean.data.uniform_(-stdv, stdv) self.fc_qw_std.data.uniform_(-stdv, stdv).add_(self.q_logvar_init) self.log_alpha.data.uniform_(-stdv, stdv) def forward(self, input): raise NotImplementedError() def fcprobforward(self, input): """ Probabilistic forwarding method. :param input: data tensor :return: output, kl-divergence """ fc_qw_mean = F.linear(input=input, weight=self.qw_mean) fc_qw_si = torch.sqrt(1e-8 + F.linear(input=input.pow(2), weight=torch.exp(self.log_alpha)*self.qw_mean.pow(2))) if cuda: fc_qw_mean.cuda() fc_qw_si.cuda() # sample from output if cuda: output = fc_qw_mean + fc_qw_si * (torch.randn(fc_qw_mean.size())).cuda() else: output = fc_qw_mean + fc_qw_si * (torch.randn(fc_qw_mean.size())) if cuda: output.cuda() w_sample = self.fc_qw.sample() # KL divergence qw_logpdf = self.fc_qw.logpdf(w_sample) kl = torch.sum(qw_logpdf - self.pw.logpdf(w_sample)) return output, kl def __repr__(self): return self.__class__.__name__ + ' (' \ + str(self.in_features) + ' -> ' \ + str(self.out_features) + ')'
class VariationalSparseGP(GPModel): r""" Variational Sparse Gaussian Process model. In :class:`.VariationalGP` model, when the number of input data :math:`X` is large, the covariance matrix :math:`k(X, X)` will require a lot of computational steps to compute its inverse (for log likelihood and for prediction). This model introduces an additional inducing-input parameter :math:`X_u` to solve that problem. Given inputs :math:`X`, their noisy observations :math:`y`, and the inducing-input parameters :math:`X_u`, the model takes the form: .. math:: [f, u] &\sim \mathcal{GP}(0, k([X, X_u], [X, X_u])),\\ y & \sim p(y) = p(y \mid f) p(f), where :math:`p(y \mid f)` is the likelihood. We will use a variational approach in this model by approximating :math:`q(f,u)` to the posterior :math:`p(f,u \mid y)`. Precisely, :math:`q(f) = p(f\mid u)q(u)`, where :math:`q(u)` is a multivariate normal distribution with two parameters ``u_loc`` and ``u_scale_tril``, which will be learned during a variational inference process. .. note:: This model can be learned using MCMC method as in reference [2]. See also :class:`.GPModel`. .. note:: This model has :math:`\mathcal{O}(NM^2)` complexity for training, :math:`\mathcal{O}(M^3)` complexity for testing. Here, :math:`N` is the number of train inputs, :math:`M` is the number of inducing inputs. Size of variational parameters is :math:`\mathcal{O}(M^2)`. References: [1] `Scalable variational Gaussian process classification`, James Hensman, Alexander G. de G. Matthews, Zoubin Ghahramani [2] `MCMC for Variationally Sparse Gaussian Processes`, James Hensman, Alexander G. de G. Matthews, Maurizio Filippone, Zoubin Ghahramani :param torch.Tensor X: A input data for training. Its first dimension is the number of data points. :param torch.Tensor y: An output data for training. Its last dimension is the number of data points. :param ~pyro.contrib.gp.kernels.kernel.Kernel kernel: A Pyro kernel object, which is the covariance function :math:`k`. :param torch.Tensor Xu: Initial values for inducing points, which are parameters of our model. :param ~pyro.contrib.gp.likelihoods.likelihood Likelihood likelihood: A likelihood object. :param callable mean_function: An optional mean function :math:`m` of this Gaussian process. By default, we use zero mean. :param torch.Size latent_shape: Shape for latent processes (`batch_shape` of :math:`q(u)`). By default, it equals to output batch shape ``y.shape[:-1]``. For the multi-class classification problems, ``latent_shape[-1]`` should corresponse to the number of classes. :param int num_data: The size of full training dataset. It is useful for training this model with mini-batch. :param bool whiten: A flag to tell if variational parameters ``u_loc`` and ``u_scale_tril`` are transformed by the inverse of ``Luu``, where ``Luu`` is the lower triangular decomposition of :math:`kernel(X_u, X_u)`. Enable this flag will help optimization. :param float jitter: A small positive term which is added into the diagonal part of a covariance matrix to help stablize its Cholesky decomposition. :param str name: Name of this model. """ def __init__(self, X, y, kernel, Xu, likelihood, mean_function=None, latent_shape=None, num_data=None, whiten=False, jitter=1e-6, name="SVGP"): super(VariationalSparseGP, self).__init__(X, y, kernel, mean_function, jitter, name) self.likelihood = likelihood self.num_data = num_data if num_data is not None else self.X.shape[0] self.whiten = whiten self.Xu = Parameter(Xu) y_batch_shape = self.y.shape[:-1] if self.y is not None else torch.Size([]) self.latent_shape = latent_shape if latent_shape is not None else y_batch_shape M = self.Xu.shape[0] u_loc_shape = self.latent_shape + (M,) u_loc = self.Xu.new_zeros(u_loc_shape) self.u_loc = Parameter(u_loc) u_scale_tril_shape = self.latent_shape + (M, M) Id = torch.eye(M, out=self.Xu.new_empty(M, M)) u_scale_tril = Id.expand(u_scale_tril_shape) self.u_scale_tril = Parameter(u_scale_tril) self.set_constraint("u_scale_tril", constraints.lower_cholesky) self._sample_latent = True def model(self): self.set_mode("model") Xu = self.get_param("Xu") u_loc = self.get_param("u_loc") u_scale_tril = self.get_param("u_scale_tril") M = Xu.shape[0] Kuu = self.kernel(Xu) + torch.eye(M, out=Xu.new_empty(M, M)) * self.jitter Luu = Kuu.potrf(upper=False) zero_loc = Xu.new_zeros(u_loc.shape) u_name = param_with_module_name(self.name, "u") if self.whiten: Id = torch.eye(M, out=Xu.new_empty(M, M)) pyro.sample(u_name, dist.MultivariateNormal(zero_loc, scale_tril=Id) .independent(zero_loc.dim() - 1)) else: pyro.sample(u_name, dist.MultivariateNormal(zero_loc, scale_tril=Luu) .independent(zero_loc.dim() - 1)) f_loc, f_var = conditional(self.X, Xu, self.kernel, u_loc, u_scale_tril, Luu, full_cov=False, whiten=self.whiten, jitter=self.jitter) f_loc = f_loc + self.mean_function(self.X) if self.y is None: return f_loc, f_var else: with poutine.scale(None, self.num_data / self.X.shape[0]): return self.likelihood(f_loc, f_var, self.y) def guide(self): self.set_mode("guide") Xu = self.get_param("Xu") u_loc = self.get_param("u_loc") u_scale_tril = self.get_param("u_scale_tril") if self._sample_latent: u_name = param_with_module_name(self.name, "u") pyro.sample(u_name, dist.MultivariateNormal(u_loc, scale_tril=u_scale_tril) .independent(u_loc.dim()-1)) return Xu, u_loc, u_scale_tril def forward(self, Xnew, full_cov=False): r""" Computes the mean and covariance matrix (or variance) of Gaussian Process posterior on a test input data :math:`X_{new}`: .. math:: p(f^* \mid X_{new}, X, y, k, X_u, u_{loc}, u_{scale\_tril}) = \mathcal{N}(loc, cov). .. note:: Variational parameters ``u_loc``, ``u_scale_tril``, the inducing-point parameter ``Xu``, together with kernel's parameters have been learned from a training procedure (MCMC or SVI). :param torch.Tensor Xnew: A input data for testing. Note that ``Xnew.shape[1:]`` must be the same as ``self.X.shape[1:]``. :param bool full_cov: A flag to decide if we want to predict full covariance matrix or just variance. :returns: loc and covariance matrix (or variance) of :math:`p(f^*(X_{new}))` :rtype: tuple(torch.Tensor, torch.Tensor) """ self._check_Xnew_shape(Xnew) # avoid sampling the unnecessary latent u self._sample_latent = False Xu, u_loc, u_scale_tril = self.guide() self._sample_latent = True loc, cov = conditional(Xnew, Xu, self.kernel, u_loc, u_scale_tril, full_cov=full_cov, whiten=self.whiten, jitter=self.jitter) return loc + self.mean_function(Xnew), cov
class cola_gnn(nn.Module): def __init__(self, args, data): super().__init__() self.x_h = 1 self.f_h = data.m self.m = data.m self.d = data.d self.w = args.window self.h = args.horizon self.adj = data.adj self.o_adj = data.orig_adj if args.cuda: self.adj = sparse_mx_to_torch_sparse_tensor( normalize_adj2(data.orig_adj.cpu().numpy())).to_dense().cuda() else: self.adj = sparse_mx_to_torch_sparse_tensor( normalize_adj2(data.orig_adj.cpu().numpy())).to_dense() self.dropout = args.dropout self.n_hidden = args.n_hidden half_hid = int(self.n_hidden / 2) self.V = Parameter(torch.Tensor(half_hid)) self.bv = Parameter(torch.Tensor(1)) self.W1 = Parameter(torch.Tensor(half_hid, self.n_hidden)) self.b1 = Parameter(torch.Tensor(half_hid)) self.W2 = Parameter(torch.Tensor(half_hid, self.n_hidden)) self.act = F.elu self.Wb = Parameter(torch.Tensor(self.m, self.m)) self.wb = Parameter(torch.Tensor(1)) self.k = args.k self.conv = nn.Conv1d(1, self.k, self.w) self.conv_long = nn.Conv1d(1, self.k, self.w - self.k, dilation=2) self.n_spatial = args.hidsp #self.h ####### check equal to k self.conv1 = GraphConvLayer(self.k * 3, self.n_hidden) # self.k self.conv2 = GraphConvLayer(self.n_hidden, self.n_spatial) if args.rnn_model == 'LSTM': self.rnn = nn.LSTM(input_size=self.x_h, hidden_size=self.n_hidden, num_layers=args.n_layer, dropout=args.dropout, batch_first=True, bidirectional=args.bi) elif args.rnn_model == 'GRU': self.rnn = nn.GRU(input_size=self.x_h, hidden_size=self.n_hidden, num_layers=args.n_layer, dropout=args.dropout, batch_first=True, bidirectional=args.bi) elif args.rnn_model == 'RNN': self.rnn = nn.RNN(input_size=self.x_h, hidden_size=self.n_hidden, num_layers=args.n_layer, dropout=args.dropout, batch_first=True, bidirectional=args.bi) else: raise LookupError(' only support LSTM, GRU and RNN') hidden_size = (int(args.bi) + 1) * self.n_hidden # self.n_hidden = hidden_size BIDIRECTIONAL BUG self.out = nn.Linear(hidden_size + self.n_spatial, 1) self.residual_window = 0 self.ratio = 1.0 if (self.residual_window > 0): self.residual_window = min(self.residual_window, args.window) self.residual = nn.Linear(self.residual_window, 1) self.init_weights() def init_weights(self): for p in self.parameters(): if p.data.ndimension() >= 2: nn.init.xavier_uniform_(p.data) # best else: stdv = 1. / math.sqrt(p.size(0)) p.data.uniform_(-stdv, stdv) def forward(self, x, feat=None): ''' Args: x: (batch, time_step, m) feat: [batch, window, dim, m] Returns: (batch, m) ''' b, w, m = x.size() orig_x = x x = x.permute(0, 2, 1).contiguous().view(-1, x.size(1), 1) r_out, hc = self.rnn(x, None) last_hid = r_out[:, -1, :] last_hid = last_hid.view(-1, self.m, self.n_hidden) out_temporal = last_hid # [b, m, 20] # print(last_hid.shape,'====') hid_rpt_m = last_hid.repeat(1, self.m, 1).view( b, self.m, self.m, self.n_hidden) # b,m,m,w continuous m hid_rpt_w = last_hid.repeat(1, 1, self.m).view( b, self.m, self.m, self.n_hidden) # b,m,m,w continuous w one window data a_mx = self.act( hid_rpt_m @ self.W1.t() + hid_rpt_w @ self.W2.t() + self.b1) @ self.V + self.bv # row, all states influence one state before_norm = a_mx.cpu().detach().numpy() ## save a_mx = F.normalize(a_mx, p=2, dim=1, eps=1e-12, out=None) after_norm = a_mx.cpu().detach().numpy() ## save r_l = [] r_long_l = [] h_mids = orig_x for i in range(self.m): h_tmp = h_mids[:, :, i:i + 1].permute(0, 2, 1).contiguous() r = self.conv(h_tmp) # [32, 10/k, 1] r_long = self.conv_long(h_tmp) r_l.append(r) r_long_l.append(r_long) r_l = torch.stack(r_l, dim=1) r_long_l = torch.stack(r_long_l, dim=1) r_l = torch.cat((r_l, r_long_l), -1) r_l = r_l.view(r_l.size(0), r_l.size(1), -1) r_l = torch.relu(r_l) adjs = self.adj.repeat(b, 1) adjs = adjs.view(b, self.m, self.m) c = torch.sigmoid(a_mx @ self.Wb + self.wb) a_mx = adjs * c + a_mx * (1 - c) after_norm2 = a_mx.cpu().detach().numpy() ## save adj = a_mx x = r_l x = F.relu(self.conv1(x, adj)) x = F.dropout(x, self.dropout, training=self.training) out_spatial = F.relu(self.conv2(x, adj)) out = torch.cat((out_spatial, out_temporal), dim=-1) out = self.out(out) out = torch.squeeze(out) if (self.residual_window > 0): z = orig_x[:, -self.residual_window:, :] #Step backward # [batch, res_window, m] z = z.permute(0, 2, 1).contiguous().view(-1, self.residual_window) #[batch*m, res_window] z = self.residual(z) #[batch*m, 1] z = z.view(-1, self.m) #[batch, m] out = out * self.ratio + z #[batch, m] return out, None
class Preprocessor(Module): def __init__( self, normalization_parameters: Dict[str, NormalizationParameters], use_gpu: bool, typed_output: bool = False, ) -> None: super(Preprocessor, self).__init__() self.normalization_parameters = normalization_parameters self.sorted_features, self.sorted_feature_boundaries = ( self._sort_features_by_normalization()) self.typed_output = typed_output cuda_available = torch.cuda.is_available() logger.info("CUDA availability: {}".format(cuda_available)) if use_gpu and cuda_available: logger.info("Using GPU: GPU requested and available.") self.use_gpu = True self.dtype = torch.cuda.FloatTensor else: logger.info("NOT Using GPU: GPU not requested or not available.") self.use_gpu = False self.dtype = torch.FloatTensor # NOTE: Because of the way we call AppendNet to squash ONNX to a C2 net, # We need to make tensors for every numeric literal self.zero_tensor = Parameter(torch.tensor([0.0]).type(self.dtype), requires_grad=False) self.one_tensor = Parameter(torch.tensor([1.0]).type(self.dtype), requires_grad=False) self.one_half_tensor = Parameter(torch.tensor([0.5]).type(self.dtype), requires_grad=False) self.one_hundredth_tensor = Parameter(torch.tensor([0.01]).type( self.dtype), requires_grad=False) self.negative_one_tensor = Parameter(torch.tensor([-1.0 ]).type(self.dtype), requires_grad=False) self.missing_tensor = Parameter(torch.tensor([MISSING_VALUE ]).type(self.dtype), requires_grad=False) self.min_tensor = Parameter(torch.tensor([-1e20]).type(self.dtype), requires_grad=False) self.max_tensor = Parameter(torch.tensor([1e20]).type(self.dtype), requires_grad=False) self.epsilon_tensor = Parameter(torch.tensor([EPS]).type(self.dtype), requires_grad=False) feature_starts = self._get_type_boundaries() for i, feature_type in enumerate(FEATURE_TYPES): begin_index = feature_starts[i] if (i + 1) == len(FEATURE_TYPES): end_index = len(self.normalization_parameters) else: end_index = feature_starts[i + 1] if begin_index == end_index: continue # No features of this type if feature_type == ENUM: # Process one-at-a-time for j in range(begin_index, end_index): norm_params = self.normalization_parameters[ self.sorted_features[j]] func = getattr(self, "_create_parameters_" + feature_type) func(j, norm_params) else: norm_params = [] for f in self.sorted_features[begin_index:end_index]: norm_params.append(self.normalization_parameters[f]) func = getattr(self, "_create_parameters_" + feature_type) func(begin_index, norm_params) def input_prototype(self): return rlt.FeatureVector( float_features=torch.randn(1, len(self.normalization_parameters))) def forward(self, input) -> torch.FloatTensor: """ Preprocess the input matrix :param input tensor """ if isinstance(input, np.ndarray): input = torch.from_numpy(input).type(self.dtype) if isinstance(input, rlt.FeatureVector): input = input.float_features.type(self.dtype) # ONNX doesn't support != yet not_missing_input = (self.one_tensor.float() - (input == self.missing_tensor).float()) feature_starts = self._get_type_boundaries() outputs = [] for i, feature_type in enumerate(FEATURE_TYPES): begin_index = feature_starts[i] if (i + 1) == len(FEATURE_TYPES): end_index = len(self.normalization_parameters) else: end_index = feature_starts[i + 1] if begin_index == end_index: continue # No features of this type if feature_type == ENUM: # Process one-at-a-time for j in range(begin_index, end_index): norm_params = self.normalization_parameters[ self.sorted_features[j]] new_output = self._preprocess_feature_single_column( j, input[:, j:j + 1], norm_params) new_output *= not_missing_input[:, j:j + 1] self._check_preprocessing_output(new_output, [norm_params]) outputs.append(new_output) else: norm_params = [] for f in self.sorted_features[begin_index:end_index]: norm_params.append(self.normalization_parameters[f]) new_output = self._preprocess_feature_multi_column( begin_index, input[:, begin_index:end_index], norm_params) new_output *= not_missing_input[:, begin_index:end_index] self._check_preprocessing_output(new_output, norm_params) outputs.append(new_output) def wrap(output): if self.typed_output: return rlt.FeatureVector(float_features=output) else: return output if len(outputs) == 1: return wrap( torch.clamp(outputs[0], MIN_FEATURE_VALUE, MAX_FEATURE_VALUE)) return wrap( torch.clamp(torch.cat(outputs, dim=1), MIN_FEATURE_VALUE, MAX_FEATURE_VALUE)) def _preprocess_feature_single_column( self, begin_index: int, input: torch.Tensor, norm_params: NormalizationParameters, ) -> torch.Tensor: if isinstance(input, np.ndarray): input = torch.from_numpy(input).type(self.dtype) feature_type = norm_params.feature_type func = getattr(self, "_preprocess_" + feature_type) return func(begin_index, input, norm_params) def _preprocess_feature_multi_column( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: if isinstance(input, np.ndarray): input = torch.from_numpy(input).type(self.dtype) feature_type = norm_params[0].feature_type func = getattr(self, "_preprocess_" + feature_type) return func(begin_index, input, norm_params) def _create_parameters_BINARY(self, begin_index: int, norm_params: List[NormalizationParameters]): pass def _preprocess_BINARY( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: # ONNX doesn't support != yet return self.one_tensor - (input == self.zero_tensor).float() def _create_parameters_PROBABILITY( self, begin_index: int, norm_params: List[NormalizationParameters]): pass def _preprocess_PROBABILITY( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: clamped_input = torch.clamp(input, 0.01, 0.99) return self.negative_one_tensor * (( (self.one_tensor / clamped_input) - self.one_tensor).log()) def _create_parameters_CONTINUOUS_ACTION( self, begin_index: int, norm_params: List[NormalizationParameters]): self._create_parameter( begin_index, "min_serving_value", torch.Tensor([p.min_value for p in norm_params]).type(self.dtype), ) self._create_parameter( begin_index, "min_training_value", torch.ones(len(norm_params)).type(self.dtype) * -1 + EPS, ) self._create_parameter( begin_index, "scaling_factor", (torch.ones(len(norm_params)).type(self.dtype) - EPS) * 2 / torch.tensor([p.max_value - p.min_value for p in norm_params]).type(self.dtype), ) def _preprocess_CONTINUOUS_ACTION( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: min_serving_value = self._fetch_parameter(begin_index, "min_serving_value") min_training_value = self._fetch_parameter(begin_index, "min_training_value") scaling_factor = self._fetch_parameter(begin_index, "scaling_factor") continuous_action = ( input - min_serving_value) * scaling_factor + min_training_value return torch.clamp(continuous_action, -1 + EPS, 1 - EPS) def _create_parameters_CONTINUOUS( self, begin_index: int, norm_params: List[NormalizationParameters]): self._create_parameter( begin_index, "means", torch.Tensor([p.mean for p in norm_params]).type(self.dtype), ) self._create_parameter( begin_index, "stddevs", torch.Tensor([p.stddev for p in norm_params]).type(self.dtype), ) def _preprocess_CONTINUOUS( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: means = self._fetch_parameter(begin_index, "means") stddevs = self._fetch_parameter(begin_index, "stddevs") continuous_output = (input - means) / stddevs return torch.clamp(continuous_output, MIN_FEATURE_VALUE, MAX_FEATURE_VALUE) def _create_parameters_BOXCOX(self, begin_index: int, norm_params: List[NormalizationParameters]): self._create_parameter( begin_index, "shifts", torch.Tensor([p.boxcox_shift for p in norm_params]).type(self.dtype), ) for p in norm_params: assert (abs(p.boxcox_lambda) > 1e-6), "Invalid value for boxcox lambda: " + str( p.boxcox_lambda) self._create_parameter( begin_index, "lambdas", torch.Tensor([p.boxcox_lambda for p in norm_params]).type(self.dtype), ) self._create_parameters_CONTINUOUS(begin_index, norm_params) def _preprocess_BOXCOX( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: shifts = self._fetch_parameter(begin_index, "shifts") lambdas = self._fetch_parameter(begin_index, "lambdas") boxcox_output = ( # We can replace this with a normal pow() call after D8528654 lands self._manual_broadcast_matrix_scalar( torch.clamp( input + shifts, 1e-6 ), # Clamp is necessary to prevent MISSING_VALUE from going to NaN lambdas, torch.pow, ) - self.one_tensor) / lambdas return self._preprocess_CONTINUOUS(begin_index, boxcox_output, norm_params) def _create_parameters_QUANTILE( self, begin_index: int, norm_params: List[NormalizationParameters]): F = len(norm_params) num_quantiles = torch.tensor([[ float(len(p.quantiles)) - 1 for p in norm_params ]]).type(self.dtype) self._create_parameter(begin_index, "num_quantiles", num_quantiles) max_num_quantile_boundaries = int( torch.max(torch.tensor([len(p.quantiles) for p in norm_params]))) B = max_num_quantile_boundaries # The quantile boundaries is a FxB matrix where B is the max # of boundaries # We take advantage of the fact that if the value is >= the max # quantile boundary it automatically gets a 1.0 to repeat the max quantile # so that we guarantee a square matrix. # We project the quantiles boundaries to 3d and create a 1xFxB tensor quantile_boundaries = torch.zeros( [1, len(norm_params), max_num_quantile_boundaries]).type(self.dtype) max_quantile_boundaries = torch.zeros([1, len(norm_params) ]).type(self.dtype) min_quantile_boundaries = torch.zeros([1, len(norm_params) ]).type(self.dtype) for i, p in enumerate(norm_params): quantile_boundaries[0, i, :] = p.quantiles[-1] quantile_boundaries[0, i, 0:len(p.quantiles)] = torch.tensor( p.quantiles).type(self.dtype) max_quantile_boundaries[0, i] = max(p.quantiles) min_quantile_boundaries[0, i] = min(p.quantiles) quantile_boundaries = quantile_boundaries.type(self.dtype) max_quantile_boundaries = max_quantile_boundaries.type(self.dtype) min_quantile_boundaries = min_quantile_boundaries.type(self.dtype) self._create_parameter(begin_index, "quantile_boundaries", quantile_boundaries) self._create_parameter(begin_index, "max_quantile_boundaries", max_quantile_boundaries) self._create_parameter(begin_index, "min_quantile_boundaries", min_quantile_boundaries) self._create_parameter( begin_index, "quantile_boundary_mask", torch.ones([1, F, B]).type(self.dtype), ) def _preprocess_QUANTILE( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: """ Replace the value with it's percentile in the range [0,1]. This preprocesses several features in a single step by putting the quantile boundaries in the third dimension and broadcasting. The input is a JxF matrix where J is the batch size and F is the # of features. """ # The number of quantiles is a 1xF matrix num_quantiles = self._fetch_parameter(begin_index, "num_quantiles") quantile_boundaries = self._fetch_parameter(begin_index, "quantile_boundaries") max_quantile_boundaries = self._fetch_parameter( begin_index, "max_quantile_boundaries") min_quantile_boundaries = self._fetch_parameter( begin_index, "min_quantile_boundaries") # Add a third dimension and repeat to create a JxFxB matrix, where the # inputs are repeated B times in the third dimension. We need to # do this because we can't broadcast both operands in different # dimensions in the same operation. # repeat doesn't work yet, so * by a mask mask = self._fetch_parameter(begin_index, "quantile_boundary_mask") expanded_inputs = input.unsqueeze(2) * mask input_greater_than_or_equal_to = (expanded_inputs >= quantile_boundaries).float() input_less_than = (expanded_inputs < quantile_boundaries).float() set_to_max = (input >= max_quantile_boundaries).float() set_to_min = (input <= min_quantile_boundaries).float() min_or_max = (set_to_min + set_to_max).float() interpolate = (min_or_max < self.one_hundredth_tensor).float() interpolate_left, _ = torch.max( (input_greater_than_or_equal_to * quantile_boundaries) + (input_less_than * self.min_tensor), dim=2, ) interpolate_right, _ = torch.min( (input_less_than * quantile_boundaries) + (input_greater_than_or_equal_to * self.max_tensor), dim=2, ) # This assumes that we need to interpolate and computes the value. # If we don't need to interpolate, this will be some bogus value, but it # will be multiplied by 0 so no big deal. left_start = torch.sum(input_greater_than_or_equal_to, dim=2) - self.one_tensor interpolated_values = (( left_start + ((input - interpolate_left) / ( (interpolate_right + self.epsilon_tensor) - interpolate_left ) # Add a small amount to interpolate_right to avoid div-0 )) / num_quantiles).float() return set_to_max + (interpolate * interpolated_values).float() def _create_parameters_ENUM(self, begin_index: int, norm_params: NormalizationParameters): self._create_parameter( begin_index, "enum_values", torch.Tensor(norm_params.possible_values).unsqueeze(0).type( self.dtype), ) def _preprocess_ENUM( self, begin_index: int, input: torch.Tensor, norm_params: NormalizationParameters, ) -> torch.Tensor: enum_values = self._fetch_parameter(begin_index, "enum_values") return (input == enum_values).float() def _sort_features_by_normalization(self): """ Helper function to return a sorted list from a normalization map. Also returns the starting index for each feature type""" # Sort features by feature type sorted_features = [] feature_starts = [] assert isinstance(list(self.normalization_parameters.keys())[0], int), "Normalization Parameters need to be int" for feature_type in FEATURE_TYPES: feature_starts.append(len(sorted_features)) for feature in sorted(self.normalization_parameters.keys()): norm = self.normalization_parameters[feature] if norm.feature_type == feature_type: sorted_features.append(feature) return sorted_features, feature_starts def _get_type_boundaries(self) -> List[int]: feature_starts = [] on_feature_type = -1 for i, feature in enumerate(self.sorted_features): feature_type = self.normalization_parameters[feature].feature_type feature_type_index = FEATURE_TYPES.index(feature_type) assert (feature_type_index >= on_feature_type ), "Features are not sorted by feature type!" while feature_type_index > on_feature_type: feature_starts.append(i) on_feature_type += 1 while on_feature_type < len(FEATURE_TYPES): feature_starts.append(len(self.sorted_features)) on_feature_type += 1 return feature_starts def _create_parameter(self, begin_index: int, name: str, t: torch.Tensor) -> Parameter: p = Parameter(t, requires_grad=False) setattr(self, "_auto_parameter_" + str(begin_index) + "_" + name, p) return p def _fetch_parameter(self, begin_index: int, name: str) -> Parameter: return getattr(self, "_auto_parameter_" + str(begin_index) + "_" + name) def _manual_broadcast_matrix_scalar(self, t1: torch.Tensor, s1: torch.Tensor, fn) -> torch.Tensor: # Some ONNX ops don't support broadcasting so we need to do some matrix magic return fn(t1, (t1 * self.zero_tensor) + s1).float() def _manual_broadcast_column_vec_row_vec(self, t1: torch.Tensor, t2: torch.Tensor, fn) -> torch.Tensor: # Some ONNX ops don't support broadcasting so we need to do some matrix magic t2_ones = t2 / t2 t1_mask = t1.mm(t2_ones) return fn(t1_mask, t2).float() def _check_preprocessing_output(self, batch, norm_params): """ Check that preprocessed features fall within range of valid output. :param batch: torch tensor :param norm_params: list of normalization parameters """ feature_type = norm_params[0].feature_type min_value, max_value = batch.min(), batch.max() if feature_type == "CONTINUOUS": # Continuous features may be in range (-inf, inf) pass elif float(max_value) > MAX_FEATURE_VALUE: raise Exception( "A {} feature type has max value {} which is > than accepted post pre-processing max of {}" .format(feature_type, max_value, MAX_FEATURE_VALUE)) elif float(min_value) < MIN_FEATURE_VALUE: raise Exception( "A {} feature type has min value {} which is < accepted post pre-processing min of {}" .format(feature_type, min_value, MIN_FEATURE_VALUE))
def __init__( self, normalization_parameters: Dict[str, NormalizationParameters], use_gpu: bool, typed_output: bool = False, ) -> None: super(Preprocessor, self).__init__() self.normalization_parameters = normalization_parameters self.sorted_features, self.sorted_feature_boundaries = ( self._sort_features_by_normalization()) self.typed_output = typed_output cuda_available = torch.cuda.is_available() logger.info("CUDA availability: {}".format(cuda_available)) if use_gpu and cuda_available: logger.info("Using GPU: GPU requested and available.") self.use_gpu = True self.dtype = torch.cuda.FloatTensor else: logger.info("NOT Using GPU: GPU not requested or not available.") self.use_gpu = False self.dtype = torch.FloatTensor # NOTE: Because of the way we call AppendNet to squash ONNX to a C2 net, # We need to make tensors for every numeric literal self.zero_tensor = Parameter(torch.tensor([0.0]).type(self.dtype), requires_grad=False) self.one_tensor = Parameter(torch.tensor([1.0]).type(self.dtype), requires_grad=False) self.one_half_tensor = Parameter(torch.tensor([0.5]).type(self.dtype), requires_grad=False) self.one_hundredth_tensor = Parameter(torch.tensor([0.01]).type( self.dtype), requires_grad=False) self.negative_one_tensor = Parameter(torch.tensor([-1.0 ]).type(self.dtype), requires_grad=False) self.missing_tensor = Parameter(torch.tensor([MISSING_VALUE ]).type(self.dtype), requires_grad=False) self.min_tensor = Parameter(torch.tensor([-1e20]).type(self.dtype), requires_grad=False) self.max_tensor = Parameter(torch.tensor([1e20]).type(self.dtype), requires_grad=False) self.epsilon_tensor = Parameter(torch.tensor([EPS]).type(self.dtype), requires_grad=False) feature_starts = self._get_type_boundaries() for i, feature_type in enumerate(FEATURE_TYPES): begin_index = feature_starts[i] if (i + 1) == len(FEATURE_TYPES): end_index = len(self.normalization_parameters) else: end_index = feature_starts[i + 1] if begin_index == end_index: continue # No features of this type if feature_type == ENUM: # Process one-at-a-time for j in range(begin_index, end_index): norm_params = self.normalization_parameters[ self.sorted_features[j]] func = getattr(self, "_create_parameters_" + feature_type) func(j, norm_params) else: norm_params = [] for f in self.sorted_features[begin_index:end_index]: norm_params.append(self.normalization_parameters[f]) func = getattr(self, "_create_parameters_" + feature_type) func(begin_index, norm_params)
class TopkHierarchicalMultiheadAttention(nn.Module): """Multi-headed attention. See "Attention Is All You Need" for more details. """ def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 if self.qkv_same_dim: self.in_proj_weight = Parameter( torch.Tensor(3 * embed_dim, embed_dim)) else: self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim)) self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim)) self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim)) if bias: self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim)) else: self.register_parameter('in_proj_bias', None) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.reset_parameters() self.onnx_trace = False def prepare_for_onnx_export_(self): self.onnx_trace = True def reset_parameters(self): if self.qkv_same_dim: nn.init.xavier_uniform_(self.in_proj_weight) else: nn.init.xavier_uniform_(self.k_proj_weight) nn.init.xavier_uniform_(self.v_proj_weight) nn.init.xavier_uniform_(self.q_proj_weight) nn.init.xavier_uniform_(self.out_proj.weight) if self.in_proj_bias is not None: nn.init.constant_(self.in_proj_bias, 0.) nn.init.constant_(self.out_proj.bias, 0.) if self.bias_k is not None: nn.init.xavier_normal_(self.bias_k) if self.bias_v is not None: nn.init.xavier_normal_(self.bias_v) def forward(self, query, key, value, hierarchical_attn, key_padding_mask=None, incremental_state=None, need_weights=True, static_kv=False, attn_mask=None): """Input shape: Time x Batch x Channel Self-attention can be implemented by passing in the same arguments for query, key and value. Timesteps can be masked by supplying a T x T mask in the `attn_mask` argument. Padding elements can be excluded from the key by passing a binary ByteTensor (`key_padding_mask`) with shape: batch x src_len, where padding elements are indicated by 1s. """ qkv_same = query.data_ptr() == key.data_ptr() == value.data_ptr() kv_same = key.data_ptr() == value.data_ptr() tgt_len, bsz, embed_dim = query.size() assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) if 'prev_key' in saved_state: # previous time steps are cached - no need to recompute # key and value if they are static if static_kv: assert kv_same and not qkv_same key = value = None else: saved_state = None if qkv_same: # self-attention q, k, v = self.in_proj_qkv(query) elif kv_same: # encoder-decoder attention q = self.in_proj_q(query) if key is None: assert value is None k = v = None else: k = self.in_proj_k(key) v = self.in_proj_v(key) else: q = self.in_proj_q(query) k = self.in_proj_k(key) v = self.in_proj_v(value) q *= self.scaling if self.bias_k is not None: assert self.bias_v is not None k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) if key_padding_mask is not None: key_padding_mask = torch.cat([ key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1) ], dim=1) q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1) if k is not None: k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) if v is not None: v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) if saved_state is not None: # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) if 'prev_key' in saved_state: prev_key = saved_state['prev_key'].view( bsz * self.num_heads, -1, self.head_dim) if static_kv: k = prev_key else: k = torch.cat((prev_key, k), dim=1) if 'prev_value' in saved_state: prev_value = saved_state['prev_value'].view( bsz * self.num_heads, -1, self.head_dim) if static_kv: v = prev_value else: v = torch.cat((prev_value, v), dim=1) saved_state['prev_key'] = k.view(bsz, self.num_heads, -1, self.head_dim) saved_state['prev_value'] = v.view(bsz, self.num_heads, -1, self.head_dim) self._set_input_buffer(incremental_state, saved_state) src_len = k.size(1) # This is part of a workaround to get around fork/join parallelism # not supporting Optional types. if key_padding_mask is not None and key_padding_mask.shape == torch.Size( []): key_padding_mask = None if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len if self.add_zero_attn: src_len += 1 k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) if key_padding_mask is not None: key_padding_mask = torch.cat([ key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask) ], dim=1) attn_weights = torch.bmm(q, k.transpose(1, 2)) assert list( attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] if attn_mask is not None: attn_mask = attn_mask.unsqueeze(0) if self.onnx_trace: attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1) attn_weights += attn_mask if key_padding_mask is not None: # don't attend to padding symbols attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) if self.onnx_trace: attn_weights = torch.where( key_padding_mask.unsqueeze(1).unsqueeze(2), torch.Tensor([-2**32 + 1]), attn_weights.float()).type_as(attn_weights) else: attn_weights = attn_weights.float().masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2), -2**32 + 1, ).type_as(attn_weights) # FP16 support: cast to float and back attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) if hierarchical_attn is not None: attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights * hierarchical_attn.unsqueeze(1) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = utils.softmax( attn_weights, dim=-1, onnx_trace=self.onnx_trace, ).type_as(attn_weights) attn_weights = F.dropout( attn_weights, p=self.dropout, training=self.training) # (bsz * self.num_heads, tgt_len, src_len) # select topk topk_value, topk_indice = torch.kthvalue(attn_weights, attn_weights.size(-1) - 10, dim=-1) topk_mask = torch.ge( attn_weights, topk_value.unsqueeze(-1).repeat(1, 1, attn_weights.size(-1))) attn = torch.bmm(attn_weights * topk_mask.float(), v) # attn = torch.bmm(attn_weights, v) assert list( attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] if (self.onnx_trace and attn.size(1) == 1): # when ONNX tracing a single decoder step (sequence length == 1) # the transpose is a no-op copy before view, thus unnecessary attn = attn.contiguous().view(tgt_len, bsz, embed_dim) else: attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn = self.out_proj(attn) if need_weights: # average attention weights over heads attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights.sum(dim=1) / self.num_heads else: attn_weights = None return attn, attn_weights def in_proj_qkv(self, query): return self._in_proj(query).chunk(3, dim=-1) def in_proj_q(self, query): if self.qkv_same_dim: return self._in_proj(query, end=self.embed_dim) else: bias = self.in_proj_bias if bias is not None: bias = bias[:self.embed_dim] return F.linear(query, self.q_proj_weight, bias) def in_proj_k(self, key): if self.qkv_same_dim: return self._in_proj(key, start=self.embed_dim, end=2 * self.embed_dim) else: weight = self.k_proj_weight bias = self.in_proj_bias if bias is not None: bias = bias[self.embed_dim:2 * self.embed_dim] return F.linear(key, weight, bias) def in_proj_v(self, value): if self.qkv_same_dim: return self._in_proj(value, start=2 * self.embed_dim) else: weight = self.v_proj_weight bias = self.in_proj_bias if bias is not None: bias = bias[2 * self.embed_dim:] return F.linear(value, weight, bias) def _in_proj(self, input, start=0, end=None): weight = self.in_proj_weight bias = self.in_proj_bias weight = weight[start:end, :] if bias is not None: bias = bias[start:end] return F.linear(input, weight, bias) def reorder_incremental_state(self, incremental_state, new_order): """Reorder buffered internal state (for incremental generation).""" input_buffer = self._get_input_buffer(incremental_state) if input_buffer is not None: for k in input_buffer.keys(): input_buffer[k] = input_buffer[k].index_select(0, new_order) self._set_input_buffer(incremental_state, input_buffer) def _get_input_buffer(self, incremental_state): return utils.get_incremental_state( self, incremental_state, 'attn_state', ) or {} def _set_input_buffer(self, incremental_state, buffer): utils.set_incremental_state( self, incremental_state, 'attn_state', buffer, )
class MyGRU(nn.Module): def __init__(self, input_size, hidden_size, layers=1, bidirectional=False, initpara=True, attn_decode=False, post_size=None): super(MyGRU, self).__init__() self.input_size, self.hidden_size, self.layers, self.bidirectional = \ input_size, hidden_size, layers, bidirectional self.GRU = GRU(input_size, hidden_size, layers, bidirectional=bidirectional) self.initpara = initpara if initpara: if bidirectional: self.h_init = Parameter( torch.Tensor(2 * layers, 1, hidden_size)) else: self.h_init = Parameter(torch.Tensor(layers, 1, hidden_size)) self.reset_parameters() if attn_decode: self.attn_query = nn.Linear(hidden_size, post_size) def reset_parameters(self): if self.initpara: stdv = 1.0 / math.sqrt(self.hidden_size) self.h_init.data.uniform_(-stdv, stdv) def getInitialParameter(self, batch_size): return self.h_init.repeat(1, batch_size, 1) def forward(self, incoming, length, h_init=None, need_h=False, attn_decode=False, post=None, post_length=None): if not attn_decode: sen_sorted, length_sorted, memo = sortSequence(incoming, length) left_batch_size = sen_sorted.shape[-2] sen_packed = pack_padded_sequence(sen_sorted, length_sorted) if h_init is None: h_init = self.getInitialParameter(left_batch_size) else: h_shape = h_init.size() h_init = sortSequenceByMemo(h_init, memo) h_init = h_init.reshape(h_shape) if h_init.dim() < 3: h_init = torch.unsqueeze(h_init, 0) h, h_n = self.GRU(sen_packed, h_init) h_n = h_n.transpose(0, 1).reshape(left_batch_size, -1) h_n = revertSequence(h_n, memo) if need_h: h = pad_packed_sequence(h)[0] h = revertSequence(h, memo, True) return h, h_n else: return h_n else: batch_size = incoming.shape[1] seqlen = incoming.shape[0] if h_init is None: h_init = self.getInitialParameter(batch_size) else: h_init = torch.unsqueeze(h_init, 0) h_now = h_init[0] hs = [] attn_weights = [] for i in range(seqlen): query = self.attn_query(h_now) attn_weight = maskedSoftmax( (query.unsqueeze(0) * post).sum(-1), post_length) context = (attn_weight.unsqueeze(-1) * post).sum(0) h_now = self.cell_forward( torch.cat([incoming[i], context], dim=-1), h_now) * Tensor( (length > np.ones(batch_size) * i).astype(float)).unsqueeze(-1) hs.append(h_now) attn_weights.append(attn_weight) return torch.stack(hs), h_now def cell_forward(self, incoming, h): return F_GRUCell(incoming, h, self.GRU.weight_ih_l0, self.GRU.weight_hh_l0, self.GRU.bias_ih_l0, self.GRU.bias_hh_l0)
def __init__( self, in_channels: Union[int, Tuple[int, int]], out_channels: Optional[int], in_edge_channels: int = None, aggr: str = "add", skip_linear: str = False, directed_msg: bool = True, heads: int = 1, attention: bool = False, attention_type: str = "additive", l2_normalize: bool = False, bias: bool = True, **kwargs, ): kwargs.setdefault('aggr', aggr) super().__init__(node_dim=0, **kwargs) self.in_channels = in_channels self.out_channels = out_channels self.in_edge_channels = in_edge_channels self.aggr = aggr self.skip_linear = skip_linear self.directed_msg = directed_msg self.heads = heads self.attention = attention self.attention_type = attention_type self.normalize_l2 = l2_normalize if isinstance(in_channels, int): in_channels = (in_channels, in_channels) if self.directed_msg: self.lin_msg = Linear(in_channels[0], out_channels * self.heads, bias=bias) else: self.lin_msg = Linear(in_channels[0], out_channels * self.heads, bias=bias) self.lin_msg_i = Linear(in_channels[0], out_channels * self.heads, bias=bias) if self.skip_linear or self.in_channels != self.out_channels: self.lin_self = Linear(in_channels[1], out_channels, bias=bias) else: self.lin_self = torch.nn.Identity() if self.in_edge_channels is not None: self.lin_edge = Linear(in_edge_channels, out_channels * self.heads, bias=bias) # TODO: A general torch_geometric.nn.AttentionLayer if self.attention: if self.attention_type == 'additive': self.att_msg = Parameter( torch.Tensor(1, self.heads, self.out_channels)) elif self.attention_type == 'dot_product': self.scaler = torch.sqrt( torch.tensor(out_channels, dtype=torch.float)) else: raise ValueError( f"Attention type '{self.attention_type}' not supported") self.reset_parameters()
class MultiheadAttention(nn.Module): """Multi-headed attention. See "Attention Is All You Need" for more details. """ def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, ): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.reset_parameters() self.onnx_trace = False def prepare_for_onnx_export_(self): self.onnx_trace = True def reset_parameters(self): if self.qkv_same_dim: # Empirically observed the convergence to be much better with # the scaled initialization nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2)) nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2)) nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2)) else: nn.init.xavier_uniform_(self.k_proj.weight) nn.init.xavier_uniform_(self.v_proj.weight) nn.init.xavier_uniform_(self.q_proj.weight) nn.init.xavier_uniform_(self.out_proj.weight) if self.out_proj.bias is not None: nn.init.constant_(self.out_proj.bias, 0.0) if self.bias_k is not None: nn.init.xavier_normal_(self.bias_k) if self.bias_v is not None: nn.init.xavier_normal_(self.bias_v) def forward( self, query, key: Optional[Tensor], value: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, need_weights: bool = True, static_kv: bool = False, attn_mask: Optional[Tensor] = None, before_softmax: bool = False, need_head_weights: bool = False, ) -> Tuple[Tensor, Optional[Tensor]]: """Input shape: Time x Batch x Channel Args: key_padding_mask (ByteTensor, optional): mask to exclude keys that are pads, of shape `(batch, src_len)`, where padding elements are indicated by 1s. need_weights (bool, optional): return the attention weights, averaged over heads (default: False). attn_mask (ByteTensor, optional): typically used to implement causal attention, where the mask prevents the attention from looking forward in time (default: None). before_softmax (bool, optional): return the raw attention weights and values before the attention softmax. need_head_weights (bool, optional): return the attention weights for each head. Implies *need_weights*. Default: return the average attention weights over all heads. """ if need_head_weights: need_weights = True is_tpu = query.device.type == "xla" tgt_len, bsz, embed_dim = query.size() assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] if (not self.onnx_trace and not is_tpu # don't use PyTorch version on TPUs and incremental_state is None and not static_kv # A workaround for quantization to work. Otherwise JIT compilation # treats bias in linear module as method. and not torch.jit.is_scripting()): assert key is not None and value is not None return F.multi_head_attention_forward( query, key, value, self.embed_dim, self.num_heads, torch.empty([0]), torch.cat( (self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)), self.bias_k, self.bias_v, self.add_zero_attn, self.dropout_module.p, self.out_proj.weight, self.out_proj.bias, self.training or self.dropout_module.apply_during_inference, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight=True, q_proj_weight=self.q_proj.weight, k_proj_weight=self.k_proj.weight, v_proj_weight=self.v_proj.weight, ) if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) if saved_state is not None and "prev_key" in saved_state: # previous time steps are cached - no need to recompute # key and value if they are static if static_kv: assert self.encoder_decoder_attention and not self.self_attention key = value = None else: saved_state = None if self.self_attention: q = self.q_proj(query) k = self.k_proj(query) v = self.v_proj(query) elif self.encoder_decoder_attention: # encoder-decoder attention q = self.q_proj(query) if key is None: assert value is None k = v = None else: k = self.k_proj(key) v = self.v_proj(key) else: assert key is not None and value is not None q = self.q_proj(query) k = self.k_proj(key) v = self.v_proj(value) q *= self.scaling if self.bias_k is not None: assert self.bias_v is not None k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) if key_padding_mask is not None: key_padding_mask = torch.cat( [ key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1), ], dim=1, ) q = (q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)) if k is not None: k = (k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)) if v is not None: v = (v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)) if saved_state is not None: # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) if "prev_key" in saved_state: _prev_key = saved_state["prev_key"] assert _prev_key is not None prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim) if static_kv: k = prev_key else: assert k is not None k = torch.cat([prev_key, k], dim=1) if "prev_value" in saved_state: _prev_value = saved_state["prev_value"] assert _prev_value is not None prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim) if static_kv: v = prev_value else: assert v is not None v = torch.cat([prev_value, v], dim=1) prev_key_padding_mask: Optional[Tensor] = None if "prev_key_padding_mask" in saved_state: prev_key_padding_mask = saved_state["prev_key_padding_mask"] assert k is not None and v is not None key_padding_mask = MultiheadAttention._append_prev_key_padding_mask( key_padding_mask=key_padding_mask, prev_key_padding_mask=prev_key_padding_mask, batch_size=bsz, src_len=k.size(1), static_kv=static_kv, ) saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim) saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim) saved_state["prev_key_padding_mask"] = key_padding_mask # In this branch incremental_state is never None assert incremental_state is not None incremental_state = self._set_input_buffer(incremental_state, saved_state) assert k is not None src_len = k.size(1) # This is part of a workaround to get around fork/join parallelism # not supporting Optional types. if key_padding_mask is not None and key_padding_mask.dim() == 0: key_padding_mask = None if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len if self.add_zero_attn: assert v is not None src_len += 1 k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) if key_padding_mask is not None: key_padding_mask = torch.cat( [ key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask), ], dim=1, ) attn_weights = torch.bmm(q, k.transpose(1, 2)) attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) assert list( attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] if attn_mask is not None: attn_mask = attn_mask.unsqueeze(0) if self.onnx_trace: attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1) attn_weights += attn_mask if key_padding_mask is not None: # don't attend to padding symbols attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) if not is_tpu: attn_weights = attn_weights.masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf"), ) else: attn_weights = attn_weights.transpose(0, 2) attn_weights = attn_weights.masked_fill( key_padding_mask, float("-inf")) attn_weights = attn_weights.transpose(0, 2) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) if before_softmax: return attn_weights, v attn_weights_float = utils.softmax(attn_weights, dim=-1, onnx_trace=self.onnx_trace) attn_weights = attn_weights_float.type_as(attn_weights) attn_probs = self.dropout_module(attn_weights) assert v is not None attn = torch.bmm(attn_probs, v) assert list( attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] if self.onnx_trace and attn.size(1) == 1: # when ONNX tracing a single decoder step (sequence length == 1) # the transpose is a no-op copy before view, thus unnecessary attn = attn.contiguous().view(tgt_len, bsz, embed_dim) else: attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn = self.out_proj(attn) attn_weights: Optional[Tensor] = None if need_weights: attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0) if not need_head_weights: # average attention weights over heads attn_weights = attn_weights.mean(dim=0) return attn, attn_weights @staticmethod def _append_prev_key_padding_mask( key_padding_mask: Optional[Tensor], prev_key_padding_mask: Optional[Tensor], batch_size: int, src_len: int, static_kv: bool, ) -> Optional[Tensor]: # saved key padding masks have shape (bsz, seq_len) if prev_key_padding_mask is not None and static_kv: new_key_padding_mask = prev_key_padding_mask elif prev_key_padding_mask is not None and key_padding_mask is not None: new_key_padding_mask = torch.cat( [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1) # During incremental decoding, as the padding token enters and # leaves the frame, there will be a time when prev or current # is None elif prev_key_padding_mask is not None: filler = torch.zeros( (batch_size, src_len - prev_key_padding_mask.size(1)), device=prev_key_padding_mask.device, ) new_key_padding_mask = torch.cat( [prev_key_padding_mask.float(), filler.float()], dim=1) elif key_padding_mask is not None: filler = torch.zeros( (batch_size, src_len - key_padding_mask.size(1)), device=key_padding_mask.device, ) new_key_padding_mask = torch.cat( [filler.float(), key_padding_mask.float()], dim=1) else: new_key_padding_mask = prev_key_padding_mask return new_key_padding_mask @torch.jit.export def reorder_incremental_state( self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order: Tensor, ): """Reorder buffered internal state (for incremental generation).""" input_buffer = self._get_input_buffer(incremental_state) if input_buffer is not None: for k in input_buffer.keys(): input_buffer_k = input_buffer[k] if input_buffer_k is not None: if self.encoder_decoder_attention and input_buffer_k.size( 0) == new_order.size(0): break input_buffer[k] = input_buffer_k.index_select(0, new_order) incremental_state = self._set_input_buffer(incremental_state, input_buffer) return incremental_state def _get_input_buffer( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] ) -> Dict[str, Optional[Tensor]]: result = self.get_incremental_state(incremental_state, "attn_state") if result is not None: return result else: empty_result: Dict[str, Optional[Tensor]] = {} return empty_result def _set_input_buffer( self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], buffer: Dict[str, Optional[Tensor]], ): return self.set_incremental_state(incremental_state, "attn_state", buffer) def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int): return attn_weights def upgrade_state_dict_named(self, state_dict, name): prefix = name + "." if name != "" else "" items_to_add = {} keys_to_remove = [] for k in state_dict.keys(): if k.endswith(prefix + "in_proj_weight"): # in_proj_weight used to be q + k + v with same dimensions dim = int(state_dict[k].shape[0] / 3) items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim] items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim:2 * dim] items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim:] keys_to_remove.append(k) k_bias = prefix + "in_proj_bias" if k_bias in state_dict.keys(): dim = int(state_dict[k].shape[0] / 3) items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim] items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][dim:2 * dim] items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim:] keys_to_remove.append(prefix + "in_proj_bias") for k in keys_to_remove: del state_dict[k] for key, value in items_to_add.items(): state_dict[key] = value
class TopKPooling(torch.nn.Module): r""":math:`\mathrm{top}_k` pooling operator from the `"Graph U-Nets" <https://arxiv.org/abs/1905.05178>`_, `"Towards Sparse Hierarchical Graph Classifiers" <https://arxiv.org/abs/1811.01287>`_ and `"Understanding Attention and Generalization in Graph Neural Networks" <https://arxiv.org/abs/1905.02850>`_ papers if min_score :math:`\tilde{\alpha}` is None: .. math:: \mathbf{y} &= \frac{\mathbf{X}\mathbf{p}}{\| \mathbf{p} \|} \mathbf{i} &= \mathrm{top}_k(\mathbf{y}) \mathbf{X}^{\prime} &= (\mathbf{X} \odot \mathrm{tanh}(\mathbf{y}))_{\mathbf{i}} \mathbf{A}^{\prime} &= \mathbf{A}_{\mathbf{i},\mathbf{i}} if min_score :math:`\tilde{\alpha}` is a value in [0, 1]: .. math:: \mathbf{y} &= \mathrm{softmax}(\mathbf{X}\mathbf{p}) \mathbf{i} &= \mathbf{y}_i > \tilde{\alpha} \mathbf{X}^{\prime} &= (\mathbf{X} \odot \mathbf{y})_{\mathbf{i}} \mathbf{A}^{\prime} &= \mathbf{A}_{\mathbf{i},\mathbf{i}}, where nodes are dropped based on a learnable projection score :math:`\mathbf{p}`. Args: in_channels (int): Size of each input sample. ratio (float): Graph pooling ratio, which is used to compute :math:`k = \lceil \mathrm{ratio} \cdot N \rceil`. This value is ignored if min_score is not None. (default: :obj:`0.5`) min_score (float, optional): Minimal node score :math:`\tilde{\alpha}` which is used to compute indices of pooled nodes :math:`\mathbf{i} = \mathbf{y}_i > \tilde{\alpha}`. When this value is not :obj:`None`, the :obj:`ratio` argument is ignored. (default: :obj:`None`) multiplier (float, optional): Coefficient by which features gets multiplied after pooling. This can be useful for large graphs and when :obj:`min_score` is used. (default: :obj:`1`) nonlinearity (torch.nn.functional, optional): The nonlinearity to use. (default: :obj:`torch.tanh`) """ def __init__(self, in_channels, ratio=0.5, min_score=None, multiplier=1, nonlinearity=torch.tanh): super(TopKPooling, self).__init__() self.in_channels = in_channels self.ratio = ratio self.min_score = min_score self.multiplier = multiplier self.nonlinearity = nonlinearity self.weight = Parameter(torch.Tensor(1, in_channels)) self.reset_parameters() def reset_parameters(self): size = self.in_channels uniform(size, self.weight) def forward(self, x, edge_index, edge_attr=None, batch=None, attn=None): """""" if batch is None: batch = edge_index.new_zeros(x.size(0)) attn = x if attn is None else attn attn = attn.unsqueeze(-1) if attn.dim() == 1 else attn score = (attn * self.weight).sum(dim=-1) ##### zero mean for each instance #########3 score = score.view(batch.max() + 1, -1) score = score - score.mean(1,keepdim=True) # score = score.view(-1) if self.min_score is None: score = self.nonlinearity(score / self.weight.norm(p=2, dim=-1)) else: score = softmax(score, batch) perm = topk(score, self.ratio, batch, self.min_score) x = x[perm] * score[perm].view(-1, 1) x = self.multiplier * x if self.multiplier != 1 else x batch = batch[perm] edge_index, edge_attr = filter_adj(edge_index, edge_attr, perm, num_nodes=score.size(0)) # we changed the last returm term --- score, which are the scores for all the nodes return x, edge_index, edge_attr, batch, perm, score.view(batch.max()+1,-1) def __repr__(self): return '{}({}, {}={}, multiplier={})'.format( self.__class__.__name__, self.in_channels, 'ratio' if self.min_score is None else 'min_score', self.ratio if self.min_score is None else self.min_score, self.multiplier)
class PositionWiseFeedForward(nn.Module): """Two-layer Feed-forward neural network""" def __init__(self, model_size, inner_size, dropout=0., variational=False, activation='relu', glu=False, weight_drop=0.0): super().__init__() self.model_size = model_size self.inner_size = inner_size self.dropout = dropout self.bias = True self.variational = variational self.activation = activation self.glu = glu self.weight_drop = weight_drop self.autograd = False if self.activation == 'relu': if self.glu: self.act = nn.ReLU(inplace=True) else: self.act = ReLUDropout(p=self.dropout, variational=self.variational, batch_first=False) elif self.activation == 'gelu': self.act = nn.GELU() elif self.activation == 'agelu': self.act = AGELU() elif self.activation in ['silu', 'swish']: self.act = SiLU() elif self.activation in ['sigmoid']: if self.glu: self.act = nn.functional.glu else: print( "Sigmoid activation function is recommended to be used with -glu" ) raise NotImplementedError self.in_proj_weight = Parameter( torch.Tensor(inner_size * (2 if glu else 1), model_size)) self.out_proj_weight = Parameter(torch.Tensor(model_size, inner_size)) self.in_proj_bias = Parameter( torch.Tensor(inner_size * (2 if glu else 1))) self.out_proj_bias = Parameter(torch.Tensor(model_size)) self.reset_parameters() self.optimized = 2 self.fused = False # At the moment fused mlp is supported for RELU, SiLU, Swish, GELU and AGELU (approximated GELU) if not self.glu and \ self.activation in ['relu', 'silu', 'swish', 'gelu', 'agelu'] and not self.variational: if self.activation == 'relu': from onmt.modules.mlp.mlp import mlp_relu_function if mlp_relu_function is not None: self.fused_function = mlp_relu_function self.fused = True elif self.activation in ['silu', 'swish']: from onmt.modules.mlp.mlp import mlp_silu_function if mlp_silu_function is not None: self.fused_function = mlp_silu_function self.fused = True elif self.activation == 'gelu': from onmt.modules.mlp.mlp import mlp_gelu_function if mlp_gelu_function is not None: self.fused_function = mlp_gelu_function self.fused = True elif self.activation == 'agelu': from onmt.modules.mlp.mlp import mlp_agelu_function if mlp_agelu_function is not None: self.fused_function = mlp_agelu_function self.fused = True def reset_parameters(self, init='normal'): if init == 'normal': std_ = math.sqrt(2.0 / (self.model_size + self.inner_size)) nn.init.normal_(self.in_proj_weight, 0.0, std_) nn.init.normal_(self.out_proj_weight, 0.0, std_) else: std_ = math.sqrt(6.0 / (self.model_size + self.inner_size)) nn.init.uniform_(self.in_proj_weight, -std_, std_) nn.init.uniform_(self.out_proj_weight, -std_, std_) nn.init.constant_(self.in_proj_bias, 0.0) nn.init.constant_(self.out_proj_bias, 0.0) def convert_autograd(self): if self.autograd: return with torch.no_grad(): self.autograd = True self.linear_in = torch.nn.Linear(self.model_size, self.inner_size) self.linear_out = torch.nn.Linear(self.inner_size, self.model_size) self.linear_in.weight.copy_(self.in_proj_weight) self.linear_in.bias.copy_(self.in_proj_bias) self.linear_out.weight.copy_(self.out_proj_weight) self.linear_out.bias.copy_(self.out_proj_bias) del self.in_proj_weight del self.in_proj_bias del self.out_proj_weight del self.out_proj_bias def forward(self, input, *args): if self.fused and input.is_cuda: # if autocast is enabled: manually cast the function args into half manually # for some reason custom_fwd(...) doesn't work with autocast(enabled=False): weights = [ self.in_proj_weight.half(), self.out_proj_weight.half() ] biases = [self.in_proj_bias.half(), self.out_proj_bias.half()] seq_len, bsz, hidden_size = input.size(0), input.size( 1), input.size(2) dropout = self.dropout if self.training else 0.0 hidden = self.fused_function( dropout, input.half().view(seq_len * bsz, -1), *weights, *biases).type_as(input) hidden = hidden.view(seq_len, bsz, hidden_size) # verification code (only with dropout = 0.0) # with torch.no_grad(): # hidden_ = F.linear(self.act(F.linear(input, self.in_proj_weight, self.in_proj_bias)), # self.out_proj_weight, self.out_proj_bias).type_as(hidden) # # comp = torch.allclose(hidden, hidden_, rtol=1e-03, atol=1e-04) # print(comp) else: if self.autograd: hidden = self.linear_in(input) else: hidden = F.linear(input, self.in_proj_weight, self.in_proj_bias) if self.glu and self.activation != 'sigmoid': hidden, gate = hidden.chunk(2, dim=-1) hidden = self.act(hidden) * gate else: # GLU function hidden = self.act(hidden) if not (not self.glu and self.activation == 'relu'): if self.variational: hidden = variational_dropout( hidden, p=self.dropout, training=self.training, inplace=self.activation in ['silu', 'relu', 'swish', 'gelu']) else: hidden = F.dropout(hidden, p=self.dropout, training=self.training, inplace=self.activation in ['silu', 'relu', 'swish', 'gelu']) if self.autograd: hidden = self.linear_out(hidden) else: hidden = F.linear(hidden, self.out_proj_weight, self.out_proj_bias) return hidden
class Preprocessor(Module): def __init__( self, normalization_parameters: Dict[str, NormalizationParameters], use_gpu: bool, typed_output: bool = False, ) -> None: super(Preprocessor, self).__init__() self.normalization_parameters = normalization_parameters self.sorted_features, self.sorted_feature_boundaries = ( self._sort_features_by_normalization() ) self.typed_output = typed_output cuda_available = torch.cuda.is_available() logger.info("CUDA availability: {}".format(cuda_available)) if use_gpu and cuda_available: logger.info("Using GPU: GPU requested and available.") self.use_gpu = True self.dtype = torch.cuda.FloatTensor else: logger.info("NOT Using GPU: GPU not requested or not available.") self.use_gpu = False self.dtype = torch.FloatTensor # NOTE: Because of the way we call AppendNet to squash ONNX to a C2 net, # We need to make tensors for every numeric literal self.zero_tensor = Parameter( torch.tensor([0.0]).type(self.dtype), requires_grad=False ) self.one_tensor = Parameter( torch.tensor([1.0]).type(self.dtype), requires_grad=False ) self.one_half_tensor = Parameter( torch.tensor([0.5]).type(self.dtype), requires_grad=False ) self.one_hundredth_tensor = Parameter( torch.tensor([0.01]).type(self.dtype), requires_grad=False ) self.negative_one_tensor = Parameter( torch.tensor([-1.0]).type(self.dtype), requires_grad=False ) self.missing_tensor = Parameter( torch.tensor([MISSING_VALUE]).type(self.dtype), requires_grad=False ) self.min_tensor = Parameter( torch.tensor([-1e20]).type(self.dtype), requires_grad=False ) self.max_tensor = Parameter( torch.tensor([1e20]).type(self.dtype), requires_grad=False ) self.epsilon_tensor = Parameter( torch.tensor([EPS]).type(self.dtype), requires_grad=False ) feature_starts = self._get_type_boundaries() for i, feature_type in enumerate(FEATURE_TYPES): begin_index = feature_starts[i] if (i + 1) == len(FEATURE_TYPES): end_index = len(self.normalization_parameters) else: end_index = feature_starts[i + 1] if begin_index == end_index: continue # No features of this type if feature_type == ENUM: # Process one-at-a-time for j in range(begin_index, end_index): norm_params = self.normalization_parameters[self.sorted_features[j]] func = getattr(self, "_create_parameters_" + feature_type) func(j, norm_params) else: norm_params = [] for f in self.sorted_features[begin_index:end_index]: norm_params.append(self.normalization_parameters[f]) func = getattr(self, "_create_parameters_" + feature_type) func(begin_index, norm_params) def input_prototype(self): return rlt.FeatureVector( float_features=torch.randn(1, len(self.normalization_parameters)) ) def forward(self, input) -> torch.FloatTensor: """ Preprocess the input matrix :param input tensor """ if isinstance(input, np.ndarray): input = torch.from_numpy(input).type(self.dtype) if isinstance(input, rlt.FeatureVector): input = input.float_features.type(self.dtype) # ONNX doesn't support != yet not_missing_input = ( self.one_tensor.float() - (input == self.missing_tensor).float() ) feature_starts = self._get_type_boundaries() outputs = [] for i, feature_type in enumerate(FEATURE_TYPES): begin_index = feature_starts[i] if (i + 1) == len(FEATURE_TYPES): end_index = len(self.normalization_parameters) else: end_index = feature_starts[i + 1] if begin_index == end_index: continue # No features of this type if feature_type == ENUM: # Process one-at-a-time for j in range(begin_index, end_index): norm_params = self.normalization_parameters[self.sorted_features[j]] new_output = self._preprocess_feature_single_column( j, input[:, j : j + 1], norm_params ) new_output *= not_missing_input[:, j : j + 1] self._check_preprocessing_output(new_output, [norm_params]) outputs.append(new_output) else: norm_params = [] for f in self.sorted_features[begin_index:end_index]: norm_params.append(self.normalization_parameters[f]) new_output = self._preprocess_feature_multi_column( begin_index, input[:, begin_index:end_index], norm_params ) new_output *= not_missing_input[:, begin_index:end_index] self._check_preprocessing_output(new_output, norm_params) outputs.append(new_output) def wrap(output): if self.typed_output: return rlt.FeatureVector(float_features=output) else: return output if len(outputs) == 1: return wrap(torch.clamp(outputs[0], MIN_FEATURE_VALUE, MAX_FEATURE_VALUE)) return wrap( torch.clamp(torch.cat(outputs, dim=1), MIN_FEATURE_VALUE, MAX_FEATURE_VALUE) ) def _preprocess_feature_single_column( self, begin_index: int, input: torch.Tensor, norm_params: NormalizationParameters, ) -> torch.Tensor: if isinstance(input, np.ndarray): input = torch.from_numpy(input).type(self.dtype) feature_type = norm_params.feature_type func = getattr(self, "_preprocess_" + feature_type) return func(begin_index, input, norm_params) def _preprocess_feature_multi_column( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: if isinstance(input, np.ndarray): input = torch.from_numpy(input).type(self.dtype) feature_type = norm_params[0].feature_type func = getattr(self, "_preprocess_" + feature_type) return func(begin_index, input, norm_params) def _create_parameters_BINARY( self, begin_index: int, norm_params: List[NormalizationParameters] ): pass def _preprocess_BINARY( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: # ONNX doesn't support != yet return self.one_tensor - (input == self.zero_tensor).float() def _create_parameters_PROBABILITY( self, begin_index: int, norm_params: List[NormalizationParameters] ): pass def _preprocess_PROBABILITY( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: clamped_input = torch.clamp(input, 0.01, 0.99) return self.negative_one_tensor * ( ((self.one_tensor / clamped_input) - self.one_tensor).log() ) def _create_parameters_CONTINUOUS_ACTION( self, begin_index: int, norm_params: List[NormalizationParameters] ): self._create_parameter( begin_index, "min_serving_value", torch.Tensor([p.min_value for p in norm_params]).type(self.dtype), ) self._create_parameter( begin_index, "min_training_value", torch.ones(len(norm_params)).type(self.dtype) * -1 + EPS, ) self._create_parameter( begin_index, "scaling_factor", (torch.ones(len(norm_params)).type(self.dtype) - EPS) * 2 / torch.tensor([p.max_value - p.min_value for p in norm_params]).type( self.dtype ), ) def _preprocess_CONTINUOUS_ACTION( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: min_serving_value = self._fetch_parameter(begin_index, "min_serving_value") min_training_value = self._fetch_parameter(begin_index, "min_training_value") scaling_factor = self._fetch_parameter(begin_index, "scaling_factor") continuous_action = ( input - min_serving_value ) * scaling_factor + min_training_value return torch.clamp(continuous_action, -1 + EPS, 1 - EPS) def _create_parameters_CONTINUOUS( self, begin_index: int, norm_params: List[NormalizationParameters] ): self._create_parameter( begin_index, "means", torch.Tensor([p.mean for p in norm_params]).type(self.dtype), ) self._create_parameter( begin_index, "stddevs", torch.Tensor([p.stddev for p in norm_params]).type(self.dtype), ) def _preprocess_CONTINUOUS( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: means = self._fetch_parameter(begin_index, "means") stddevs = self._fetch_parameter(begin_index, "stddevs") continuous_output = (input - means) / stddevs return torch.clamp(continuous_output, MIN_FEATURE_VALUE, MAX_FEATURE_VALUE) def _create_parameters_BOXCOX( self, begin_index: int, norm_params: List[NormalizationParameters] ): self._create_parameter( begin_index, "shifts", torch.Tensor([p.boxcox_shift for p in norm_params]).type(self.dtype), ) for p in norm_params: assert ( abs(p.boxcox_lambda) > 1e-6 ), "Invalid value for boxcox lambda: " + str(p.boxcox_lambda) self._create_parameter( begin_index, "lambdas", torch.Tensor([p.boxcox_lambda for p in norm_params]).type(self.dtype), ) self._create_parameters_CONTINUOUS(begin_index, norm_params) def _preprocess_BOXCOX( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: shifts = self._fetch_parameter(begin_index, "shifts") lambdas = self._fetch_parameter(begin_index, "lambdas") boxcox_output = ( # We can replace this with a normal pow() call after D8528654 lands self._manual_broadcast_matrix_scalar( torch.clamp( input + shifts, 1e-6 ), # Clamp is necessary to prevent MISSING_VALUE from going to NaN lambdas, torch.pow, ) - self.one_tensor ) / lambdas return self._preprocess_CONTINUOUS(begin_index, boxcox_output, norm_params) def _create_parameters_QUANTILE( self, begin_index: int, norm_params: List[NormalizationParameters] ): F = len(norm_params) num_quantiles = torch.tensor( [[float(len(p.quantiles)) - 1 for p in norm_params]] ).type(self.dtype) self._create_parameter(begin_index, "num_quantiles", num_quantiles) max_num_quantile_boundaries = int( torch.max(torch.tensor([len(p.quantiles) for p in norm_params])) ) B = max_num_quantile_boundaries # The quantile boundaries is a FxB matrix where B is the max # of boundaries # We take advantage of the fact that if the value is >= the max # quantile boundary it automatically gets a 1.0 to repeat the max quantile # so that we guarantee a square matrix. # We project the quantiles boundaries to 3d and create a 1xFxB tensor quantile_boundaries = torch.zeros( [1, len(norm_params), max_num_quantile_boundaries] ).type(self.dtype) max_quantile_boundaries = torch.zeros([1, len(norm_params)]).type(self.dtype) min_quantile_boundaries = torch.zeros([1, len(norm_params)]).type(self.dtype) for i, p in enumerate(norm_params): quantile_boundaries[0, i, :] = p.quantiles[-1] quantile_boundaries[0, i, 0 : len(p.quantiles)] = torch.tensor( p.quantiles ).type(self.dtype) max_quantile_boundaries[0, i] = max(p.quantiles) min_quantile_boundaries[0, i] = min(p.quantiles) quantile_boundaries = quantile_boundaries.type(self.dtype) max_quantile_boundaries = max_quantile_boundaries.type(self.dtype) min_quantile_boundaries = min_quantile_boundaries.type(self.dtype) self._create_parameter(begin_index, "quantile_boundaries", quantile_boundaries) self._create_parameter( begin_index, "max_quantile_boundaries", max_quantile_boundaries ) self._create_parameter( begin_index, "min_quantile_boundaries", min_quantile_boundaries ) self._create_parameter( begin_index, "quantile_boundary_mask", torch.ones([1, F, B]).type(self.dtype), ) def _preprocess_QUANTILE( self, begin_index: int, input: torch.Tensor, norm_params: List[NormalizationParameters], ) -> torch.Tensor: """ Replace the value with it's percentile in the range [0,1]. This preprocesses several features in a single step by putting the quantile boundaries in the third dimension and broadcasting. The input is a JxF matrix where J is the batch size and F is the # of features. """ # The number of quantiles is a 1xF matrix num_quantiles = self._fetch_parameter(begin_index, "num_quantiles") quantile_boundaries = self._fetch_parameter(begin_index, "quantile_boundaries") max_quantile_boundaries = self._fetch_parameter( begin_index, "max_quantile_boundaries" ) min_quantile_boundaries = self._fetch_parameter( begin_index, "min_quantile_boundaries" ) # Add a third dimension and repeat to create a JxFxB matrix, where the # inputs are repeated B times in the third dimension. We need to # do this because we can't broadcast both operands in different # dimensions in the same operation. # repeat doesn't work yet, so * by a mask mask = self._fetch_parameter(begin_index, "quantile_boundary_mask") expanded_inputs = input.unsqueeze(2) * mask input_greater_than_or_equal_to = ( expanded_inputs >= quantile_boundaries ).float() input_less_than = (expanded_inputs < quantile_boundaries).float() set_to_max = (input >= max_quantile_boundaries).float() set_to_min = (input <= min_quantile_boundaries).float() min_or_max = (set_to_min + set_to_max).float() interpolate = (min_or_max < self.one_hundredth_tensor).float() interpolate_left, _ = torch.max( (input_greater_than_or_equal_to * quantile_boundaries) + (input_less_than * self.min_tensor), dim=2, ) interpolate_right, _ = torch.min( (input_less_than * quantile_boundaries) + (input_greater_than_or_equal_to * self.max_tensor), dim=2, ) # This assumes that we need to interpolate and computes the value. # If we don't need to interpolate, this will be some bogus value, but it # will be multiplied by 0 so no big deal. left_start = torch.sum(input_greater_than_or_equal_to, dim=2) - self.one_tensor interpolated_values = ( ( left_start + ( (input - interpolate_left) / ( (interpolate_right + self.epsilon_tensor) - interpolate_left ) # Add a small amount to interpolate_right to avoid div-0 ) ) / num_quantiles ).float() return set_to_max + (interpolate * interpolated_values).float() def _create_parameters_ENUM( self, begin_index: int, norm_params: NormalizationParameters ): self._create_parameter( begin_index, "enum_values", torch.Tensor(norm_params.possible_values).unsqueeze(0).type(self.dtype), ) def _preprocess_ENUM( self, begin_index: int, input: torch.Tensor, norm_params: NormalizationParameters, ) -> torch.Tensor: enum_values = self._fetch_parameter(begin_index, "enum_values") return (input == enum_values).float() def _sort_features_by_normalization(self): """ Helper function to return a sorted list from a normalization map. Also returns the starting index for each feature type""" # Sort features by feature type sorted_features = [] feature_starts = [] assert isinstance( list(self.normalization_parameters.keys())[0], int ), "Normalization Parameters need to be int" for feature_type in FEATURE_TYPES: feature_starts.append(len(sorted_features)) for feature in sorted(self.normalization_parameters.keys()): norm = self.normalization_parameters[feature] if norm.feature_type == feature_type: sorted_features.append(feature) return sorted_features, feature_starts def _get_type_boundaries(self) -> List[int]: feature_starts = [] on_feature_type = -1 for i, feature in enumerate(self.sorted_features): feature_type = self.normalization_parameters[feature].feature_type feature_type_index = FEATURE_TYPES.index(feature_type) assert ( feature_type_index >= on_feature_type ), "Features are not sorted by feature type!" while feature_type_index > on_feature_type: feature_starts.append(i) on_feature_type += 1 while on_feature_type < len(FEATURE_TYPES): feature_starts.append(len(self.sorted_features)) on_feature_type += 1 return feature_starts def _create_parameter( self, begin_index: int, name: str, t: torch.Tensor ) -> Parameter: p = Parameter(t, requires_grad=False) setattr(self, "_auto_parameter_" + str(begin_index) + "_" + name, p) return p def _fetch_parameter(self, begin_index: int, name: str) -> Parameter: return getattr(self, "_auto_parameter_" + str(begin_index) + "_" + name) def _manual_broadcast_matrix_scalar( self, t1: torch.Tensor, s1: torch.Tensor, fn ) -> torch.Tensor: # Some ONNX ops don't support broadcasting so we need to do some matrix magic return fn(t1, (t1 * self.zero_tensor) + s1).float() def _manual_broadcast_column_vec_row_vec( self, t1: torch.Tensor, t2: torch.Tensor, fn ) -> torch.Tensor: # Some ONNX ops don't support broadcasting so we need to do some matrix magic t2_ones = t2 / t2 t1_mask = t1.mm(t2_ones) return fn(t1_mask, t2).float() def _check_preprocessing_output(self, batch, norm_params): """ Check that preprocessed features fall within range of valid output. :param batch: torch tensor :param norm_params: list of normalization parameters """ feature_type = norm_params[0].feature_type min_value, max_value = batch.min(), batch.max() if feature_type == "CONTINUOUS": # Continuous features may be in range (-inf, inf) pass elif bool(max_value > MAX_FEATURE_VALUE): raise Exception( "A {} feature type has max value {} which is > than accepted post pre-processing max of {}".format( feature_type, max_value, MAX_FEATURE_VALUE ) ) elif bool(min_value < MIN_FEATURE_VALUE): raise Exception( "A {} feature type has min value {} which is < accepted post pre-processing min of {}".format( feature_type, min_value, MIN_FEATURE_VALUE ) )
class SelfAttnRNN(nn.Module): def __init__(self, args, data): super().__init__() self.n_input = 1 self.m = data.m self.w = args.window self.hid = args.n_hidden self.rnn_cell = nn.RNNCell(input_size=self.n_input, hidden_size=self.hid) self.V = Parameter(torch.Tensor(self.hid, 1)) self.Wx = Parameter(torch.Tensor(self.hid, self.n_input)) self.Wtlt = Parameter(torch.Tensor(self.hid, self.hid)) self.Wh = Parameter(torch.Tensor(self.hid, self.hid)) self.init_weights() self.out = nn.Linear(self.hid, 1) def init_weights(self): for p in self.parameters(): if p.data.ndimension() >= 2: nn.init.xavier_uniform_( p.data) # xavier_normal xavier_uniform_ else: # nn.init.zeros_(p.data) stdv = 1. / math.sqrt(p.size(0)) p.data.uniform_(-stdv, stdv) def forward(self, x): ''' Args: x: (batch, time_step, m) Returns: (batch, m) ''' b, w, m = x.size() x = x.permute(0, 2, 1).contiguous().view( x.size(0) * x.size(2), x.size(1), self.n_input) # x, 20, 1 Htlt = [] H = [] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") for step in range(self.w): # forloop each history step x_tp1 = x[:, step, :] # [x, 1] if step == 0: hx = torch.zeros(b * m, self.hid).to(device) H.append(hx) h_tlt = torch.zeros(b * m, self.hid).to(device) else: h_tlt = Htlt[-1] h_his = torch.stack(H, dim=1) if step > 0: x_tp1_rp = x_tp1.repeat(1, step + 1).view(b * m, step + 1, -1) h_tlt_rp = h_tlt.repeat(1, step + 1).view(b * m, step + 1, -1) else: x_tp1_rp = x_tp1 h_tlt_rp = h_tlt q1 = x_tp1_rp @ self.Wx.t() # [x, 20] q2 = h_tlt_rp @ self.Wtlt.t() # [x, 20] q3 = h_his @ self.Wh.t() # [x, 20] a = torch.tanh(q1 + q2 + q3) @ self.V # [x, 1] a = torch.softmax(a, dim=-1) h_tlt_t = h_his * a h_tlt_t = torch.sum(h_tlt_t, dim=1) Htlt.append(h_tlt_t) hx = self.rnn_cell(x_tp1, h_tlt_t) # [x, 20] H.append(hx) h = H[-1] out = self.out(h) out = out.view(b, m) return out, None
class GPLVM(Parameterized): """ Gaussian Process Latent Variable Model (GPLVM) model. GPLVM is a Gaussian Process model with its train input data is a latent variable. This model is useful for dimensional reduction of high dimensional data. Assume the mapping from low dimensional latent variable to is a Gaussian Process instance. Then the high dimensional data will play the role of train output ``y`` and our target is to learn latent inputs which best explain ``y``. For the purpose of dimensional reduction, latent inputs should have lower dimensions than ``y``. We follows reference [1] to put a unit Gaussian prior to the input and approximate its posterior by a multivariate normal distribution with two variational parameters: ``X_loc`` and ``X_scale_tril``. For example, we can do dimensional reduction on Iris dataset as follows: >>> # With y as the 2D Iris data of shape 150x4 and we want to reduce its dimension >>> # to a tensor X of shape 150x2, we will use GPLVM. .. doctest:: :hide: >>> # Simulating iris data. >>> y = torch.stack([dist.Normal(4.8, 0.1).sample((150,)), ... dist.Normal(3.2, 0.3).sample((150,)), ... dist.Normal(1.5, 0.4).sample((150,)), ... dist.Exponential(0.5).sample((150,))]) >>> # First, define the initial values for X_loc parameter: >>> X_loc = torch.zeros(150, 2) >>> # Then, define a Gaussian Process model with input X_loc and output y: >>> kernel = gp.kernels.RBF(input_dim=2, lengthscale=torch.ones(2)) >>> Xu = torch.zeros(20, 2) # initial inducing inputs of sparse model >>> gpmodel = gp.models.SparseGPRegression(X_loc, y, kernel, Xu) >>> # Finally, wrap gpmodel by GPLVM, optimize, and get the "learned" mean of X: >>> gplvm = gp.models.GPLVM(gpmodel) >>> gplvm.optimize() # doctest: +SKIP >>> X = gplvm.get_param("X_loc") Reference: [1] Bayesian Gaussian Process Latent Variable Model Michalis K. Titsias, Neil D. Lawrence :param ~pyro.contrib.gp.models.model.GPModel base_model: A Pyro Gaussian Process model object. Note that ``base_model.X`` will be the initial value for the variational parameter ``X_loc``. :param str name: Name of this model. """ def __init__(self, base_model, name="GPLVM"): super(GPLVM, self).__init__(name) if base_model.X.dim() != 2: raise ValueError("GPLVM model only works with 2D latent X, but got " "X.dim() = {}.".format(base_model.X.dim())) self.base_model = base_model self.y = self.base_model.y self.X_loc = Parameter(self.base_model.X) C = self.X_loc.shape[1] X_scale_tril_shape = self.X_loc.shape + (C,) Id = torch.eye(C, out=self.X_loc.new_empty(C, C)) X_scale_tril = Id.expand(X_scale_tril_shape) self.X_scale_tril = Parameter(X_scale_tril) self.set_constraint("X_scale_tril", constraints.lower_cholesky) self._call_base_model_guide = True def model(self): self.set_mode("model", recursive=False) # sample X from unit multivariate normal distribution zero_loc = self.X_loc.new_zeros(self.X_loc.shape) C = self.X_loc.shape[1] Id = torch.eye(C, out=self.X_loc.new_empty(C, C)) X_name = param_with_module_name(self.name, "X") X = pyro.sample(X_name, dist.MultivariateNormal(zero_loc, scale_tril=Id) .independent(zero_loc.dim()-1)) self.base_model.set_data(X, self.y) self.base_model.model() def guide(self): self.set_mode("guide", recursive=False) # sample X from variational multivariate normal distribution X_loc = self.get_param("X_loc") X_scale_tril = self.get_param("X_scale_tril") X_name = param_with_module_name(self.name, "X") X = pyro.sample(X_name, dist.MultivariateNormal(X_loc, scale_tril=X_scale_tril) .independent(X_loc.dim()-1)) self.base_model.set_data(X, self.y) if self._call_base_model_guide: self.base_model.guide() def forward(self, **kwargs): """ Forward method has the same signal as its ``base_model``. Note that the train input data of ``base_model`` is sampled from GPLVM. """ # avoid calling base_model's guide two times self._call_base_model_guide = False self.guide() self._call_base_model_guide = True return self.base_model(**kwargs) def optimize(self, optimizer=optim.Adam({}), num_steps=1000): """ A convenient method to optimize parameters for GPLVM model using :class:`~pyro.infer.svi.SVI`. :param ~optim.PyroOptim optimizer: A Pyro optimizer. :param int num_steps: Number of steps to run SVI. :returns: a list of losses during the training procedure :rtype: list """ if not isinstance(optimizer, optim.PyroOptim): raise ValueError("Optimizer should be an instance of " "pyro.optim.PyroOptim class.") svi = infer.SVI(self.model, self.guide, optimizer, loss=infer.Trace_ELBO()) losses = [] for i in range(num_steps): losses.append(svi.step()) return losses
def __init__(self, args, data): super().__init__() self.x_h = 1 self.f_h = data.m self.m = data.m self.d = data.d self.w = args.window self.h = args.horizon self.adj = data.adj self.o_adj = data.orig_adj if args.cuda: self.adj = sparse_mx_to_torch_sparse_tensor( normalize_adj2(data.orig_adj.cpu().numpy())).to_dense().cuda() else: self.adj = sparse_mx_to_torch_sparse_tensor( normalize_adj2(data.orig_adj.cpu().numpy())).to_dense() self.dropout = args.dropout self.n_hidden = args.n_hidden half_hid = int(self.n_hidden / 2) self.V = Parameter(torch.Tensor(half_hid)) self.bv = Parameter(torch.Tensor(1)) self.W1 = Parameter(torch.Tensor(half_hid, self.n_hidden)) self.b1 = Parameter(torch.Tensor(half_hid)) self.W2 = Parameter(torch.Tensor(half_hid, self.n_hidden)) self.act = F.elu self.Wb = Parameter(torch.Tensor(self.m, self.m)) self.wb = Parameter(torch.Tensor(1)) self.k = args.k self.conv = nn.Conv1d(1, self.k, self.w) self.conv_long = nn.Conv1d(1, self.k, self.w - self.k, dilation=2) self.n_spatial = args.hidsp #self.h ####### check equal to k self.conv1 = GraphConvLayer(self.k * 3, self.n_hidden) # self.k self.conv2 = GraphConvLayer(self.n_hidden, self.n_spatial) if args.rnn_model == 'LSTM': self.rnn = nn.LSTM(input_size=self.x_h, hidden_size=self.n_hidden, num_layers=args.n_layer, dropout=args.dropout, batch_first=True, bidirectional=args.bi) elif args.rnn_model == 'GRU': self.rnn = nn.GRU(input_size=self.x_h, hidden_size=self.n_hidden, num_layers=args.n_layer, dropout=args.dropout, batch_first=True, bidirectional=args.bi) elif args.rnn_model == 'RNN': self.rnn = nn.RNN(input_size=self.x_h, hidden_size=self.n_hidden, num_layers=args.n_layer, dropout=args.dropout, batch_first=True, bidirectional=args.bi) else: raise LookupError(' only support LSTM, GRU and RNN') hidden_size = (int(args.bi) + 1) * self.n_hidden # self.n_hidden = hidden_size BIDIRECTIONAL BUG self.out = nn.Linear(hidden_size + self.n_spatial, 1) self.residual_window = 0 self.ratio = 1.0 if (self.residual_window > 0): self.residual_window = min(self.residual_window, args.window) self.residual = nn.Linear(self.residual_window, 1) self.init_weights()
class ConvVDO(ModuleWrapper): def __init__(self, in_channels, out_channels, kernel_size, alpha_shape, stride=1, padding=0, dilation=1, prior='loguni', bias=True): super(ConvVDO, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = (kernel_size, kernel_size) self.stride = stride self.padding = padding self.dilation = dilation self.alpha_shape = alpha_shape self.groups = 1 self.weight = Parameter(torch.Tensor( out_channels, in_channels, *self.kernel_size)) if bias: self.bias = Parameter(torch.Tensor(1, out_channels, 1, 1)) else: self.register_parameter('bias', None) self.op_bias = lambda input, kernel: F.conv2d(input, kernel, self.bias, self.stride, self.padding, self.dilation, self.groups) self.op_nobias = lambda input, kernel: F.conv2d(input, kernel, None, self.stride, self.padding, self.dilation, self.groups) self.log_alpha = Parameter(torch.Tensor(*alpha_shape)) self.reset_parameters() self.zero_mean = False self.permute_sigma = False self.prior = prior if prior == 'loguni': self.kl_fun = metrics.kl_loguni else: self.kl_fun = metrics.kl_ard def reset_parameters(self): n = self.in_channels for k in self.kernel_size: n *= k stdv = 1. / math.sqrt(n) self.weight.data.uniform_(-stdv, stdv) if self.bias is not None: self.bias.data.uniform_(-stdv, stdv) self.log_alpha.data.fill_(-5.0) def forward(self, x): if self.zero_mean: lrt_mean = self.op_bias(x, 0.0 * self.weight) else: lrt_mean = self.op_bias(x, self.weight) sigma2 = Variable.exp(self.log_alpha) * self.weight * self.weight if self.permute_sigma: sigma2 = sigma2.view(-1)[torch.randperm(self.weight.nelement()).cuda()].view(self.weight.shape) lrt_std = Variable.sqrt(1e-16 + self.op_nobias(x * x, sigma2)) if self.training: eps = Variable(lrt_std.data.new(lrt_std.size()).normal_()) else: eps = 0.0 return lrt_mean + lrt_std * eps def kl_reg(self): return self.weight.nelement() / self.log_alpha.nelement() * metrics.kl_loguni(self.log_alpha) def __repr__(self): s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}' ', stride={stride}') s += ', padding={padding}' s += ', alpha_shape=' + str(self.alpha_shape) s += ', prior=' + self.prior s += ', dilation={dilation}' if self.bias is None: s += ', bias=False' s += ')' return s.format(name=self.__class__.__name__, **self.__dict__)
def __init__(self, in_features: int, out_features: int): super(ArcMarginProduct, self).__init__() self.weight = Parameter(torch.FloatTensor(out_features, in_features), requires_grad=True) self.reset_parameters()
def _build_model(self): self.linear = nn.utils.weight_norm( nn.Linear(self.input_dim, self.out_dim)) self.bias = Parameter(torch.Tensor) self.register_buffer('running_mean', torch.zeros(self.out_dim)) self.reset_parameter()
def beta(self, value): self._beta = Parameter(free_form(torch.as_tensor(value)))
def __init__(self, model_size, inner_size, dropout=0., variational=False, activation='relu', glu=False, weight_drop=0.0): super().__init__() self.model_size = model_size self.inner_size = inner_size self.dropout = dropout self.bias = True self.variational = variational self.activation = activation self.glu = glu self.weight_drop = weight_drop self.autograd = False if self.activation == 'relu': if self.glu: self.act = nn.ReLU(inplace=True) else: self.act = ReLUDropout(p=self.dropout, variational=self.variational, batch_first=False) elif self.activation == 'gelu': self.act = nn.GELU() elif self.activation == 'agelu': self.act = AGELU() elif self.activation in ['silu', 'swish']: self.act = SiLU() elif self.activation in ['sigmoid']: if self.glu: self.act = nn.functional.glu else: print( "Sigmoid activation function is recommended to be used with -glu" ) raise NotImplementedError self.in_proj_weight = Parameter( torch.Tensor(inner_size * (2 if glu else 1), model_size)) self.out_proj_weight = Parameter(torch.Tensor(model_size, inner_size)) self.in_proj_bias = Parameter( torch.Tensor(inner_size * (2 if glu else 1))) self.out_proj_bias = Parameter(torch.Tensor(model_size)) self.reset_parameters() self.optimized = 2 self.fused = False # At the moment fused mlp is supported for RELU, SiLU, Swish, GELU and AGELU (approximated GELU) if not self.glu and \ self.activation in ['relu', 'silu', 'swish', 'gelu', 'agelu'] and not self.variational: if self.activation == 'relu': from onmt.modules.mlp.mlp import mlp_relu_function if mlp_relu_function is not None: self.fused_function = mlp_relu_function self.fused = True elif self.activation in ['silu', 'swish']: from onmt.modules.mlp.mlp import mlp_silu_function if mlp_silu_function is not None: self.fused_function = mlp_silu_function self.fused = True elif self.activation == 'gelu': from onmt.modules.mlp.mlp import mlp_gelu_function if mlp_gelu_function is not None: self.fused_function = mlp_gelu_function self.fused = True elif self.activation == 'agelu': from onmt.modules.mlp.mlp import mlp_agelu_function if mlp_agelu_function is not None: self.fused_function = mlp_agelu_function self.fused = True
def __init__(self, basic_conv1=Conv2d_Hori_Veri_Cross, basic_conv2=Conv2d_Diag_Cross, theta=0.8): super(DC_CDN, self).__init__() self.conv1 = nn.Sequential( basic_conv1(3, 64, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(64), nn.ReLU(), ) self.Block1 = nn.Sequential( basic_conv1(64, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), basic_conv1(128, 196, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(196), nn.ReLU(), basic_conv1(196, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2, padding=0), ) self.Block2 = nn.Sequential( basic_conv1(128, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), basic_conv1(128, 196, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(196), nn.ReLU(), basic_conv1(196, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2, padding=0), ) self.Block3 = nn.Sequential( basic_conv1(128, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), basic_conv1(128, 196, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(196), nn.ReLU(), basic_conv1(196, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2, padding=0), ) self.lastconv1 = nn.Sequential( basic_conv1(128 * 3, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), ) self.lastconv2 = nn.Sequential( basic_conv1(128, 64, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(64), nn.ReLU(), ) self.lastconv3 = nn.Sequential( #basic_conv1(64, 1, kernel_size=3, stride=1, padding=1, bias=False, theta= theta), nn.Conv2d(128, 1, kernel_size=1, stride=1, padding=0, bias=False), nn.ReLU(), ) # 2nd stream self.conv1_2 = nn.Sequential( basic_conv2(3, 64, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(64), nn.ReLU(), ) self.Block1_2 = nn.Sequential( basic_conv2(64, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), basic_conv2(128, 196, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(196), nn.ReLU(), basic_conv2(196, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2, padding=0), ) self.Block2_2 = nn.Sequential( basic_conv2(128, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), basic_conv2(128, 196, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(196), nn.ReLU(), basic_conv2(196, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2, padding=0), ) self.Block3_2 = nn.Sequential( basic_conv2(128, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), basic_conv2(128, 196, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(196), nn.ReLU(), basic_conv2(196, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2, padding=0), ) self.lastconv1_2 = nn.Sequential( basic_conv2(128 * 3, 128, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(128), nn.ReLU(), ) self.lastconv2_2 = nn.Sequential( basic_conv2(128, 64, kernel_size=3, stride=1, padding=1, bias=False, theta=theta), nn.BatchNorm2d(64), nn.ReLU(), ) #self.lastconv3_2 = nn.Sequential( # basic_conv2(64, 1, kernel_size=3, stride=1, padding=1, bias=False, theta= theta), # #nn.Conv2d(64, 1, kernel_size=1, stride=1, padding=0, bias=False), # nn.ReLU(), #) #self.HP_branch1 = Parameter(torch.ones([3,1])) self.HP_branch1 = Parameter(torch.zeros([3, 1])) #self.HP_branch2 = Parameter(torch.ones([3,1])) self.HP_branch2 = Parameter(torch.zeros([3, 1])) self.downsample32x32 = nn.Upsample(size=(32, 32), mode='bilinear')
def __init__(self, feat_dim, num_class, margin=0.35, scale=32): super(AM_Softmax, self).__init__() self.weight = Parameter(torch.Tensor(feat_dim, num_class)) self.weight.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5) self.margin = margin self.scale = scale
class MultiheadAttention(nn.Module): """Multi-headed attention. See "Attention Is All You Need" for more details. """ def __init__(self, embed_dim, num_heads, attn_dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False): super().__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.attn_dropout = attn_dropout self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.scaling = self.head_dim ** -0.5 self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim)) self.register_parameter('in_proj_bias', None) if bias: self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim)) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.reset_parameters() def reset_parameters(self): nn.init.xavier_uniform_(self.in_proj_weight) nn.init.xavier_uniform_(self.out_proj.weight) if self.in_proj_bias is not None: nn.init.constant_(self.in_proj_bias, 0.) nn.init.constant_(self.out_proj.bias, 0.) if self.bias_k is not None: nn.init.xavier_normal_(self.bias_k) if self.bias_v is not None: nn.init.xavier_normal_(self.bias_v) def forward(self, query, key, value, attn_mask=None): """Input shape: Time x Batch x Channel Self-attention can be implemented by passing in the same arguments for query, key and value. Timesteps can be masked by supplying a T x T mask in the `attn_mask` argument. Padding elements can be excluded from the key by passing a binary ByteTensor (`key_padding_mask`) with shape: batch x src_len, where padding elements are indicated by 1s. """ import pdb qkv_same = query.data_ptr() == key.data_ptr() == value.data_ptr() kv_same = key.data_ptr() == value.data_ptr() tgt_len, bsz, embed_dim = query.size() assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] assert key.size() == value.size() aved_state = None if qkv_same: # self-attention q, k, v = self.in_proj_qkv(query) elif kv_same: # encoder-decoder attention q = self.in_proj_q(query) if key is None: assert value is None k = v = None else: k, v = self.in_proj_kv(key) else: q = self.in_proj_q(query) k = self.in_proj_k(key) v = self.in_proj_v(value) q *= self.scaling if self.bias_k is not None: assert self.bias_v is not None k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) if attn_mask is not None: attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) #q 50 24 30 q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1) #q 120 50 6 if k is not None: k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) if v is not None: v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) src_len = k.size(1) if self.add_zero_attn: src_len += 1 k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) if attn_mask is not None: attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) attn_weights = torch.bmm(q, k.transpose(1, 2)) assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] if attn_mask is not None: try: attn_weights += attn_mask.unsqueeze(0) except: print(attn_weights.shape) print(attn_mask.unsqueeze(0).shape) assert False attn_weights = F.softmax(attn_weights.float(), dim=-1).type_as(attn_weights) # attn_weights = F.relu(attn_weights) # attn_weights = attn_weights / torch.max(attn_weights) attn_weights = F.dropout(attn_weights, p=self.attn_dropout, training=self.training) attn = torch.bmm(attn_weights, v) assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn = self.out_proj(attn) # average attention weights over heads attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights.sum(dim=1) / self.num_heads return attn, attn_weights def in_proj_qkv(self, query): return self._in_proj(query).chunk(3, dim=-1) def in_proj_kv(self, key): return self._in_proj(key, start=self.embed_dim).chunk(2, dim=-1) def in_proj_q(self, query, **kwargs): return self._in_proj(query, end=self.embed_dim, **kwargs) def in_proj_k(self, key): return self._in_proj(key, start=self.embed_dim, end=2 * self.embed_dim) def in_proj_v(self, value): return self._in_proj(value, start=2 * self.embed_dim) def _in_proj(self, input, start=0, end=None, **kwargs): weight = kwargs.get('weight', self.in_proj_weight) bias = kwargs.get('bias', self.in_proj_bias) weight = weight[start:end, :] if bias is not None: bias = bias[start:end] return F.linear(input, weight, bias)
def test_parameter_sharing(self): param = Parameter(torch.arange(1., 26).view(5, 5)) self._test_autograd_sharing(param, is_parameter=True)
def __init__(self, *, inputSize, hiddenSize, train=True, dr=0.5, drMethod='gal+sem', gpu=0): super(LSTMcell_untied, self).__init__() self.inputSize = inputSize self.hiddenSize = inputSize self.dr = dr self.w_xi = Parameter(torch.Tensor(hiddenSize, inputSize)) self.w_xf = Parameter(torch.Tensor(hiddenSize, inputSize)) self.w_xo = Parameter(torch.Tensor(hiddenSize, inputSize)) self.w_xc = Parameter(torch.Tensor(hiddenSize, inputSize)) self.w_hi = Parameter(torch.Tensor(hiddenSize, hiddenSize)) self.w_hf = Parameter(torch.Tensor(hiddenSize, hiddenSize)) self.w_ho = Parameter(torch.Tensor(hiddenSize, hiddenSize)) self.w_hc = Parameter(torch.Tensor(hiddenSize, hiddenSize)) self.b_i = Parameter(torch.Tensor(hiddenSize)) self.b_f = Parameter(torch.Tensor(hiddenSize)) self.b_o = Parameter(torch.Tensor(hiddenSize)) self.b_c = Parameter(torch.Tensor(hiddenSize)) self.drMethod = drMethod.split('+') self.gpu = gpu self.train = train if gpu >= 0: self = self.cuda(gpu) self.is_cuda = True else: self.is_cuda = False self.reset_parameters()
def test_cuda_parameter_sharing(self): param = Parameter(torch.arange(1., 26, device='cuda').view(5, 5)) self._test_autograd_sharing(param, mp.get_context('spawn'), is_parameter=True)
def __init__(self): super().__init__() self.mu = Parameter(FloatTensor([0.0])) self.log_variance = Parameter(FloatTensor([0.0]))
def __init__(self, num_layers=50, drop_ratio=0.4, mode='ir_se', stn_mode='resnet'): super(ResnetFaceSTN, self).__init__() assert num_layers in [50, 100, 152] assert mode in ['ir', 'ir_se'] assert stn_mode in ['resnet', 'cbp'] blocks = get_blocks(num_layers) if mode == 'ir': unit_module = bottleneck_IR elif mode == 'ir_se': unit_module = bottleneck_IR_SE if stn_mode == 'cbp': self.localization = Sequential( Conv2d(3, 16, kernel_size=7, padding=1, stride=2, bias=False), BatchNorm2d(16), PReLU(16), Conv2d(16, 32, kernel_size=5, padding=1, stride=2, bias=False), BatchNorm2d(32), PReLU(32), Conv2d(32, 32, kernel_size=5, padding=1, stride=2, bias=False), BatchNorm2d(32), PReLU(32), Conv2d(32, 64, kernel_size=5, padding=1, stride=2, bias=False), BatchNorm2d(64), PReLU(64), Conv2d(64, 64, kernel_size=5, padding=1, stride=1, bias=False), BatchNorm2d(64), PReLU(64) ) self.fc_loc = Sequential( Flatten(), Linear(64 * 4 * 4, 6) ) elif stn_mode == 'resnet': self.localization = Sequential( bottleneck_IR(3, 16, 2), bottleneck_IR(16, 32, 2), bottleneck_IR(32, 32, 2), bottleneck_IR(32, 64, 2), bottleneck_IR(64, 64, 1), torch.nn.AdaptiveAvgPool2d(1) ) self.fc_loc = Sequential( Flatten(), Linear(64 * 1 * 1, 6) ) self.input_layer = Sequential(Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64), PReLU(64)) self.output_layer = Sequential(BatchNorm2d(512), Dropout(drop_ratio), Flatten(), Linear(512 * 7 * 7, 512), BatchNorm1d(512)) modules = [] for block in blocks: for bottleneck in block: modules.append( unit_module(bottleneck.in_channel, bottleneck.depth, bottleneck.stride)) self.body = Sequential(*modules) self.fc_loc[1].weight.data.zero_() # WARNING remember to change the bias according to input size # NOTE for img size 128 -> 112 self.fc_loc[1].bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float32)) self.warp_param_adder = Parameter(torch.ones(1, 1))
def _create_parameter(self, begin_index: int, name: str, t: torch.Tensor) -> Parameter: p = Parameter(t, requires_grad=False) setattr(self, "_auto_parameter_" + str(begin_index) + "_" + name, p) return p
def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, output_padding, groups, p_logvar_init=-3, p_pi=1.0, q_logvar_init=-5): super(_ConvNd, self).__init__() if in_channels % groups != 0: raise ValueError('in_channels must be divisible by groups') if out_channels % groups != 0: raise ValueError('out_channels must be divisible by groups') self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.stride = stride self.padding = padding self.dilation = dilation self.output_padding = output_padding self.groups = groups # initialize log variance of p and q self.p_logvar_init = p_logvar_init self.q_logvar_init = q_logvar_init # approximate posterior weights... self.qw_mean = Parameter( torch.Tensor(out_channels, in_channels // groups, *kernel_size)) self.qw_logvar = Parameter( torch.Tensor(out_channels, in_channels // groups, *kernel_size)) # optionally add bias # self.qb_mean = Parameter(torch.Tensor(out_channels)) # self.qb_logvar = Parameter(torch.Tensor(out_channels)) # ...and output... self.conv_qw_mean = Parameter( torch.Tensor(out_channels, in_channels // groups, *kernel_size)) self.conv_qw_std = Parameter( torch.Tensor(out_channels, in_channels // groups, *kernel_size)) # ...as normal distributions self.qw = Normal(mu=self.qw_mean, logvar=self.qw_logvar) # self.qb = Normal(mu=self.qb_mean, logvar=self.qb_logvar) self.conv_qw = Normalout(mu=self.conv_qw_mean, si=self.conv_qw_std) # initialise self.log_alpha = Parameter(torch.Tensor(1, 1)) # prior model # (does not have any trainable parameters so we use fixed normal or fixed mixture normal distributions) self.pw = distribution_selector(mu=0.0, logvar=p_logvar_init, pi=p_pi) # self.pb = distribution_selector(mu=0.0, logvar=p_logvar_init, pi=p_pi) # initialize all parameters self.reset_parameters()
def __init__( self, normalization_parameters: Dict[str, NormalizationParameters], use_gpu: bool, typed_output: bool = False, ) -> None: super(Preprocessor, self).__init__() self.normalization_parameters = normalization_parameters self.sorted_features, self.sorted_feature_boundaries = ( self._sort_features_by_normalization() ) self.typed_output = typed_output cuda_available = torch.cuda.is_available() logger.info("CUDA availability: {}".format(cuda_available)) if use_gpu and cuda_available: logger.info("Using GPU: GPU requested and available.") self.use_gpu = True self.dtype = torch.cuda.FloatTensor else: logger.info("NOT Using GPU: GPU not requested or not available.") self.use_gpu = False self.dtype = torch.FloatTensor # NOTE: Because of the way we call AppendNet to squash ONNX to a C2 net, # We need to make tensors for every numeric literal self.zero_tensor = Parameter( torch.tensor([0.0]).type(self.dtype), requires_grad=False ) self.one_tensor = Parameter( torch.tensor([1.0]).type(self.dtype), requires_grad=False ) self.one_half_tensor = Parameter( torch.tensor([0.5]).type(self.dtype), requires_grad=False ) self.one_hundredth_tensor = Parameter( torch.tensor([0.01]).type(self.dtype), requires_grad=False ) self.negative_one_tensor = Parameter( torch.tensor([-1.0]).type(self.dtype), requires_grad=False ) self.missing_tensor = Parameter( torch.tensor([MISSING_VALUE]).type(self.dtype), requires_grad=False ) self.min_tensor = Parameter( torch.tensor([-1e20]).type(self.dtype), requires_grad=False ) self.max_tensor = Parameter( torch.tensor([1e20]).type(self.dtype), requires_grad=False ) self.epsilon_tensor = Parameter( torch.tensor([EPS]).type(self.dtype), requires_grad=False ) feature_starts = self._get_type_boundaries() for i, feature_type in enumerate(FEATURE_TYPES): begin_index = feature_starts[i] if (i + 1) == len(FEATURE_TYPES): end_index = len(self.normalization_parameters) else: end_index = feature_starts[i + 1] if begin_index == end_index: continue # No features of this type if feature_type == ENUM: # Process one-at-a-time for j in range(begin_index, end_index): norm_params = self.normalization_parameters[self.sorted_features[j]] func = getattr(self, "_create_parameters_" + feature_type) func(j, norm_params) else: norm_params = [] for f in self.sorted_features[begin_index:end_index]: norm_params.append(self.normalization_parameters[f]) func = getattr(self, "_create_parameters_" + feature_type) func(begin_index, norm_params)
class VariationalSparseGP(GPModel): r""" Variational Sparse Gaussian Process model. In :class:`.VariationalGP` model, when the number of input data :math:`X` is large, the covariance matrix :math:`k(X, X)` will require a lot of computational steps to compute its inverse (for log likelihood and for prediction). This model introduces an additional inducing-input parameter :math:`X_u` to solve that problem. Given inputs :math:`X`, their noisy observations :math:`y`, and the inducing-input parameters :math:`X_u`, the model takes the form: .. math:: [f, u] &\sim \mathcal{GP}(0, k([X, X_u], [X, X_u])),\\ y & \sim p(y) = p(y \mid f) p(f), where :math:`p(y \mid f)` is the likelihood. We will use a variational approach in this model by approximating :math:`q(f,u)` to the posterior :math:`p(f,u \mid y)`. Precisely, :math:`q(f) = p(f\mid u)q(u)`, where :math:`q(u)` is a multivariate normal distribution with two parameters ``u_loc`` and ``u_scale_tril``, which will be learned during a variational inference process. .. note:: This model can be learned using MCMC method as in reference [2]. See also :class:`.GPModel`. .. note:: This model has :math:`\mathcal{O}(NM^2)` complexity for training, :math:`\mathcal{O}(M^3)` complexity for testing. Here, :math:`N` is the number of train inputs, :math:`M` is the number of inducing inputs. Size of variational parameters is :math:`\mathcal{O}(M^2)`. References: [1] `Scalable variational Gaussian process classification`, James Hensman, Alexander G. de G. Matthews, Zoubin Ghahramani [2] `MCMC for Variationally Sparse Gaussian Processes`, James Hensman, Alexander G. de G. Matthews, Maurizio Filippone, Zoubin Ghahramani :param torch.Tensor X: A input data for training. Its first dimension is the number of data points. :param torch.Tensor y: An output data for training. Its last dimension is the number of data points. :param ~pyro.contrib.gp.kernels.kernel.Kernel kernel: A Pyro kernel object, which is the covariance function :math:`k`. :param torch.Tensor Xu: Initial values for inducing points, which are parameters of our model. :param ~pyro.contrib.gp.likelihoods.likelihood Likelihood likelihood: A likelihood object. :param callable mean_function: An optional mean function :math:`m` of this Gaussian process. By default, we use zero mean. :param torch.Size latent_shape: Shape for latent processes (`batch_shape` of :math:`q(u)`). By default, it equals to output batch shape ``y.shape[:-1]``. For the multi-class classification problems, ``latent_shape[-1]`` should corresponse to the number of classes. :param int num_data: The size of full training dataset. It is useful for training this model with mini-batch. :param bool whiten: A flag to tell if variational parameters ``u_loc`` and ``u_scale_tril`` are transformed by the inverse of ``Luu``, where ``Luu`` is the lower triangular decomposition of :math:`kernel(X_u, X_u)`. Enable this flag will help optimization. :param float jitter: A small positive term which is added into the diagonal part of a covariance matrix to help stablize its Cholesky decomposition. """ def __init__(self, X, y, kernel, Xu, likelihood, mean_function=None, latent_shape=None, num_data=None, whiten=False, jitter=1e-6): super().__init__(X, y, kernel, mean_function, jitter) self.likelihood = likelihood self.Xu = Parameter(Xu) y_batch_shape = self.y.shape[:-1] if self.y is not None else torch.Size( []) self.latent_shape = latent_shape if latent_shape is not None else y_batch_shape M = self.Xu.size(0) u_loc = self.Xu.new_zeros(self.latent_shape + (M, )) self.u_loc = Parameter(u_loc) identity = eye_like(self.Xu, M) u_scale_tril = identity.repeat(self.latent_shape + (1, 1)) self.u_scale_tril = PyroParam(u_scale_tril, constraints.lower_cholesky) self.num_data = num_data if num_data is not None else self.X.size(0) self.whiten = whiten self._sample_latent = True @pyro_method def model(self): self.set_mode("model") M = self.Xu.size(0) Kuu = self.kernel(self.Xu).contiguous() Kuu.view(-1)[::M + 1] += self.jitter # add jitter to the diagonal Luu = Kuu.cholesky() zero_loc = self.Xu.new_zeros(self.u_loc.shape) if self.whiten: identity = eye_like(self.Xu, M) pyro.sample( self._pyro_get_fullname("u"), dist.MultivariateNormal( zero_loc, scale_tril=identity).to_event(zero_loc.dim() - 1)) else: pyro.sample( self._pyro_get_fullname("u"), dist.MultivariateNormal( zero_loc, scale_tril=Luu).to_event(zero_loc.dim() - 1)) f_loc, f_var = conditional(self.X, self.Xu, self.kernel, self.u_loc, self.u_scale_tril, Luu, full_cov=False, whiten=self.whiten, jitter=self.jitter) f_loc = f_loc + self.mean_function(self.X) if self.y is None: return f_loc, f_var else: # we would like to load likelihood's parameters outside poutine.scale context self.likelihood._load_pyro_samples() with poutine.scale(scale=self.num_data / self.X.size(0)): return self.likelihood(f_loc, f_var, self.y) @pyro_method def guide(self): self.set_mode("guide") self._load_pyro_samples() pyro.sample( self._pyro_get_fullname("u"), dist.MultivariateNormal( self.u_loc, scale_tril=self.u_scale_tril).to_event(self.u_loc.dim() - 1)) def forward(self, Xnew, full_cov=False): r""" Computes the mean and covariance matrix (or variance) of Gaussian Process posterior on a test input data :math:`X_{new}`: .. math:: p(f^* \mid X_{new}, X, y, k, X_u, u_{loc}, u_{scale\_tril}) = \mathcal{N}(loc, cov). .. note:: Variational parameters ``u_loc``, ``u_scale_tril``, the inducing-point parameter ``Xu``, together with kernel's parameters have been learned from a training procedure (MCMC or SVI). :param torch.Tensor Xnew: A input data for testing. Note that ``Xnew.shape[1:]`` must be the same as ``self.X.shape[1:]``. :param bool full_cov: A flag to decide if we want to predict full covariance matrix or just variance. :returns: loc and covariance matrix (or variance) of :math:`p(f^*(X_{new}))` :rtype: tuple(torch.Tensor, torch.Tensor) """ self._check_Xnew_shape(Xnew) self.set_mode("guide") loc, cov = conditional(Xnew, self.Xu, self.kernel, self.u_loc, self.u_scale_tril, full_cov=full_cov, whiten=self.whiten, jitter=self.jitter) return loc + self.mean_function(Xnew), cov
class ChebConvAtt(MessagePassing): r"""The chebyshev spectral graph convolutional operator with attention from the `Attention Based Spatial-Temporal Graph Convolutional Networks for Traffic Flow Forecasting." <https://ojs.aaai.org/index.php/AAAI/article/view/3881>`_ paper :math:`\mathbf{\hat{L}}` denotes the scaled and normalized Laplacian :math:`\frac{2\mathbf{L}}{\lambda_{\max}} - \mathbf{I}`. Args: in_channels (int): Size of each input sample. out_channels (int): Size of each output sample. K (int): Chebyshev filter size :math:`K`. normalization (str, optional): The normalization scheme for the graph Laplacian (default: :obj:`"sym"`): 1. :obj:`None`: No normalization :math:`\mathbf{L} = \mathbf{D} - \mathbf{A}` 2. :obj:`"sym"`: Symmetric normalization :math:`\mathbf{L} = \mathbf{I} - \mathbf{D}^{-1/2} \mathbf{A} \mathbf{D}^{-1/2}` 3. :obj:`"rw"`: Random-walk normalization :math:`\mathbf{L} = \mathbf{I} - \mathbf{D}^{-1} \mathbf{A}` You need to pass :obj:`lambda_max` to the :meth:`forward` method of this operator in case the normalization is non-symmetric. :obj:`\lambda_max` should be a :class:`torch.Tensor` of size :obj:`[num_graphs]` in a mini-batch scenario and a scalar/zero-dimensional tensor when operating on single graphs. You can pre-compute :obj:`lambda_max` via the :class:`torch_geometric.transforms.LaplacianLambdaMax` transform. bias (bool, optional): If set to :obj:`False`, the layer will not learn an additive bias. (default: :obj:`True`) **kwargs (optional): Additional arguments of :class:`torch_geometric.nn.conv.MessagePassing`. """ def __init__(self, in_channels, out_channels, K, normalization=None, bias=True, **kwargs): kwargs.setdefault('aggr', 'add') super(ChebConvAtt, self).__init__(**kwargs) assert K > 0 assert normalization in [None, 'sym', 'rw'], 'Invalid normalization' self.in_channels = in_channels self.out_channels = out_channels self.normalization = normalization self.weight = Parameter(torch.Tensor(K, in_channels, out_channels)) if bias: self.bias = Parameter(torch.Tensor(out_channels)) else: self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self): glorot(self.weight) zeros(self.bias) def __norm__(self, edge_index, num_nodes: Optional[int], edge_weight: OptTensor, normalization: Optional[str], lambda_max, dtype: Optional[int] = None, batch: OptTensor = None): edge_index, edge_weight = remove_self_loops(edge_index, edge_weight) edge_index, edge_weight = get_laplacian(edge_index, edge_weight, normalization, dtype, num_nodes) if batch is not None and lambda_max.numel() > 1: lambda_max = lambda_max[batch[edge_index[0]]] edge_weight = (2.0 * edge_weight) / lambda_max edge_weight.masked_fill_(edge_weight == float('inf'), 0) edge_index, edge_weight = add_self_loops(edge_index, edge_weight, fill_value=-1., num_nodes=num_nodes) assert edge_weight is not None return edge_index, edge_weight def forward(self, x, edge_index, spatial_attention, edge_weight: OptTensor = None, batch: OptTensor = None, lambda_max: OptTensor = None): """ Making a forward pass of the ChebConvAtt layer. B is the batch size. N_nodes is the number of nodes in the graph. F_in is the dimension of input features (in_channels). F_out is the dimension of input features (out_channels). Arg types: * x (PyTorch Float Tensor) - Node features for T time periods, with shape (B, N_nodes, F_in). * edge_index (Tensor array) - Edge indices. * spatial_attention (PyTorch Float Tensor) - Spatial attention weights, with shape (B, N_nodes, N_nodes). * edge_weight (PyTorch Float Tensor, optional) - Edge weights corresponding to edge indices. * batch (PyTorch Tensor, optional) - Batch labels for each edge. * lambda_max (optional, but mandatory if normalization is None) - Largest eigenvalue of Laplacian. Return types: * output (PyTorch Float Tensor) - Hidden state tensor for all nodes, with shape (B, N_nodes, F_out). """ if self.normalization != 'sym' and lambda_max is None: raise ValueError('You need to pass `lambda_max` to `forward() in`' 'case the normalization is non-symmetric.') if lambda_max is None: lambda_max = torch.tensor(2.0, dtype=x.dtype, device=x.device) if not isinstance(lambda_max, torch.Tensor): lambda_max = torch.tensor(lambda_max, dtype=x.dtype, device=x.device) assert lambda_max is not None edge_index, norm = self.__norm__(edge_index, x.size(self.node_dim), edge_weight, self.normalization, lambda_max, dtype=x.dtype, batch=batch) row, col = edge_index Att_norm = norm * spatial_attention[:, row, col] num_nodes = x.size(self.node_dim) TAx_0 = torch.matmul( (torch.eye(num_nodes) * spatial_attention).permute(0, 2, 1), x) out = torch.matmul(TAx_0, self.weight[0]) # L_tilde = torch.sparse_coo_tensor(edge_index,norm,(num_nodes,num_nodes)).to_dense() # propagate_type: (x: Tensor, norm: Tensor) edge_index_transpose = edge_index[[ 1, 0 ]] # transpose according to the paper if self.weight.size(0) > 1: TAx_1 = self.propagate(edge_index_transpose, x=TAx_0, norm=Att_norm, size=None) out = out + torch.matmul(TAx_1, self.weight[1]) for k in range(2, self.weight.size(0)): TAx_2 = self.propagate(edge_index_transpose, x=TAx_1, norm=norm, size=None) TAx_2 = 2. * TAx_2 - TAx_0 out = out + torch.matmul(TAx_2, self.weight[k]) TAx_0, TAx_1 = TAx_1, TAx_2 if self.bias is not None: out += self.bias return out def message(self, x_j, norm): if norm.dim() == 1: return norm.view(-1, 1) * x_j else: d1, d2 = norm.shape return norm.view(d1, d2, 1) * x_j def __repr__(self): return '{}({}, {}, K={}, normalization={})'.format( self.__class__.__name__, self.in_channels, self.out_channels, self.weight.size(0), self.normalization)
class AlternatingHighwayLSTM(torch.nn.Module): """ A stacked LSTM with LSTM layers which alternate between going forwards over the sequence and going backwards, with highway connections between each of the alternating layers. This implementation is based on the description in `Deep Semantic Role Labelling - What works and what's next <https://homes.cs.washington.edu/~luheng/files/acl2017_hllz.pdf>`_ . Parameters ---------- input_size : int, required The dimension of the inputs to the LSTM. hidden_size : int, required The dimension of the outputs of the LSTM. num_layers : int, required The number of stacked LSTMs to use. recurrent_dropout_probability: float, optional (default = 0.0) The dropout probability to be used in a dropout scheme as stated in `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks <https://arxiv.org/abs/1512.05287>`_ . Returns ------- output : PackedSequence The outputs of the interleaved LSTMs per timestep. A tensor of shape (batch_size, max_timesteps, hidden_size) where for a given batch element, all outputs past the sequence length for that batch are zero tensors. """ def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1, recurrent_dropout_probability: float = 0) -> None: super(AlternatingHighwayLSTM, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers self.recurrent_dropout_probability = recurrent_dropout_probability self.training = True # Input dimensions consider the fact that we do # all of the LSTM projections (and highway parts) # in a single matrix multiplication. input_projection_size = 6 * hidden_size state_projection_size = 5 * hidden_size bias_size = 5 * hidden_size # Here we are creating a single weight and bias with the # parameters for all layers unfolded into it. This is necessary # because unpacking and re-packing the weights inside the # kernel would be slow, as it would happen every time it is called. total_weight_size = 0 total_bias_size = 0 for layer in range(num_layers): layer_input_size = input_size if layer == 0 else hidden_size input_weights = input_projection_size * layer_input_size state_weights = state_projection_size * hidden_size total_weight_size += input_weights + state_weights total_bias_size += bias_size self.weight = Parameter(torch.FloatTensor(total_weight_size)) self.bias = Parameter(torch.FloatTensor(total_bias_size)) self.reset_parameters() def reset_parameters(self) -> None: self.bias.data.zero_() weight_index = 0 bias_index = 0 for i in range(self.num_layers): input_size = self.input_size if i == 0 else self.hidden_size # Create a tensor of the right size and initialize it. init_tensor = self.weight.new_zeros(input_size, self.hidden_size * 6) block_orthogonal(init_tensor, [input_size, self.hidden_size]) # Copy it into the flat weight. self.weight.data[weight_index: weight_index + init_tensor.nelement()]\ .view_as(init_tensor).copy_(init_tensor) weight_index += init_tensor.nelement() # Same for the recurrent connection weight. init_tensor = self.weight.new_zeros(self.hidden_size, self.hidden_size * 5) block_orthogonal(init_tensor, [self.hidden_size, self.hidden_size]) self.weight.data[weight_index: weight_index + init_tensor.nelement()]\ .view_as(init_tensor).copy_(init_tensor) weight_index += init_tensor.nelement() # Set the forget bias to 1. self.bias.data[bias_index + self.hidden_size:bias_index + 2 * self.hidden_size].fill_(1) bias_index += 5 * self.hidden_size def forward(self, inputs: PackedSequence, # pylint: disable=arguments-differ # pylint: disable=unused-argument initial_state: torch.Tensor = None)-> Tuple[PackedSequence, torch.Tensor]: """ Parameters ---------- inputs : ``PackedSequence``, required. A batch first ``PackedSequence`` to run the stacked LSTM over. initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None) Currently, this is ignored. Returns ------- output_sequence : ``PackedSequence`` The encoded sequence of shape (batch_size, sequence_length, hidden_size) final_states: ``torch.Tensor`` The per-layer final (state, memory) states of the LSTM, each with shape (num_layers, batch_size, hidden_size). """ inputs, lengths = pad_packed_sequence(inputs, batch_first=True) # Kernel takes sequence length first tensors. inputs = inputs.transpose(0, 1) sequence_length, batch_size, _ = inputs.size() accumulator_shape = [self.num_layers, sequence_length + 1, batch_size, self.hidden_size] state_accumulator = inputs.new_zeros(*accumulator_shape) memory_accumulator = inputs.new_zeros(*accumulator_shape) dropout_weights = inputs.new_ones(self.num_layers, batch_size, self.hidden_size) if self.training: # Normalize by 1 - dropout_prob to preserve the output statistics of the layer. dropout_weights.bernoulli_(1 - self.recurrent_dropout_probability)\ .div_((1 - self.recurrent_dropout_probability)) gates = inputs.new_tensor((self.num_layers, sequence_length, batch_size, 6 * self.hidden_size)) lengths_variable = torch.LongTensor(lengths) implementation = _AlternatingHighwayLSTMFunction(self.input_size, self.hidden_size, num_layers=self.num_layers, train=self.training) output, _ = implementation(inputs, self.weight, self.bias, state_accumulator, memory_accumulator, dropout_weights, lengths_variable, gates) # TODO(Mark): Also return the state here by using index_select with the lengths so we can use # it as a Seq2VecEncoder. output = output.transpose(0, 1) output = pack_padded_sequence(output, lengths, batch_first=True) return output, None