def __first_forward(self, x): assert self.training self._forwardfunc = self.__normal_forward kernel = self.constval * self.alpha * F.layer_norm(self.weight, self.norm_shape, weight=None, bias=None, eps=self.gamma) output = F.conv2d(x, kernel, bias=self.bias, padding=self.padding, stride=self.stride) outputvar = output.var() nn.init.constant_(self.alpha, torch.sqrt(2 / outputvar)) kernel = self.constval * self.alpha * F.layer_norm(self.weight, self.norm_shape, weight=None, bias=None, eps=self.gamma) output = F.conv2d(x, kernel, bias=self.bias, padding=self.padding, stride=self.stride) if self.activation is not None: output = self.activation(output) return output
def forward(self, word_input, look_ahead_mask, target_padding_mask, *args): """ :param word_input: (batch, target_seq_len, d_model) :param look_ahead_mask: (batch, target_seq_len, ) :param target_padding_mask: (batch, target_seq_len, ) :param args: :return: """ # (batch, target_seq_len, d_model) if target_padding_mask is None and look_ahead_mask is None: combined_mask = None elif target_padding_mask is not None and look_ahead_mask is not None: combined_mask = torch.max(target_padding_mask, look_ahead_mask) else: combined_mask = target_padding_mask if target_padding_mask is not None else look_ahead_mask attn1, attn_weights_block1 = self.mha(word_input, word_input, word_input, mask=combined_mask) attn1 = self.dropout1(attn1) out1 = F.layer_norm(word_input + attn1, normalized_shape=[attn1.size(-1)]) ffn_output = self.ffn(out1) ffn_output = self.dropout2(ffn_output) output = F.layer_norm(out1 + ffn_output, normalized_shape=[ffn_output.size(-1)]) return output, attn_weights_block1
def prep_vis_pe(bbox_preds, cls_probs): """ Args: bbox_preds: raw pre-processed bbox predictions from detector, shape = (batch, detections, 6) cls_probs: raw pre-processed class probabilities from detector, shape = (batch, detections, num classes + 1) Returns: vis_pe: visual positional embedding, which is norm bbox + norm area + box score shape = (batch, detections, num_classes + 6 + 1) """ batch_size = bbox_preds.shape[0] num_detections = bbox_preds.shape[1] num_classes = cls_probs.shape[2]-1 max_x1s, _ = torch.max(bbox_preds[:, :, 0], dim=1) max_x2s, _ = torch.max(bbox_preds[:, :, 2], dim=1) max_y1s, _ = torch.max(bbox_preds[:, :, 1], dim=1) max_y2s, _ = torch.max(bbox_preds[:, :, 3], dim=1) w_ests = torch.max(max_x1s, max_x2s)*1.+1e-5 h_ests = torch.max(max_y1s, max_y2s)*1.+1e-5 bbox_preds[:, :, [0, 2]] = torch.div(bbox_preds[:, :, [0, 2]], w_ests.unsqueeze(1).unsqueeze(2)) bbox_preds[:, :, [1, 3]] = torch.div(bbox_preds[:, :, [1, 3]], h_ests.unsqueeze(1).unsqueeze(2)) rel_area = (bbox_preds[:, :, 3]-bbox_preds[:, :, 1])*(bbox_preds[:, :, 2]-bbox_preds[:, :, 0]) rel_area.clamp_(0) vis_pe = torch.cat((bbox_preds[:, :, :4], rel_area.view(batch_size, num_detections, 1), bbox_preds[:, :, 5:]), dim=-1) vis_pe = torch.cat((F.layer_norm(vis_pe, [6]), F.layer_norm(cls_probs, [num_classes+1])), dim=-1) return vis_pe
def get_img_tensors(preds, fc_layer, fc_dim, num_classes, max_detections=100): """ Args: preds: predictions from a detectron2 detector, a list of instances fc_layer: 0-indexed layer to pull features from (in prior literature FC6 = 0 (first FC layer), and FC7 = 1 (2nd FC layer)) fc_dim: the dimensionality of the flattened vector (usually 2048) Returns: box_features: tensor of box features from the FC layer output, shape = (number of regions, fc-dim) vis_pe: visual positional embedding, which is bbox + area + box score both are end row padded to the max detection limit """ h, w = preds['instances'].image_size fields = preds['instances'].get_fields() fc_box_features = fields['fc_box_features'][:, fc_layer*fc_dim:(fc_layer+1)*fc_dim] probs = fields['probs'] boxes = fields['pred_boxes'].clone() num_detections = fc_box_features.shape[0] boxes.scale(scale_x=1/w, scale_y=1/h) areas = boxes.area().unsqueeze(dim=1) scores = fields['scores'].unsqueeze(dim=1) bbox_areas = torch.cat([boxes.tensor, areas, scores], dim=1) # 4 coordinates + 1 bbox area +1 score, +1 for background class vis_pe = torch.cat((F.layer_norm(bbox_areas, [6]), F.layer_norm(probs, [num_classes+1])), dim=-1) box_features = F.pad(fc_box_features, [0, 0, 0, max_detections-num_detections]) vis_pe = F.pad(vis_pe, [0, 0, 0, max_detections-num_detections]) return box_features, vis_pe
def forward(self, x): # x = torch.from_numpy(x) x = F.layer_norm(x, x.size()) x = F.leaky_relu(self.fc1(x)) x = F.layer_norm(x, x.size()) x = self.fc2(x) return x
def get_rcnn(self, img_path): if os.path.exists(img_path) and os.path.exists( img_path.replace('.npy', '_cls_prob.npy')): img = torch.from_numpy(np.load(img_path)) img_id = img_path.split('/')[-1].split('.')[0] cls_label = torch.from_numpy( np.load(img_path.replace('.npy', '_cls_prob.npy'))) with h5py.File(self.region_bbox_file, 'r') as region_bbox_f: vis_pe = torch.from_numpy(region_bbox_f[img_id][:]) # lazy normalization of the coordinates... w_est = torch.max(vis_pe[:, [0, 2]]) * 1. + 1e-5 h_est = torch.max(vis_pe[:, [1, 3]]) * 1. + 1e-5 vis_pe[:, [0, 2]] /= w_est vis_pe[:, [1, 3]] /= h_est rel_area = (vis_pe[:, 3] - vis_pe[:, 1]) * (vis_pe[:, 2] - vis_pe[:, 0]) rel_area.clamp_(0) vis_pe = torch.cat( (vis_pe[:, :4], rel_area.view(-1, 1), vis_pe[:, 5:]), -1) # confident score normalized_coord = F.normalize(vis_pe.data[:, :5] - 0.5, dim=-1) vis_pe = torch.cat((F.layer_norm(vis_pe, [6]), \ F.layer_norm(cls_label, [1601])), dim=-1) # 1601 hard coded... else: img = torch.randn(100, 2048) vis_pe = torch.randn(100, 1601 + 6) return img, vis_pe
def forward(self, x): y, _ = self.multi_head_attention(x, x, x) x = F.layer_norm(x + y, (self.embed_dim, )) y = self.feed_forward(x) y = self.activation(y) return F.layer_norm(x + y, (self.embed_dim, ))
def forward(self, t, adj): # adj=dataset.skeleton_ if self.sequential: # sequential architecture for i in range(self.num_layers): t = rearrange(fn.relu(self.spatial_layers[i](t, adj)), 'b n c -> n b c') t = rearrange( fn.relu( fn.layer_norm(self.temporal_layers[i] (t), t.shape[1:]) + t), 'n b c -> b n c') else: # parallel architecture s = t t = get_synergy(t) #t = self.temporal_in(t.permute(1,0)) t = torch.unsqueeze(t.permute(1, 0), 1) t = self.temporal_in(rearrange(t, 'b n c -> n b c')) #print(t.shape) #print(p) for i in range(self.num_layers): s = fn.relu(self.spatial_layers[i](s, adj)) t = fn.relu( fn.layer_norm(self.temporal_layers[i](t), t.shape[1:]) + t) if self.trainable_factor: factor = fn.sigmoid(self.spatial_factor).to("cuda") #t = factor * rearrange(s, 'b n c -> b (n c)') + (1. - factor) * t t = factor * s + (1. - factor) * rearrange(t, 'n b c -> b n c') else: t = (s + rearrange(t, 'n b c -> b n c')) * 0.5 #t = self.bottle_neck(t) t = rearrange(self.bottle_neck(t), 'b n c -> b (n c)') t = self.final_layer(t) # return fn.sigmoid(t) # dimension (b, n, oc) return t
def forward(self, enc_demo_seq, enc_demo_key_seq, e_l, e_r, start_ind, end_ind, inputs, timestep=None): """Performs multi-layered, multi-headed attention.""" if self._hp.forced_attention: return batchwise_index(enc_demo_seq, timestep[:,0].long()), None # Get (initial) attention key if self._hp.one_hot_attn_time_cond and timestep is not None: one_hot_timestep = make_one_hot(timestep.long(), self._hp.max_seq_len).float() else: one_hot_timestep = timestep args = [one_hot_timestep] if self._hp.timestep_cond_attention else [] query = self.query_net(e_l, e_r, *args) # Attend s_ind, e_ind = (torch.floor(start_ind), torch.ceil(end_ind)) if self._hp.mask_inf_attention \ else (inputs.start_ind, inputs.end_ind) norm_shape_k = query.shape[1:] norm_shape_v = enc_demo_seq.shape[2:] raw_attn_output, att_weights = None, None for attention, predictor in zip(self.attention_layers, self.predictor_layers): raw_attn_output, att_weights = attention(query, enc_demo_key_seq, enc_demo_seq, s_ind, e_ind, forced_attention_step=timestep if self._hp.forced_attention else None) x = F.layer_norm(raw_attn_output, norm_shape_v) query = F.layer_norm(predictor(x) + query, norm_shape_k) # skip connections around attention and predictor return apply_linear(self.out, raw_attn_output, dim=1), att_weights # output non-normalized output of final attention layer
def forward(self, tgt_seq, encoder_outputs, V, return_attns=False): dec_init_state = encoder_outputs['dec_init_state'] encoder_states = encoder_outputs['encoder_states'] keys = encoder_outputs['encoder_states'] src_mask = encoder_outputs['src_mask'] tgt_embed = self.tgt_embed_layer(tgt_seq) tgt_embed = self.dropout(tgt_embed) dec_states = None for l, rnn in enumerate(self.layer_stack): if l == 0: dec_states, _ = rnn(tgt_embed, dec_init_state) dec_states = F.layer_norm(dec_states, (self.d_model, )) context_txt, txt_attention = self.attention_txt( dec_states, keys, encoder_states, src_mask) context_img, img_attention = self.attention_img( dec_states, V, V) else: prev_states = dec_states dec_input = torch.cat([prev_states, context_txt, context_img], 2) dec_states, last_hidden = rnn(dec_input, dec_init_state) dec_states = self.dropout(dec_states) if l >= 2: dec_states = self.residual_scaler * (dec_states + prev_states) dec_states = F.layer_norm(dec_states, (self.d_model, )) return dec_states, last_hidden.squeeze(0), (txt_attention, img_attention)
def forward(self, x): tensor_1, tensor_2 = torch.split(x, int(x.size()[1]/2), dim=1) act_1 = tanh(tensor_1) act_2 = sigmoid(tensor_2) norm_1 = layer_norm(act_1, act_1.size()[1:]) norm_2 = layer_norm(act_2, act_2.size()[1:]) return torch.mul(norm_1, norm_2)
def forward(self, X: torch.FloatTensor, idx: torch.LongTensor) -> torch.FloatTensor: """ Making a forward pass of layer normalization. Arg types: * **X** (Pytorch Float Tensor) - Input tensor, with shape (batch_size, feature_dim, num_nodes, seq_len). * **idx** (Pytorch Long Tensor) - Input indices. Return types: * **X** (PyTorch Float Tensor) - Output tensor, with shape (batch_size, feature_dim, num_nodes, seq_len). """ if self._elementwise_affine: return F.layer_norm( X, tuple(X.shape[1:]), self._weight[:, idx, :], self._bias[:, idx, :], self._eps, ) else: return F.layer_norm(X, tuple(X.shape[1:]), self._weight, self._bias, self._eps)
def LayerNormLSTMScript(training: bool, zoneout_prob: float, input, h0, c0, kernel, recurrent_kernel, bias, gamma, gamma_h, beta_h, zoneout_mask): time_steps = input.shape[0] batch_size = input.shape[1] hidden_size = recurrent_kernel.shape[0] h = [h0] c = [c0] Wx = F.layer_norm(input @ kernel, (hidden_size * 4, ), weight=gamma[0]) for t in range(time_steps): v = F.layer_norm(h[t] @ recurrent_kernel, (hidden_size * 4, ), weight=gamma[1]) + Wx[t] + bias i, g, f, o = torch.chunk(v, 4, 1) i = torch.sigmoid(i) g = torch.tanh(g) f = torch.sigmoid(f) o = torch.sigmoid(o) c.append(f * c[t] + i * g) h.append(o * torch.tanh( F.layer_norm(c[-1], (hidden_size, ), weight=gamma_h, bias=beta_h))) if zoneout_prob: if training: h[-1] = (h[-1] - h[-2]) * zoneout_mask[t] + h[-2] else: h[-1] = zoneout_prob * h[-2] + (1 - zoneout_prob) * h[-1] h = torch.stack(h) c = torch.stack(c) return h, c
def get_rcnn(self, path): img_id = path.split('/')[-1].split('.')[0] with h5py.File(self.region_det_file_prefix + '_feat' + img_id[-3:] + '.h5', 'r') as region_feat_f, \ h5py.File(self.region_det_file_prefix + '_cls' + img_id[-3:] + '.h5', 'r') as region_cls_f, \ h5py.File(self.region_bbox_file, 'r') as region_bbox_f: img = torch.from_numpy(region_feat_f[img_id][:]).float() cls_label = torch.from_numpy(region_cls_f[img_id][:]).float() vis_pe = torch.from_numpy(region_bbox_f[img_id][:]) # lazy normalization of the coordinates... w_est = torch.max(vis_pe[:, [0, 2]]) * 1. + 1e-5 h_est = torch.max(vis_pe[:, [1, 3]]) * 1. + 1e-5 vis_pe[:, [0, 2]] /= w_est vis_pe[:, [1, 3]] /= h_est rel_area = (vis_pe[:, 3] - vis_pe[:, 1]) * (vis_pe[:, 2] - vis_pe[:, 0]) rel_area.clamp_(0) vis_pe = torch.cat( (vis_pe[:, :4], rel_area.view(-1, 1), vis_pe[:, 5:]), -1) # confident score normalized_coord = F.normalize(vis_pe.data[:, :5] - 0.5, dim=-1) vis_pe = torch.cat((F.layer_norm(vis_pe, [6]), \ F.layer_norm(cls_label, [1601])), dim=-1) # 1601 hard coded... return img, vis_pe
def forward(self, x): x = self.conv_features(x) # [b, d, h, w] b, d, h, w = x.shape if self.use_transformer: # add positional encodings y = torch.stack([ torch.cat([torch.arange(h).unsqueeze(1)] * w, dim=1), torch.cat([torch.arange(w).unsqueeze(0)] * h, dim=0), ]) # [2, h, w y = y.view([2, h * w]).transpose(1, 0) # [h*w, 2] y = y.type(torch.float32).to(flair.device) y = self.position_features(y).transpose(1, 0).view( [d, h, w]) # [h*w, d] => [d, h, w] y = y.unsqueeze(dim=0) # [1, d, h, w] x = x + y # [b, d, h, w] + [1, d, h, w] => [b, d, h, w] # reshape the pixels into the sequence x = x.view([b, d, h * w]) # [b, d, h*w] # layer norm after convolution and positional encodings x = F.layer_norm(x.permute([0, 2, 1]), (d, )).permute([0, 2, 1]) # add <cls> token x = torch.cat([x, torch.stack([self.cls_token] * b)], dim=2) # [b, d, h*w+1] # transformer requires input in the shape [h*w+1, b, d] x = ( x.view([b * d, h * w + 1]).transpose(1, 0).view([h * w + 1, b, d]) ) # [b, d, h*w+1] => [b*d, h*w+1] => [h*w+1, b*d] => [h*w+1, b*d] x = self.transformer(x) # [h*w+1, b, d] # the output is an embedding of <cls> token x = x[-1, :, :] # [b, d] else: x = x.view([-1, self._feat_dim]) x = F.layer_norm(x, (self._feat_dim, )) return x
def LayerNormGRUScript(training: bool, zoneout_prob: float, input, h0, kernel, recurrent_kernel, bias, recurrent_bias, gamma, zoneout_mask): time_steps = input.shape[0] batch_size = input.shape[1] hidden_size = recurrent_kernel.shape[0] h = [h0] Wx = F.layer_norm(input @ kernel, (hidden_size * 3, ), weight=gamma[0]) + bias for t in range(time_steps): Rh = F.layer_norm(h[t] @ recurrent_kernel, (hidden_size * 3, ), weight=gamma[1]) + recurrent_bias vx = torch.chunk(Wx[t], 3, 1) vh = torch.chunk(Rh, 3, 1) z = torch.sigmoid(vx[0] + vh[0]) r = torch.sigmoid(vx[1] + vh[1]) g = torch.tanh(vx[2] + r * vh[2]) h.append(z * h[t] + (1 - z) * g) if zoneout_prob: if training: h[-1] = (h[-1] - h[-2]) * zoneout_mask[t] + h[-2] else: h[-1] = zoneout_prob * h[-2] + (1 - zoneout_prob) * h[-1] h = torch.stack(h) return h
def encode(self, src_seq, src_mask=None): src_embed = self.src_embed_layer(src_seq) src_embed = self.dropout(src_embed) enc_states = src_embed for l, rnn in enumerate(self.encoder_rnns): prev_states = enc_states if src_mask is not None: prev_states = pack_padded_sequence(prev_states, lengths=src_mask.sum(1), batch_first=True) enc_states, (enc_last_hidden, _) = rnn(prev_states) if src_mask is not None: enc_states, _ = pad_packed_sequence(enc_states, batch_first=True) enc_states = self.dropout(enc_states) if l >= 2: enc_states = self.residual_scaler * (enc_states + prev_states) if self.layer_norm: enc_states = F.layer_norm(enc_states, (self._hidden_size * 2, )) enc_states = self.project_nn(enc_states) if self.layer_norm: enc_states = F.layer_norm(enc_states, (self._hidden_size, )) encoder_outputs = { "encoder_states": enc_states, "keys": enc_states, "src_mask": src_mask } return encoder_outputs
def forward(self, src_seq): src_embed = self.src_embed_layer(src_seq) src_embed = self.dropout(src_embed) src_mask = src_seq.clone() src_mask = torch.where(src_seq != 0, torch.tensor(1).cuda(), torch.tensor(0).cuda()) enc_states = src_embed for l, rnn in enumerate(self.layer_stack): prev_states = enc_states prev_states = pack_padded_sequence(prev_states, lengths=src_mask.sum(1), batch_first=True, enforce_sorted=False) enc_states = rnn(prev_states) enc_states, _ = pad_packed_sequence(enc_states, batch_first=True) enc_states = self.dropout(enc_states) if l >= 2: prev_states, _ = pad_packed_sequence(prev_states, batch_first=True) enc_states = self.residual_scaler * (enc_states + prev_states) enc_states = F.layer_norm(enc_states, (self.d_model * 2, )) enc_states = self.project_nn(enc_states) enc_states = F.layer_norm(enc_states, (self.d_model, )) encoder_outputs = { "encoder_states": enc_states, "keys": enc_states, "src_mask": src_mask, "dec_init_state": None } return encoder_outputs
def forward(self, x_t, h_t, c_t, m_t): x_concat = self.conv_x(x_t) x_concat = f.layer_norm(x_concat, x_concat.size()[1:]) h_concat = self.conv_h(h_t) h_concat = f.layer_norm(h_concat, h_concat.size()[1:]) m_concat = self.conv_m(m_t) m_concat = f.layer_norm(m_concat, m_concat.size()[1:]) i_x, f_x, g_x, i_x_prime, f_x_prime, g_x_prime, o_x = torch.split( x_concat, self.num_hidden, dim=1) i_h, f_h, g_h, o_h = torch.split(h_concat, self.num_hidden, dim=1) i_m, f_m, g_m = torch.split(m_concat, self.num_hidden, dim=1) i_t = torch.sigmoid(i_x + i_h) f_t = torch.sigmoid(f_x + f_h + self._forget_bias) g_t = torch.tanh(g_x + g_h) c_new = f_t * c_t + i_t * g_t i_t_prime = torch.sigmoid(i_x_prime + i_m) f_t_prime = torch.sigmoid(f_x_prime + f_m + self._forget_bias) g_t_prime = torch.tanh(g_x_prime + g_m) m_new = f_t_prime * m_t + i_t_prime * g_t_prime mem = torch.cat((c_new, m_new), 1) o_t = torch.sigmoid(o_x + o_h + self.conv_o(mem)) h_new = o_t * torch.tanh(self.conv_last(mem)) return h_new, c_new, m_new
def forward(self, x): x = (x - torch.mean(x)) / torch.var(x) x = F.pad(x, (1, 1, 1, 1)) x = F.relu(self.conv1(x)) x = F.pad(x, (1, 1, 1, 1)) x = F.relu(self.conv2(x)) x = F.layer_norm(x, x.shape) x = F.max_pool2d(x, 2, 2) x = F.dropout2d(x, 0.3) x = F.pad(x, (1, 1, 1, 1)) x = F.relu(self.conv3(x)) x = F.pad(x, (1, 1, 1, 1)) x = F.relu(self.conv4(x)) x = F.layer_norm(x, x.shape) x = F.max_pool2d(x, 2, 2) x = F.dropout2d(x, 0.3) x = F.pad(x, (1, 1, 1, 1)) x = F.relu(self.conv5(x)) x = F.pad(x, (1, 1, 1, 1)) x = F.relu(self.conv6(x)) x = F.layer_norm(x, x.shape) x = F.max_pool2d(x, 2, 2) x = F.dropout2d(x, 0.3) x = x.view(-1, 3 * 3 * 256) x = self.fc1(x) x = F.dropout(x, 0.3) x = self.fc2(x) x = F.relu(x) x = self.fc3(x) return F.log_softmax(x, dim=1)
def forward(self, x): ### YOUR CODE HERE ### att = self.attention(x, x, x) interm = x + F.layer_norm(att, normalized_shape=att.size()[1:]) ff = self.feed_forward(interm.float()) transformed_skip = interm + F.layer_norm( ff, normalized_shape=ff.size()[1:]).double() return transformed_skip
def forward(self, inputs): qkv = self.produce_qkv(inputs) queries, keys, values = qkv.split(self.in_channels, -1) attention = self.attention(queries, keys, values) outputs = F.layer_norm(attention + inputs, (self.in_channels,)) outputs = F.layer_norm(self.linear(outputs) + outputs, (self.in_channels,)) return outputs
def forward(self, x, z, src_mask, trg_mask): att_out, _ = self.self_masked_attention(x, x, x, trg_mask) middle_out_1 = F.layer_norm(self.dropout(att_out) + x, x.shape) att_out_2, _ = self.encoder_attention(middle_out_1, z, z, src_mask) middle_out_2 = F.layer_norm(self.dropout(att_out_2) + middle_out_1, middle_out_1.shape) fc_out = self.fc(middle_out_2) out = F.layer_norm(middle_out_2 + self.dropout(fc_out), middle_out_2.shape) return out
def forward(self, x): # print(x.size()) # print('x', x) x = F.layer_norm(x, x.size()) x = F.leaky_relu(self.fc1(x)) x = F.layer_norm(x, x.size()) x = torch.sigmoid(self.fc2(x)) return x
def forward(self, x): normalized_shape = x.size()[1:] if self.affine: return F.layer_norm(x, normalized_shape, self.weight.expand(normalized_shape), self.bias.expand(normalized_shape)) else: return F.layer_norm(x, normalized_shape)
def forward(self, input, idx): if self.elementwise_affine: return F.layer_norm(input, tuple(input.shape[1:]), self.weight[:, idx, :], self.bias[:, idx, :], self.eps) else: return F.layer_norm(input, tuple(input.shape[1:]), self.weight, self.bias, self.eps)
def forward(self, x): x = torch.tanh(F.layer_norm(self.conv1(x), (20, 24, 24))) x = F.max_pool2d(x, 2, 2) x = torch.tanh(F.layer_norm(self.conv2(x), (50, 8, 8))) x = F.max_pool2d(x, 2, 2) x = x.view(-1, 4 * 4 * 50) x = torch.tanh(F.layer_norm(self.fc1(x), (500, ))) x = F.layer_norm(self.fc2(x), (10, )) return F.log_softmax(x, dim=1)
def forward(self, y_s, y_t): y_s = F.layer_norm(y_s, torch.Size( (self.n_cls, )), None, None, 1e-7) * self.multiplier y_t = F.layer_norm(y_t, torch.Size( (self.n_cls, )), None, None, 1e-7) * self.multiplier p_s = F.log_softmax(y_s / self.T, dim=1) p_t = F.softmax(y_t / self.T, dim=1) loss = F.kl_div(p_s, p_t, size_average=False) * (self.T** 2) / y_s.shape[0] return loss
def forward(self, inp, attn_out=None): assert inp.size(1) == self.d_model, "Feature dimension not match!!" if self.pre_lnorm: inp = F.layer_norm(inp.transpose(1,2), (self.d_model,)).transpose(1,2) relu_out1 = self.drop1(F.relu(self.ff1_net(inp))) out2 = self.drop2(self.ff2_net(relu_out1)) output = out2 + inp if not self.pre_lnorm: output = F.layer_norm(output.transpose(1,2), (self.d_model,)).transpose(1,2) return output
def ATTNCell(self, A, hidden): attn1 = self.mattn(hidden, hidden, hidden) attn1 = F.layer_norm(attn1 + hidden, (self.hidden_size, )) attn2 = self.mattn(attn1, hidden, hidden) attn2 = F.dropout(attn2, self.dropout, training=self.training) attn2 = F.layer_norm(attn1 + attn2, (self.hidden_size, )) x = self.feedforward(attn2) x = F.layer_norm(attn2 + x, (self.hidden_size, )) return x