def forward(self, x, z_input): x = self.layer_norm_x(x) z = self.layer_norm_1(z_input) z, _ = self.attention(z, x, x) z = self.dropout(z) z = self.linear1(z) z = self.layer_norm_2(z) z = self.linear2(z) z = F.gelu(z) z = self.dropout(z) z = self.linear3(z) return z + z_input
def forward(self, x): L = self.n_layers skips = [] outs = [] rs = x for n in range(L): if n >= 1: rs = F.avg_pool1d(rs, 2, 2) skip = F.pad(rs, [1, 1], 'reflect') skip = self.contract[2 * n](skip) skip = F.gelu(skip) skip = self.contract[2 * n + 1](skip) skips.append(skip) if n == 0: rs = F.gelu(skip) else: rs = rs + F.gelu(skip) skips = list(reversed(skips)) for n in range(L): skip = F.pad(rs, [1, 1], 'reflect') skip = self.expand[2 * n](skip) skip = F.gelu(skip) skip = skip + skips[n] skip = self.expand[2 * n + 1](skip) if n < L - 1: skip, out = skip.split( [self.c_h, self.c_k + self.c_out * 2**(L - n - 1)], dim=1) else: out = skip outs.append(out) if n < L - 1: rs = rs + F.gelu(skip) rs = F.interpolate(rs, scale_factor=2) return outs
def forward(self, dense_x: torch.Tensor, dense_edge_index: torch.Tensor, batch: torch.Tensor, return_edge: bool = False): assert dense_x.dim() == dense_edge_index.dim() assert dense_x.size(0) == dense_edge_index.size(0) # batch_size assert dense_x.size(1) == dense_edge_index.size(1) # graph_size assert dense_x.size(2) == self.embed_dim assert dense_edge_index.size(2) == 2, f"{dense_edge_index.size()}" batch_size = dense_x.size(0) graph_size = dense_x.size(1) device = dense_x.device node_x = dense_x.flatten(0, 1) # flatten on first dimension(batch dim) x = node_x edge_index_offset = torch.arange(batch_size, device=device) * graph_size edge_index = ( (dense_edge_index + edge_index_offset[:, None, None]).flatten( 0, 1).T) # shape=[2, batch_size * graph_size] reversed_edge_index = edge_index.flipud() x_dir_0 = x x_dir_1 = x for _ in range(self.num_layers): x_dir_0 = checkpoint(self.gnn_layer, x_dir_0, edge_index) x_dir_1 = checkpoint(self.reversed_gnn_layer, x_dir_1, reversed_edge_index) x = F.gelu(x_dir_0 + x_dir_1) x = self.norm_out(x, batch) tour_embeddings = self.pooling_func(x, batch) dense_edge_embeddings = None if return_edge: edge_embeddings = self.edge_extractor(node_x=node_x, solution_x=x, edge_index=edge_index, batch=batch) assert edge_embeddings.dim() == 2 assert edge_embeddings.size(0) == batch_size * graph_size dense_edge_embeddings = edge_embeddings.reshape( batch_size, graph_size, -1) return tour_embeddings, dense_edge_embeddings
def forward(self, encoder_outputs, durations, frames_positions, input_lengths): """ Gaussian upsampling PARAMS ------ encoder_outputs: Encoder outputs [B, N, H] durations: phoneme durations [B, N] frames_positions: Transformer-styled frames_positions [B, T, pos_embed] input_lengths: for text masks RETURNS ------- encoder_upsampling_outputs: upsampled encoder_output [B, T, H] """ B = encoder_outputs.size(0) N = encoder_outputs.size(1) # total_decoder_steps = torch.max(torch.sum(durations, dim=1)).item() total_decoder_steps = frames_positions.size(1) c = torch.cumsum(durations, dim=1, dtype=torch.float) - 0.5 * durations c = c.unsqueeze(2) t = torch.arange(total_decoder_steps).expand( B, N, total_decoder_steps).float().cuda() # [B, N, T] # calculate range parameters using ConvNorm and GRU net self.range_parameter_layer.flatten_parameters() processed_durations = durations.float().unsqueeze(1) for duration_conv in self.duration_convs: processed_durations = F.dropout( F.gelu(duration_conv(processed_durations)), 0.5, self.training) range_parameters, _ = self.range_parameter_layer( torch.cat((encoder_outputs, processed_durations.transpose(1, 2)), dim=2)) var = F.softplus(self.range_dense(range_parameters)) # w_t = -torch.pow((t-c)/var, 2) w_t = -0.5 * (np.log(2.0 * np.pi) + torch.log(var) + torch.pow(t - c, 2) / var) if input_lengths is not None: input_masks = ~get_mask_from_lengths(input_lengths, N) # [B, N] masks = input_masks.unsqueeze(2) w_t.data.masked_fill_(masks, self.mask_score) w_t = F.softmax(w_t, dim=1) encoder_upsampling_outputs = torch.bmm(w_t.transpose( 1, 2), encoder_outputs) # [B, T, encoder_hidden_size] encoder_upsampling_outputs = torch.cat( (encoder_upsampling_outputs, frames_positions), dim=2) return encoder_upsampling_outputs
def forward(self, inputs, enc_outputs, lookahead_mask, padding_mask): attention_1, _ = self.multi_head_attention_1(inputs, inputs, inputs, lookahead_mask) attention_1 = self.dropout_1(attention_1) attention_1 = self.norm_1(attention_1 + inputs) attention_2, _ = self.multi_head_attention_2(attention_1, enc_outputs, enc_outputs, padding_mask) attention_2 = self.dropout_2(attention_2) attention_2 = self.norm_2(attention_2 + attention_1) outputs = F.gelu(self.dense_1(attention_2)) outputs = self.dense_2(outputs) outputs = self.dropout_3(outputs) outputs = self.norm_3(outputs) return outputs
def forward(self, src: torch.FloatTensor, src_mask: torch.FloatTensor) -> torch.FloatTensor: # multi head attention src1 = self.layer_norm1(src) src1 = self.self_attn(src1, src_mask) # add and norm src = src + self.dropout1(src1) # feed forward src1 = self.layer_norm2(src) src1 = F.gelu(self.intermediate_linear1(src1)) src1 = self.intermediate_linear2(src1) src1 = self.dropout(src1) # add and norm src = src + src1 return src
def forward(self, src_seq, src_mask, return_attns=False): enc_slf_attn_list = [] # -- Forward enc_output = F.gelu(src_seq) for enc_layer in self.layer_stack: enc_output, enc_slf_attn = enc_layer(enc_output, slf_attn_mask=src_mask) enc_slf_attn_list += [enc_slf_attn] if return_attns else [] enc_output = self.layer_norm(enc_output) if return_attns: return enc_output, enc_slf_attn_list return enc_output,
def forward(self, input_tensor, seed, random=True): # [batch, length, d_model] chunks = torch.chunk(input_tensor, chunks=self.chunk, dim=1) # [batch, length // chunk, d_model] output = [F.gelu(self.linear1(chunk)) for chunk in chunks] # [batch, length // chunk, d_ff] if self.training: output = [ deterministic_dropout(chunk, seed + i, dropout=self.dropout) for chunk, i in zip(output, range(self.chunk)) ] # [batch, length // chunk, d_ff] output = torch.cat([self.linear2(chunk) for chunk in output], dim=1) # [batch, length, d_model] return output
def forward(self, lv, ls): ls.set_values(lv) #similar to densenet and resnet: bn, relu, conv https://arxiv.org/pdf/1603.05027.pdf if self.norm is None: self.norm = GroupNormLatticeModule(lv.shape[1]) lv, ls=self.norm(lv,ls) lv=F.gelu(lv) if self.with_dropout: lv = self.drop(lv) ls.set_values(lv) lv_1, ls_1 = self.conv(lv, ls) ls_1.set_values(lv_1) return lv_1, ls_1
def forward(self, G, out_key_one, out_key_two): h = {} for ntype in G.ntypes: n_id = self.node_dict[ntype] h[ntype] = F.gelu(self.adapt_ws[n_id](self.node_emb[ntype]( G.nodes[ntype].data['id']))) for i in range(self.n_layers): h = self.gcs[i](G, h) h1_dict = {} h2_dict = {} h1_out = self.out(h[out_key_one]) h2_out = self.out(h[out_key_two]) for i in range(h1_out.shape[0]): h1_dict[G.nodes[out_key_one].data['id'][i].item()] = h1_out[i] for i in range(h2_out.shape[0]): h2_dict[G.nodes[out_key_two].data['id'][i].item()] = h2_out[i] return h1_dict, h2_dict
def forward(self, input, indices=None): """ :param input: T x B x H :param indices: T x B or B :return: """ # n_factors = self.r.size(0) bsz = input.size(1) seq_len = input.size(0) weight_ = F.dropout(self.weight, p=self.weight_drop, training=self.training) if indices.size(0) == 1 and len(indices.shape) == 1: r = torch.index_select(self.r, 0, indices).squeeze(0) s = torch.index_select(self.s, 0, indices).squeeze(0) # weight_mask = torch.sum(torch.einsum('bi,bj->bij', (s, r)), dim=0) # weight_mask = torch.bmm(s.unsqueeze(-1), r.unsqueeze(1)) if self.use_multiplicative: rm = torch.index_select(self.rm, 0, indices).squeeze(0) sm = torch.index_select(self.sm, 0, indices).squeeze(0) weight_ = weight_ * torch.sum( torch.bmm(rm.unsqueeze(-1), sm.unsqueeze(1)), dim=0) if self.mfw_activation == "none": weight_ = weight_ elif self.mfw_activation == "gelu": weight_ = F.gelu(weight_) elif self.mfw_activation == "silu": weight_ = F.silu(weight_) else: raise NotImplementedError weight_mask = torch.bmm(r.unsqueeze(-1), s.unsqueeze(1)) weight_mask = torch.sum(weight_mask, dim=0) weight_ = weight_ + weight_mask input = F.linear(input, weight_.t(), self.bias) # input = torch.addmm(self.bias, input.view(-1, input.size(-1)), weight_) # input = input.view(seq_len, bsz, input.size(-1)) return input else: print(indices.size(), input.size()) raise NotImplementedError
def forward(self, x): out = F.gelu(self.bn1(self.conv1(x))) # out =sine(self.bn1(self.conv1(x))) # out = F.leaky_sin(self.bn1(self.conv1(x)), negative_slope=0.1) out = F.tanh(self.bn2(self.conv2(out))) # out =sine(self.bn2(self.conv2(out))) # out = F.leaky_sin(self.bn2(self.conv2(out)), negative_slope=0.1) out = self.bn3(self.conv3(out)) out += self.shortcut(x) out = F.relu(out) # out =sine(out) # out = F.leaky_sin(out, negative_slope=0.1) return out
def forward(self, embedded, mask): #embedded = [batch size, seq len, emb dim] embedded = self.layer_norm_1( embedded + self.dropout(self.self_attn(embedded, embedded, embedded, mask))) #embedded = [batch size, seq len, emb dim] embedded = self.layer_norm_2( embedded + self.dropout(self.fc_2(self.dropout(F.gelu(self.fc_1(embedded)))))) #embedded = [batch size, seq len, emb dim] return embedded
def update(self, aggr_out, node_inp, node_type): ''' Step 3: Target-specific Aggregation x = W[node_type] * gelu(Agg(x)) + x ''' aggr_out = F.gelu(aggr_out) res = torch.zeros(aggr_out.size(0), self.out_dim).to(node_inp.device) for target_type in range(self.num_types): idx = (node_type==int(target_type)).reshape(-1) if idx.sum() == 0: continue ''' Add skip connection with learnable weight self.skip[t_id] ''' alpha = F.sigmoid(self.skip[target_type]) res[idx] = self.a_linears[target_type](aggr_out[idx]) * alpha + node_inp[idx] * (1 - alpha) return self.drop(res)
def forward(self, x, hidden): #First Layer --> Conv1 x = self.conv1(x) x = self.conv_layers(x) sizes = x.size() x = x.view(sizes[0], sizes[1]*sizes[2], sizes[3]) #(batch, features*channel, time) x = x.transpose(1,2) #(batch, time, features*channel) x = self.fully_connected(x) #x = F.relu(x) x = F.gelu(x) x = self.dropout(x) # GRU Bidirectional (batch, time, gru_input_size) inputs = (x, hidden) #inputs = x out, hidden = self.gru_layers(inputs) # Fully connected layers out = self.classifier(out) return out, hidden
def forward(self, self_attn, label_attn): factor1 = torch.sigmoid(self.linear_weight1(self_attn)) factor2 = torch.sigmoid(self.linear_weight2(label_attn)) factor1 = factor1 / (factor1 + factor2) factor2 = 1 - factor1 out1 = factor1 * self_attn #[batch, label, hidden] out2 = factor2 * label_attn #[batch, label, hidden] out = torch.cat((out1, out2), dim=-1) out = self.fusion_linear(out) out = self.dropout(out) out = self.ln(out) out = F.gelu(out) out = self.out_linear(out) return torch.squeeze(out, -1)
def transform(self, hidden_states): weight = getattr( self, 'cls.predictions.transform.dense.weight'.replace('.', '__')) bias = getattr( self, 'cls.predictions.transform.dense.bias'.replace('.', '__')) hidden_states = linear(hidden_states, weight, bias) hidden_states = F.gelu(hidden_states) weight = getattr( self, 'cls.predictions.transform.LayerNorm.weight'.replace('.', '__')) bias = getattr( self, 'cls.predictions.transform.LayerNorm.bias'.replace('.', '__')) hidden_states = layer_norm(weight, bias, hidden_states, 1e-12) return hidden_states
def forward(self, src, src_mask=None, src_key_padding_mask=None): r"""Pass the input through the endocder layer. Args: src: the sequnce to the encoder layer (required). src_mask: the mask for the src sequence (optional). src_key_padding_mask: the mask for the src keys per batch (optional). """ outputs = self.self_attn(src, src, src, key_padding_mask=src_key_padding_mask) scaled_values, queries, keys, slot_vectors, slot_assignment_scores, slot_attention_scores = outputs src = src + self.dropout1(scaled_values) src = self.norm1(src) src2 = self.linear2(self.dropout(F.gelu(self.linear1(src)))) src = src + self.dropout2(src2) src = self.norm2(src) return src, queries, keys, slot_vectors, slot_assignment_scores, slot_attention_scores
def forward(self, lv, ls): ls.set_values(lv) #similar to densenet and resnet: bn, relu, conv https://arxiv.org/pdf/1603.05027.pdf if self.norm is None: self.norm = GroupNormLatticeModule(lv.shape[1]) self.linear= torch.nn.Linear(lv.shape[1], self.out_channels, bias=self.use_bias).to("cuda") with torch.no_grad(): torch.nn.init.kaiming_normal_(self.linear.weight, mode='fan_in', nonlinearity='relu') lv, ls=self.norm(lv,ls) # lv=self.relu(lv) lv=F.gelu(lv) ls.set_values(lv) lv = self.linear(lv) ls.set_values(lv) return lv, ls
def forward(self, x, mask): N, S = x.size() last_hidden_states, _, all_hidden_states = self.BERT( x, attention_mask=mask) last_hidden_states = self.dropout(last_hidden_states) encoder_states, final_state = self.BiLSTM(last_hidden_states, mask) mask = mask.view(N, S, 1) intermediate = F.gelu(self.linear1(final_state)) binary_prob = T.sigmoid(self.linear2(intermediate)) classes_prob = self.linear3(intermediate) return binary_prob, classes_prob
def test_gelu_activation(N=50): from numpy_ml.neural_nets.activations import GELU N = np.inf if N is None else N i = 0 while i < N: n_dims = np.random.randint(1, 100) z = random_stochastic_matrix(1, n_dims) approx = np.random.choice([True, False]) mine = GELU(approximate=False) mine_approx = GELU(approximate=True) gold = lambda z: F.gelu(torch.FloatTensor(z)).numpy() np.testing.assert_allclose(mine.fn(z), gold(z), rtol=1e-3) assert_almost_equal(mine.fn(z), mine_approx.fn(z)) print("PASSED") i += 1
def forward(self, lv, ls, concat_connection=None): ls.set_values(lv) #similar to densenet and resnet: bn, relu, conv if self.norm is None: self.norm = GroupNormLatticeModule(lv.shape[1]) lv, ls=self.norm(lv,ls) lv=F.gelu(lv) ls.set_values(lv) lv_1, ls_1 = self.coarse(lv, ls) ls_1.set_values(lv_1) if concat_connection is not None: lv_1=torch.cat((lv_1, concat_connection),1) ls_1.set_values(lv_1) return lv_1, ls_1
def forward_ref(self, input, mask): i = 0 output = input for l in range(self.num_layers): output = F.linear(output, self.weights[l], self.biases[l]) dropout_mask = mask[i:i + output.numel()] pinv = 1 / (1 - self.dropout) if l < self.num_layers - 1: # print(mask.size()) # output = fast_silu(output) * dropout_mask.view(output.size(0), -1) * pinv # output = GELUFunction.apply(output) * dropout_mask.view(output.size(0), -1) * pinv output = F.gelu(output) * dropout_mask.view( output.size(0), -1) * pinv i += output.numel() return output
def forward(self, x, mask): N, S = x.size() last_hidden_states, _, all_hidden_states = self.BERT(x, attention_mask=mask) last_six_hidden_states = all_hidden_states[-6:] concated_layers = T.cat(last_six_hidden_states, dim=-1) concated_layers = concated_layers.view(N*S, 6, 768) _, fused_layers = self.layer_fusion(concated_layers) fused_layers = fused_layers.view(N*S, 768) fused_layers = fused_layers.view(N, S, 768) fused_layers = self.dropout(fused_layers) encoder_states, final_state = self.BiLSTM(fused_layers, mask) mask = mask.view(N, S, 1) # Attention Mechanism attention_mask = T.where(mask == 0.0, self.neg_inf, self.zeros) attention_mask = attention_mask.view(N, S) encoder_states = encoder_states.view(N*S, 2*self.hidden_size) attn_scores = self.linear_attn_2(T.tanh(self.linear_attn_1(encoder_states))) attn_scores = attn_scores.view(N, S) attn_scores = attn_scores+attention_mask attn_scores = F.softmax(attn_scores, dim=-1) attn_scores = attn_scores.view(N, S, 1) encoder_states = encoder_states.view(N, S, 2*self.hidden_size) context_vector = T.sum(attn_scores*encoder_states, dim=1) intermediate = F.gelu(self.linear1(context_vector)) binary_prob = T.sigmoid(self.linear2(intermediate)) classes_prob = self.linear3(intermediate) return binary_prob, classes_prob
def forward(self, xyz1, xyz2, points1, points2): """ Input: xyz1: input points position data, [B, C, N] xyz2: sampled input points position data, [B, C, S] points1: input points data, [B, D, N] points2: input points data, [B, D, S] Return: new_points: upsampled points data, [B, D', N] """ # xyz1 = xyz1.permute(0, 2, 1) # xyz2 = xyz2.permute(0, 2, 1) points2 = points2.permute(0, 2, 1) B, N, C = xyz1.shape _, S, _ = xyz2.shape if S == 1: interpolated_points = points2.repeat(1, N, 1) else: dists = square_distance(xyz1, xyz2) dists, idx = dists.sort(dim=-1) dists, idx = dists[:, :, :3], idx[:, :, :3] # [B, N, 3] dist_recip = 1.0 / (dists + 1e-8) norm = torch.sum(dist_recip, dim=2, keepdim=True) weight = dist_recip / norm interpolated_points = torch.sum(index_points(points2, idx) * weight.view(B, N, 3, 1), dim=2) if points1 is not None: points1 = points1.permute(0, 2, 1) new_points = torch.cat([points1, interpolated_points], dim=-1) else: new_points = interpolated_points new_points = new_points.permute(0, 2, 1) for i, conv in enumerate(self.mlp_convs): bn = self.mlp_bns[i] new_points = F.gelu(bn(conv(new_points))) return new_points
def forward(self, batch): """ :param batch: list[str], list of sentences (NOTE: untokenized, continuous sentences) :return: pre_softmax, torch.tensor of shape (batch_size, n_class) """ b_input_ids = batch[0] b_input_mask = batch[1] b_meta_features = batch[2] pooled_output = self.bert(input_ids=b_input_ids, attention_mask=b_input_mask) output = pooled_output[0] pooled_output = output[:, 0] # Retrieve the first hidden state pooled_output = torch.cat([pooled_output, b_meta_features], dim=-1) pooled_output = F.gelu(self.hidden(self.dropout(pooled_output))) logits = self.classifier(pooled_output) return (logits, ) # add hidden states and attention if they are here
def forward(self, x): bsize, feats, num_pts = x.size() # x0 = get_graph_feature(x, k=self.k) # (bsize, 3, num_points) -> (bsize, 3*2, num_points, k) # t = self.transform_net(x0) # (bsize, 3, 3) # x = x.transpose(2, 1) # (bsize, 3, num_points) -> (bsize, num_points, 3) # x = torch.bmm(x, t) # (bsize, num_points, 3) * (bsize, 3, 3) -> (bsize, num_points, 3) # x = x.transpose(2, 1) feature = F.gelu(self.conv1(x, x)) x, feature = self.pool1(x, feature, num_pts // 4) # x1 = feature[:, :, :num_pts // 32] feature = F.gelu(self.conv2(x, feature)) x, feature = self.pool2(x, feature, num_pts // 8) # x2 = feature[:, :, :num_pts // 32] feature = F.gelu(self.conv3(x, feature)) x, feature = self.pool3(x, feature, num_pts // 16) # x3 = feature[:, :, :num_pts // 32] feature = F.gelu(self.conv4(x, feature)) x, feature = self.pool4(x, feature, num_pts // 32) feature = F.gelu(self.conv6(x, feature)) # _, x4 = self.pooling(x, feature, num_pts // 32) # x = torch.cat((x1, x2, x3, x4), dim=1) x = F.gelu(self.conv5(feature)) x1 = F.adaptive_max_pool1d(x, 1).view(bsize, -1) x2 = F.adaptive_avg_pool1d(x, 1).view(bsize, -1) x = torch.cat((x1, x2), 1) x = F.gelu(self.bn6(self.linear1(x))) x = self.dp1(x) x = F.gelu(self.bn7(self.linear2(x))) x = self.dp2(x) x = self.linear3(x) return x
def forward(self, x, seed): # [batch, length, d_model] x = x.reshape(-1, x.size(1) // self.chunk, x.size(2)) # [batch * chunk, length // chunk, d_model] output = F.gelu(self.linear1(x)) # [batch * chunk, length // chunk, d_ff] if self.training: generator = torch.Generator(device=output.get_device()) generator.manual_seed(seed) dropout_mask = torch.bernoulli(output, p=1 - self.dropout, generator=generator) output = dropout_mask * output / (1 - self.dropout) output = self.linear2(output) # [batch * chunk, length // chunk, d_model] output = output.reshape(-1, output.size(1) * self.chunk, output.size(2)) # [batch, length, d_model] return output
def forward(self, x): x_key_padding_mask = (x == 0).clone().detach( ) # zero out the attention of empty sequence elements x = self.embedding(x.transpose(1, 0).int()) # [seq, batch] x = self.positionalEncoder(x) #x = self.encoder(x,src_key_padding_mask=x_key_padding_mask) #x = x.permute(1,0,2).reshape(x_key_padding_mask.shape[0], int(self.embedDim*self.maxLen)) for i in range(len(self.self_attn_layers)): x = self.self_attn_layers[i]( x, x, x, key_padding_mask=x_key_padding_mask)[0] x = self.encoder_linear[i](x) x = x.mean(dim=0) # mean aggregation for i in range(len(self.decoder_layers)): x = F.gelu(self.decoder_layers[i](x)) x = self.decoder_dropouts[i](x) x = self.output_layer(x) return x
def forward(self, state, ir_state, action): sa = torch.cat([state, ir_state, action], 1) q1 = F.gelu(self.f1(sa)) q1 = F.gelu(self.f2(q1)) q1 = F.gelu(self.f3(q1)) q1 = F.selu(self.f4(q1)) q1 = self.f5(q1) q2 = F.gelu(self.l1(sa)) q2 = F.gelu(self.l2(q2)) q2 = F.gelu(self.l3(q2)) q2 = F.selu(self.f4(q2)) q2 = self.f5(q2) return q1, q2