def forward(self, x, init_states=None): """Assumes x is of shape (batch, sequence, feature)""" seq_sz, bs, _ = x.size() hidden_seq = [] if init_states is None: h_t, c_t = ( flow.zeros((bs, self.hidden_size)).to("cuda"), flow.zeros((bs, self.hidden_size)).to("cuda"), ) else: h_t, c_t = init_states HS = self.hidden_size for t in range(seq_sz): x_t = x[t, :, :].reshape(x.shape[1], x.shape[2]) # batch the computations into a single matrix multiplication # NOTE(Xu Zhiqiu): flow does not support view now, use reshape instead gates = flow.matmul(x_t, self.W) + flow.matmul(h_t, self.U) + self.bias i_t, f_t, g_t, o_t = ( flow.sigmoid(gates[:, :HS]), flow.sigmoid(gates[:, HS:HS * 2]), flow.tanh(gates[:, HS * 2:HS * 3]), flow.sigmoid(gates[:, HS * 3:]), ) c_t = f_t * c_t + i_t * g_t h_t = o_t * flow.tanh(c_t) hidden_seq.append(h_t.unsqueeze(0)) hidden_seq = flow.cat(hidden_seq, dim=0) return hidden_seq, (h_t, c_t)
def forward(self, x, init_states=None): seq_sz, bs, _ = x.size() hidden_seq = [] if init_states is None: h_t, c_t = ( flow.zeros((bs, self.hidden_size)).to("cuda"), flow.zeros((bs, self.hidden_size)).to("cuda"), ) else: h_t, c_t = init_states HS = self.hidden_size for t in range(seq_sz): x_t = x[t, :, :] x_t = x_t.reshape(x.shape[1], x.shape[2]) gates = flow.matmul(x_t, self.W) + flow.matmul(h_t, self.U) + self.bias i_t, f_t, g_t, o_t = ( flow.sigmoid(gates[:, :HS]), flow.sigmoid(gates[:, HS:HS * 2]), flow.tanh(gates[:, HS * 2:HS * 3]), flow.sigmoid(gates[:, HS * 3:]), ) c_t = f_t * c_t + i_t * g_t h_t = o_t * flow.tanh(c_t) hidden_seq.append(h_t.unsqueeze(0)) hidden_seq = flow.cat(hidden_seq, dim=0) return hidden_seq, (h_t, c_t)
def forward(self, x, init_states=None): """Assumes x is of shape (batch, sequence, feature)""" bs, seq_sz, _ = x.size() hidden_seq = [] if init_states is None: h_t, c_t = ( flow.zeros((bs, self.hidden_size)).to(x.device), flow.zeros((bs, self.hidden_size)).to(x.device), ) else: h_t, c_t = init_states HS = self.hidden_size for t in range(seq_sz): x_t = x[:, t, :].reshape(x.shape[0], x.shape[2]) gates = flow.matmul(x_t, self.W) + flow.matmul(h_t, self.U) + self.bias i_t, f_t, g_t, o_t = ( flow.sigmoid(gates[:, :HS]), flow.sigmoid(gates[:, HS : HS * 2]), flow.tanh(gates[:, HS * 2 : HS * 3]), flow.sigmoid(gates[:, HS * 3 :]), ) c_t = f_t * c_t + i_t * g_t h_t = o_t * flow.tanh(c_t) hidden_seq.append(h_t.unsqueeze(1)) hidden_seq = flow.cat(hidden_seq, dim=1) return hidden_seq, (h_t, c_t)
def forward(self, x, c): c = c.view(c.size(0), c.size(1), 1, 1) c1 = c.repeat(1, 1, x.size(2), x.size(3)) x = flow.cat([x, c1], dim=1) x = self.d1(x) c2 = c.repeat(1, 1, x.size(2), x.size(3)) x = flow.cat([x, c2], dim=1) x = self.d2(x) c3 = c.repeat(1, 1, x.size(2), x.size(3)) x = flow.cat([x, c3], dim=1) x = self.d3(x) c4 = c.repeat(1, 1, x.size(2), x.size(3)) x = flow.cat([x, c4], dim=1) x = self.d4(x) c5 = c.repeat(1, 1, x.size(2), x.size(3)) x = flow.cat([x, c5], dim=1) x = self.conv(x) x = self.pool(x) x = flow.squeeze(x) x = flow.tanh(x) return x
def _test_body_tanh_v2(test_case, input_arr): x = flow.Tensor(input_arr) y = flow.tanh(x) z = np.tanh(input_arr) test_case.assertTrue(np.allclose(y.numpy(), z, rtol=1e-4, atol=1e-4))
def gelu(x): """ Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 """ return (0.5 * x * (1.0 + flow.tanh( math.sqrt(2.0 / math.pi) * (x + 0.044715 * flow.pow(x, 3.0)))))
def forward(self, x, hidden=None): batch_size, seq_len, _ = x.size() H_S = self.hidden_size hidden_seq = [] if hidden is None: h_t = flow.zeros((batch_size, self.hidden_size)) else: h_t = hidden for t in range(seq_len): x_t = x[:, t, :] gates_1 = flow.matmul(x_t, self.inp_W) + self.inp_b gates_2 = flow.matmul(h_t, self.hid_W) + self.hid_b r_gate = flow.sigmoid(gates_1[:, :H_S] + gates_2[:, :H_S]) z_gate = flow.sigmoid(gates_1[:, H_S:H_S * 2] + gates_2[:, H_S:H_S * 2]) h_t_ = flow.tanh(gates_1[:, H_S * 2:H_S * 3] + r_gate * gates_2[:, H_S * 2:H_S * 3]) h_t = (1 - z_gate) * h_t_ + z_gate * h_t hidden_seq.append(h_t.unsqueeze(1)) hidden_seq = flow.cat(hidden_seq, dim=1) return hidden_seq, h_t
def _tanh(self): return flow.tanh(self)
def forward(self, input, h_0=None): if self.batch_first == False: input = self.permute_tensor(input) D = 2 if self.bidirectional else 1 num_layers = self.num_layers batch_size, seq_len, _ = input.size() if h_0 is None: real_hidden_size = ( self.proj_size if self.proj_size > 0 else self.hidden_size ) h_t = flow.zeros( (D * num_layers, batch_size, real_hidden_size), dtype=input.dtype, device=input.device, ) c_t = flow.zeros( (D * num_layers, batch_size, self.hidden_size), dtype=input.dtype, device=input.device, ) h_0 = (h_t, c_t) else: h_t, c_t = h_0 if self.bidirectional: if h_0 is None: h_t_f = h_t[:num_layers, :, :] h_t_b = h_t[num_layers:, :, :] c_t_f = c_t[:num_layers, :, :] c_t_b = c_t[num_layers:, :, :] else: h_t_f = flow.cat( [ h_t[l, :, :].unsqueeze(0) for l in range(h_t.size(0)) if l % 2 == 0 ], dim=0, ) h_t_b = flow.cat( [ h_t[l, :, :].unsqueeze(0) for l in range(h_t.size(0)) if l % 2 != 0 ], dim=0, ) c_t_f = flow.cat( [ c_t[l, :, :].unsqueeze(0) for l in range(c_t.size(0)) if l % 2 == 0 ], dim=0, ) c_t_b = flow.cat( [ c_t[l, :, :].unsqueeze(0) for l in range(c_t.size(0)) if l % 2 != 0 ], dim=0, ) else: h_t_f = h_t c_t_f = c_t layer_hidden = [] layer_cell = [] for layer in range(self.num_layers): hidden_seq_f = [] if self.bidirectional: hidden_seq_b = [] hid_t_f = h_t_f[layer, :, :] h_c_t_f = c_t_f[layer, :, :] if self.bidirectional: hid_t_b = h_t_b[layer, :, :] h_c_t_b = c_t_b[layer, :, :] for t in range(seq_len): if layer == 0: x_t_f = input[:, t, :] if self.bidirectional: x_t_b = input[:, seq_len - 1 - t, :] else: x_t_f = hidden_seq[:, t, :] if self.bidirectional: x_t_b = hidden_seq[:, seq_len - 1 - t, :] # TODO: Modify after adding the stride attribute # gi_f = flow.matmul( # x_t_f, # getattr(self, "weight_ih_l{}{}".format(layer, "")).permute(1, 0), # ) # gh_f = flow.matmul( # hid_t_f, # getattr(self, "weight_hh_l{}{}".format(layer, "")).permute(1, 0), # ) gi_f = flow.matmul( x_t_f, getattr(self, "weight_ih_l{}{}".format(layer, "")), ) gh_f = flow.matmul( hid_t_f, getattr(self, "weight_hh_l{}{}".format(layer, "")), ) if self.bias: gi_f += getattr(self, "bias_ih_l{}{}".format(layer, "")) gh_f += getattr(self, "bias_hh_l{}{}".format(layer, "")) gates_f = gi_f + gh_f ingate_f, forgetgate_f, cellgate_f, outgate_f = gates_f.chunk(4, dim=1) ingate_f = flow.sigmoid(ingate_f) forgetgate_f = flow.sigmoid(forgetgate_f) cellgate_f = flow.tanh(cellgate_f) outgate_f = flow.sigmoid(outgate_f) h_c_t_f = (forgetgate_f * h_c_t_f) + (ingate_f * cellgate_f) hid_t_f = outgate_f * flow.tanh(h_c_t_f) if self.proj_size > 0: # TODO:Modify after adding the stride attribute # hid_t_f = flow.matmul( # hid_t_f, # getattr(self, "weight_hr_l{}{}".format(layer, "")).permute( # 1, 0 # ), # ) hid_t_f = flow.matmul( hid_t_f, getattr(self, "weight_hr_l{}{}".format(layer, "")) ) hidden_seq_f.append(hid_t_f.unsqueeze(1)) if self.bidirectional: # TODO:Modify after adding the stride attribute # gi_b = flow.matmul( # x_t_b, # getattr( # self, "weight_ih_l{}{}".format(layer, "_reverse") # ).permute(1, 0), # ) # gh_b = flow.matmul( # hid_t_b, # getattr( # self, "weight_hh_l{}{}".format(layer, "_reverse") # ).permute(1, 0), # ) gi_b = flow.matmul( x_t_b, getattr(self, "weight_ih_l{}{}".format(layer, "_reverse")), ) gh_b = flow.matmul( hid_t_b, getattr(self, "weight_hh_l{}{}".format(layer, "_reverse")), ) if self.bias: gi_b += getattr(self, "bias_ih_l{}{}".format(layer, "_reverse")) gh_b += getattr(self, "bias_hh_l{}{}".format(layer, "_reverse")) gates_b = gi_b + gh_b ingate_b, forgetgate_b, cellgate_b, outgate_b = gates_b.chunk( 4, dim=1 ) ingate_b = flow.sigmoid(ingate_b) forgetgate_b = flow.sigmoid(forgetgate_b) cellgate_b = flow.tanh(cellgate_b) outgate_b = flow.sigmoid(outgate_b) h_c_t_b = (forgetgate_b * h_c_t_b) + (ingate_b * cellgate_b) hid_t_b = outgate_b * flow.tanh(h_c_t_b) if self.proj_size > 0: # TODO:Modify after adding the stride attribute # hid_t_b = flow.matmul( # hid_t_b, # getattr( # self, "weight_hr_l{}{}".format(layer, "_reverse") # ).permute(1, 0), # ) hid_t_b = flow.matmul( hid_t_b, getattr(self, "weight_hr_l{}{}".format(layer, "_reverse")), ) hidden_seq_b.insert(0, hid_t_b.unsqueeze(1)) hidden_seq_f = flow.cat(hidden_seq_f, dim=1) if self.bidirectional: hidden_seq_b = flow.cat(hidden_seq_b, dim=1) if self.dropout != 0 and layer != self.num_layers - 1: hidden_seq_f = self.drop(hidden_seq_f) if self.bidirectional: hidden_seq_b = self.drop(hidden_seq_b) if self.bidirectional: hidden_seq = flow.cat([hidden_seq_f, hidden_seq_b], dim=2) else: hidden_seq = hidden_seq_f if self.bidirectional: h_t = flow.cat([hid_t_f.unsqueeze(0), hid_t_b.unsqueeze(0)], dim=0) c_t = flow.cat([h_c_t_f.unsqueeze(0), h_c_t_b.unsqueeze(0)], dim=0) else: h_t = hid_t_f.unsqueeze(0) c_t = h_c_t_f.unsqueeze(0) layer_hidden.append(h_t) layer_cell.append(c_t) h_t = flow.cat(layer_hidden, dim=0) c_t = flow.cat(layer_cell, dim=0) if self.batch_first == False: hidden_seq = self.permute_tensor(hidden_seq) return hidden_seq, (h_t, c_t)
def forward(self, input, h_0=None): if self.batch_first == False: input = self.permute_tensor(input) D = 2 if self.bidirectional else 1 num_layers = self.num_layers batch_size, seq_len, _ = input.size() if h_0 is None: h_t = flow.zeros( (D * num_layers, batch_size, self.hidden_size), dtype=input.dtype, device=input.device, ) else: h_t = h_0 if self.bidirectional: if h_0 is None: h_t_f = h_t[:num_layers, :, :] h_t_b = h_t[num_layers:, :, :] else: h_t_f = flow.cat( [ h_t[l, :, :].unsqueeze(0) for l in range(h_t.size(0)) if l % 2 == 0 ], dim=0, ) h_t_b = flow.cat( [ h_t[l, :, :].unsqueeze(0) for l in range(h_t.size(0)) if l % 2 != 0 ], dim=0, ) else: h_t_f = h_t layer_hidden = [] for layer in range(self.num_layers): hidden_seq_f = [] if self.bidirectional: hidden_seq_b = [] hid_t_f = h_t_f[layer, :, :] if self.bidirectional: hid_t_b = h_t_b[layer, :, :] for t in range(seq_len): if layer == 0: x_t_f = input[:, t, :] if self.bidirectional: x_t_b = input[:, seq_len - 1 - t, :] else: x_t_f = hidden_seq[:, t, :] if self.bidirectional: x_t_b = hidden_seq[:, seq_len - 1 - t, :] # TODO: Modify after adding the stride attribute # gi_f = flow.matmul( # x_t_f, # getattr(self, "weight_ih_l{}{}".format(layer, "")).permute(1, 0), # ) # gh_f = flow.matmul( # hid_t_f, # getattr(self, "weight_hh_l{}{}".format(layer, "")).permute(1, 0), # ) gi_f = flow.matmul( x_t_f, getattr(self, "weight_ih_l{}{}".format(layer, "")), ) gh_f = flow.matmul( hid_t_f, getattr(self, "weight_hh_l{}{}".format(layer, "")), ) if self.bias: gi_f += getattr(self, "bias_ih_l{}{}".format(layer, "")) gh_f += getattr(self, "bias_hh_l{}{}".format(layer, "")) i_r_f, i_i_f, i_n_f = gi_f.chunk(3, dim=1) h_r_f, h_i_f, h_n_f = gh_f.chunk(3, dim=1) resetgate_f = flow.sigmoid(i_r_f + h_r_f) inputgate_f = flow.sigmoid(i_i_f + h_i_f) newgate_f = flow.tanh(i_n_f + resetgate_f * h_n_f) hid_t_f = newgate_f + inputgate_f * (hid_t_f - newgate_f) hidden_seq_f.append(hid_t_f.unsqueeze(1)) if self.bidirectional: # TODO:Modify after adding the stride attribute # gi_b = flow.matmul( # x_t_b, # getattr( # self, "weight_ih_l{}{}".format(layer, "_reverse") # ).permute(1, 0), # ) # gh_b = flow.matmul( # hid_t_b, # getattr( # self, "weight_hh_l{}{}".format(layer, "_reverse") # ).permute(1, 0), # ) gi_b = flow.matmul( x_t_b, getattr(self, "weight_ih_l{}{}".format(layer, "_reverse")), ) gh_b = flow.matmul( hid_t_b, getattr(self, "weight_hh_l{}{}".format(layer, "_reverse")), ) if self.bias: gi_b += getattr(self, "bias_ih_l{}{}".format(layer, "_reverse")) gh_b += getattr(self, "bias_hh_l{}{}".format(layer, "_reverse")) i_r_b, i_i_b, i_n_b = gi_b.chunk(3, dim=1) h_r_b, h_i_b, h_n_b = gh_b.chunk(3, dim=1) resetgate_b = flow.sigmoid(i_r_b + h_r_b) inputgate_b = flow.sigmoid(i_i_b + h_i_b) newgate_b = flow.tanh(i_n_b + resetgate_b * h_n_b) hid_t_b = newgate_b + inputgate_b * (hid_t_b - newgate_b) hidden_seq_b.insert(0, hid_t_b.unsqueeze(1)) hidden_seq_f = flow.cat(hidden_seq_f, dim=1) if self.bidirectional: hidden_seq_b = flow.cat(hidden_seq_b, dim=1) if self.dropout != 0 and layer != self.num_layers - 1: hidden_seq_f = self.drop(hidden_seq_f) if self.bidirectional: hidden_seq_b = self.drop(hidden_seq_b) if self.bidirectional: hidden_seq = flow.cat([hidden_seq_f, hidden_seq_b], dim=2) else: hidden_seq = hidden_seq_f if self.bidirectional: h_t = flow.cat([hid_t_f.unsqueeze(0), hid_t_b.unsqueeze(0)], dim=0) else: h_t = hid_t_f.unsqueeze(0) layer_hidden.append(h_t) h_t = flow.cat(layer_hidden, dim=0) if self.batch_first == False: hidden_seq = self.permute_tensor(hidden_seq) return hidden_seq, h_t
import logging import oneflow as flow import oneflow.nn as nn import oneflow.nn.functional as F logger = logging.getLogger(__name__) _ACTIVATION = { "relu": F.relu, "gelu": F.gelu, "glu": F.glu, "tanh": lambda x: flow.tanh(x), "swish": lambda x: x * flow.sigmoid(x), } class PositionwiseFeedForward(nn.Module): """Positionwise feed forward """ def __init__(self, d_model, d_ff, dropout, activation="relu"): super(PositionwiseFeedForward, self).__init__() self.activation = activation assert activation in ["relu", "gelu", "glu", "tanh", "swish"] self.w_1 = nn.Linear(d_model, d_ff * 2 if activation == "glu" else d_ff) self.w_2 = nn.Linear(d_ff, d_model) self.dropout = nn.Dropout(dropout) def forward(self, x):