def self_attention(self, h, adj, step): attention_layer_index = 0 if self.attention_tying else step mask = np.sum(adj, axis=1) mask[mask == 0] = -10000 # [mb, atoms, ch] -> [mb, ch, atoms] mb, atoms, ch = h.shape h = functions.transpose(h, axes=(0, 2, 1)) h = self.linear_transform_layer[attention_layer_index](h) # [mb, 1, atoms] f_1 = self.conv1d_layer_1[attention_layer_index](h) # [mb, 1, atoms] -> [mb, atoms, 1] f_1 = functions.transpose(f_1, axes=(0, 2, 1)) # [mb, atoms, 1] -> [mb, atoms, atoms] f_1 = functions.tile(f_1, reps=(1, 1, atoms)) # [mb, 1, atoms] f_2 = self.conv1d_layer_2[attention_layer_index](h) # [mb, 1, atoms] -> [mb, atoms, atoms] f_2 = functions.tile(f_2, reps=(1, atoms, 1)) logits = f_1 + f_2 # logits *= mask # [mb, atoms, atoms] coefs = functions.softmax(functions.leaky_relu(logits)) coefs = functions.transpose(coefs, axes=(0, 2, 1)) # [mb, ch, atoms] -> [mb, atoms, ch] h = functions.transpose(h, axes=(0, 2, 1)) h = functions.dropout( h, ratio=self.dropout_rate) if self.dropout_rate != 0.0 else h # [mb, atoms, atoms] * [mb, atoms, ch] vals = functions.matmul(coefs, h) h = functions.elu(vals) return h
def masked_self_attention(self, input, adj, step): adj = np.sum(adj, axis=1) # [mb, atoms, ch] mb, atoms, ch = input.shape attention_layer_index = 0 if self.attention_tying else step # [mb, atoms, hidden_dim] h = functions.reshape(input, shape=(mb * atoms, ch)) h = self.linear_transform_layer[attention_layer_index](h) h = functions.reshape(h, shape=(mb, atoms, -1)) # [mb, atoms, atoms, 2 * hidden_dim] a_input = functions.concat([functions.tile(h, reps=(1, 1, atoms)).reshape(mb, atoms * atoms, -1), functions.tile(h, reps=(1, atoms, 1))], axis=-1).reshape(mb, atoms, atoms, 2 * self.hidden_dim) a_input = functions.reshape(a_input, shape=(mb * atoms * atoms, 2 * self.hidden_dim)) # [mb * atoms * atoms, 2 * hidden_dim] => [mb * atoms * atoms, 1] => [mb, atoms * atoms] e = functions.leaky_relu( functions.reshape(functions.squeeze(self.neural_network_layer[attention_layer_index](a_input), axis=-1), shape=(mb, atoms, atoms))) # [mb, atoms, atoms] zero_vec = -9e15 * self.xp.ones_like(e, dtype=self.xp.float32) # [mb, atoms, atoms] attention = functions.where(adj > 0, e, zero_vec) # [mb, atoms, atoms] attention = functions.softmax(attention, axis=2) # [mb, atoms, atoms] * [mb, atoms, hidden_dim] => [mb, atoms, hidden_dim] h_prime = functions.matmul(attention, h) h_prime = functions.elu(h_prime) return h_prime
def _compare(self, s, o): """ Receive: batch Return: self.n_rel-d vector """ # Store batch size batch_size = len(s) # Repeat each subject and object vectors slice size times # W tensor_t = F.tile(self.wc, (batch_size, 1, 1, 1)) # Calculate each term # - sWo so_elp = s * o # element-wise product of s and o so_elp_t = F.reshape(F.tile(so_elp, (1, self.k)), (batch_size, self.k, 1, self.d)) sWo = F.sum(tensor_t * so_elp_t, (2, 3)) affine = self._affinec(s, o, mp=self.mp) preact = sWo + affine compared = F.tanh(preact) return compared
def _node(self, left, right, ope): """ Recieve left and right vectors Return vectors of their destinations """ batch_size = len(left) left_t = F.reshape(F.tile(left, (1, self.d)), (batch_size * self.d, self.d)) right_t = F.reshape(F.tile(right, (1, self.d)), (batch_size * self.d, self.d)) # W S_t = F.reshape(self.S[ope], (batch_size * self.d, self.d, self.p)) T_t = F.reshape(self.T[ope], (batch_size * self.d, self.p, self.d)) # Calculate each term # -sWo sS = F.batch_matmul(left_t, S_t, transa=True) To = F.batch_matmul(T_t, right_t) # Activation rntn = F.reshape(F.batch_matmul(sS, To), (batch_size, self.d)) rnn = self._affine(left, right, ope) composed = F.tanh(rntn + rnn) return composed
def _node(self, left, right): """ Recieve left and right vectors Return vectors of their destinations """ # Get batch size batch_size = len(left) # Get vectors of subjects and objects s_vecs = left o_vecs = right # Concats of subject and object so_vecs = F.concat((s_vecs, o_vecs), axis=1) # W tensor_t = F.tile(self.w, (batch_size, 1, 1, 1)) # Calculate each term # - sWo so_elp = s_vecs * o_vecs # element-wise product of s and o so_elp_t = F.reshape(F.tile(so_elp, (1, self.d)), (batch_size, self.d, 1, self.d)) sWo = F.sum(tensor_t * so_elp_t, (2, 3)) # Sum up terms rntn = sWo rnn = self.V(so_vecs) composed = F.leaky_relu(rntn, slope=0.01) + F.leaky_relu(rnn, slope=0.01) return composed
def _compare(self, s, o): """ Receive: batch Return: self.n_rel-d vector """ # Store batch size batch_size = len(s) # Get vectors of subjects and objects s_vecs = s o_vecs = o # Repeat each subject and object vectors slice size times # - Concats of subject and object so_vecs = F.concat((s_vecs, o_vecs), axis=1) # W tensor_t = F.tile(self.wc, (batch_size, 1, 1, 1)) # Calculate each term # - sWo so_elp = s_vecs * o_vecs # element-wise product of s and o so_elp_t = F.reshape(F.tile(so_elp, (1, self.k)), (batch_size, self.k, 1, self.d)) sWo = F.sum(tensor_t * so_elp_t, (2, 3)) # Activation rntn = sWo rnn = self.Vc(so_vecs) compared = F.leaky_relu(rntn, slope=0.01) + F.leaky_relu(rnn, slope=0.01) return compared
def _compare(self, s, o): """ Receive: batch Return: self.n_rel-d vector """ # Store batch size batch_size = len(s) # W tensor_t = F.tile(self.Wc, (batch_size, 1, 1, 1)) # Calculate each term # -sWo s_vecs_ = F.reshape(s, (batch_size, 1, self.d)) s_vecs__ = F.broadcast_to(s_vecs_, ((batch_size, self.d, self.d))) o_vecs_ = F.reshape(o, (batch_size, self.d, 1)) o_vecs__ = F.broadcast_to(o_vecs_, ((batch_size, self.d, self.d))) so_elp = s_vecs__ * o_vecs__ # element-wise product of s and o so_elp_t = F.reshape(F.tile(so_elp, (1, 1, self.k)), (batch_size, self.k, self.d, self.d)) sWo = F.sum(tensor_t * so_elp_t, (2, 3)) affine = self._affinec(s, o, mp=self.mp) preact = sWo + affine compared = F.tanh(preact) return compared
def __call__(self, atoms_1, g_1, atoms_2, g_2): """ :param atoms_1: atomic representation of molecule 1, with shape of (mb, N_1, hidden_dim) :param g_1: molecular representation of molecule 1, with shape of (mb, out_dim) :param atoms_2: atomic representation of molecule 2, with shape of (mb, N_2, hidden_dim) :param g_2: molecular representation of molecule 2, with shape of (mb, out_dim) :return: """ # energy: (mb, N_2, N_1) energy = self.compute_attention(query=atoms_2, key=atoms_1) # energy_1: (mb, N_1) energy_1 = functions.mean(energy, axis=1) # attn_1: (mb, N_1) attn_1 = functions.softmax(energy_1, axis=1) # attn_1: (mb, N_1, out_dim) attn_1 = functions.tile(functions.expand_dims(attn_1, axis=-1), reps=(1, 1, self.out_dim)) # energy_2: (mb, N_2) energy_2 = functions.mean(energy, axis=2) # attn_2: (mb, N_2) attn_2 = functions.softmax(energy_2, axis=1) # attn_2: (mb, N_2, out_dim) attn_2 = functions.tile(functions.expand_dims(attn_2, axis=-1), reps=(1, 1, self.out_dim)) # compact_1: (mb, out_dim) compact_1 = functions.sum(attn_1 * self.j_layer(atoms_1), axis=1) # compact_2: (mb, out_dim) compact_2 = functions.sum(attn_2 * self.j_layer(atoms_2), axis=1) return compact_1, compact_2
def mp_matching_func_pairwise(v1, v2, w): """ Implementation of m = f_m(v_1, v_2, W). m_k = cosine(W_k \odot v_1, W_k \odot v_2) :param v1: (mb, N_1, hidden_dim) :param v2: (mb, N_2, hidden_dim) :param w: (head, hidden_dim) :return: sim: (mb, N_1, N_2, head) """ mb, N_1, _ = v1.shape N_2 = v2.shape[1] # w: (head, hidden_dim) -> (1, head, hidden_dim) -> (1, head, 1, hidden_dim) w = F.expand_dims(F.expand_dims(w, axis=0), axis=2) # v1: (mb, head, N_1, hidden_dim) v1 = F.tile(w, reps=(mb, 1, N_1, 1)) * F.stack([v1] * self.head, axis=1) # v2: (mb, head, N_2, hidden_dim) v2 = F.tile(w, reps=(mb, 1, N_2, 1)) * F.stack([v2] * self.head, axis=1) # v1: (mb, head, N_1, hidden_dim), normalized on hidden_dim v1_normed = F.normalize(v1, axis=3) # v2: (mb, head, N_2, hidden_dim), normalized on hidden_dim v2_normed = F.normalize(v2, axis=3) # sim: (mb, head, N_1, N_2) sim = F.matmul(v1_normed, F.transpose(v2_normed, axes=(0, 1, 3, 2))) # sim: (mb, N_1, N_2, head) sim = F.transpose(sim, axes=(0, 2, 3, 1)) return sim
def mp_matching_func(v1, v2, w): """ Implementation of m = f_m(v_1, v_2, W). m_k = cosine(W_k \odot v_1, W_k \odot v_2) Similar to multi-head attention mechanism :param v1: (mb, N_1, hidden_dim) :param v2: (mb, N_1, hidden_dim) or (mb, hidden_size) :param w: (head, hidden_dim) :return: m: (mb, N_1, head) """ mb, N_1, _ = v1.shape # w: (hidden_dim, head) w = F.transpose(w, axes=(1, 0)) # w: (1, 1, hidden_dim, head) w = F.expand_dims(F.expand_dims(w, axis=0), axis=0) # v1: (mb, N_1, hidden_dim, head) v1 = F.tile(w, reps=(mb, N_1, 1, 1)) * F.stack([v1] * self.head, axis=3) if len(v2.shape) == 3: v2 = F.tile(w, reps=(mb, N_1, 1, 1)) * F.stack([v2] * self.head, axis=3) else: # v2: (mb, hidden_dim) -> (mb, N_1, hidden_dim) -> (mb, N_1, hidden_dim, head) v2 = F.tile(w, reps=(mb, N_1, 1, 1)) * F.stack([F.stack([v2] * N_1, axis=1)] * self.head, axis=3) # v1/v2: (mb, N_1, hidden_dim, head) v1_normed = F.normalize(v1, axis=2) v2_normed = F.normalize(v2, axis=2) # (mb, N_1, head, head) sim = F.matmul(F.transpose(v1_normed, axes=(0, 1, 3, 2)), v2_normed) # sim: (mb, N_1, head, head) -> (mb, N_1, head) sim = sim[:, :, :, 0] return sim
def __call__(self, atoms_1, g_1, atoms_2, g_2): """ :param atoms_1: atomic representation of molecule 1, with shape of (mb, N_1, hidden_dim) :param g_1: molecular representation of molecule 1, with shape of (mb, out_dim) :param atoms_2: atomic representation of molecule 2, with shape of (mb, N_2, hidden_dim) :param g_2: molecular representation of molecule 2, with shape of (mb, out_dim) :return: """ # compute attention based on molecular representation of entity 2 attn_1 = self.compute_attention(query=g_2, key=atoms_1, focus=1) # attn_1: (mb, N_1, out_dim) attn_1 = F.tile(attn_1, reps=(1, 1, self.out_dim)) # (mb, N_1, out_dim) * (mb, N_1, out_dim) - > (mb, N_1, out_dim) z_1 = attn_1 * self.j_layer(atoms_1) # compact_1: (mb, out_dim) compact_1 = F.sum(z_1, axis=1) # compute attention based on attended molecular representation of entity 1 # attn_2: (mb, N_2, 1) attn_2 = self.compute_attention(query=g_1, key=atoms_2, focus=2) # attn_2: (mb, N_2, out_dim) attn_2 = F.tile(attn_2, reps=(1, 1, self.out_dim)) # z_2: (mb, N_2, out_dim) * (mb, N_2, out_dim) -> (mb, N_2, out_dim) z_2 = attn_2 * self.j_layer(atoms_2) # compact_2: (mb, out_dim) compact_2 = F.sum(z_2, axis=1) return compact_1, compact_2
def _compare(self, s, o): """ Receive: batch Return: self.n_rel-d vector """ # Get batch_length batch_size = len(s) # - Concats of subject and object so_vecs = F.concat((s, o), axis=1) s_t = F.reshape(F.tile(s, (1, self.k)), (batch_size * self.k, self.d)) o_t = F.reshape(F.tile(o, (1, self.k)), (batch_size * self.k, self.d)) # W S_t = F.reshape(F.tile(self.Sc, (batch_size, 1)), (batch_size * self.k, self.d, self.q)) T_t = F.reshape(F.tile(self.Tc, (batch_size, 1)), (batch_size * self.k, self.q, self.d)) # Calculate each term # -sWo sS = F.batch_matmul(s_t, S_t, transa=True) To = F.batch_matmul(T_t, o_t) # Activation rntn = F.reshape(F.batch_matmul(sS, To), (batch_size, self.k)) rnn = self.Vc(so_vecs) compared = F.tanh(rntn + rnn) return compared
def __call__(self, x, z): """ Args: x (~chainer.Variable): Batch of input vectors. z (~chainer.Variable): Batch of context vectors. Returns: ~chainer.Variable: Output of the context layer. """ if self.has_uninitialized_params: with cuda.get_device(self._device_id): self._initialize_params(x.size // x.shape[0]) batch_size = x.shape[0] # compute adaptive filter W = self.predictor(z) # reshape linear W to the correct size W = F.reshape(W, [batch_size] + self.shape) # add constant W if defined if self.constantW: W += F.tile(self.C, (batch_size, 1, 1)) # multiply weights with inputs in batch mode y = F.squeeze(F.batch_matmul(W, x), 2) # add bias y += F.tile(self.b, tuple([batch_size, 1])) return y
def interaction_layer(x, y): x_len = x.shape[1] y_len = y.shape[1] x_aug = F.tile(F.expand_dims(x, 2), (1, 1, y_len, 1)) y_aug = F.tile(F.expand_dims(y, 1), (1, x_len, 1, 1)) h_logits = x_aug * y_aug return h_logits
def compute_attention(self, query, key): """ :param query: with shape of (mb, N_1, hidden_dim) :param key: with shape of (mb, N_2, hidden_dim) :return: attn: attention weights (mb, N_1, N_2) """ energy_layer = self.energy_layer mb, N_1, hidden_dim = query.shape N_2 = key.shape[1] # query: (mb, N_1, 1, hidden_dim) query = functions.expand_dims(query, axis=2) # query: (mb, N_1, N_2, hidden_dim) query = functions.tile(query, reps=(1, 1, N_2, 1)) # query: (mb * N_1 * N_2, hidden_dim) query = functions.reshape(query, (mb * N_1 * N_2, hidden_dim)) # key: (mb, 1, N_2, hidden_dim) key = functions.expand_dims(key, axis=1) # key: (mb, N_1, N_2, hidden_dim) key = functions.tile(key, reps=(1, N_1, 1, 1)) # key: (mb * N_1 * N_2, hidden_dim) key = functions.reshape(key, (mb * N_1 * N_2, hidden_dim)) # energy: (mb * N_1 * N_2, 1) energy = self.activation(energy_layer(key, query)) energy = functions.reshape(energy, (mb, N_1, N_2)) return energy
def planar_flows(self,z): self.z_trans = [] self.z_trans.append(z) self.phi = [] for i in range(self.num_trans): flow_w_name = 'flow_w_' + str(i) flow_b_name = 'flow_b_' + str(i) flow_u_name = 'flow_u_' + str(i) h = self[flow_w_name](z) h = F.sum(h,axis=(1)) h = self[flow_b_name](h) h = F.tanh(h) h_tanh = h dim_latent = z.shape[1] h = F.transpose(F.tile(h, (dim_latent,1))) h = self[flow_u_name](h) z += h self.z_trans.append(z) # Calculate and store the phi term h_tanh_derivative = 1-(h_tanh*h_tanh) h_tanh_derivative = F.transpose(F.tile(h_tanh_derivative, (dim_latent,1))) phi = self[flow_w_name](h_tanh_derivative) # Equation (11) self.phi.append(phi) return z
def __call__(self, q, k, v, attn_mask=None): d_k, d_v = self.d_k, self.d_v n_head = self.n_heads batch_size, len_q, d_model = q.shape batch_size, len_k, d_model = k.shape batch_size, len_v, d_model = v.shape residual = q # treat as a (n_head) size batch, shape = (heads x batch), number_words, d_model; then (heads, (batch x len_q), d_model) q_s = F.tile(q, reps=(n_head, 1, 1)).reshape(n_head, -1, d_model) # n_head x (batch_size*len_q) x d_model k_s = F.tile(k, reps=(n_head, 1, 1)).reshape(n_head, -1, d_model) # n_head x (batch_size*len_k) x d_model v_s = F.tile(v, reps=(n_head, 1, 1)).reshape(n_head, -1, d_model) # n_head x (batch_size*len_v) x d_model # (n_head) batch matrix multiply of ((batch * len_q) x d_model) x (d_model, d_k) = (batch * len_q) x d_k # treat the result as a (n_head * mb_size) size batch q_s = F.matmul(q_s, self.w_qs).reshape(-1, len_q, d_k) # (n_head*mb_size) x len_q x d_k k_s = F.matmul(k_s, self.w_ks).reshape(-1, len_k, d_k) # (n_head*mb_size) x len_k x d_k v_s = F.matmul(v_s, self.w_vs).reshape(-1, len_v, d_v) # (n_head*mb_size) x len_v x d_v # outputs size = (n_head * mb_size) x len_q x d_v, attns size = (n_head*mb_size) x len_q x len_k if attn_mask is not None: attn_mask = F.tile(attn_mask, reps=(n_head, 1, 1)) outputs, attns = self.attention(q_s, k_s, v_s, attn_mask=attn_mask) # (n_head*batch) x len_q x d_v outputs = F.concat(F.split_axis(outputs, n_head, axis=0), axis=2) # = batch_size, len_q, (n_head*d_v) outputs = F.reshape(outputs, shape=(batch_size * len_q, n_head * d_v)) # project back to residual size outputs = self.proj(outputs) outputs = F.dropout(outputs, self.dropout_ratio) outputs = F.reshape(outputs, shape=(batch_size, len_q, d_model)) return self.layer_norm(outputs + residual)
def calc_pair_score(self, span_reps_pad, relative_position_info, batchsize, max_n_spans): #(batchsize, max_n_spans, span_representation) span_reps_pad_ntimes = chaFunc.tile(span_reps_pad, (1, 1, max_n_spans)) #(batchsize, max_n_spans, max_n_spans, span_representation) span_reps_matrix = chaFunc.reshape( span_reps_pad_ntimes, (batchsize, max_n_spans, max_n_spans, span_reps_pad.shape[-1])) #(batchsize, max_n_spans, max_n_spans, span_representation) span_reps_matrix_t = chaFunc.transpose(span_reps_matrix, axes=(0, 2, 1, 3)) #(batchsize, max_n_spans, max_n_spans, pair_representation) pair_reps = chaFunc.concat([ span_reps_matrix, span_reps_matrix_t, span_reps_matrix * span_reps_matrix_t, relative_position_info ], axis=-1) ######################### #### add root object #### ######################### #(batchsize, max_n_spans, span_rep_size) root_matrix = chaFunc.tile(self.root_embedding, (batchsize, self.max_n_spans, 1)) #(batchsize, max_n_spans, pair_rep_size) pair_reps_with_root = chaFunc.concat([ span_reps_pad, root_matrix, span_reps_pad * root_matrix, self.xp.zeros( (batchsize, self.max_n_spans, self.relative_position_info_size)).astype(self.xp.float32) ], axis=-1) #(batchsize, max_n_spans, max_n_spans+1, pair_rep_size) pair_reps = chaFunc.concat([ pair_reps, chaFunc.reshape( pair_reps_with_root, (batchsize, self.max_n_spans, 1, self.span_pair_size)) ], axis=2) #(batchsize, max_n_spans*max_n_spans, pair_rep_size) pair_reps = chaFunc.reshape(pair_reps, (batchsize * max_n_spans * (max_n_spans + 1), self.span_pair_size)) #(batsize, max_n_spans*max_n_spans) calculate relation score for each pair pair_scores = self.RelationLayer(pair_reps) #(batchsize*max_n_spans, max_n_spans) pair_scores = chaFunc.reshape( pair_scores, (batchsize, max_n_spans, max_n_spans + 1)) return pair_scores
def __call__(self, atoms_1, g_1, atoms_2, g_2): """ :param atoms_1: atomic representation of molecule 1, with shape of (mb, N_1, hidden_dim) :param g_1: molecular representation of molecule 1, with shape of (mb, out_dim) :param atoms_2: atomic representation of molecule 2, with shape of (mb, N_2, hidden_dim) :param g_2: molecular representation of molecule 2, with shape of (mb, out_dim) :return: """ # initial_g_2: (mb, hidden_dim) initial_g_2 = F.mean(atoms_2, axis=1) # attn_1: (mb, N_1, 1), doc_1: (mb, N_1, out_dim) attn_1, doc_1 = self.compute_attention(query=initial_g_2, key=atoms_1, focus=1) # attn_1: (mb, N_1, out_dim) attn_1 = F.tile(attn_1, reps=(1, 1, self.out_dim)) # compact_1: (mb, out_dim) compact_1 = F.sum(attn_1 * doc_1, axis=1) # initial_g_1: (mb, hidden_dim) initial_g_1 = F.mean(atoms_1, axis=1) # attn_2: (mb, N_2, 1), doc_2: (mb, N_2, out_dim) attn_2, doc_2 = self.compute_attention(query=initial_g_1, key=atoms_2, focus=2) # attn_2: (mb, N_2, out_dim) attn_2 = F.tile(attn_2, reps=(1, 1, self.out_dim)) # compact_2: (mb, out_dim) compact_2 = F.sum(attn_2 * doc_2, axis=1) return compact_1, compact_2
def __call__(self, X, A): # A is Adjacency Matrix N = X.shape[0] outputs = [] for head in range(self.attn_heads): kernel_fc = self.kernels[head] # W in paper (F x F') attn_fc = self.attn_kernels[ head] # Attention kernel a in the paper (2F' x 1) # Compute inputs to attention network linear_transfer_X = kernel_fc(X) # (N x F') # Compute feature combinations repeated = F.tile(linear_transfer_X, (1, N)) repeated = F.reshape( repeated, (N * N, self.F_)) # after tile: N x (F' x N), then N^2 x F' tiled = F.tile(linear_transfer_X, (N, 1)) # (N^2 x F') combinations = F.concat( [repeated, tiled], axis=1 ) # (N^2 x 2F') # this will be all combinations N x N include self to self # Attention head dense = F.squeeze( attn_fc(combinations) ) # a(Wh_i, Wh_j) in the paper (N^2 x 1), then squeeze to remove last 1 dense = dense.reshape(N, N) dense = F.leaky_relu(dense) # Mask values before activation (Vaswani et al., 2017) comparison = (A == 0) # true or false of each element mask = F.where( comparison, np.ones_like(A) * -10e9, np.zeros_like(A)) # if A ==0, choose -10e9, else choose 0 # this mask elements: if A == 0: -10e9, if A!=0: 0 masked = dense + mask # push non-neighbor elements to -10e9, shape = N x N # Feed masked values to softmax softmax_val = F.softmax( masked, axis=1 ) # paper eqn.3 alpha, push non-neighbor node value to almost 0 dropout_val = F.dropout(softmax_val, ratio=self.attn_dropout) # shape = N x N # Linear combination with neighbors' features node_features = F.matmul( dropout_val, linear_transfer_X) # (N x N) x (N x F') = N x F' if self.attn_heads_reduction == 'concat' and self.activation is not None: # In case of 'concat', we compute the activation here (Eq 5) node_features = self.activation( node_features) # shape = N x F' # Add output of attention head to final output outputs.append(node_features) # add one head output # Reduce the attention heads output according to the reduction method if self.attn_heads_reduction == 'concat': output = F.concat( outputs, axis=1) # shape = N x KF' , where K is head count else: output = F.mean(F.stack(outputs), axis=0) #shape = N x F' if self.activation is not None: output = self.activation(output) return output
def __call__(self, edge, h): num_atom = edge.shape[1] h1 = F.tile(F.expand_dims(h, 1), (1, num_atom, 1, 1)) h2 = F.tile(F.expand_dims(h, 2), (1, 1, num_atom, 1)) concat = F.concat([h1, h2, edge], axis=3) add = zero_plus(self.W2(zero_plus(self.W1(concat)))) return edge + self.bn(add)
def negative_log_likelihood(self, x, y): pi, mu, log_var = self.get_gaussian_params(x) # Likelihood over different Gaussians y = F.tile(y[:, None, :], (1, self.gaussian_mixtures, 1)) pi = F.tile(F.expand_dims(pi, 2), (1, 1, self.input_dim)) prob = F.sum(pi * self.normal_prob(y, mu, log_var), axis=1) negative_log_likelihood = -F.log(prob) return F.mean(negative_log_likelihood)
def st_graph_output(self, f_A, f_G): # f_A shape = (N,D), f_G shape = (N,4) assert f_A.shape[0] == f_G.shape[0] if self.add_self: assert f_A.shape[1] == self.out_size N = f_G.shape[0] assert N % self.frame_node_num == 0 T = N // self.frame_node_num geo_dim = f_G.shape[1] f_A_orig = f_A f_G = F.reshape(f_G, (T, self.frame_node_num, geo_dim)) f_A = F.reshape(f_A, (T, self.frame_node_num, f_A.shape[-1])) assert f_A_orig.ndim == 2, f_A_orig.ndim f_R = [] for nr in range(self.num_relations): f_G_1 = F.tile(f_G, (1, 1, F)) # shape = (T, F, 4 * F) f_G_1 = F.reshape( f_G_1, (T, self.frame_node_num** 2, geo_dim)) # after tile: (T, F, (4 x F)) then (T,F^2,4) f_G_2 = F.tile(f_G, (1, F, 1)) # shape = (T, F*F, 4) encoded_offset = self.encode_box_offset(f_G_1.reshape( -1, geo_dim), f_G_2.reshape(-1, geo_dim)) # shape = (TxFxF, 4) # paper formula (5), shape = (T,F,F) w_G = F.relu( getattr(self, self.W_G_lst[nr])(self.position_encoding( encoded_offset, self.d_g))) # TxFxF,1 w_G = F.reshape(w_G, shape=(T, self.frame_node_num, self.frame_node_num)) # shape = (T,F,F) # paper formula (4), shape = (N,N) w_K_result = getattr(self, self.W_K_lst[nr])(f_A_orig).reshape( T, self.frame_node_num, self.d_k) # shape = (T, F, d_k) w_Q_transpose_result = F.transpose(getattr( self, self.W_Q_lst[nr])(f_A_orig).reshape(T, self.frame_node_num, self.d_k), axes=(0, 2, 1)) # shape = (T, d_k, F) w_A = F.matmul(w_K_result, w_Q_transpose_result) # shape = (T,F,F) w_A = w_A + F.log(w_G) # paper formula (3), shape = (T,F,F) w = F.softmax( w_A, axis=2 ) # original paper formula (3) is weighted softmax, because chainer does not provide such weighed-softmax # we instead only element-wise dot here, then softmax # w = w_G * F.exp(w_A) / F.sum(w_G * F.exp(w_A), axis=1) # denominator shape = (N,1) numerator shape = (N,N) # paper formula (2), weight sum = matmul:(T,F,F) x (T, F, out_size//nr) = (T, F, out_size//nr) f_R_nr = F.matmul( w, getattr(self, self.W_V_lst[nr])(f_A_orig).reshape( T, self.frame_node_num, self.w_v_outsize)) f_R.append(f_R_nr) if self.add_self: return f_A + F.concat(f_R, axis=2).reshape(N, self.out_size) return F.concat(f_R, axis=2).reshape(N, self.out_size)
def _get_neg_g(self, r_ids, s_re_r, s_im_r, o_re_r, o_im_r, cs_re_r, cs_im_r, co_re_r, co_im_r, csco): # W w_re = F.reshape(self.wr_re[r_ids], (self.s_batch * self.k, 1, self.d)) w_im = F.reshape(self.wr_im[r_ids], (self.s_batch * self.k, 1, self.d)) # V V = self.Vr[r_ids] V_t = F.tile(V, (self.n_nsamp, 1, 1)) # b b = self.br[r_ids] b_t = F.tile(b, (self.n_nsamp, 1)) # u u = self.ur[r_ids] u_t = F.tile(u, (self.n_nsamp, 1)) # Stack vectors w_rrii = F.stack((w_re, w_re, w_im, w_im), axis=0) s_riri = F.stack((s_re_r, s_im_r, s_re_r, s_im_r), axis=0) o_riir = F.stack((o_re_r, o_im_r, o_im_r, o_re_r), axis=0) cs_riri = F.stack((cs_re_r, cs_im_r, cs_re_r, cs_im_r), axis=0) co_riir = F.stack((co_re_r, co_im_r, co_im_r, co_re_r), axis=0) # calculate each term # - sWo sW = s_riri * w_rrii Wo = w_rrii * o_riir sW_t = F.tile(sW, (1, self.n_co, 1, 1)) Wo_t = F.tile(Wo, (1, self.n_cs, 1, 1)) csWo__ = F.sum(cs_riri * Wo_t, axis=(2, 3)) csWo_ = csWo__[0] + csWo__[1] + csWo__[2] - csWo__[3] csWo = F.reshape(csWo_, (self.s_batch * self.n_cs, self.k)) sWco__ = F.sum(sW_t * co_riir, axis=(2, 3)) sWco_ = sWco__[0] + sWco__[1] + sWco__[2] - sWco__[3] sWco = F.reshape(sWco_, (self.s_batch * self.n_co, self.k)) sWo = F.concat((csWo, sWco), axis=0) # - Vso Vso_ = F.matmul(V_t, F.expand_dims(csco, axis=1), transb=True) Vso = F.reshape(Vso_, (self.s_batch * self.n_nsamp, self.k)) # sum up terms if self.mp == 1: preact = sWo + Vso + b_t elif self.mp == 0: preact = sWo + b_t activated = F.tanh(preact) g_score_ = F.sum(u_t * activated, axis=1) g_score = F.reshape(g_score_, (self.s_batch * self.n_nsamp, 1)) return g_score
def shift(self, x, gamma, beta): if gamma is None: return x batchsize = x.shape[0] if gamma.ndim == 1 and beta.ndim == 1: x = x * F.tile(gamma[None, :, None, None], (batchsize, 1, self.bottom_width, self.bottom_width)) + \ F.tile(beta[None, :, None, None], (batchsize, 1, self.bottom_width, self.bottom_width)) elif gamma.ndim == 2 and beta.ndim == 2: x = x * F.tile(gamma[:, :, None, None], (1, self.bottom_width, self.bottom_width)) + \ F.tile(beta[:, :, None, None], (1, self.bottom_width, self.bottom_width)) return x
def categorical_kl(params0, params1): params0 = params0[0] params1 = params1[0] assert params0.shape == params1.shape a0 = params0 - F.tile(F.max(params0, axis=1, keepdims=True), (1, 4)) a1 = params1 - F.tile(F.max(params1, axis=1, keepdims=True), (1, 4)) ea0 = F.exp(a0) ea1 = F.exp(a1) z0 = F.tile(F.sum(ea0, axis=1, keepdims=True), (1, 4)) z1 = F.tile(F.sum(ea1, axis=1, keepdims=True), (1, 4)) p0 = ea0 / z0 return F.sum(p0 * (a0 - F.log(z0) - a1 + F.log(z1)), axis=1)
def get_g(self, r_ids, s_ids, o_ids): s_batch = len(r_ids) # Get embeddings s_re = self.embed_re(s_ids) s_re_r = F.reshape(F.tile(s_re, (1, self.k)), (s_batch * self.k, 1, self.d)) o_re = self.embed_re(o_ids) o_re_r = F.reshape(F.tile(o_re, (1, self.k)), (s_batch * self.k, 1, self.d)) s_im = self.embed_im(s_ids) s_im_r = F.reshape(F.tile(s_im, (1, self.k)), (s_batch * self.k, 1, self.d)) o_im = self.embed_im(o_ids) o_im_r = F.reshape(F.tile(o_im, (1, self.k)), (s_batch * self.k, 1, self.d)) so = F.concat((s_re, s_im, o_re, o_im), axis=1) # W w_re = F.reshape(self.wr_re[r_ids], (s_batch * self.k, 1, self.d)) w_im = F.reshape(self.wr_im[r_ids], (s_batch * self.k, 1, self.d)) # V V = self.Vr[r_ids] # b b = self.br[r_ids] # u u = self.ur[r_ids] # calculate each term # - sWo w_rrii = F.stack((w_re, w_re, w_im, w_im), axis=0) s_riri = F.stack((s_re_r, s_im_r, s_re_r, s_im_r), axis=0) o_riir = F.stack((o_re_r, o_im_r, o_im_r, o_re_r), axis=0) sWo__ = F.sum(w_rrii * s_riri * o_riir, axis=(2, 3)) sWo_ = sWo__[0] + sWo__[1] + sWo__[2] - sWo__[3] sWo = F.reshape(sWo_, (s_batch, self.k)) # - Vso Vso_ = F.matmul(V, F.expand_dims(so, axis=1), transb=True) Vso = F.reshape(Vso_, (s_batch, self.k)) # sum up terms if self.mp == 1: preact = sWo + Vso + b elif self.mp == 0: preact = sWo + b activated = F.tanh(preact) g_score_ = F.sum(u * activated, axis=1) g_score = F.reshape(g_score_, (s_batch, 1)) return g_score
def __call__(self, x, test=True): n, c, h, w = x.shape assert (c == self.nc) if n != self.prev_batch: self.bn = L.BatchNormalization(n * c, dtype=self.dtype) self.bn.to_gpu(self._device_id) self.bn.gamma = F.tile(self.gamma, n) self.bn.beta = F.tile(self.beta, n) self.prev_batch = n x = F.reshape(x, (1, n * c, h, w)) return F.reshape(self.bn(x), (n, c, h, w))
def __call__(self, x, c): mu = F.average(x, axis=0).reshape(1, x.shape[1], x.shape[2], x.shape[3]) sigma = F.average((x-F.tile(mu, (x.shape[0], 1, 1, 1)))**2, axis=0) x_hat = (x-F.tile(mu, (x.shape[0], 1, 1, 1)))/F.sqrt(F.tile(sigma+self.eps, (x.shape[0], 1, 1, 1))) h = F.relu(self.c0(c)) w = self.cw(h) b = self.cb(h) #ones = chainer.as_variable(xp.ones_like(w, dtype=xp.float32)) h = w * x_hat + b return h
def get_all_prob_or_log_prob(self, is_log=False): segments = self.parametric_segments num_of_action_types = len(segments) action_types_logits = self.beta * self.logits[:, :num_of_action_types] if is_log: action_types_probs = F.log_softmax(action_types_logits) else: action_types_probs = F.softmax(action_types_logits) # action_types_probs = F.softmax(action_types_logits) # if is_log: # print("LOG") # print(action_types_probs) # action_types_probs = action_types_probs.data * np.power(self.segments_sizes, 1/4) # action_types_probs = action_types_probs / np.expand_dims(np.sum(action_types_probs, axis=1), axis=1) # action_types_probs = chainer.Variable(action_types_probs.astype(np.float32)) # if is_log: # action_types_probs = F.log(action_types_probs) # print(action_types_probs) result = [] logits_offset = num_of_action_types for i in range(num_of_action_types): action_type_prob = action_types_probs[:, i:i + 1] if not segments[i]: # if no parameters for this action type result.append(action_type_prob) else: segments_factor = 1 for sub_seg_size in segments[i]: segments_factor *= sub_seg_size if is_log: sub_seg_probs = F.log_softmax( self.beta * self.logits[:, logits_offset:logits_offset + sub_seg_size]) else: sub_seg_probs = F.softmax( self.beta * self.logits[:, logits_offset:logits_offset + sub_seg_size]) if is_log: action_type_prob = F.repeat( action_type_prob, sub_seg_size, axis=1) + F.tile( sub_seg_probs, segments_factor // sub_seg_size) else: action_type_prob = F.repeat( action_type_prob, sub_seg_size, axis=1) * F.tile( sub_seg_probs, segments_factor // sub_seg_size) logits_offset += sub_seg_size result.append(action_type_prob) res = F.concat(tuple(result)) return res
def __call__(self, x): # x.shape == (batchsize, 3, 128, 64) batchsize = x.shape[0] h = F.elu(self.bn1(self.conv1_1(x))) h = F.elu(self.bn2(self.conv1_2(h))) h = F.max_pooling_2d(h, 3, 2, cover_all=False) h = self.conv2_1(h) h = self.conv2_3(h) h = self.conv3_1(h) h = self.conv3_3(h) h = self.conv4_1(h) h = self.conv4_3(h) h = h.reshape(batchsize, -1) h = F.dropout(h, ratio=0.6) h = F.elu(self.fc1_bn(self.fc1(h))) # Features in rows, normalize axis 1. weights = self.mean_vectors features = self.ball(h) features = F.normalize(features, eps=1e-8) scale = F.softplus(self.scale) normalized_weight = F.normalize(weights, axis=0, eps=1e-8) logits = F.tile(scale[None, ], (batchsize, 1)) * \ F.matmul(features, normalized_weight) return logits
def compute_dists(self, obs, feats=None): if feats is None: feats = super().compute_features(obs) means = self.l_act(feats) # for this policy, the variance is independent of the state log_stds = F.tile(self.log_std.reshape((1, -1)), (len(feats), 1)) return Gaussian(means=means, log_stds=log_stds)
def pose_estimate(self, imgs): batch_size = imgs.shape[0] imgs = Variable(self.resnet_v2.xp.array(imgs, 'f')) img_feat = self.resnet_v2(imgs).reshape(batch_size, -1) theta_prev = F.tile( Variable(self.encoder_fc3_model.xp.array(mean, 'f')), (batch_size, 1)) num_cam = 3 num_theta = 72 for i in range(self.num_stage): state = F.concat([img_feat, theta_prev], axis=1) delta_theta = self.encoder_fc3_model(state) theta_here = theta_prev + delta_theta # cam = N x 3, pose N x self.num_theta, shape: N x 10 cams = theta_here[:, :num_cam] poses = theta_here[:, num_cam:(num_cam + num_theta)] shapes = theta_here[:, (num_cam + num_theta):] verts, Js, Rs, A = self.smpl(shapes, poses) # Project to 2D! # pred_kp = batch_orth_proj_idrot( # Js, cams, name='proj_2d_stage%d' % i) theta_prev = theta_here return verts, Js, Rs, A, cams, poses, shapes
def look_at(vertices, eye, at=None, up=None): """ "Look at" transformation of vertices. """ assert (vertices.ndim == 3) xp = chainer.cuda.get_array_module(vertices) batch_size = vertices.shape[0] if at is None: at = xp.array([0, 0, 0], 'float32') if up is None: up = xp.array([0, 1, 0], 'float32') if isinstance(eye, list) or isinstance(eye, tuple): eye = xp.array(eye, 'float32') if eye.ndim == 1: eye = cf.tile(eye[None, :], (batch_size, 1)) if at.ndim == 1: at = cf.tile(at[None, :], (batch_size, 1)) if up.ndim == 1: up = cf.tile(up[None, :], (batch_size, 1)) # create new axes z_axis = cf.normalize(at - eye) x_axis = cf.normalize(neural_renderer.cross(up, z_axis)) y_axis = cf.normalize(neural_renderer.cross(z_axis, x_axis)) # create rotation matrix: [bs, 3, 3] r = cf.concat((x_axis[:, None, :], y_axis[:, None, :], z_axis[:, None, :]), axis=1) if r.shape[0] != vertices.shape[0]: r = cf.broadcast_to(r, (vertices.shape[0], 3, 3)) # apply # [bs, nv, 3] -> [bs, nv, 3] -> [bs, nv, 3] if vertices.shape != eye.shape: eye = cf.broadcast_to(eye[:, None, :], vertices.shape) vertices = vertices - eye vertices = cf.matmul(vertices, r, transb=True) return vertices
def batch_rodrigues(theta): """ Theta is N x 3 """ batch_size = theta.shape[0] xp = theta.xp angle = F.expand_dims(F.sqrt(F.batch_l2_norm_squared(theta + 1e-8)), -1) r = F.expand_dims(theta / F.tile(angle, 3), -1) angle = F.expand_dims(angle, -1) cos = F.cos(angle) sin = F.sin(angle) cos = F.tile(cos, (3, 3)) sin = F.tile(sin, (3, 3)) outer = F.matmul(r, r, transb=True) eyes = F.tile(F.expand_dims( Variable(xp.array(xp.eye(3), 'f')), 0), (batch_size, 1, 1)) R = cos * eyes + (1 - cos) * outer + sin * batch_skew(r, batch_size) return R
def __call__(self, imgs, questions): feat = self.feat_extractor(imgs) # Append relative coordinates to each location in the feature maps. n, c, h, w = feat.shape spatial_area = h * w xp = self.xp coords_h = xp.linspace(-1, 1, h, dtype=feat.dtype) coords_w = xp.linspace(-1, 1, w, dtype=feat.dtype) coords_hh, coords_ww = xp.meshgrid(coords_h, coords_w) coords_hh = coords_hh[None] coords_ww = coords_ww[None] coords = xp.concatenate((coords_hh, coords_ww), axis=0) coords = coords.reshape(2, -1) coords = coords[None] # (1, 2, spatial_area * spatial_area) coords = xp.repeat(coords, n, axis=0) # Coordinates may be cached here but the performance gain is not # significant so it is skipped in favor of readability. feat = feat.reshape(n, c, spatial_area) h = F.concat((feat, coords), axis=1) # (n, c + 2, spatial_area) # Create coordinate pairs (differentiable meshgrid). h_hh = F.expand_dims(h, 2) h_ww = F.expand_dims(h, 3) h_hh = F.repeat(h_hh, spatial_area, axis=2) h_ww = F.repeat(h_ww, spatial_area, axis=3) h = F.concat((h_hh, h_ww), axis=1) # Append questions to each coordinate pair. questions = questions.astype(imgs.dtype) questions = questions[:, :, None, None] questions = F.tile(questions, (1, 1, spatial_area, spatial_area)) h = F.concat((h, questions), axis=1) # (n, (c + 2) * 2 + questions_length, spatial_area, spatial_area) # g. h = F.transpose(h, (0, 2, 3, 1)) h = F.reshape(h, (n * spatial_area * spatial_area, -1)) h = self.g(h) h = F.reshape(h, (n, spatial_area * spatial_area, -1)) h = F.sum(h, axis=1) h = self.f(h) # Logits. h = self.fc(h) return h
def test_x_not_ndarray_or_variable(self): with self.assertRaises(TypeError): functions.tile((self.x, self.x), 2)
def f(x): y = functions.tile(x, self.reps) return y * y
def test_value_error(self): x = numpy.random.uniform(-1, 1, (2,)).astype(numpy.float32) with self.assertRaises(ValueError): functions.tile(x, self.reps)
def test_reps_not_int(self): with self.assertRaises(TypeError): functions.tile(self.x, 'a')
def f(x): return functions.tile(x, self.reps)
def check_forward(self, x_data): y = functions.tile(x_data, self.reps) y_expected = numpy.tile(self.x, self.reps) self.assertEqual(y.dtype, y_expected.dtype) testing.assert_allclose( y.data, y_expected, **self.check_forward_options)
def __call__(self, beta, theta, get_skin=False, with_a=False): batch_size = beta.shape[0] # 1. Add shape blend shapes # (N x 10) x (10 x 6890*3) = N x 6890 x 3 self.beta_shapedirs = F.matmul(beta, self.shapedirs) v_shaped = F.reshape( F.matmul(beta, self.shapedirs), [-1, self.size[0], self.size[1]]) + \ F.repeat(self.v_template[None, ], batch_size, axis=0) self.v_shaped = v_shaped # 2. Infer shape-dependent joint locations. Jx = F.matmul(v_shaped[:, :, 0], self.J_regressor) Jy = F.matmul(v_shaped[:, :, 1], self.J_regressor) Jz = F.matmul(v_shaped[:, :, 2], self.J_regressor) J = F.stack([Jx, Jy, Jz], axis=2) self.J = J # 3. Add pose blend shapes # N x 24 x 3 x 3 Rs = F.reshape( batch_rodrigues(F.reshape(theta, [-1, 3])), [-1, 24, 3, 3]) self.Rs = Rs # Ignore global rotation. pose_feature = F.reshape(Rs[:, 1:, :, :] - F.repeat(F.repeat(Variable(self.xp.array(self.xp.eye(3), 'f'))[ None, ], 23, axis=0)[None, ], batch_size, axis=0), [-1, 207]) self.pose_feature = pose_feature # (N x 207) x (207, 20670) -> N x 6890 x 3 v_posed = F.reshape( F.matmul(pose_feature, self.posedirs), [-1, self.size[0], self.size[1]]) + v_shaped # 4. Get the global joint location self.J_transformed, A = batch_global_rigid_transformation( Rs, J, self.parents) # 5. Do skinning: # W is N x 6890 x 24 W = F.reshape( F.tile(self.weights, (batch_size, 1)), [batch_size, -1, 24]) # (N x 6890 x 24) x (N x 24 x 16) T = F.reshape( F.matmul(W, F.reshape(A, [batch_size, 24, 16])), [batch_size, -1, 4, 4]) v_posed_homo = F.concat( [v_posed, self.xp.ones([batch_size, v_posed.shape[1], 1], 'f')], 2) v_homo = F.matmul(T, F.expand_dims(v_posed_homo, -1)) verts = v_homo[:, :, :3, 0] # Get cocoplus or lsp joints: joint_x = F.matmul(verts[:, :, 0], self.joint_regressor) joint_y = F.matmul(verts[:, :, 1], self.joint_regressor) joint_z = F.matmul(verts[:, :, 2], self.joint_regressor) joints = F.stack([joint_x, joint_y, joint_z], axis=2) return verts, joints, Rs, A
def batch_global_rigid_transformation(Rs, Js, parent, rotate_base=False): """ Computes absolute joint locations given pose. rotate_base: if True, rotates the global rotation by 90 deg in x axis. if False, this is the original SMPL coordinate. Args: Rs: N x 24 x 3 x 3 rotation vector of K joints Js: N x 24 x 3, joint locations before posing parent: 24 holding the parent id for each index Returns new_J : `Tensor`: N x 24 x 3 location of absolute joints A : `Tensor`: N x 24 4 x 4 relative joint transformations for LBS. """ xp = Rs.xp N = Rs.shape[0] if rotate_base: print('Flipping the SMPL coordinate frame!!!!') rot_x = Variable( [[1, 0, 0], [0, -1, 0], [0, 0, -1]], dtype=Rs.dtype) rot_x = F.reshape(F.tile(rot_x, [N, 1]), [N, 3, 3]) root_rotation = F.matmul(Rs[:, 0, :, :], rot_x) else: root_rotation = Rs[:, 0, :, :] # Now Js is N x 24 x 3 x 1 Js = F.expand_dims(Js, -1) def make_A(R, t, name=None): # Rs is N x 3 x 3, ts is N x 3 x 1 R_homo = F.pad(R, [[0, 0], [0, 1], [0, 0]], 'constant') t_homo = F.concat([t, xp.ones([N, 1, 1], 'f')], 1) return F.concat([R_homo, t_homo], 2) A0 = make_A(root_rotation, Js[:, 0]) results = [A0] for i in range(1, parent.shape[0]): j_here = Js[:, i] - Js[:, parent[i]] A_here = make_A(Rs[:, i], j_here) res_here = F.matmul( results[parent[i]], A_here) results.append(res_here) # 10 x 24 x 4 x 4 results = F.stack(results, axis=1) new_J = results[:, :, :3, 3] # --- Compute relative A: Skinning is based on # how much the bone moved (not the final location of the bone) # but (final_bone - init_bone) # --- Js_w0 = F.concat([Js, xp.zeros([N, 24, 1, 1], 'f')], 2) init_bone = F.matmul(results, Js_w0) # Append empty 4 x 3: init_bone = F.pad(init_bone, [[0, 0], [0, 0], [0, 0], [3, 0]], 'constant') A = results - init_bone return new_J, results
def test_type_error(self): x = numpy.random.uniform(-1, 1, (2,)).astype(numpy.float32) with self.assertRaises(TypeError): functions.tile(x, 'a')
def forward(self, inputs, devices): x, = inputs y = functions.tile(x, self.reps) return y,
def __call__(self, x): return functions.tile(x, self.reps)