def split_heads(self, x, k=False): new_x_shape = x.shape[:-1] + (self.n_head, x.shape[-1] // self.n_head) x = x.reshape(*new_x_shape) # in Tensorflow implem: fct split_states if k: return F.transpose(x, (0, 2, 3, 1)) else: return F.transpose(x, (0, 2, 1, 3))
def reorg(input, stride=2): batch_size, input_channel, input_height, input_width = input.data.shape output_height, output_width, output_channel = int(input_height/stride), int(input_width/stride), input_channel*stride*stride output = F.transpose(F.reshape(input, (batch_size, input_channel, output_height, stride, output_width, stride)), (0, 1, 2, 4, 3, 5)) output = F.transpose(F.reshape(output, (batch_size, input_channel, output_height, output_width, -1)), (0, 4, 1, 2, 3)) output = F.reshape(output, (batch_size, output_channel, output_height, output_width)) return output
def _predict_depth_chainer_backend(self, bgr, depth_bgr=None): bgr_data = np.array([bgr], dtype=np.float32) depth_bgr_data = np.array([depth_bgr], dtype=np.float32) if self.gpu != -1: bgr_data = cuda.to_gpu(bgr_data, device=self.gpu) depth_bgr_data = cuda.to_gpu(depth_bgr_data, device=self.gpu) if LooseVersion(chainer.__version__) < LooseVersion('2.0.0'): bgr = chainer.Variable(bgr_data, volatile=True) depth_bgr = chainer.Variable(depth_bgr_data, volatile=True) self.model(bgr, depth_bgr) else: with chainer.using_config('train', False): with chainer.no_backprop_mode(): bgr = chainer.Variable(bgr_data) depth_bgr = chainer.Variable(depth_bgr_data) self.model(bgr, depth_bgr) proba_img = F.softmax(self.model.mask_score) label_pred = F.argmax(self.model.mask_score, axis=1) depth_pred = F.sigmoid(self.model.depth_score) proba_img = F.transpose(proba_img, (0, 2, 3, 1)) max_proba_img = F.max(proba_img, axis=-1) # squeeze batch axis, gpu -> cpu proba_img = cuda.to_cpu(proba_img.data)[0] max_proba_img = cuda.to_cpu(max_proba_img.data)[0] label_pred = cuda.to_cpu(label_pred.data)[0] depth_pred = cuda.to_cpu(depth_pred.data)[0] # uncertain because the probability is low label_pred[max_proba_img < self.proba_threshold] = self.bg_label # get depth image depth_pred = depth_pred[0, :, :] depth_pred *= (self.model.max_depth - self.model.min_depth) depth_pred += self.model.min_depth return label_pred, proba_img, depth_pred
def block_embed(embed, x, dropout=0.): """Embedding function followed by convolution Args: embed (callable): A :func:`~chainer.functions.embed_id` function or :class:`~chainer.links.EmbedID` link. x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): Input variable, which is a :math:`(B, L)`-shaped int array. Its first dimension :math:`(B)` is assumed to be the *minibatch dimension*. The second dimension :math:`(L)` is the length of padded sentences. dropout (float): Dropout ratio. Returns: ~chainer.Variable: Output variable. A float array with shape of :math:`(B, N, L, 1)`. :math:`(N)` is the number of dimensions of word embedding. """ e = embed(x) e = F.dropout(e, ratio=dropout) e = F.transpose(e, (0, 2, 1)) e = e[:, :, :, None] return e
def _log_prob_words(self, context, temperature=1.0): """ This calculates an softmax over the vocabulary as a function of the dot product of context and word. """ dot = F.matmul(context, F.transpose(self.vocab.W)) prob = F.softmax(dot / temperature) return F.log(prob)
def __call__(self, h_list): h_list = [functions.expand_dims(h, axis=-2) for h in h_list] concat_h = functions.concat(h_list, axis=-2) mb, atoms, n_layers, hidden_dim = concat_h.shape # concat_h: (n_layers, mb, atoms, hidden_dim) concat_h = functions.transpose(concat_h, axes=(2, 0, 1, 3)) seq_h = functions.reshape(concat_h, shape=(n_layers, mb * atoms, hidden_dim)) seq_h_list = list(seq_h) _, seq_out_list = self.bigru_layer(None, seq_h_list) # [n_layers, mb * atoms, hidden_dim] seq_out_arr = functions.concat( [functions.expand_dims(seq, axis=0) for seq in seq_out_list], axis=0) # [mb * atoms, hidden_dim] seq_out_forward = seq_out_arr[-1, :, :hidden_dim] # [mb * atoms, hidden_dim] seq_out_backward = seq_out_arr[0, :, hidden_dim:] # [mb * atoms, 2 * hidden_dim] seq_out_arr = functions.concat([seq_out_forward, seq_out_backward], axis=-1) # [mb * atoms, 2 * hidden_dim] -> [mb, atoms, 2 * hidden_dim] seq_out_arr = functions.reshape(seq_out_arr, shape=(mb, atoms, 2 * hidden_dim)) # [mb, atoms, 2 * hidden_dim] h = seq_out_arr h = self.out_layer(h) return h
def translate(self, hxs, max_length): """Generate target sentences given hidden states of source sentences. Args: hxs: Hidden states for source sequences. Returns: ys: Generated sequences. """ batch_size, _, _ = hxs.shape compute_context = self.attention(hxs) c = Variable(self.xp.zeros((batch_size, self.n_units), 'f')) h = F.broadcast_to(self.bos_state, ((batch_size, self.n_units))) # first character's embedding previous_embedding = self.embed_y( Variable(self.xp.full((batch_size, ), EOS, 'i'))) results = [] for _ in range(max_length): context = compute_context(h) concatenated = F.concat((previous_embedding, context)) c, h = self.lstm(c, h, concatenated) concatenated = F.concat((concatenated, h)) logit = self.w(self.maxout(concatenated)) y = F.reshape(F.argmax(logit, axis=1), (batch_size, )) results.append(y) previous_embedding = self.embed_y(y) results = F.separate(F.transpose(F.vstack(results)), axis=0) ys = [get_subsequence_before_eos(result.data) for result in results] return ys
def compute_features(self, obs): obs = F.cast(obs, np.float32) obs = F.transpose(obs, (0, 3, 1, 2)) h1 = F.relu(self.conv1(obs)) h2 = F.relu(self.conv2(h1)) h3 = F.relu(self.fc(h2)) return h3
def forward(self, x, q, is_linear=False): # Random Noise for Learing Time invariance if chainer.configuration.config.train: xp = chainer.cuda.get_array_module(x) z = xp.zeros((1,x.shape[1]), dtype=numpy.float32) i = 0 while i<x.shape[0]: if numpy.random.rand(1)[0]<0.1: x = xp.vstack((x[:i], z, x[i:])) i += 1 i += 1 max_knowledge, D = self.temporal_a.shape if len(x)>max_knowledge: x = x[len(x)-max_knowledge:] j = max_knowledge-len(x) if self.pe: a = xp.arange(1,0,-1/D) b = xp.arange(-1,1,2/D) M = a * F.matmul(x[:,:self.V], self.embedid_a) + b * F.matmul(x[:,self.V:], self.embedid_a) + self.temporal_a[j:] C = a * F.matmul(x[:,:self.V], self.embedid_c) + b * F.matmul(x[:,self.V:], self.embedid_c) + self.temporal_c[j:] else: M = F.matmul(x[:,:self.V], self.embedid_a) + self.temporal_a[j:] C = F.matmul(x[:,:self.V], self.embedid_c) + self.temporal_c[j:] U = F.matmul(q.reshape(1,-1), self.embedid_b) for l in range(self.layer): P = F.transpose(F.matmul(M,U[0])) if not is_linear: P = F.softmax(P) O = F.matmul(P,C) if l == self.layer-1: U = U + O else: U = self.H(U) + O return self.W(U) # (1,D)
def __call__(self, x): """Compute localization, objectness, and classification from a batch of images. This method computes three variables, :obj:`locs`, :obj:`objs`, and :obj:`confs`. :meth:`self._decode` converts these variables to bounding box coordinates and confidence scores. These variables are also used in training YOLOv2. Args: x (chainer.Variable): A variable holding a batch of images. Returns: tuple of chainer.Variable: This method returns three variables, :obj:`locs`, :obj:`objs`, and :obj:`confs`. * **locs**: A variable of float arrays of shape \ :math:`(B, K, 4)`, \ where :math:`B` is the number of samples in the batch and \ :math:`K` is the number of default bounding boxes. * **objs**: A variable of float arrays of shape \ :math:`(B, K)`. * **confs**: A variable of float arrays of shape \ :math:`(B, K, n\_fg\_class)`. """ h = self.subnet(self.extractor(x)) h = F.transpose(h, (0, 2, 3, 1)) h = F.reshape(h, (h.shape[0], -1, 4 + 1 + self.n_fg_class)) locs = h[:, :, :4] objs = h[:, :, 4] confs = h[:, :, 5:] return locs, objs, confs
def reorg(input, stride=2): batch_size, input_channel, input_height, input_width = input.data.shape output_height, output_width, output_channel = int( input_height / stride), int(input_width / stride), input_channel * stride * stride output = F.transpose( F.reshape(input, (batch_size, input_channel, output_height, stride, output_width, stride)), (0, 1, 2, 4, 3, 5)) output = F.transpose( F.reshape( output, (batch_size, input_channel, output_height, output_width, -1)), (0, 4, 1, 2, 3)) output = F.reshape( output, (batch_size, output_channel, output_height, output_width)) return output
def __call__(self, x, **kwargs): # shapes: # self.gammas.W : (n_classes, n_ch) # weights : (n_batch, x, y, n_classes) weights, = argument.parse_kwargs(kwargs, ('weights', None)) _weights = F.reshape(weights, (weights.shape[0] * weights.shape[1] * weights.shape[2], weights.shape[3])) gamma_c = F.reshape(F.matmul(_weights, self.gammas.W), (weights.shape[0], weights.shape[1], weights.shape[2], self.gammas.W.shape[1])) gamma_c = F.transpose(gamma_c, (0, 3, 1, 2)) beta_c = F.reshape(F.matmul(_weights, self.betas.W), (weights.shape[0], weights.shape[1], weights.shape[2], self.gammas.W.shape[1])) beta_c = F.transpose(beta_c, (0, 3, 1, 2)) return super(SpatialCategoricalConditionalBatchNormalization, self).__call__(x, gamma_c, beta_c, **kwargs)
def __call__(self, x): x = F.transpose(x, axes=(0, 3, 1, 2)) h = F.relu(self.bn1(self.conv1(x))) h = F.relu(self.bn2(self.conv2(h))) h = F.relu(self.bn3(self.l1(h))) h = F.relu(self.bn4(self.l2(h))) return chainerrl.action_value.DiscreteActionValue(self.l3(h))
def train(): # model model = Mynet(train=True) if GPU >= 0: chainer.cuda.get_device(GPU).use() model.to_gpu() opt = chainer.optimizers.MomentumSGD(0.01, momentum=0.9) opt.setup(model) #opt.add_hook(chainer.optimizer.WeightDecay(0.0005)) xs, ts, paths = data_load('../Dataset/train/images/', hf=True, vf=True) # training mb = 4 mbi = 0 train_ind = np.arange(len(xs)) np.random.seed(0) np.random.shuffle(train_ind) for i in range(500): if mbi + mb > len(xs): mb_ind = train_ind[mbi:] np.random.shuffle(train_ind) mb_ind = np.hstack((mb_ind, train_ind[:(mb - (len(xs) - mbi))])) mbi = mb - (len(xs) - mbi) else: mb_ind = train_ind[mbi:mbi + mb] mbi += mb x = xs[mb_ind] t = ts[mb_ind] if GPU >= 0: x = chainer.cuda.to_gpu(x) t = chainer.cuda.to_gpu(t) #else: # x = chainer.Variable(x) # t = chainer.Variable(t) y = model(x) accu = F.accuracy(y, t[..., 0]) y = F.transpose(y, axes=(0, 2, 3, 1)) loss = F.sigmoid_cross_entropy(y, t) model.cleargrads() loss.backward() opt.update() loss = loss.data accu = accu.data if GPU >= 0: loss = chainer.cuda.to_cpu(loss) accu = chainer.cuda.to_cpu(accu) print("iter >>", i + 1, ',loss >>', loss.item(), ',accuracy >>', accu) chainer.serializers.save_npz('cnn.npz', model)
def __call__(self, x): """ Calucurate Minibatch Discrimination using broardcast. Parameters --------------- x: Variable input vector shape is (N, num_units) """ batch_size = x.shape[0] xp = x.xp x = F.reshape(x, (batch_size, -1)) activation = F.reshape(self.t(x), (-1, self.b, self.c)) m = F.reshape(activation, (-1, self.b, self.c)) m = F.expand_dims(m, 3) m_T = F.transpose(m, (3, 1, 2, 0)) m, m_T = F.broadcast(m, m_T) l1_norm = F.sum(F.absolute(m-m_T), axis=2) # eraser to erase l1 norm with themselves eraser = F.expand_dims(xp.eye(batch_size, dtype="f"), 1) eraser = F.broadcast_to(eraser, (batch_size, self.b, batch_size)) o_X = F.sum(F.exp(-(l1_norm + 1e6 * eraser)), axis=2) # concatunate along channels or units return F.concat((x, o_X), axis=1)
def _log_prob_words(self, context, temperature=1.0): """ This calculates a softmax over the vocabulary as a function of the dot product of context and word. """ dot = F.matmul(context, F.transpose(self.vocab.W)) prob = F.softmax(dot / temperature) return F.log(prob)
def __call__(self, x, mask): #h = self.c(x) - self.b self.m.W.data = self.xp.array(self.maskW) #mask windows are set by 1 h = self.c(x * mask) #(B,C,H,W) B, C, H, W = h.shape b = F.transpose(F.broadcast_to(self.c.b, (B, H, W, C)), (0, 3, 1, 2)) h = h - b mask_sums = self.m(mask) mask_new = (self.xp.sign(mask_sums.data - 0.5) + 1.0) * 0.5 mask_new_b = mask_new.astype("bool") mask_sums = F.where( mask_new_b, mask_sums, 0.01 * Variable(self.xp.ones(mask_sums.shape).astype("f"))) h = h / mask_sums + b mask_new = Variable(mask_new) h = F.where(mask_new_b, h, Variable(self.xp.zeros(h.shape).astype("f"))) #elif self.sample=="up": # h = F.unpooling_2d(x, 2, 2, 0, cover_all=False) # h = self.c(h) #else: # print("unknown sample method %s"%self.sample) if self.bn: h = self.batchnorm(h) if self.noise: h = add_noise(h) if self.dropout: h = F.dropout(h) if not self.activation is None: h = self.activation(h) return h, mask_new
def _segment(self, bgr): bgr_data = np.array([bgr], dtype=np.float32) if self.gpu != -1: bgr_data = cuda.to_gpu(bgr_data, device=self.gpu) if LooseVersion(chainer.__version__) < LooseVersion('2.0.0'): bgr = chainer.Variable(bgr_data, volatile=True) self.model(bgr, None) else: with chainer.using_config('train', False): with chainer.no_backprop_mode(): bgr = chainer.Variable(bgr_data) self.model(bgr, None) # Get proba_img, pred_label proba_img = F.softmax(self.model.score) proba_img = F.transpose(proba_img, (0, 2, 3, 1)) max_proba_img = F.max(proba_img, axis=-1) pred_label = F.argmax(self.model.score, axis=1) # squeeze batch axis, gpu -> cpu max_proba_img = cuda.to_cpu(max_proba_img.data)[0] pred_label = cuda.to_cpu(pred_label.data)[0] # uncertain because the probability is low pred_label[max_proba_img < self.proba_threshold] = self.bg_label return pred_label
def __call__(self, h, adj): # --- Message part --- mb, atom, ch = h.shape out_ch = ch m = functions.reshape(self.graph_linear(h), (mb, atom, out_ch, self.num_edge_type)) # m: (minibatch, atom, ch, edge_type) # Transpose m = functions.transpose(m, (0, 3, 1, 2)) # m: (minibatch, edge_type, atom, ch) adj = functions.reshape(adj, (mb * self.num_edge_type, atom, atom)) # (minibatch * edge_type, atom, out_ch) m = functions.reshape(m, (mb * self.num_edge_type, atom, out_ch)) m = chainer_chemistry.functions.matmul(adj, m) # (minibatch * edge_type, atom, out_ch) m = functions.reshape(m, (mb, self.num_edge_type, atom, out_ch)) m = functions.sum(m, axis=1) # (minibatch, atom, out_ch) # --- Update part --- # Contraction h = functions.reshape(h, (mb * atom, ch)) # Contraction m = functions.reshape(m, (mb * atom, ch)) out_h = self.update_layer(functions.concat((h, m), axis=1)) # Expansion out_h = functions.reshape(out_h, (mb, atom, ch)) return out_h
def __call__(self, xs): # self.reset_state() h0_f, ys_f = [], [] if len(xs.shape) == 2: h0 = self.embed(xs) elif len(xs.shape) == 3: xs = F.transpose(xs, axes=(1, 0, 2)) h0 = F.concat((self.embed(xs[0]), self.f_embed(xs[1])), axis=1) # print(h0.shape) # (batchsize, seqsize*2, embedsize) for x in h0: h0_f.append(x) # hy, cy, ys = self.bilstm(self.hx, self.cx, h0_f) hy, ys = self.bigru(self.hx, h0_f) # self.hx, self.cx = hy.to_gpu(), cy.to_gpu() self.hx = hy.to_gpu() for ys_s in ys: ys_f.append(ys_s.data) ys_f = self.xp.array(ys_f, dtype=self.xp.float32) h1 = self.l2(ys_f) predict = self.l3(h1) return predict
def forward(self, xs): # xs shape = (batch, T, F, D) ''' :param xs: appearance features of all boxes feature across all frames :param gs: geometry features of all polygons. each is 4 coordinates represent box :param crf_pact_structures: packaged graph structure contains supplementary information :return: ''' xp = chainer.cuda.get_array_module(xs.data) batch = xs.shape[0] T = xs.shape[1] # first frame node_id ==> other frame node_id in same corresponding box node_out_dict = self.node_recurrent_forward(xs) # shape = F, B, T, mid_size node_out = F.stack([ node_out_ for _, node_out_ in sorted(node_out_dict.items(), key=lambda e: int(e[0])) ]) node_out = F.transpose(node_out, (1, 2, 0, 3)) # shape = (B,T,F,D) assert self.frame_node_num == node_out.shape[2], node_out.shape[2] assert self.mid_size == node_out.shape[-1] assert T == node_out.shape[1] node_out = F.reshape(node_out, shape=(-1, self.mid_size)) node_out = self.node_classify_fc( F.relu(node_out)) # shape = (N, out_size) node_out = F.reshape(node_out, shape=(batch, T, self.frame_node_num, self.out_size)) if self.spatial_edge_mode == SpatialEdgeMode.all_edge: conn_out_dict = self.conn_recurrent_forward(node_out_dict) return node_out, conn_out_dict return node_out
def predict(self, xs): # all shape is (B, T, F, D) if not isinstance(xs, chainer.Variable): xs = chainer.Variable(xs) xp = chainer.cuda.cupy.get_array_module(xs) with chainer.no_backprop_mode(): if self.spatial_edge_mode == SpatialEdgeMode.all_edge: node_out, conn_out_dict = self.forward( xs) # node_out is B,T,F,class_num node_out = chainer.cuda.to_cpu( node_out.data) # B, T, F, class_num node_out = np.bitwise_or.reduce(node_out, axis=2) # B, T, class_num temp_conn_output = [] for conn_out in conn_out_dict.values(): # each is B,T,D temp_conn_output.append(conn_out) # F, B, T, D temp_conn_output = F.transpose(F.stack(temp_conn_output), (1, 2, 0, 3)) # B, T, conn_F, D temp_conn_output = chainer.cuda.to_cpu( temp_conn_output.data) # B,T, conn_F,D temp_conn_output = np.bitwise_or.reduce(temp_conn_output, axis=2) # B, T, D pred = node_out | temp_conn_output # B, class_num pred = (pred > 0).astype(np.int32) else: node_out = self.forward(xs) # node_out is B,T,F,class_num node_out = chainer.cuda.to_cpu( node_out.data) # B, T, F, class_num node_out = np.bitwise_or.reduce(node_out, axis=2) # B,T, class_num pred = (node_out > 0).astype(np.int32) return pred # return batch x out_size, it is last time_step frame of 2-nd axis of input xs prediction
def sentence_block_embed(embed, x): batch, length = x.shape e = embed(x.reshape((batch * length, ))) # (batch * length, units) e = F.transpose(F.stack(F.split_axis(e, batch, axis=0), axis=0), (0, 2, 1)) # (batch, units, length) return e
def forward(self, ws, ss, ps): batchsize, length = ws.shape xp = chainer.cuda.get_array_module(ws[0]) ws = self.emb_word(ws) # (batch, length, word_dim) ss = F.reshape(self.emb_suf(ss), (batchsize, length, -1)) ps = F.reshape(self.emb_prf(ps), (batchsize, length, -1)) hs = F.transpose(F.concat([ws, ss, ps], 2), (1, 0, 2)) hs = F.dropout(hs, self.dropout_ratio, train=self.train) hs = F.split_axis(hs, length, 0) hs_f = [] hs_b = [] self._init_state() for h_in_f, h_in_b in zip(hs, reversed(hs)): h_f = self.lstm_f2(self.lstm_f1(F.reshape(h_in_f, (-1, self.in_dim)))) hs_f.append(h_f) h_b = self.lstm_b2(self.lstm_b1(F.reshape(h_in_b, (-1, self.in_dim)))) hs_b.append(h_b) hs = zip(hs_f, reversed(hs_b)) cat_ys = [self.linear_cat2(F.dropout( F.elu(self.linear_cat1(h)), 0.5, train=self.train)) for h in hs] dep_ys = [self.biaffine( F.elu(F.dropout(self.linear_dep(h), 0.32, train=self.train)), F.elu(F.dropout(self.linear_head(h), 0.32, train=self.train))) for h in hs] return cat_ys, dep_ys
def __call__(self, query, key, value, mask=None): """ Perform attention on the value array, using the query and key parameters for calculating the attention mask. :param query: matrix of shape (batch_size, num_timesteps, transformer_size) that is used for attention mask calculation :param key: matrix of shape (batch_size, num_timesteps, transformer_size) that is used for attention mask calculation :param value: matrix of shape (batch_size, num_timesteps, transformer_size) that is used for attention calculation :param mask: mask that can be used to mask out parts of the feature maps and avoid attending to those parts :return: the attended feature map `value`. """ if mask is not None: mask = mask[:, self.xp.newaxis, ...] batch_size = len(query) query, key, value = [ self.project(linear, x, batch_size) for linear, x in zip(self.linears, (query, key, value)) ] x, self.attention = self.attention_implementation( query, key, value, mask=mask, dropout_ratio=self.dropout_ratio) x = F.transpose(x, (0, 2, 1, 3)) x = F.reshape( x, (batch_size, -1, self.num_heads * self.key_dimensionality)) return self.linears[-1](x, n_batch_axes=2)
def callAndAtt(self, xs): #xsはword_idのlistのlist xs_f = self.makeEmbedBatch(xs) xs_b = self.makeEmbedBatch(xs, True) self.enc_f.reset_state() self.enc_b.reset_state() ys_f = self.enc_f(xs_f) ys_b = self.enc_b(xs_b) ys_bi = [ F.concat((y_f, y_b[::-1]), axis=1) for y_f, y_b in zip(ys_f, ys_b) ] y_att = [ self.att_w2(np.ones((y_bi.data.shape[0], 1), dtype=xp.float32)) * F.tanh(self.att_w1(y_bi)) for y_bi in ys_bi ] y_att = [ F.softmax(F.reshape(F.sum(y_ce, axis=1), (1, y_ce.data.shape[0]))) for y_ce in y_att ] y_conc = [ F.transpose( F.concat([y_ce for ri in range(2 * self.out_size)], axis=0)) for y_ce in y_att ] h = F.concat([ F.reshape(F.sum(y_ce * y_bi, axis=0), (1, 2 * self.out_size)) for y_ce, y_bi in zip(y_conc, ys_bi) ], axis=0) y = self.clssi(h) return y, y_att
def translate(self, hxs, max_length=100): batch_size, _, _ = hxs.shape compute_context = self.attention(hxs) c = Variable(self.xp.zeros((batch_size, self.n_units), 'f')) h = Variable(self.xp.zeros((batch_size, self.n_units), 'f')) ys = self.xp.full(batch_size, tokens['<SOS>'], np.int32) results = [] for _ in range(max_length): eys = self.embed_y(ys) context = compute_context(h) concatenated = F.concat([eys, context]) c, h = self.lstm(c, h, concatenated) concatenated = F.concat([concatenated, h]) logit = self.w(self.maxout(concatenated)) y = F.reshape(F.argmax(logit, axis=1), (batch_size, )) results.append(y) results = F.separate(F.transpose(F.vstack(results)), axis=0) outs = [] for y in results: inds = np.argwhere(y == tokens['<EOS>']) if len(inds) > 0: y = y[:inds[0, 0]] outs.append(y) return outs
def generate(self, articles, rule_flag_list, limit_s=7, limit_w=50): # 各データをエンコードする(バッチ処理) hs, cs, enc_ys = self.encode(articles) # 1次元と2次元を入れ替えてバッチ単位にする hs = F.transpose(hs, (1, 0, 2)) cs = F.transpose(cs, (1, 0, 2)) # 1データずつデコード処理(バッチ処理ではない) ys = [] for h, c, e, r in zip(hs, cs, enc_ys, rule_flag_list): h = F.transpose(F.reshape(h, (1, *h.shape)), (1, 0, 2)) c = F.transpose(F.reshape(c, (1, *c.shape)), (1, 0, 2)) ys.append(self._generate(h, c, e, r, limit_s, limit_w)) return ys
def check_forward(self, x_data): axes = self.axes x = chainer.Variable(x_data) y = functions.transpose(x, axes) self.assertEqual(y.data.dtype, self.dtype) self.assertTrue( (self.x.transpose(axes) == backend.CpuDevice().send(y.data)).all())
def __call__(self, h, adj, **kwargs): """ Args: h: (batchsize, num_nodes, in_channels) adj: (batchsize, num_edge_type, num_nodes, num_nodes) Returns: (batchsize, num_nodes, ch) """ mb, node, ch = h.shape # --- self connection, apply linear function --- hs = self.graph_linear_self(h) # --- relational feature, from neighbor connection --- # Expected number of neighbors of a vertex # Since you have to divide by it, if its 0, you need to # arbitrarily set it to 1 m = self.graph_linear_edge(h) m = functions.reshape(m, (mb, node, self.out_channels, self.n_edge_types)) m = functions.transpose(m, (0, 3, 1, 2)) # m: (batchsize, edge_type, node, ch) # hrL (batchsize, edge_type, node, ch) hr = functions.matmul(adj, m) # hr: (batchsize, node, ch) hr = functions.sum(hr, axis=1) return hs + hr
def __call__(self, xs, ys, train): decoder_logits = self.predictor.predict(xs, ys, train) labels = F.flatten(F.transpose(ys)) loss = F.softmax_cross_entropy(decoder_logits, labels) accuracy = F.accuracy(decoder_logits, labels) reporter.report({"loss": loss, "accuracy": accuracy}, self) return loss
def pixel_shuffler(out_ch, x, r = 2): b, c, w, h = x.shape x = F.reshape(x, (b, r, r, int(out_ch/(r*2)), w, h)) x = F.transpose(x, (0,3,4,1,5,2)) out_map = F.reshape(x, (b, int(out_ch/(r*2)), w*r, h*r)) return out_map
def __call__(self, hx, cx, xs, enc_hs): xs_embed = [self.embed(x) for x in xs] hy, cy, ys = self.Nlstm(hx, cx, xs_embed) ys_pad = F.pad_sequence(ys, length=None, padding=0.0) enc_hs = F.pad_sequence(enc_hs, length=None, padding=0.0) mask = self.xp.all(enc_hs.data == 0, axis=2, keepdims=True) mask_num = self.xp.full(mask.shape, -1024.0, dtype=self.xp.float32) alignment = [] decode = [] ys_pad = F.transpose(ys_pad, (1, 0, 2)) for y in ys_pad: y = F.reshape(y, (*y.shape, 1)) score = F.matmul(enc_hs, y) score = F.where(mask, mask_num, score) align = F.softmax(score, axis=1) context_vector = F.matmul(enc_hs, align, True, False) t = self.W_c( F.dropout(F.concat((y, context_vector), axis=1), self.dropout)) ys_proj = self.proj(F.dropout(t, self.dropout)) alignment.append(F.reshape(align, (len(xs), -1))) decode.append(ys_proj) decode = F.stack(decode, axis=1) alignment = F.stack(alignment, axis=1) return hy, cy, decode, alignment.data
def loss(self, padded_input_batch_data, target_signal_batch_data): batchsize = padded_input_batch_data.shape[0] width = target_signal_batch_data.shape[1] raw_output = self.forward_one_step(padded_input_batch_data, softmax=False) # remove padding cut = padded_input_batch_data.shape[3] - width if cut > 0: raw_output = CausalSlice1d(cut)(raw_output) # (batchsize * time_step,) <- (batchsize, time_step) target_signal_batch_data = target_signal_batch_data.reshape((-1, )) # (batchsize * time_step, channels) <- (batchsize, channels, 1, time_step) raw_output = F.transpose(raw_output, (0, 3, 2, 1)) raw_output = F.reshape(raw_output, (batchsize * width, -1)) target_id_batch = Variable(target_signal_batch_data) if self.gpu_enabled: target_id_batch.to_gpu() loss = F.sum(F.softmax_cross_entropy(raw_output, target_id_batch)) return loss
def angular_mc_loss(f, f_p, alpha=45, in_degree=True): ''' Args: f (chainer.Variable or xp.npdarray): Anchor vectors. Each vectors in f must be l2 normalized. f_p (chainer.Variable or xp.npdarray): Positive vectors. Each vectors in f must be l2 normalized. ''' xp = cuda.get_array_module(f) if in_degree: alpha = np.deg2rad(alpha) sq_tan_alpha = np.tan(alpha)**2 n_pairs = len(f) # first and second term of f_{a,p,n} term1 = 4 * sq_tan_alpha * matmul(f + f_p, transpose(f_p)) term2 = 2 * (1 + sq_tan_alpha) * F.sum(f * f_p, axis=1, keepdims=True) # term2 = 2 * (1 + sq_tan_alpha) * F.batch_matmul(f, f_p, transa=True).reshape(n_pairs, 1) f_apn = term1 - F.broadcast_to(term2, (n_pairs, n_pairs)) # multiply zero to diagonal components of f_apn mask = xp.ones_like(f_apn.data) - xp.eye(n_pairs, dtype=f.dtype) f_apn = f_apn * mask return F.average(F.logsumexp(f_apn, axis=1))
def __call__(self, h, adj): """ Args: h: (batchsize, num_nodes, in_channels) adj: (batchsize, num_edge_type, num_nodes, num_nodes) Returns: (batchsize, num_nodes, ch) """ mb, node, ch = h.shape # --- self connection, apply linear function --- hs = self.graph_linear_self(h) # --- relational feature, from neighbor connection --- # Expected number of neighbors of a vertex # Since you have to divide by it, if its 0, you need to # arbitrarily set it to 1 m = self.graph_linear_edge(h) m = functions.reshape( m, (mb, node, self.out_channels, self.num_edge_type)) m = functions.transpose(m, (0, 3, 1, 2)) # m: (batchsize, edge_type, node, ch) # hrL (batchsize, edge_type, node, ch) hr = functions.matmul(adj, m) # hr: (batchsize, node, ch) hr = functions.sum(hr, axis=1) return hs + hr
def prelu(self, inp, parameter): x = F.reshape(inp, (inp.shape[0], 1, inp.shape[1])) zeros = self.xp.zeros_like(x.data) c = F.transpose(F.concat((x, zeros), axis=1), (0, 2, 1)) return F.max( c, axis=2) + F.broadcast_to(parameter, inp.shape) * F.min(c, axis=2)
def __call__(self, hs, ys): '''CTC forward :param hs: :param ys: :return: ''' self.loss = None ilens = [x.shape[0] for x in hs] olens = [x.shape[0] for x in ys] # zero padding for hs y_hat = linear_tensor( self.ctc_lo, F.dropout(F.pad_sequence(hs), ratio=self.dropout_rate)) y_hat = F.transpose(y_hat, (1, 0, 2)) # batch x frames x hdim # get length info logging.info(self.__class__.__name__ + ' input lengths: ' + str(ilens)) logging.info(self.__class__.__name__ + ' output lengths: ' + str(olens)) # get ctc loss self.loss = warp_ctc(y_hat, ilens, [cuda.to_cpu(l.data) for l in ys])[0] logging.info('ctc loss:' + str(self.loss.data)) return self.loss
def _calc_distmat(self, h): bs = h.shape[0] h_l2_2 = F.sum(h**2, axis=1) H = F.broadcast_to(h_l2_2, (bs, bs)) H_t = F.transpose(H) XX = F.linear(h, h) return (H_t - 2*XX + H)
def __call__(self, xs): """Compute loc and conf from feature maps This method computes :obj:`mb_locs` and :obj:`mb_confs` from given feature maps. Args: xs (iterable of chainer.Variable): An iterable of feature maps. The number of feature maps must be same as the number of :obj:`aspect_ratios`. Returns: tuple of chainer.Variable: This method returns two :obj:`chainer.Variable`: :obj:`mb_locs` and :obj:`mb_confs`. * **mb_locs**: A variable of float arrays of shape \ :math:`(B, K, 4)`, \ where :math:`B` is the number of samples in the batch and \ :math:`K` is the number of default bounding boxes. * **mb_confs**: A variable of float arrays of shape \ :math:`(B, K, n\_fg\_class + 1)`. """ mb_locs = [] mb_confs = [] for i, x in enumerate(xs): mb_loc = self.loc[i](x) mb_loc = F.transpose(mb_loc, (0, 2, 3, 1)) mb_loc = F.reshape(mb_loc, (mb_loc.shape[0], -1, 4)) mb_locs.append(mb_loc) mb_conf = self.conf[i](x) mb_conf = F.transpose(mb_conf, (0, 2, 3, 1)) mb_conf = F.reshape( mb_conf, (mb_conf.shape[0], -1, self.n_class)) mb_confs.append(mb_conf) mb_locs = F.concat(mb_locs, axis=1) mb_confs = F.concat(mb_confs, axis=1) return mb_locs, mb_confs
def __call__(self, h, adj): # type: (chainer.Variable, chainer.Variable) -> chainer.Variable mb, node, ch = h.shape if ch != self.out_channels: raise ValueError('out_channels must be equal to dimension ' 'of feature vector associated to each atom, ' '{}, but it was set to {}'.format( ch, self.out_channels)) # adj: (mb, edge_type, node, node) edge_type = adj.shape[1] adj_in = adj adj_out = functions.transpose(adj, axes=(0, 1, 3, 2)) # expand edge vector to matrix adj_in = functions.reshape(adj_in, (-1, edge_type)) # adj_in: (mb*node*node, edge_type) adj_in = self.nn_layer_in(adj_in) # adj_in: (mb*node*node, out_ch*out_ch) adj_in = functions.reshape(adj_in, (mb, node, node, ch, ch)) adj_in = functions.reshape( functions.transpose(adj_in, axes=(0, 1, 3, 2, 4)), (mb, node * ch, node * ch)) adj_out = functions.reshape(adj_out, (-1, edge_type)) # adj_out: (mb*node*node, edge_type) adj_out = self.nn_layer_out(adj_out) # adj_out: (mb*node*node, out_ch*out_ch) adj_out = functions.reshape(adj_out, (mb, node, node, ch, ch)) adj_out = functions.reshape( functions.transpose(adj_out, axes=(0, 1, 3, 2, 4)), (mb, node * ch, node * ch)) # calculate message h = functions.reshape(h, (mb, node * ch, 1)) message_in = chainer_chemistry.functions.matmul(adj_in, h) # message_in: (mb, node*ch, 1) message_in = functions.reshape(message_in, (mb, node, ch)) # message_in: (mb, node, out_ch) message_out = chainer_chemistry.functions.matmul(adj_out, h) # message_out: (mb, node*ch, 1) message_out = functions.reshape(message_out, (mb, node, ch)) message = functions.concat([message_in, message_out], axis=2) return message # message: (mb, node, out_ch * 2)
def check_backward(self, x_data, y_grad): x = chainer.Variable(x_data) y = functions.transpose(x, self.axes) y.grad = y_grad y.backward() func = y.creator f = lambda: func.forward((x.data.copy(),)) gx, = gradient_check.numerical_grad(f, (x.data,), (y.grad,), eps=1e-5) gradient_check.assert_allclose(gx, x.grad, rtol=1e-5)
def __call__(self, imgs, questions): feat = self.feat_extractor(imgs) # Append relative coordinates to each location in the feature maps. n, c, h, w = feat.shape spatial_area = h * w xp = self.xp coords_h = xp.linspace(-1, 1, h, dtype=feat.dtype) coords_w = xp.linspace(-1, 1, w, dtype=feat.dtype) coords_hh, coords_ww = xp.meshgrid(coords_h, coords_w) coords_hh = coords_hh[None] coords_ww = coords_ww[None] coords = xp.concatenate((coords_hh, coords_ww), axis=0) coords = coords.reshape(2, -1) coords = coords[None] # (1, 2, spatial_area * spatial_area) coords = xp.repeat(coords, n, axis=0) # Coordinates may be cached here but the performance gain is not # significant so it is skipped in favor of readability. feat = feat.reshape(n, c, spatial_area) h = F.concat((feat, coords), axis=1) # (n, c + 2, spatial_area) # Create coordinate pairs (differentiable meshgrid). h_hh = F.expand_dims(h, 2) h_ww = F.expand_dims(h, 3) h_hh = F.repeat(h_hh, spatial_area, axis=2) h_ww = F.repeat(h_ww, spatial_area, axis=3) h = F.concat((h_hh, h_ww), axis=1) # Append questions to each coordinate pair. questions = questions.astype(imgs.dtype) questions = questions[:, :, None, None] questions = F.tile(questions, (1, 1, spatial_area, spatial_area)) h = F.concat((h, questions), axis=1) # (n, (c + 2) * 2 + questions_length, spatial_area, spatial_area) # g. h = F.transpose(h, (0, 2, 3, 1)) h = F.reshape(h, (n * spatial_area * spatial_area, -1)) h = self.g(h) h = F.reshape(h, (n, spatial_area * spatial_area, -1)) h = F.sum(h, axis=1) h = self.f(h) # Logits. h = self.fc(h) return h
def __call__(self, x): """Applies the linear layer. Args: x (~chainer.Variable): Batch of input vectors. Returns: ~chainer.Variable: Output of the linear layer. """ if self.has_uninitialized_params: self._initialize_params(x.shape[1]) # return linear.linear(x, self.W, self.b) batch_size = x.data.shape[1] return F.transpose(F.reshape(batch_matmul(x, self.W), (self.out_size, batch_size)))
def predict(self, input_x): output = self.predictor(input_x) batch_size, input_channel, input_h, input_w = input_x.shape batch_size, _, grid_h, grid_w = output.shape x, y, w, h, conf, prob = F.split_axis(F.reshape(output, (batch_size, self.predictor.n_boxes, self.predictor.n_classes+5, grid_h, grid_w)), (1, 2, 3, 4, 5), axis=2) x = F.sigmoid(x) # xのactivation y = F.sigmoid(y) # yのactivation conf = F.sigmoid(conf) # confのactivation prob = F.transpose(prob, (0, 2, 1, 3, 4)) prob = F.softmax(prob) # probablitiyのacitivation prob = F.transpose(prob, (0, 2, 1, 3, 4)) # x, y, w, hを絶対座標へ変換 x_shift = Variable(np.broadcast_to(np.arange(grid_w, dtype=np.float32), x.shape)) y_shift = Variable(np.broadcast_to(np.arange(grid_h, dtype=np.float32).reshape(grid_h, 1), y.shape)) w_anchor = Variable(np.broadcast_to(np.reshape(np.array(self.anchors, dtype=np.float32)[:, 0], (self.predictor.n_boxes, 1, 1, 1)), w.shape)) h_anchor = Variable(np.broadcast_to(np.reshape(np.array(self.anchors, dtype=np.float32)[:, 1], (self.predictor.n_boxes, 1, 1, 1)), h.shape)) #x_shift.to_gpu(), y_shift.to_gpu(), w_anchor.to_gpu(), h_anchor.to_gpu() box_x = (x + x_shift) / grid_w box_y = (y + y_shift) / grid_h box_w = F.exp(w) * w_anchor / grid_w box_h = F.exp(h) * h_anchor / grid_h return box_x, box_y, box_w, box_h, conf, prob
def __call__(self, h): if len(h.shape) != 4: return 0 # (b, c, h, w) -> (b, h, w, c) -> (b, h*w, c) h = F.transpose(h, (0, 2, 3, 1)) shape = h.shape b, n, c = shape[0], shape[1]*shape[2], shape[3] h = F.reshape(h, (b, n, c)) s = 0 xp = cuda.get_array_module(h.data) I_ = xp.identity(n) I_ = Variable(to_device(I_, device)) for h_ in h: s += F.sum(F.square(F.linear(h_, h_) - I_)) l = s / (b * n * c) return l
def __call__(self, y, ): bs = y.data.shape[0] d = np.prod(y.data.shape[1]) if len(y.shape) > 2: s = np.prod(y.data.shape[2:]) y = F.reshape(y, (bs, d, s)) y = F.transpose(y, (0, 2, 1)) y_normalized = F.softmax(y, use_cudnn=False) y_log_softmax = F.log_softmax(y, use_cudnn=False) self.loss = - F.sum(y_normalized * y_log_softmax) / bs / s else: y_normalized = F.softmax(y) y_log_softmax = F.log_softmax(y) self.loss = - F.sum(y_normalized * y_log_softmax) / bs / d return self.loss
def transpose_for_scores(input_tensor, batch_size, num_attention_heads, seq_length, width): """ output_tensor = F.stack( F.split_axis(input_tensor, num_attention_heads, axis=1), axis=1) # batch_size * seq_length, num_attention_heads, width output_tensor = F.stack( F.split_axis(output_tensor, seq_length, axis=0), axis=2) batch_size, num_attention_heads, seq_length, width """ output_tensor = F.reshape( input_tensor, (batch_size, seq_length, num_attention_heads, width)) output_tensor = F.transpose(output_tensor, [0, 2, 1, 3]) return output_tensor
def __call__(self, orig_img): orig_input_height, orig_input_width, _ = orig_img.shape #img = cv2.resize(orig_img, (640, 640)) img = reshape_to_yolo_size(orig_img) input_height, input_width, _ = img.shape img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = np.asarray(img, dtype=np.float32) / 255.0 img = img.transpose(2, 0, 1) # forward x_data = img[np.newaxis, :, :, :] x = Variable(x_data) x, y, w, h, conf, prob = self.model.predict(x) # parse results _, _, _, grid_h, grid_w = x.shape x = F.reshape(x, (self.n_boxes, grid_h, grid_w)).data y = F.reshape(y, (self.n_boxes, grid_h, grid_w)).data w = F.reshape(w, (self.n_boxes, grid_h, grid_w)).data h = F.reshape(h, (self.n_boxes, grid_h, grid_w)).data conf = F.reshape(conf, (self.n_boxes, grid_h, grid_w)).data prob = F.transpose(F.reshape(prob, (self.n_boxes, self.n_classes, grid_h, grid_w)), (1, 0, 2, 3)).data detected_indices = (conf * prob).max(axis=0) > self.detection_thresh results = [] for i in range(detected_indices.sum()): results.append({ "class_id": prob.transpose(1, 2, 3, 0)[detected_indices][i].argmax(), "label": self.labels[prob.transpose(1, 2, 3, 0)[detected_indices][i].argmax()], "probs": prob.transpose(1, 2, 3, 0)[detected_indices][i], "conf" : conf[detected_indices][i], "objectness": conf[detected_indices][i] * prob.transpose(1, 2, 3, 0)[detected_indices][i].max(), "box" : Box( x[detected_indices][i]*orig_input_width, y[detected_indices][i]*orig_input_height, w[detected_indices][i]*orig_input_width, h[detected_indices][i]*orig_input_height).crop_region(orig_input_height, orig_input_width) }) # nms nms_results = nms(results, self.iou_thresh) return nms_results
def __call__(self, x): xp = chainer.cuda.get_array_module(x.data) batchsize = x.shape[0] if self.train_weights == False and self.initial_T is not None: self.T.W.data = self.initial_T M = F.reshape(self.T(x), (-1, self.num_kernels, self.ndim_kernel)) M = F.expand_dims(M, 3) M_T = F.transpose(M, (3, 1, 2, 0)) M, M_T = F.broadcast(M, M_T) norm = F.sum(abs(M - M_T), axis=2) eraser = F.broadcast_to(xp.eye(batchsize, dtype=x.dtype).reshape((batchsize, 1, batchsize)), norm.shape) c_b = F.exp(-(norm + 1e6 * eraser)) o_b = F.sum(c_b, axis=2) if self.train_weights == False: self.initial_T = self.T.W.data return F.concat((x, o_b), axis=1)
def choose_var_of_type(spec, context, scope, type_def): compatible_scope = [var for var in scope if var.type_def.can_be(type_def)] scope = list(scope) var_ndxs = [i for i in range(len(scope)) if scope[i].type_def.can_be(type_def)] var_embeddings = [scope[i].vec for i in var_ndxs] var_lprobs = [F.matmul(vec, F.transpose(context['state'])) for vec in var_embeddings] normalizer = Variable(np.array([[0]], dtype=np.float32)) for vlp in var_lprobs: normalizer = normalizer + F.exp(vlp) normalizer = F.log(normalizer) var_lprobs = [vlp - normalizer for vlp in var_lprobs] vlp_data = np.array([vlp.data for vlp in var_lprobs])[:,0,0] ps = np.exp(vlp_data) ps /= np.sum(ps) ndx = np.random.choice(range(len(ps)), p=ps) lp = var_lprobs[ndx] var = scope[var_ndxs[ndx]] context['lp'] += lp[:,0] return var, context
def mk_expression_of_type(spec, context, scope, type_def): if context['depth'] > context['max_depth']: return Expression('depth_exceeded', type_def=type_def), context, False rule_ndxs = [i for i in range(len(spec['rules'])) if spec['rules'][i].can_make(scope, type_def)] vrule_ndxs = Variable(np.array(rule_ndxs, dtype=np.int32)) rule_embeddings = context['model'].rule_embeddings(vrule_ndxs) rule_lprobs = F.matmul(rule_embeddings, F.transpose(context['state'])) normalizer = context['model'].normalize(rule_lprobs) rule_lprobs = rule_lprobs - F.BroadcastTo((len(rule_ndxs), 1))(normalizer) rps = np.exp(rule_lprobs.data)[:,0] rps /= np.sum(rps) ndx = np.random.choice(range(len(rps)), p=rps) lp = rule_lprobs[ndx,:] rule_ndx = rule_ndxs[ndx] rule_embedding = rule_embeddings[[ndx],:] rule = spec['rules'][rule_ndx] context['lp'] += lp context['state'] = context['model'].state2state(context['state']) + context['model'].choice2state(rule_embedding) context['state'] = F.tanh(context['state']) return rule.make_expr(spec, context, scope, type_def)
def __call__(self, input_ids, input_mask, token_type_ids): final_hidden = self.bert.get_sequence_output( input_ids, input_mask, token_type_ids) batch_size = final_hidden.shape[0] seq_length = final_hidden.shape[1] hidden_size = final_hidden.shape[2] final_hidden_matrix = F.reshape( final_hidden, [batch_size * seq_length, hidden_size]) logits = self.output(final_hidden_matrix) logits = F.reshape(logits, [batch_size, seq_length, 2]) logits = logits - (1 - input_mask[:, :, None]) * 1000. # ignore pads logits = F.transpose(logits, [2, 0, 1]) unstacked_logits = F.separate(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits)
def __call__(self, h): # type: (chainer.Variable) -> chainer.Variable xp = cuda.get_array_module(h) mb, node, ch = h.shape # type: int, int, int if self.q_star is None: self.q_star = [ xp.zeros((1, self.in_channels * 2)).astype('f') for _ in range(mb) ] self.hx, self.cx, q = self.lstm_layer(self.hx, self.cx, self.q_star) # self.hx: (mb, mb, ch) # self.cx: (mb, mb, ch) # q: List[(1, ch) * mb] q = functions.stack(q) # q: (mb, 1, ch) q_ = functions.transpose(q, axes=(0, 2, 1)) # q_: (mb, ch, 1) e = functions.matmul(h, q_) # e: (mb, node, 1) a = functions.softmax(e) # a: (mb, node, 1) a = functions.broadcast_to(a, h.shape) # a: (mb, node, ch) r = functions.sum((a * h), axis=1, keepdims=True) # r: (mb, 1, ch) q_star_ = functions.concat((q, r), axis=2) # q_star_: (mb, 1, ch*2) self.q_star = functions.separate(q_star_) return functions.reshape(q_star_, (mb, ch * 2))
def __call__(self, batch): word_ids, (char_ids, char_boundaries) = batch batch_size = word_ids.data.shape[0] # word lookup table word_embs = self.word_emb(word_ids) # batch x len x dim if self.use_char: # character lookup table char_embs = self.char_emb(char_ids) # total_len x dim char_embs_reshape = F.reshape(char_embs, (1, 1, -1, self.char_emb_dim)) # 1 x 1 x total_len x dim # convolution char_emb_conv = self.char_conv(char_embs_reshape) # 1 x dim x total_len x 1 char_emb_conv_reshape = F.reshape(char_emb_conv, (self.char_hidden_dim, -1)) # dim x total_len # max embs = [] for i, char_emb_conv_word in enumerate(F.split_axis(char_emb_conv_reshape, char_boundaries, axis=1)): if i % 2 == 1: # not pad embs.append(F.max(char_emb_conv_word, axis=1)) char_emb_conv = F.reshape(F.concat(embs, axis=0), (batch_size, -1, self.char_hidden_dim)) # concatenate word_embs = F.concat([word_embs, char_emb_conv], axis=2) # batch x len x dim word_embs_reshape = F.reshape(word_embs, (batch_size, 1, -1, self.word_dim)) h = self.word_conv(word_embs_reshape) # batch x dim x len x 1 #h_transpose = F.swapaxes(h, 1, 2) # TODO: maybe inefficient h_transpose = F.transpose(h, (0, 2, 1, 3)) # TODO: maybe inefficient h_reshape = F.reshape(h_transpose, (-1, self.word_hidden_dim)) y = self.linear(F.relu(h_reshape)) return y
def __call__(self, h, adj): xp = self.xp # (minibatch, atom, channel) mb, atom, ch = h.shape # (minibatch, atom, EDGE_TYPE * heads * out_dim) h = self.message_layer(h) # (minibatch, atom, EDGE_TYPE, heads, out_dim) h = functions.reshape(h, (mb, atom, self.n_edge_types, self.n_heads, self.out_channels)) # concat all pairs of atom # (minibatch, 1, atom, heads, out_dim) h_i = functions.reshape(h, (mb, 1, atom, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, atom, heads, out_dim) h_i = functions.broadcast_to(h_i, (mb, atom, atom, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, 1, EDGE_TYPE, heads, out_dim) h_j = functions.reshape(h, (mb, atom, 1, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, atom, EDGE_TYPE, heads, out_dim) h_j = functions.broadcast_to(h_j, (mb, atom, atom, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, atom, EDGE_TYPE, heads, out_dim * 2) e = functions.concat([h_i, h_j], axis=5) # (minibatch, EDGE_TYPE, heads, atom, atom, out_dim * 2) e = functions.transpose(e, (0, 3, 4, 1, 2, 5)) # (minibatch * EDGE_TYPE * heads, atom * atom, out_dim * 2) e = functions.reshape(e, (mb * self.n_edge_types * self.n_heads, atom * atom, self.out_channels * 2)) # (minibatch * EDGE_TYPE * heads, atom * atom, 1) e = self.attention_layer(e) # (minibatch, EDGE_TYPE, heads, atom, atom) e = functions.reshape(e, (mb, self.n_edge_types, self.n_heads, atom, atom)) e = functions.leaky_relu(e, self.negative_slope) # (minibatch, EDGE_TYPE, atom, atom) if isinstance(adj, chainer.Variable): cond = adj.array.astype(xp.bool) else: cond = adj.astype(xp.bool) # (minibatch, EDGE_TYPE, 1, atom, atom) cond = xp.reshape(cond, (mb, self.n_edge_types, 1, atom, atom)) # (minibatch, EDGE_TYPE, heads, atom, atom) cond = xp.broadcast_to(cond, e.array.shape) # TODO(mottodora): find better way to ignore non connected e = functions.where(cond, e, xp.broadcast_to(xp.array(-10000), e.array.shape) .astype(xp.float32)) # In Relational Graph Attention Networks eq.(7) # ARGAT: take the softmax over the logits across node neighborhoods # irrespective of relation if self.softmax_mode == 'across': # (minibatch, heads, atom, EDGE_TYPE, atom) e = functions.transpose(e, (0, 2, 3, 1, 4)) # (minibatch, heads, atom, EDGE_TYPE * atom) e = functions.reshape(e, (mb, self.n_heads, atom, self.n_edge_types * atom)) # (minibatch, heads, atom, EDGE_TYPE * atom) alpha = functions.softmax(e, axis=3) if self.dropout_ratio >= 0: alpha = functions.dropout(alpha, ratio=self.dropout_ratio) # (minibatch, heads, atom, EDGE_TYPE, atom) alpha = functions.reshape(alpha, (mb, self.n_heads, atom, self.n_edge_types, atom)) # (minibatch, EDGE_TYPE, heads, atom, atom) alpha = functions.transpose(alpha, (0, 3, 1, 2, 4)) # In Relational Graph Attention Networks eq.(6) # WIRGAT: take the softmax over the logits independently for each # relation elif self.softmax_mode == 'within': alpha = functions.softmax(e, axis=4) if self.dropout_ratio >= 0: alpha = functions.dropout(alpha, ratio=self.dropout_ratio) else: raise ValueError("{} is invalid. Please use 'across' or 'within'" .format(self.softmax_mode)) # before: (minibatch, atom, EDGE_TYPE, heads, out_dim) # after: (minibatch, EDGE_TYPE, heads, atom, out_dim) h = functions.transpose(h, (0, 2, 3, 1, 4)) # (minibatch, EDGE_TYPE, heads, atom, out_dim) h_new = functions.matmul(alpha, h) # (minibatch, heads, atom, out_dim) h_new = functions.sum(h_new, axis=1) if self.concat_heads: # (heads, minibatch, atom, out_dim) h_new = functions.transpose(h_new, (1, 0, 2, 3)) # (minibatch, atom, heads * out_dim) h_new = functions.concat(h_new, axis=2) else: # (minibatch, atom, out_dim) h_new = functions.mean(h_new, axis=1) return h_new
def merge_heads(self, x): x = F.transpose(x, (0, 2, 1, 3)) new_x_shape = x.shape[:-2] + (x.shape[-2] * x.shape[-1], ) return x.reshape(*new_x_shape)
#!/usr/bin/env # -*- coding: utf-8 -*-" import numpy as np from chainer import Variable from chainer import functions as F x_data = np.array([5], dtype=np.float32) x = Variable(x_data) y = F.relu(x) z = Variable(np.array([[10, 20], [30, 40]], dtype=np.float32)) zz = F.transpose(z) print(zz.data) x_data = np.array([3,4,5], dtype=np.float32) x = Variable(x_data) y = F.sum(F.exp(x)+F.sin(x)) y.backward() print(x.grad)