def channel_normalize(x, test=False): s0, s1, s2, s3 = x.data.shape cavg = F.reshape(F.sum(x, axis=1) / s1, (s0, 1, s2, s3)) xavg = F.concat(s1 * [cavg]) cvar = F.reshape(F.sum((x - xavg) ** 2, axis=1) / s1, (s0, 1, s2, s3)) xvar = F.concat(s1 * [cvar]) return (x - xavg) / (xvar + 1e-5) ** 0.5
def forward(self, data): self.reset_state() x_list = [XP.iarray([d[0]]) for d in data] ep_list = [self.p_embed(x) for x in x_list] ec_list = [self.c_embed(x) for x in x_list] er_list = [self.r_embed(x) for x in x_list] p_list = self.p_encode(ep_list) c_list = self.c_encode(ec_list) r_list = self.r_encode(er_list) P = functions.reshape( functions.concat(p_list, 0), (1, len(data), self.hidden_size)) C = functions.reshape( functions.concat(c_list, 0), (1, len(data), self.hidden_size)) R = functions.concat(r_list, 0) parent_scores = functions.reshape( functions.batch_matmul(C, P, transb=True), (len(data), len(data))) root_scores = functions.reshape( self.r_scorer(R), (1, len(data))) return parent_scores, root_scores
def reorg(input, stride=2): batch_size, input_channel, input_height, input_width = input.data.shape output_height, output_width, output_channel = int(input_height/stride), int(input_width/stride), input_channel*stride*stride output = F.transpose(F.reshape(input, (batch_size, input_channel, output_height, stride, output_width, stride)), (0, 1, 2, 4, 3, 5)) output = F.transpose(F.reshape(output, (batch_size, input_channel, output_height, output_width, -1)), (0, 4, 1, 2, 3)) output = F.reshape(output, (batch_size, output_channel, output_height, output_width)) return output
def __call__(self, h, dist): """ Args: h (numpy.ndarray): axis 0 represents minibatch index, axis 1 represents atom_index and axis2 represents feature dimension. dist (numpy.ndarray): axis 0 represents minibatch index, axis 1 and 2 represent distance between atoms. """ mb, atom, ch = h.shape if ch != self.hidden_dim: raise ValueError('h.shape[2] {} and hidden_dim {} must be same!' .format(ch, self.hidden_dim)) embedlist = self.xp.arange( self.num_rbf).astype('f') * self.radius_resolution dist = functions.reshape(dist, (mb, atom, atom, 1)) dist = functions.broadcast_to(dist, (mb, atom, atom, self.num_rbf)) dist = functions.exp(- self.gamma * (dist - embedlist) ** 2) dist = functions.reshape(dist, (-1, self.num_rbf)) dist = self.dense1(dist) dist = functions.softplus(dist) dist = self.dense2(dist) dist = functions.softplus(dist) dist = functions.reshape(dist, (mb, atom, atom, self.hidden_dim)) h = functions.reshape(h, (mb, atom, 1, self.hidden_dim)) h = functions.broadcast_to(h, (mb, atom, atom, self.hidden_dim)) h = functions.sum(h * dist, axis=1) return h
def __call__(self, z, test=False, rectifier='clipped_relu'): batch = z if self.mode == 'convolution': batch = F.relu(self.bn6(self.lin(z), test=test)) n_pics = batch.data.shape[0] start_array_shape = (n_pics,) + calc_fc_size(self.img_height, self.img_width) batch = F.reshape(batch, start_array_shape) batch = F.relu(self.bn5(self.deconv5(batch), test=test)) batch = F.relu(self.bn4(self.deconv4(batch), test=test)) batch = F.relu(self.bn3(self.deconv3(batch), test=test)) batch = F.relu(self.bn2(self.deconv2(batch), test=test)) batch = self.deconv1(batch) elif self.mode == 'linear': n_layers = len(self.decode_layers) for i in range(n_layers): batch = F.relu(getattr(self, 'linear_%i' % i)(batch)) batch = F.relu(getattr(self, 'linear_%i' % n_layers)(batch)) batch = F.reshape(batch, (-1, self.img_height, self.img_width, self.color_channels)) if rectifier == 'clipped_relu': batch = F.clipped_relu(batch, z=1.0) elif rectifier == 'sigmoid': batch = F.sigmoid(batch) else: raise NameError( "Unsupported rectifier type: %s, must be either 'sigmoid' or 'clipped_relu'." % rectifier) return batch
def __call__(self, annotion_list, back_word_list, p): """ Calculate the annotion and back word value :param annotion_list: :param back_word_list: :param p: hidden value :return: """ batch_size = p.data.shape[0] exponential_list = [] sum_exponential = XP.fzeros((batch_size, 1)) # Calculate the total value list and total value # Prepare the Convoluation for annotion, back_word in zip(annotion_list, back_word_list): weight = functions.tanh(self.annotion_weight(annotion) + self.back_weight(back_word) + self.pw(p)) exponential = functions.exp(self.weight_exponential(weight)) exponential_list.append(exponential) sum_exponential += exponential ZEROS = XP.fzeros((batch_size, self.hidden_size)) annotion_value = ZEROS back_word_value = ZEROS # Calculate the Convolution Value each annotion and back word for annotion, back_word, exponential in zip(annotion_list, back_word_list, exponential_list): exponential /= sum_exponential annotion_value += functions.reshape(functions.batch_matmul(annotion, exponential), (batch_size, self.hidden_size)) back_word_value += functions.reshape(functions.batch_matmul(back_word, exponential), (batch_size, self.hidden_size)) return annotion_value, back_word_value
def __call__(self, x, context): e = model.embed(context) shape = e.data.shape x = F.broadcast_to(x, (shape[0], shape[1])) e = F.reshape(e, (shape[0] * shape[1], shape[2])) x = F.reshape(x, (shape[0] * shape[1],)) return self.loss_func(e, x)
def __call__(self, x, im_info): h, n = self.trunk(x), x.data.shape[0] rpn_cls_score = self.rpn_cls_score(h) c, hh, ww = rpn_cls_score.data.shape[1:] rpn_bbox_pred = self.rpn_bbox_pred(h) rpn_cls_score = F.reshape(rpn_cls_score, (n, 2, -1)) # RoI Proposal rpn_cls_prob = F.softmax(rpn_cls_score) rpn_cls_prob_reshape = F.reshape(rpn_cls_prob, (n, c, hh, ww)) rois = self.proposal_layer( rpn_cls_prob_reshape, rpn_bbox_pred, im_info, self.train) boxes = rois[:, 1:5] / im_info[0][2] rois = chainer.Variable(rois, volatile=not self.train) # RCNN pool5 = F.roi_pooling_2d(self.trunk.relu5_3_out, rois, 7, 7, 0.0625) fc6 = F.relu(self.fc6(pool5)) fc7 = F.relu(self.fc7(fc6)) self.scores = F.softmax(self.cls_score(fc7)) box_deltas = self.bbox_pred(fc7).data pred_boxes = bbox_transform_inv(boxes, box_deltas) self.pred_boxes = clip_boxes(pred_boxes, im_info[0][:2]) if self.train: # loss_cls = F.softmax_cross_entropy(cls_score, labels) # huber loss with delta=1 means SmoothL1Loss return None else: return self.scores, self.pred_boxes
def compute_vecs(self, word_ids, word_boundaries, phrase_num, char_vecs=None): word_ids = my_variable(word_ids, volatile=not self.train) word_embs = self.emb(word_ids) # total_len x dim word_embs_reshape = F.reshape(word_embs, (1, 1, -1, self.emb_dim)) if self.word_level_flag and char_vecs is not None: # print char_vecs.data.shape # print word_embs.data.shape word_embs = F.concat([word_embs, char_vecs], axis=1) # print word_embs.data.shape dim = self.emb_dim + self.add_dim word_embs_reshape = F.reshape(word_embs, (1, 1, -1, dim)) # 1 x 1 x total_len x dim # convolution word_emb_conv = self.conv(word_embs_reshape) # 1 x dim x total_len x 1 word_emb_conv_reshape = F.reshape(word_emb_conv, (self.hidden_dim, -1)) # max word_emb_conv_reshape = F.split_axis(word_emb_conv_reshape, word_boundaries, axis=1) embs = [F.max(word_emb_conv_word, axis=1) for i, word_emb_conv_word in enumerate(word_emb_conv_reshape) if i % 2 == 1] embs = F.concat(embs, axis=0) phrase_emb_conv = F.reshape(embs, (phrase_num, self.hidden_dim)) return phrase_emb_conv
def create_encoder_states_matrix(self, hs): batch_size, dim = hs[0].data.shape hs_3d = list(map(lambda h: F.expand_dims(h, 1), hs)) # [(batch_size, 1, dim)] hs_3d_concat = F.concat(hs_3d, axis=1) # (batch_size, input_length, dim) hs_3d_concat_linear = self.decoder.phi2_linear(F.reshape(hs_3d_concat, (-1, dim))) # (batch_size * input_length, dim) hs_3d_concat_linear_tanh = F.tanh(F.reshape(hs_3d_concat_linear, (batch_size, -1, dim))) # (batch_size, input_length, dim) return hs_3d_concat_linear_tanh
def step(self, x, rnn_states, encoder_states, train): new_states = [] h_in = self.word_emb(x) for i, (rnn, state) in enumerate(zip(self.rnns, rnn_states)): if self.gru: h = state state = rnn(h, h_in) h_in = state else: c, h = state state = rnn(c, h, h_in) _, h_in = state new_states.append(state) if i < len(self.rnns) - 1: if self.dropout_ratio > 0: h_in = F.dropout(h_in, self.dropout_ratio, train) batch_size, input_length, hidden_dim = encoder_states.data.shape h_in_linear = self.phi1_linear(h_in) # (batch_size, hidden_dim) h_in_linear_tanh = F.tanh(h_in_linear) # (batch_size, hidden_dim) unnormalized_weights = F.reshape(F.batch_matmul(encoder_states, h_in_linear_tanh), (batch_size, input_length)) # (batch, input_length) normalized_weights = F.softmax(unnormalized_weights) # (batch, input_length) encoder_context = F.reshape(F.batch_matmul(encoder_states, normalized_weights, transa=True), (batch_size, hidden_dim)) # (batch, hidden_dim) encoder_context_h_in = F.concat([encoder_context, h_in], axis=1) # (batch, hidden_dim * 2) y = self.softmax_linear(F.relu(encoder_context_h_in)) # Is ReLU here really necessary? return y, normalized_weights, new_states
def train(self, x_real, ): bs = x_real.shape[0] # Train discriminator x_recon_ = self.generate_x_recon(bs) #TODO: change when using CNN reconstructor bs = x_recon_.shape[0] x_recon = F.reshape(x_recon_, (bs, 1, 28, 28)) d_x = self.discriminator(x_real) z = self.generate_random(bs, self.dim_rand) x_gen = self.generator(x_recon, z, test=False) d_x_gen = self.discriminator(x_gen) loss = self.gan_loss(d_x_gen, d_x) self.discriminator.cleargrads() self.generator.cleargrads() loss.backward() self.optimizer_dis.update() # Train generator x_recon_ = self.generate_x_recon(bs) #TODO: change when using CNN reconstructor bs = x_recon_.shape[0] x_recon = F.reshape(x_recon_, (bs, 1, 28, 28)) z = self.generate_random(bs, self.dim_rand) x_gen = self.generator(x_recon, z, test=False) d_x_gen = self.discriminator(x_gen) loss = self.gan_loss(d_x_gen) self.discriminator.cleargrads() self.generator.cleargrads() loss.backward() self.optimizer_gen.update()
def __call__(self, fs, bs, h): ''' Attentionの計算 :param fs: 順向きのEncoderの中間ベクトルが記録されたリスト :param bs: 逆向きのEncoderの中間ベクトルが記録されたリスト :param h: Decoderで出力された中間ベクトル :return: 順向きのEncoderの中間ベクトルの加重平均と逆向きのEncoderの中間ベクトルの加重平均 ''' batch_size = h.data.shape[0] # ミニバッチのサイズを記憶 ws = [] # ウェイトを記録するためのリストの初期化 sum_w = Variable(xp.zeros((batch_size, 1), dtype='float32')) # ウェイトの合計値を計算するための値を初期化 # Encoderの中間ベクトルとDecoderの中間ベクトルを使ってウェイトの計算 for f, b in zip(fs, bs): w = F.tanh(self.fh(f)+self.bh(b)+self.hh(h)) # 順向きEncoderの中間ベクトル、逆向きEncoderの中間ベクトル、Decoderの中間ベクトルを使ってウェイトの計算 w = F.exp(self.hw(w)) # softmax関数を使って正規化する ws.append(w) # 計算したウェイトを記録 sum_w += w # 出力する加重平均ベクトルの初期化 att_f = Variable(xp.zeros((batch_size, self.hidden_size), dtype='float32')) att_b = Variable(xp.zeros((batch_size, self.hidden_size), dtype='float32')) for f, b, w in zip(fs, bs, ws): w /= sum_w # ウェイトの和が1になるように正規化 # ウェイト * Encoderの中間ベクトルを出力するベクトルに足していく att_f += F.reshape(F.batch_matmul(f, w), (batch_size, self.hidden_size)) att_b += F.reshape(F.batch_matmul(b, w), (batch_size, self.hidden_size)) return att_f, att_b
def tv_norm(self, x): diffh = self.tvh( F.reshape(x, (3, 1, self.args.in_size, self.args.in_size))) diffw = self.tvw( F.reshape(x, (3, 1, self.args.in_size, self.args.in_size))) tv = (F.sum(diffh ** 2) + F.sum(diffw ** 2)) ** (self.args.beta / 2.) return tv
def cosine_similarity(x, y, eps=1e-6): n1, n2, n3 = x.data.shape _, m2, _ = y.data.shape z = F.batch_matmul(x, y, transb=True) x2 = F.broadcast_to(F.reshape(F.sum(x * x, axis=2), (n1, n2, 1)), (n1, n2, m2)) y2 = F.broadcast_to(F.reshape(F.sum(y * y, axis=2), (n1, 1, m2)), (n1, n2, m2)) z /= F.exp(F.log(x2 * y2 + eps) / 2) return z
def forward(self, x): n_batch, n_atom, n_channel = x.shape x = functions.reshape(x, (n_batch * n_atom, n_channel)) for l in self.layers: x = l(x) x = functions.relu(x) x = functions.reshape(x, (n_batch, n_atom, self.n_output_channel)) return x
def __call__(self, x, y): h = F.sigmoid(self.l1_(x)) coef = F.softmax(self.coef_(h)) mean = F.reshape(self.mean_(h), (-1,self.NUM_MIXTURE,self.OUT_DIM)) logvar = self.logvar_(h) mean, y = F.broadcast(mean, F.reshape(y, (-1,1,self.OUT_DIM))) return F.sum( coef*F.exp(-0.5*F.sum((y-mean)**2, axis=2)*F.exp(-logvar))/ ((2*np.pi*F.exp(logvar))**(0.5*self.OUT_DIM)),axis=1)
def __call__(self, x, context): e = self.embed(context) shape = e.data.shape x = F.broadcast_to(x[:, None], (shape[0], shape[1])) e = F.reshape(e, (shape[0] * shape[1], shape[2])) x = F.reshape(x, (shape[0] * shape[1],)) loss = self.loss_func(e, x) reporter.report({'loss': loss}, self) return loss
def __call__(self, x, contexts): e = self.embed(contexts) batch_size, n_context, n_units = e.shape x = F.broadcast_to(x[:, None], (batch_size, n_context)) e = F.reshape(e, (batch_size * n_context, n_units)) x = F.reshape(x, (batch_size * n_context,)) loss = self.loss_func(e, x) reporter.report({'loss': loss}, self) return loss
def __call__(self, h, adj): # type: (chainer.Variable, chainer.Variable) -> chainer.Variable # adj: (mb, edge_type, node, node) mb, node, ch = h.shape h = self.message_layer(h, adj) # h: (mb, node, hidden_dim*2) h = functions.reshape(h, (mb * node, self.hidden_dim * 2)) h = self.update_layer(h) # h: (mb*node, hidden_dim) h = functions.reshape(h, (mb, node, self.hidden_dim)) return h
def __call__(self, x, context): e = self.embed(context) shape = e.shape x = F.broadcast_to(x[:, None], (shape[0], shape[1])) e = F.reshape(e, (shape[0] * shape[1], shape[2])) x = F.reshape(x, (shape[0] * shape[1],)) loss = self.loss_func(e, x) # shouldn't we divide loss by batch size? reporter.report({'loss': loss}, self) return loss
def __call__(self, z, test=False): lf = self.l0z(z).creator # print lf.outputs h = F.reshape(F.relu(self.bn0l(self.l0z(z), test=test)), (z.data.shape[0], 512, 6, 6)) h = F.relu(self.bn1(self.dc1(h), test=test)) h = F.relu(self.bn2(self.dc2(h), test=test)) h = F.relu(self.bn3(self.dc3(h), test=test)) x = (self.dc4(h)) # shape:(サンプル数(= z.data.shape[0]), 3,96,96) になっている return F.reshape(x, (z.data.shape[0], 3 * 96 * 96))
def __call__(self, z, h_feat, test=False): bs = z.shape[0] h_feat = F.average_pooling_2d(h_feat, (7, 7)) h_feat = F.reshape(h_feat, (bs, 128)) h = F.concat((z, h_feat)) h = self.linear(h) h = self.bn(h, test) h = F.reshape(h, (bs, 128, 7, 7)) h = self.act(h) return h
def forward(self, x_data, y_data, train=True): #print y_data batchsize = len(x_data) csize = self.channel x, t = chainer.Variable(x_data,volatile=not train), chainer.Variable(y_data,volatile=not train) x = F.reshape(x,(batchsize,csize,-1)) h = F.reshape(x,(batchsize,csize,-1,1)) h = self.conv1(h) h = F.reshape(h,(batchsize,10,-1)) h = F.tanh(h) h = F.reshape(h,(batchsize,10,-1,1)) h = self.conv2(h) h = F.reshape(h,(batchsize,10,-1)) h = F.tanh(h) h = F.reshape(h,(batchsize,10,-1,1)) h = self.conv3(h) h = F.reshape(h,(batchsize,100,-1)) h = F.tanh(h) h = F.reshape(h,(batchsize,100,-1,1)) h = self.conv4(h) h = F.reshape(h,(batchsize,100,-1)) h = F.tanh(h) h = F.dropout(F.tanh(self.fc5(h)), train=train) y = self.fc6(h) return F.mean_squared_error(y, t)
def forward(self, x_data, y_data, train=True): #print y_data batchsize = len(x_data) csize = self.channel x, t = chainer.Variable(x_data,volatile=not train), chainer.Variable(y_data.reshape(len(y_data),),volatile=not train) x = F.reshape(x,(batchsize,csize,-1)) h = F.reshape(x,(batchsize,csize,-1,1)) h = self.conv1(h) h = F.reshape(h,(batchsize,10,-1)) h = F.tanh(h) h = F.reshape(h,(batchsize,10,-1,1)) h = self.conv2(h) h = F.reshape(h,(batchsize,10,-1)) h = F.tanh(h) h = F.reshape(h,(batchsize,10,-1,1)) h = self.conv3(h) h = F.reshape(h,(batchsize,100,-1)) h = F.tanh(h) h = F.reshape(h,(batchsize,100,-1,1)) h = self.conv4(h) h = F.reshape(h,(batchsize,100,-1)) h = F.tanh(h) h = F.dropout(F.tanh(self.fc5(h)), train=train) y = self.fc6(h) return F.softmax_cross_entropy(y, t), F.accuracy(y, t)
def predict(self, x_data, train=False): batchsize = len(x_data) csize = self.channel x = chainer.Variable(x_data,volatile=True) x = F.reshape(x,(batchsize,csize,-1)) h = F.reshape(x,(batchsize,csize,-1,1)) h = self.conv1(h) h = F.reshape(h,(batchsize,10,-1)) h = F.tanh(h) h = F.reshape(h,(batchsize,10,-1,1)) h = self.conv2(h) h = F.reshape(h,(batchsize,10,-1)) h = F.tanh(h) h = F.reshape(h,(batchsize,10,-1,1)) h = self.conv3(h) h = F.reshape(h,(batchsize,100,-1)) h = F.tanh(h) h = F.reshape(h,(batchsize,100,-1,1)) h = self.conv4(h) h = F.reshape(h,(batchsize,100,-1)) h = F.tanh(h) h = F.dropout(F.tanh(self.fc5(h)), train=train) y = F.softmax(self.fc6(h)) return y
def forward(self, x): n_batch, n_pair, n_feature = x.shape a = functions.reshape( x, (n_batch * (self.n_atom * self.n_atom), n_feature)) for l in self.linearLayer: a = l(a) a = functions.relu(a) a = functions.reshape(a, (n_batch, self.n_atom, self.n_atom, self.n_channel)) a = self.readout(a, axis=2) return a
def __call__(self, x, z, test=False): if self.nolin: h = x else: h = self.lin(x) mu = F.sum(h, axis=0)/h.data.shape[0] self.mu = F.broadcast(F.reshape(mu, (1,h.data.shape[1])),h)[0] vr = (F.sum((h-self.mu)*(h-self.mu), axis=0)/h.data.shape[0])**0.5 self.vr = F.broadcast(F.reshape(vr, (1,h.data.shape[1])),h)[0] bnh = (h-self.mu)/(self.vr+1e-7) return self.comb(bnh, z)
def __call__(self, x, eta, test=False): h = self.lin(x) mu = F.sum(h, axis=0)/h.data.shape[0] self.mu = F.broadcast(F.reshape(mu, (1,h.data.shape[1])),h)[0] vr = (F.sum((h-self.mu)*(h-self.mu), axis=0)/h.data.shape[0])**0.5 self.vr = F.broadcast(F.reshape(vr, (1,h.data.shape[1])),h)[0] bnh = (h-self.mu)/(self.vr+1e-7) z = bnh + xp.random.randn(x.data.shape[0], self.n_out)*eta if self.act is None: return z, F.broadcast(self.gamma.W, z)[0]*(z + F.broadcast(self.beta.W, z)[0]) else: return z, self.act(F.broadcast(self.gamma.W, z)[0]*(z + F.broadcast(self.beta.W, z)[0]))
def __call__(self, x, h_feat, test=False): bs = x.shape[0] h = self.convunit0(x, test) h = self.convunit1(h, test) h_feat = F.average_pooling_2d(h_feat, (7, 7)) h_feat = F.reshape(h_feat, (bs, 128)) h = F.average_pooling_2d(h, (7, 7)) h = F.reshape(h_feat, (bs, 128)) h = F.concat((h, h_feat)) h = self.linear(h) #h = F.sigmoid(h) return h
def get_x(self, x, y): x = F.reshape(x, (x.shape[0], x.shape[1] * x.shape[2] * x.shape[3])) return chainer.cuda.to_cpu(x.data)
def forward(self, x, p): y1 = F.reshape(x, (24, 12)) y2 = F.reshape(x, (6, -1, 12)) y3 = F.reshape(x, (-1, p)) return (y1, y2, y3)
def __call__(self, x, **kwargs): """__call__(self, x, finetune=False) Invokes the forward propagation of BatchNormalization. In training mode, the BatchNormalization computes moving averages of mean and variance for evaluation during training, and normalizes the input using batch statistics. .. warning:: ``test`` argument is not supported anymore since v2. Instead, use ``chainer.using_config('train', False)``. See :func:`chainer.using_config`. Args: x (Variable): Input variable. finetune (bool): If it is in the training mode and ``finetune`` is ``True``, BatchNormalization runs in fine-tuning mode; it accumulates the input array to compute population statistics for normalization, and normalizes the input using batch statistics. """ # check argument argument.check_unexpected_kwargs( kwargs, test='test argument is not supported anymore. ' 'Use chainer.using_config') finetune, = argument.parse_kwargs(kwargs, ('finetune', False)) original_shape = x.shape batch_size = original_shape[0] # reshape input x if batchsize > 1 if batch_size > 1: reshaped_x = functions.expand_dims(x, axis=0) else: reshaped_x = x if hasattr(self, 'gamma'): gamma = self.gamma if self.norm_grad: # gamma.add_batch(batch_size) gamma.n_batch = batch_size else: with cuda.get_device_from_id(self._device_id): gamma = variable.Variable(self.xp.ones( self.avg_mean.shape, dtype=x.dtype)) if hasattr(self, 'beta'): beta = self.beta if self.norm_grad: # beta.add_batch(batch_size) beta.n_batch = batch_size else: with cuda.get_device_from_id(self._device_id): beta = variable.Variable(self.xp.zeros( self.avg_mean.shape, dtype=x.dtype)) #align shapes if x was reshaped if batch_size > 1: mean = self.xp.stack((self.avg_mean,) * batch_size) var = self.xp.stack((self.avg_var,) * batch_size) gamma = functions.stack((gamma,) * batch_size) beta = functions.stack((beta,) * batch_size) else: mean = self.xp.asarray(self.avg_mean) var = self.xp.asarray(self.avg_var) if configuration.config.train: if finetune: self.N += 1 decay = 1. - 1. / self.N else: decay = self.decay func = batch_normalization.BatchNormalizationFunction( self.eps, mean, var, decay) ret = func(reshaped_x, gamma, beta) else: head_ndim = gamma.ndim + 1 axis = (0,) + tuple(range(head_ndim, reshaped_x.ndim)) mean = reshaped_x.data.mean(axis=axis) var = reshaped_x.data.var(axis=axis) ret = functions.fixed_batch_normalization( reshaped_x, gamma, beta, mean, var, self.eps) # ret is normalized input x if batch_size > 1: ret = functions.reshape(ret, original_shape) return ret
def translate(self, xs): EOS_DST = self.v_eos_dst batch = len(xs) with chainer.no_backprop_mode(), chainer.using_config('train', False): exs = sequence_embed(self.embed_x, xs) hx, cx, xs_outputs = self.encoder(None, None, exs) hx = F.transpose( F.reshape(F.transpose(hx, (1, 0, 2)), (batch, self.n_layers, self.n_units * 2)), (1, 0, 2)) cx = F.transpose( F.reshape(F.transpose(cx, (1, 0, 2)), (batch, self.n_layers, self.n_units * 2)), (1, 0, 2)) #ivs = sequence_embed(self.embed_y,list(map(lambda i: xp.array([i]),range(self.n_target_vocab)))) #v = ivs[EOS_DST] result = [] beam_with = 3 vs = [xp.array([EOS_DST]) for _ in xs_outputs] kds = [[] for _ in xs_outputs] rs = [0.0 for _ in xs_outputs] beam_data = [(rs, kds, hx, cx, vs)] for j in range(self.n_maxlen): #print(j) to_beam = [[] for _ in range(batch)] for rs, kds, nhx, ncx, vs in beam_data: if type(nhx) is list: #print(nhx[0].shape) nhx = F.stack(nhx, axis=1) ncx = F.stack(ncx, axis=1) vs = list(map(lambda d: xp.array(d), vs)) #print(nhx.shape) #print(rs,kds,nhx,ncx,vs) evs = sequence_embed(self.embed_y, vs) thx, tcx, ys = self.decoder(nhx, ncx, evs) wy = self.W(F.concat(ys, axis=0)) #print(wy.shape) wy = F.log_softmax(wy).data #print(thx.shape) thx = F.separate(thx, axis=1) tcx = F.separate(tcx, axis=1) #print(thx[0].shape) for i in range(batch): ahx, acx = thx[i], tcx[i] for t, nr in enumerate(wy[i]): to_beam[i].append( (rs[i] + nr, kds[i] + [t], ahx, acx, [t])) to_beam = list( map(lambda x: sorted(x)[::-1][:beam_with], to_beam)) beam_data = [ tuple([[to_beam[i][k][s] for i in range(batch)] for s in range(5)]) for k in range(beam_with) ] #for i,hxs in enumerate(xs_outputs): # wy = F.reshape(F.log_softmax(F.reshape(wy,(1,self.n_target_vocab))),(self.n_target_vocab,)).data #print('translate',i) """ nhx,ncx = hx[i],cx[i] ncx = F.reshape(ncx,(ncx.shape[0],1,ncx.shape[1])) nhx = F.reshape(nhx,(nhx.shape[0],1,nhx.shape[1])) beam_data = [(0.0,([],v,nhx,ncx))] to_beam = [] for ci,(r,(kd,v,nhx,ncx)) in enumerate(beam_data): if len(kd)>0 and kd[-1]==EOS_DST: to_beam.append((r,(kd,v,nhx,ncx))) if ci == 0: break continue #print(v.shape) thx,tcx,ys = self.decoder(nhx,ncx,[v]) yh = ys[0] wy = self.W(yh).data[0] wy = F.reshape(F.log_softmax(F.reshape(wy,(1,self.n_target_vocab))),(self.n_target_vocab,)).data #print(wy.shape) to_beam += [(r+nr,(kd + [i],ivs[i],thx,tcx)) for i,nr in enumerate(wy)] #print(to_beam[0][0]) #print(list(map(lambda a: a[0],to_beam))) beam_data = sorted(to_beam)[::-1][:beam_with] #print(list(map(lambda a: a[0],beam_data))) """ #result.append(beam_data[0][1][0]) result = beam_data[0][1] # for i in range(batch) # Remove EOS taggs outs = [] for y in result: if EOS_DST in y: y = y[:y.index(EOS_DST)] #print(y) outs.append(y) print(xs, outs) return outs
def calc_loss(self, predictions, labels): labels = labels.ravel() predictions = F.reshape(predictions, (-1, predictions.shape[-1])) return F.softmax_cross_entropy(predictions, labels)
def train(source_bpe, target_bpe, source_glove, target_glove, chunk_length, batch_size, warmup_steps, save_decimation, num_steps, gpu_id, out, log_level): if not os.path.exists(out): os.makedirs(out) ll = getattr(logging, log_level) stream_handler = logging.StreamHandler(sys.stdout) stream_handler.setLevel(ll) stream_handler.setFormatter(logging.Formatter('%(message)s')) file_handler = logging.FileHandler(filename=os.path.join( out, 'training.log'), mode='a') file_handler.setLevel(ll) file_handler.setFormatter(logging.Formatter('%(message)s')) logger.addHandler(stream_handler) logger.addHandler(file_handler) logger.setLevel(ll) gpu_id = gpu_id if gpu_id is not None else -1 device_name = '@intel64' if gpu_id >= 0: device_name = f'@cupy:{gpu_id}' with chainer.using_device(device_name): source_vocab = make_vocab(source_glove) target_vocab = make_vocab(target_glove) output_model_dim = target_vocab.embedding_size dataset = make_dataset(source_bpe, target_bpe, source_vocab, target_vocab, chunk_length) iterator = MultithreadIterator(dataset, batch_size) state = TrainingState() model = Transformer(source_vocab, target_vocab) model.to_gpu(gpu_id) optimizer = Adam(beta1=0.99, beta2=0.98, eps=1e-9).setup(model) load_training(out, model, optimizer, state) try: for n, batch in enumerate(iterator): if n >= num_steps: break if (n + 1) % save_decimation == 0: save_training(out, model, optimizer, state) model.cleargrads() gc.collect() source, target = stack_nested(batch) source.token_ids.to_gpu(gpu_id) source.masks.to_gpu(gpu_id) target.token_ids.to_gpu(gpu_id) target.masks.to_gpu(gpu_id) output_probs = model.train_forward(source.token_ids, target.token_ids, input_masks=source.masks, output_masks=target.masks) unnormalized_loss = F.softmax_cross_entropy( F.reshape(output_probs, (output_probs.shape[0] * output_probs.shape[1], output_probs.shape[2])), F.reshape(target.token_ids, (target.token_ids.shape[0] * target.token_ids.shape[1], )), reduce='no') loss_mask = xp.reshape( xp.logical_not(target.masks.array).astype(xp.float32), (target.masks.shape[0] * target.masks.shape[1], )) loss = F.sum(unnormalized_loss * loss_mask) / F.sum(loss_mask) loss.backward() learning_rate = (output_model_dim**-0.5) * min( (state.step**-0.5), state.step * (warmup_steps**-1.5)) optimizer.alpha = learning_rate optimizer.update() logger.info( f'time = {int(time.time())} | step = {state.step} | loss = {float(loss.array)} | lr = {learning_rate}' ) state.step += 1 finally: save_training(out, model, optimizer, state)
def __call__(self, x, get_feature=False, scaled=False, resize=False): """Input dims are (batch_size, 3, 299, 299).""" if resize: x = F.resize_images(x, (299, 299)) if scaled: x = (x + 1) * 127.5 # assert x.shape[1:] == (3, 299, 299) x -= 128.0 x *= 0.0078125 h = F.relu(self.bn_conv(self.conv(x))) # assert h.shape[1:] == (32, 149, 149) h = F.relu(self.bn_conv_1(self.conv_1(h))) # assert h.shape[1:] == (32, 147, 147) h = F.relu(self.bn_conv_2(self.conv_2(h))) # assert h.shape[1:] == (64, 147, 147) h = F.max_pooling_2d(h, 3, stride=2, pad=0) # assert h.shape[1:] == (64, 73, 73) h = F.relu(self.bn_conv_3(self.conv_3(h))) # assert h.shape[1:] == (80, 73, 73) h = F.relu(self.bn_conv_4(self.conv_4(h))) # assert h.shape[1:] == (192, 71, 71) h = F.max_pooling_2d(h, 3, stride=2, pad=0) # assert h.shape[1:] == (192, 35, 35) h = self.mixed(h) # assert h.shape[1:] == (256, 35, 35) h = self.mixed_1(h) # assert h.shape[1:] == (288, 35, 35) h = self.mixed_2(h) # assert h.shape[1:] == (288, 35, 35) h = self.mixed_3(h) # assert h.shape[1:] == (768, 17, 17) h = self.mixed_4(h) # assert h.shape[1:] == (768, 17, 17) h = self.mixed_5(h) # assert h.shape[1:] == (768, 17, 17) h = self.mixed_6(h) # assert h.shape[1:] == (768, 17, 17) h = self.mixed_7(h) # assert h.shape[1:] == (768, 17, 17) h = self.mixed_8(h) # assert h.shape[1:] == (1280, 8, 8) h = self.mixed_9(h) # assert h.shape[1:] == (2048, 8, 8) h = self.mixed_10(h) # assert h.shape[1:] == (2048, 8, 8) h = F.average_pooling_2d(h, 8, 1) # assert h.shape[1:] == (2048, 1, 1) h = F.reshape(h, (-1, 2048)) if get_feature: return h else: h = self.logit(h) h = F.softmax(h) # assert h.shape[1:] == (1008,) return h
def forward(self, hs, ys): '''Decoder forward :param Variable hs: :param Variable ys: :return: ''' self.loss = None # prepare input and output word sequences with sos/eos IDs eos = self.xp.array([self.eos], 'i') sos = self.xp.array([self.sos], 'i') ys_in = [F.concat([sos, y], axis=0) for y in ys] ys_out = [F.concat([y, eos], axis=0) for y in ys] # padding for ys with -1 # pys: utt x olen pad_ys_in = F.pad_sequence(ys_in, padding=self.eos) pad_ys_out = F.pad_sequence(ys_out, padding=-1) # get dim, length info batch = pad_ys_out.shape[0] olength = pad_ys_out.shape[1] logging.info(self.__class__.__name__ + ' input lengths: ' + str(self.xp.array([h.shape[0] for h in hs]))) logging.info(self.__class__.__name__ + ' output lengths: ' + str(self.xp.array([y.shape[0] for y in ys_out]))) # initialization c_list = [None] # list of cell state of each layer z_list = [None] # list of hidden state of each layer for l in six.moves.range(1, self.dlayers): c_list.append(None) z_list.append(None) att_w = None z_all = [] self.att.reset() # reset pre-computation of h # pre-computation of embedding eys = self.embed(pad_ys_in) eys = F.separate(eys, axis=1) # loop for an output sequence for i in six.moves.range(olength): att_c, att_w = self.att(hs, z_list[0], att_w) # EDIT(hamaji): No scheduled sampling. # if i > 0 and random.random() < self.sampling_probability: # logging.info(' scheduled sampling ') # z_out = self.output(z_all[-1]) # z_out = F.argmax(F.log_softmax(z_out), axis=1) # z_out = self.embed(z_out) # ey = F.hstack((z_out, att_c)) # utt x (zdim + hdim) # else: # ey = F.hstack((eys[i], att_c)) # utt x (zdim + hdim) ey = F.hstack((eys[i], att_c)) # utt x (zdim + hdim) # EDIT(hamaji): Unrolled, etc. c_list_new = [] z_list_new = [] c_new, z_new = self.lstm0(c_list[0], z_list[0], ey) c_list_new.append(c_new) z_list_new.append(z_new) if self.dlayers > 1: c_new, z_new = self.lstm1(c_list[1], z_list[1], z_list_new[-1]) c_list_new.append(c_new) z_list_new.append(z_new) # for l in six.moves.range(1, self.dlayers): # c_new, z_new = self['lstm%d' % l](c_list[l], z_list[l], z_list_new[-1]) # c_list_new.append(c_new) # z_list_new.append(z_new) c_list = c_list_new z_list = z_list_new z_all.append(z_list[-1]) z_all = F.reshape(F.stack(z_all, axis=1), (batch * olength, self.dunits)) # compute loss y_all = self.output(z_all) # EDIT(hamaji): `np.flatten` implemented by ourselves. # self.loss = F.softmax_cross_entropy(y_all, F.flatten(pad_ys_out)) self.loss = F.softmax_cross_entropy(y_all, _flatten(pad_ys_out)) # -1: eos, which is removed in the loss computation # EDIT(hamaji): `np.mean` implemented by a naive loop. # self.loss *= (np.mean([len(x) for x in ys_in]) - 1) self.loss *= _mean(ys_in) - 1 # EDIT(hamaji): No need to compute accuracy. # acc = F.accuracy(y_all, F.flatten(pad_ys_out), ignore_label=-1) # logging.info('att loss:' + str(self.loss.data)) # EDIT(hamaji): Skip verbose logging. # # show predicted character sequence for debug # if self.verbose > 0 and self.char_list is not None: # y_hat = F.reshape(y_all, (batch, olength, -1)) # y_true = pad_ys_out # for (i, y_hat_), y_true_ in zip(enumerate(y_hat.data), y_true.data): # if i == MAX_DECODER_OUTPUT: # break # idx_hat = self.xp.argmax(y_hat_[y_true_ != -1], axis=1) # idx_true = y_true_[y_true_ != -1] # seq_hat = [self.char_list[int(idx)] for idx in idx_hat] # seq_true = [self.char_list[int(idx)] for idx in idx_true] # seq_hat = "".join(seq_hat).replace('<space>', ' ') # seq_true = "".join(seq_true).replace('<space>', ' ') # logging.info("groundtruth[%d]: " % i + seq_true) # logging.info("prediction [%d]: " % i + seq_hat) # EDIT(hamaji): Skip `labeldist` thing. # if self.labeldist is not None: # if self.vlabeldist is None: # self.vlabeldist = chainer.Variable(self.xp.asarray(self.labeldist)) # loss_reg = - F.sum(F.scale(F.log_softmax(y_all), self.vlabeldist, axis=1)) / len(ys_in) # self.loss = (1. - self.lsm_weight) * self.loss + self.lsm_weight * loss_reg # EDIT(hamaji): Return loss only. # return self.loss, acc return self.loss
def act(self, state, reward, is_state_terminal): if self.clip_reward: reward = np.clip(reward, -1, 1) if not is_state_terminal: statevar = chainer.Variable(np.expand_dims(self.phi(state), 0)) self.past_rewards[self.t - 1] = reward if (is_state_terminal and self.t_start < self.t) \ or self.t - self.t_start == self.t_max: assert self.t_start < self.t if is_state_terminal: R = 0 else: _, vout = self.model.pi_and_v(statevar, keep_same_state=True) R = float(vout.data) pi_loss = 0 v_loss = 0 for i in reversed(range(self.t_start, self.t)): R *= self.gamma R += self.past_rewards[i] v = self.past_values[i] if self.process_idx == 0: logger.debug('s:%s v:%s R:%s', self.past_states[i].data.sum(), v.data, R) advantage = R - v # Accumulate gradients of policy log_prob = self.past_action_log_prob[i] entropy = self.past_action_entropy[i] # Log probability is increased proportionally to advantage pi_loss -= log_prob * float(advantage.data) # Entropy is maximized pi_loss -= self.beta * entropy # Accumulate gradients of value function v_loss += (v - R)**2 / 2 if self.pi_loss_coef != 1.0: pi_loss *= self.pi_loss_coef if self.v_loss_coef != 1.0: v_loss *= self.v_loss_coef # Normalize the loss of sequences truncated by terminal states if self.keep_loss_scale_same and \ self.t - self.t_start < self.t_max: factor = self.t_max / (self.t - self.t_start) pi_loss *= factor v_loss *= factor if self.process_idx == 0: logger.debug('pi_loss:%s v_loss:%s', pi_loss.data, v_loss.data) total_loss = pi_loss + F.reshape(v_loss, pi_loss.data.shape) # Compute gradients using thread-specific model self.model.zerograds() total_loss.backward() # Copy the gradients to the globally shared model self.shared_model.zerograds() copy_param.copy_grad(target_link=self.shared_model, source_link=self.model) # Update the globally shared model if self.process_idx == 0: norm = self.optimizer.compute_grads_norm() logger.debug('grad norm:%s', norm) self.optimizer.update() if self.process_idx == 0: logger.debug('update') self.sync_parameters() self.model.unchain_backward() self.past_action_log_prob = {} self.past_action_entropy = {} self.past_states = {} self.past_rewards = {} self.past_values = {} self.t_start = self.t if not is_state_terminal: self.past_states[self.t] = statevar pout, vout = self.model.pi_and_v(statevar) self.past_action_log_prob[self.t] = pout.sampled_actions_log_probs self.past_action_entropy[self.t] = pout.entropy self.past_values[self.t] = vout self.t += 1 if self.process_idx == 0: logger.debug('t:%s entropy:%s, probs:%s', self.t, pout.entropy.data, pout.probs.data) return pout.action_indices[0] else: self.model.reset_state() return None
def _flatten(xs): return F.reshape(xs, (xs.size, ))
def __call__(self, x): b = x.shape[0] return F.reshape(x, (b,) + self.shape)
def _elementwise_softmax_cross_entropy(x, t): assert x.shape[:-1] == t.shape shape = t.shape x = F.reshape(x, (-1, x.shape[-1])) t = F.flatten(t) return F.reshape(F.softmax_cross_entropy(x, t, reduce='no'), shape)
def draw(self, x): x_r = F.reshape(x, (x.shape[0] * x.shape[1], x.shape[2])) out = self.regressor.ucb(x_r) result = F.reshape(out, (x.shape[0], x.shape[1])) return F.argmax(result, axis=1)
def __call__(self, ids, ts_link, ts_type, ts_link_type): """ Args: ids: essay ids ts_link: gold links ts_type: gold ac types ts_link_type: gold link types Return: (all_spans, candidates, score) """ ############# # load data # ############# xs, x_spans, shell_spans, x_position_info = self.load_data(ids) assert len(xs) == len(x_spans) assert len(xs) == len(shell_spans) assert x_spans[0][0][1] >= x_spans[0][0][0] assert shell_spans[0][0][1] >= shell_spans[0][0][0] assert len(x_position_info[0][0]) == 3 ################### # load embeddings # ################### if self.use_elmo: xs_embed = self.load_elmo(ids, xs) else: xs_embed = self.sequence_embed(self.Embed_x, xs, False) x_section = self.get_section(xs, x_spans) position_info = self.get_position_info(x_position_info) relative_position_info = self.get_relative_position_info( x_position_info) ########### # encoder # ########### span_reps = self.hierarchical_encode(ids, xs, xs_embed, position_info, x_spans, shell_spans, x_position_info) ########### # decoder # ########### pair_scores, ac_types, link_types, span_reps_pad =\ self.decoder_net(span_reps, x_spans, x_section, position_info, relative_position_info, ts_link) if self.baseline_heuristic: pair_scores = self.majority_voting_to_links(position_info) masked_pair_scores = self.mask_link_scores(pair_scores, x_spans, self.batchsize, self.max_n_spans, mask_type="minus_inf") #(batchsize*max_n_spans, max_n_spans+1) masked_pair_scores = chaFunc.reshape( masked_pair_scores, (self.batchsize * self.max_n_spans, self.max_n_spans + 1)) return masked_pair_scores, ac_types, link_types
def encoder(self, x, batchsize, train=True): with chainer.using_config('train', train): x2 = F.reshape(x, (batchsize, 84, 84, 3)) x3 = F.transpose(x2, [0, 3, 1, 2]) c1_r = self.chain.l_conv1_r(x3) n1_r = self.chain.l_norm1_r(c1_r) c1_1 = self.chain.l_conv1_1(x3) n1_1 = self.chain.l_norm1_1(c1_1) a1_1 = F.relu(n1_1) c1_2 = self.chain.l_conv1_2(a1_1) n1_2 = self.chain.l_norm1_2(c1_2) a1_2 = F.relu(n1_2) c1_3 = self.chain.l_conv1_3(a1_2) n1_3 = self.chain.l_norm1_3(c1_3) a1_3 = F.relu(n1_3 + n1_r) p1 = F.max_pooling_2d(a1_3, 2) p1 = F.dropout(p1, ratio=0.3) c2_r = self.chain.l_conv2_r(p1) n2_r = self.chain.l_norm2_r(c2_r) c2_1 = self.chain.l_conv2_1(p1) n2_1 = self.chain.l_norm2_1(c2_1) a2_1 = F.relu(n2_1) c2_2 = self.chain.l_conv2_2(a2_1) n2_2 = self.chain.l_norm2_2(c2_2) a2_2 = F.relu(n2_2) c2_3 = self.chain.l_conv2_3(a2_2) n2_3 = self.chain.l_norm2_3(c2_3) a2_3 = F.relu(n2_3 + n2_r) p2 = F.max_pooling_2d(a2_3, 2) p2 = F.dropout(p2, ratio=0.2) c3_r = self.chain.l_conv3_r(p2) n3_r = self.chain.l_norm3_r(c3_r) c3_1 = self.chain.l_conv3_1(p2) n3_1 = self.chain.l_norm3_1(c3_1) a3_1 = F.relu(n3_1) c3_2 = self.chain.l_conv3_2(a3_1) n3_2 = self.chain.l_norm3_2(c3_2) a3_2 = F.relu(n3_2) c3_3 = self.chain.l_conv3_3(a3_2) n3_3 = self.chain.l_norm3_3(c3_3) a3_3 = F.relu(n3_3 + n3_r) p3 = F.max_pooling_2d(a3_3, 2) p3 = F.dropout(p3, ratio=0.2) c4_r = self.chain.l_conv4_r(p3) n4_r = self.chain.l_norm4_r(c4_r) c4_1 = self.chain.l_conv4_1(p3) n4_1 = self.chain.l_norm4_1(c4_1) a4_1 = F.relu(n4_1) c4_2 = self.chain.l_conv4_2(a4_1) n4_2 = self.chain.l_norm4_2(c4_2) a4_2 = F.relu(n4_2) c4_3 = self.chain.l_conv4_3(a4_2) n4_3 = self.chain.l_norm4_3(c4_3) a4_3 = F.relu(n4_3 + n4_r) p4 = F.max_pooling_2d(a4_3, 2) p4 = F.dropout(p4, ratio=0.2) p5 = F.average_pooling_2d(p4, 6) h_t = F.reshape(p5, (batchsize, -1)) return h_t
def generate_image(img_orig, img_style, width, nw, nh, max_iter, lr, img_gen=None): mid_orig = nn.forward(Variable(img_orig, volatile=True)) style_mats = [ get_matrix(y) for y in nn.forward(Variable(img_style, volatile=True)) ] if img_gen is None: if args.gpu >= 0: img_gen = xp.random.uniform(-20, 20, (1, 3, width, width), dtype=np.float32) else: img_gen = np.random.uniform(-20, 20, (1, 3, width, width)).astype( np.float32) x = Variable(img_gen) xg = xp.zeros_like(x.data) optimizer = optimizers.Adam(alpha=lr) optimizer.setup((img_gen, xg)) for i in range(max_iter): x = Variable(img_gen) y = nn.forward(x) optimizer.zero_grads() L = Variable(xp.zeros((), dtype=np.float32)) for l in range(len(y)): ch = y[l].data.shape[1] wd = y[l].data.shape[2] gogh_y = F.reshape(y[l], (ch, wd**2)) gogh_matrix = F.matmul(gogh_y, gogh_y, transb=True) / np.float32( ch * wd**2) L1 = np.float32(args.lam) * np.float32( nn.alpha[l]) * F.mean_squared_error(y[l], Variable(mid_orig[l].data)) L2 = np.float32(nn.beta[l]) * F.mean_squared_error( gogh_matrix, Variable(style_mats[l].data)) / np.float32(len(y)) L += L1 + L2 if i % 100 == 0: print i, l, L1.data, L2.data L.backward() xg += x.grad optimizer.update() tmp_shape = img_gen.shape if args.gpu >= 0: img_gen += Clip().forward(img_gen).reshape(tmp_shape) - img_gen else: def clip(x): return -120 if x < -120 else (136 if x > 136 else x) img_gen += np.vectorize(clip)(img_gen).reshape(tmp_shape) - img_gen if i % 3000 == 0: save_image(img_gen, W, nw, nh, i)
def __call__(self, x, rel_y, neighbor_entities, neighbor_dict, assign, entities, relations, RC, EC, t, assignEtoN): if self.layer == 0: return self.easy_case(x, neighbor_entities, neighbor_dict, assign, entities, relations) #print 'entities', len(entities) if len(neighbor_dict) == 1: x = [x] else: x = F.split_axis(x, len(neighbor_dict), axis=0) if len(entities) == 1: t = [t] else: t = F.split_axis(t, len(entities), axis=0) rel_y = F.split_axis(rel_y, len(RC), axis=0) result = [] for i, e in enumerate(entities): rt = t[i] tmpXList = [] tmpValList = [] tmpListV1 = [] tmpListV2 = [] tmpList2 = [] tmpVFlag = [] for k in assignEtoN[i]: v = neighbor_dict[k] rx = x[v] if (e, k) in relations: r = relations[(e, k)] * 2 else: r = relations[(k, e)] * 2 + 1 r_rep = rel_y[r // 2] #calc the attention value tmp2 = F.concat((rx, rt), axis=1) tmp2 = F.concat((tmp2, r_rep), axis=1) tmpList2.append(tmp2) tmp = F.pad(F.concat((rx, r_rep), axis=0), ((0, 0), (0, 1)), 'constant') tmp = F.reshape(tmp, (1, 1, 2, -1)) if r % 2 == 0: tmpListV1.append(tmp) tmpVFlag.append(1) else: tmpListV2.append(tmp) tmpVFlag.append(-1) #print len(tmpListV1), len(tmpListV2), len(tmpList2) oV1 = [] oV2 = [] oAtt = [] if (len(tmpListV1) > 0): inputV1 = F.concat(tmpListV1, axis=0) #print inputV1.shape outputV1 = getattr(self, self.forwardH[0][0])(inputV1) #print outputV1.shape oV1 = F.split_axis(outputV1, len(tmpListV1), axis=0) if (len(tmpListV2) > 0): inputV2 = F.concat(tmpListV2, axis=0) outputV2 = getattr(self, self.forwardT[0][0])(inputV2) oV2 = F.split_axis(outputV2, len(tmpListV2), axis=0) inputAtt = F.concat(tmpList2, axis=0) #print inputAtt.shape outputAtt = getattr(self, self.AttL[0][0])(inputAtt) #print outputAtt.shape oAtt = F.split_axis(outputAtt, len(tmpList2), axis=0) cnt1 = 0 cnt2 = 0 for a, flag in enumerate(tmpVFlag): tmpAtt = oAtt[a] tmpAtt = F.repeat(tmpAtt, 200) tmpAtt = F.reshape(tmpAtt, [-1, 200]) tmpValList.append(F.exp(tmpAtt)) if flag == 1: tmprx = oV1[cnt1] cnt1 += 1 tmpXList.append(tmprx) elif flag == -1: tmprx = oV2[cnt2] cnt2 += 1 tmpXList.append(tmprx) #print len(tmpXList), len(tmpValList) for a, val in enumerate(tmpValList): #print tmpXList[a].data #print val.data tmpXList[a] = tmpXList[a] * val result.append(sum(tmpXList) / (sum(tmpValList))) #print result[0].shape #(1,1,1,200) should be (1,200) result = F.concat(result, axis=0) #print len(entities) #print result.shape return result
def __call__(self, xs1, xs2, wordcnt, wgt_wordcnt, x1s_len, x2s_len): sum_embed_xs1, xs1_conv1, xs1_conv1_swap = self.seq_encode(xs1) sum_embed_xs2, xs2_conv1, xs2_conv1_swap = self.seq_encode(xs2) batchsize, dim, seq_length1, depth = xs1_conv1.shape batchsize, dim, seq_length2, depth = xs2_conv1.shape # A(Attention matrix)をつくる # xs1_conv1_stack = F.reshape(F.tile(F.reshape(xs1_conv1_swap, (batchsize, seq_length1, dim)), (seq_length2,1)), (batchsize, seq_length1, seq_length2, 50)) # xs2_conv1_stack = F.reshape(F.tile(F.reshape(xs2_conv1_swap, (batchsize, seq_length2, dim)), (1,seq_length1)), (batchsize, seq_length1, seq_length2, 50)) # A_pooling = F.reshape(F.batch_l2_norm_squared(F.reshape(xs1_conv1_stack - xs2_conv1_stack, # (batchsize*(seq_length1)*(seq_length2), 50))), (batchsize, seq_length1, seq_length2)) # A_pooling = F.transpose(1 / (1 + F.sqrt(A_pooling+1e-20)),axes=(0,2,1)) ### A_pooling.shape = (batchsize, seqlen1, seqlen2) # A_pooling.shape = (batchsize, seqlen2, seqlen1) x1s = F.squeeze(xs1_conv1_swap, axis=1) x2s = F.squeeze(xs2_conv1_swap, axis=1) x1s_x2s = F.batch_matmul(x1s, x2s, transb=True) x1s_squared = F.tile(F.expand_dims(F.sum(F.square(x1s), axis=2), axis=2), reps=(1, 1, x2s.shape[1])) x2s_squared = F.tile(F.expand_dims(F.sum(F.square(x2s), axis=2), axis=1), reps=(1, x1s.shape[1], 1)) inside_root = x1s_squared + (-2 * x1s_x2s) + x2s_squared epsilon = Variable(self.xp.full((batchsize, seq_length1, seq_length2), sys.float_info.epsilon, dtype=np.float32)) inside_root = F.maximum(inside_root, epsilon) denominator = 1.0 + F.sqrt(inside_root) A_pooling = F.transpose(1.0 / denominator,axes=(0,2,1)) # A_pooling = 1.0 / denominator col_wise_sum = F.sum(A_pooling,axis=1) # col-wise sum (batchsize, seqlen1) row_wise_sum = F.sum(A_pooling,axis=2) # row-wise sum (batchsize, seqlen2) # # developing # # 要確認 xs1_conv1_aten = F.swapaxes(F.reshape(F.scale(F.reshape(xs1_conv1_swap, (batchsize, seq_length1, dim)) ,F.reshape(col_wise_sum,(batchsize, seq_length1, 1)), axis=0),(batchsize, 1, seq_length1, 50)), 1, 3) xs2_conv1_aten = F.swapaxes(F.reshape(F.scale(F.reshape(xs2_conv1_swap, (batchsize, seq_length2, dim)) ,F.reshape(row_wise_sum,(batchsize, seq_length2, 1)), axis=0),(batchsize, 1, seq_length2, 50)), 1, 3) # all_average_pooling with attention weight (for 1 layer) # xs1_all_avg_b1 = F.average_pooling_2d(xs1_conv1, ksize=(xs1_conv1.shape[2], 1), use_cudnn=False) # not attention # xs2_all_avg_b1 = F.average_pooling_2d(xs2_conv1, ksize=(xs2_conv1.shape[2], 1), use_cudnn=False) # not attention xs1_all_avg_b1 = F.average_pooling_2d(xs1_conv1_aten, ksize=(xs1_conv1_aten.shape[2], 1), use_cudnn=False) # with attention xs2_all_avg_b1 = F.average_pooling_2d(xs2_conv1_aten, ksize=(xs2_conv1_aten.shape[2], 1), use_cudnn=False) # with attention if self.n_layer == 1: x1_vecs = (sum_embed_xs1,xs1_all_avg_b1) x2_vecs = (sum_embed_xs2,xs2_all_avg_b1) else: # average_pooling with window(for 2 layer) # ?? xs1_avg = F.average_pooling_2d(xs1_conv1_swap, ksize=(4, 1), stride=1, use_cudnn=False) xs2_avg = F.average_pooling_2d(xs2_conv1_swap, ksize=(4, 1), stride=1, use_cudnn=False) # ?? assert xs1_avg.shape[2] == seq_length1-3 # average pooling語に系列長が元に戻ってないといけない assert xs2_avg.shape[2] == seq_length2-3 # average pooling語に系列長が元に戻ってないといけない # wide_convolution(for 2 layer) xs1_conv2 = F.tanh(self.conv2(xs1_avg)) xs2_conv2 = F.tanh(self.conv2(xs2_avg)) # all_average_pooling with attention (for 2 layer) # attention not just yet xs1_all_avg_b2 = F.average_pooling_2d(xs1_conv2, ksize=(xs1_conv2.shape[2], 1), use_cudnn=False) xs2_all_avg_b2 = F.average_pooling_2d(xs2_conv2, ksize=(xs2_conv2.shape[2], 1), use_cudnn=False) x1_vecs = (sum_embed_xs1, xs1_all_avg_b1, xs1_all_avg_b2) x2_vecs = (sum_embed_xs2, xs2_all_avg_b1, xs2_all_avg_b2) # not develoved exit(1) # similarity score for block 2 and 3 (block 1 is embedding layer) sim_scores = [F.squeeze(cos_sim(v1, v2), axis=2) for v1, v2 in zip(x1_vecs, x2_vecs)] # sim_scores[0/1/(2)].shape = (batchsize, 1) feature_vec = F.concat(sim_scores + [wordcnt, wgt_wordcnt, x1s_len, x2s_len], axis=1) fc = F.squeeze(self.l1(feature_vec), axis=1) if self.train: return fc else: return fc, sim_scores
def get_loss(self, roi_feature, gt_roi_label): neg_pos_ratio = 3 with chainer.cuda.get_device_from_array(roi_feature.data) as device: batch, frame_box, channel, roi_height, roi_width = roi_feature.shape roi_feature = F.reshape(roi_feature, shape=(batch * frame_box, channel, roi_height, roi_width)) roi_feature = F.average_pooling_2d(roi_feature, ksize=7, stride=1) roi_feature = roi_feature.reshape(batch * frame_box, 2048) # # B * F, 2048, 7, 7 predict_score = F.relu(self.fc(roi_feature)) predict_score = self.score(predict_score) gt_roi_label = gt_roi_label.reshape(-1, gt_roi_label.shape[-1]) assert predict_score.shape == gt_roi_label.shape, \ "{0} != {1} (pred!=gt)".format(predict_score.shape, gt_roi_label.shape) union_gt = set( ) # union of prediction positive and ground truth positive cpu_gt_roi_label = chainer.cuda.to_cpu(gt_roi_label) gt_pos_index = np.nonzero(cpu_gt_roi_label) cpu_pred_score = (chainer.cuda.to_cpu(roi_feature.data) > 0).astype(np.int32) pred_pos_index = np.nonzero(cpu_pred_score) len_gt_pos = len( gt_pos_index[0]) if len(gt_pos_index[0]) > 0 else 1 neg_pick_count = neg_pos_ratio * len_gt_pos gt_pos_index_set = set(list(zip(*gt_pos_index))) pred_pos_index_set = set(list(zip(*pred_pos_index))) union_gt.update(gt_pos_index_set) union_gt.update(pred_pos_index_set) false_positive_index = np.asarray( list(pred_pos_index_set - gt_pos_index_set)) # shape = n x 2 gt_pos_index_lst = list(gt_pos_index_set) if neg_pick_count <= len(false_positive_index): choice_fp = np.random.choice(np.arange( len(false_positive_index)), size=neg_pick_count, replace=False) gt_pos_index_lst.extend( list(map(tuple, false_positive_index[choice_fp].tolist()))) else: gt_pos_index_lst.extend( list(map(tuple, false_positive_index.tolist()))) rest_pick_count = neg_pick_count - len(false_positive_index) gt_neg_index = np.where(cpu_gt_roi_label == 0) gt_neg_index_set = set(list(zip(*gt_neg_index))) gt_neg_index_set = gt_neg_index_set - set( gt_pos_index_lst) # remove already picked gt_neg_index_array = np.asarray(list(gt_neg_index_set)) rest_pick_count = len(gt_neg_index_array) if len( gt_neg_index_array) < rest_pick_count else rest_pick_count choice_rest = np.random.choice(np.arange( len(gt_neg_index_array)), size=rest_pick_count, replace=False) gt_pos_index_lst.extend( list(map(tuple, gt_neg_index_array[choice_rest].tolist()))) pick_index = list(zip(*gt_pos_index_lst)) if len(union_gt) == 0: accuracy_pick_index = np.where(cpu_gt_roi_label) else: accuracy_pick_index = list(zip(*union_gt)) accuracy = F.binary_accuracy( predict_score[list(accuracy_pick_index[0]), list(accuracy_pick_index[1])], gt_roi_label[list(accuracy_pick_index[0]), list(accuracy_pick_index[1])]) loss = F.sigmoid_cross_entropy( predict_score[list(pick_index[0]), list(pick_index[1])], gt_roi_label[list(pick_index[0]), list(pick_index[1])]) # 支持多label chainer.reporter.report({'loss': loss, "accuracy": accuracy}, self) return loss, accuracy
def compute_accuracy(self, y, t): if self.nested_label: b, c, h, w, d = t.shape y = F.reshape(y, (b, 2, h * c, w, d)) t = F.reshape(t, (b, h * c, w, d)) return F.accuracy(y, t)
def reshape(): x = rand((1, 8, 8, 8)) y = F.reshape(x, (1, 1, 64, 8)) return {'input': x}, {'out': y}
def __call__(self, tokenIdsList_merged, tokenIdsList_merged_b, argsort, argsort_reverse, pList): # input a list of token ids, output a list of word embeddings tokenIdsList_ordered = tokenIdsList_merged[argsort] tokenIdsList_ordered += 2 if tokenIdsList_merged_b is not None: tokenIdsList_ordered_b = tokenIdsList_merged_b[argsort] tokenIdsList_ordered_b += 2 self.reset_state() y = None for i in range(tokenIdsList_ordered.shape[1]): if pList[i] == 0: break if i == 0: self.rnn(tokenIdsList_ordered[:, i]) if 'sum' in self.subword: y = F.dropout(self.mid.h, self.dropout) # y = self.out(y) else: self.rnn(tokenIdsList_ordered[0: pList[i], i]) if 'sum' in self.subword: tmp_y = F.dropout(self.mid.h, self.dropout) # tmp_y = self.out(tmp_y) if pList[i] < tmp_y.shape[0]: tmp_y = tmp_y[0: pList[i], :] tmp_y = (tmp_y + y[0: pList[i], :]) y = F.concat((tmp_y, y[pList[i]:, :]), axis=0) else: tmp_y = (tmp_y + y) y = tmp_y # print(tokenIdsList_ordered_b.shape) # print(tokenIdsList_ordered) # print(tokenIdsList_ordered_b) if 'bilstm' in self.subword: y_b = None for i in range(tokenIdsList_ordered_b.shape[1]): if pList[i] == 0: break if i == 0: self.rnn_b(tokenIdsList_ordered_b[:, i]) if 'sum' in self.subword: y_b = F.dropout(self.mid.h, self.dropout) else: self.rnn_b(tokenIdsList_ordered_b[0: pList[i], i]) if 'sum' in self.subword: tmp_y = F.dropout(self.mid.h, self.dropout) if pList[i] < tmp_y.shape[0]: tmp_y = tmp_y[0: pList[i], :] tmp_y = (tmp_y + y_b[0: pList[i], :]) y_b = F.concat((tmp_y, y_b[pList[i]:, :]), axis=0) else: tmp_y = (tmp_y + y_b) y_b = tmp_y if 'sum' not in self.subword: # pure lstm, without sum/avg over all timestep y = F.dropout(self.mid.h, self.dropout) if 'bilstm' in self.subword: y_b = F.dropout(self.mid_b.h, self.dropout) y = self.out(y) if 'bilstm' in self.subword: y_b = self.out_b(y_b) y = y + y_b e = y[argsort_reverse] # isSum = True # if isSum: # e = F.reshape(e, (int(e.shape[0] / self.n_ngram), # self.n_ngram, e.shape[1])) # e = F.sum(e, axis=1) # else: # e = F.reshape(e, (int(e.shape[0] / self.n_ngram), # self.n_ngram * e.shape[1])) # e = self.final_out(F.tanh(e)) e = F.reshape(e, (int(e.shape[0] / self.n_ngram), self.n_ngram, e.shape[1])) e = F.sum(e, axis=1) return e
def encode_decode_train(self, in_word_list, out_word_list, train=True): xp = cuda.cupy if self.gpuid >= 0 else np self.reset_state() # Add GO_ID, EOS_ID to decoder input decoder_word_list = [GO_ID] + out_word_list + [EOS_ID] # encode list of words/tokens enc_states = self.encode_list(in_word_list, train=train) # initialize decoder LSTM to final encoder state self.set_decoder_state() # decode and compute loss if not train: with chainer.no_backprop_mode(): # convert list of tokens into chainer variable list var_dec = (Variable( xp.asarray(decoder_word_list, dtype=np.int32).reshape( (-1, 1)))) # Initialise first decoded word to GOID pred_word = Variable(xp.asarray([GO_ID], dtype=np.int32)) else: # convert list of tokens into chainer variable list var_dec = (Variable( xp.asarray(decoder_word_list, dtype=np.int32).reshape( (-1, 1)))) # Initialise first decoded word to GOID pred_word = Variable(xp.asarray([GO_ID], dtype=np.int32)) # compute loss self.loss = 0 # decode tokens for next_word_var in var_dec[1:]: self.decode(pred_word, train=train) if self.attn == NO_ATTN: predicted_out = self.out(self[self.lstm_dec[-1]].h) # with chainer.using_config('train', train): # predicted_out = self.out(F.dropout(self[self.lstm_dec[-1]].h, dropout_ratio)) else: # __QUESTION Add attention # pass score_t = F.reshape( F.matmul(enc_states, self[self.lstm_dec[-1]].h.T), (1, -1)) context_t = F.matmul(F.softmax(score_t), enc_states) att_out = self['Glob_att'](F.concat( (self[self.lstm_dec[-1]].h, context_t))) h_t_tilde = F.tanh(att_out) predicted_out = self.out(h_t_tilde) # compute loss prob = F.softmax(predicted_out) pred_word = self.select_word(prob, train=train, sample=False) ''' ___QUESTION-1-DESCRIBE-E-START___ Explain what loss is computed with an example What does this value mean? # ============================= [Piazza] Explain what softmax cross entropy is (in relation to the task of MT). You may find it useful to include a toy problem but it is not needed. ---------- 1. cross entropy loss. -\sum_{c=1}^My_{o,c}\log(p_{o,c}) 2. measure the divergence between the predicted words and next label words with a probability between 0 and 1. # ============================= ''' self.loss += F.softmax_cross_entropy(predicted_out, next_word_var) '''___QUESTION-1-DESCRIBE-E-END___''' report({"loss": self.loss}, self) return self.loss
def __call__(self, img, threshold=0.2): self.detection_thresh = threshold orig_input_height, orig_input_width, _ = img.shape #img = cv2.resize(orig_img, (640, 640)) input_height, input_width, _ = img.shape img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = np.asarray(img, dtype=np.float32) / 255.0 img = img.transpose(2, 0, 1) # forward x_data = img[np.newaxis, :, :, :] x = Variable(x_data) if self.gpu >= 0: x.to_gpu() pred_fcn, pred_yolo = self.model.predict(x, both=True) pred_fcn = pred_fcn[0].data.argmax(axis=0) if self.gpu >= 0: pred_fcn = cuda.to_cpu(pred_fcn) x, y, w, h, conf, prob = pred_yolo # parse results _, _, _, grid_h, grid_w = x.shape x = F.reshape(x, (self.n_boxes, grid_h, grid_w)).data y = F.reshape(y, (self.n_boxes, grid_h, grid_w)).data w = F.reshape(w, (self.n_boxes, grid_h, grid_w)).data h = F.reshape(h, (self.n_boxes, grid_h, grid_w)).data conf = F.reshape(conf, (self.n_boxes, grid_h, grid_w)).data prob = F.transpose( F.reshape(prob, (self.n_boxes, self.n_classes_yolo, grid_h, grid_w)), (1, 0, 2, 3)).data detected_indices = (conf * prob).max(axis=0) > self.detection_thresh if self.gpu >= 0: x = cuda.to_cpu(x) y = cuda.to_cpu(y) w = cuda.to_cpu(w) h = cuda.to_cpu(h) conf = cuda.to_cpu(conf) prob = cuda.to_cpu(prob) detected_indices = cuda.to_cpu(detected_indices) results = [] for i in range(detected_indices.sum()): results.append({ "label": self.labels[prob.transpose(1, 2, 3, 0)[detected_indices][i].argmax()], "probs": prob.transpose(1, 2, 3, 0)[detected_indices][i], "conf": conf[detected_indices][i], "objectness": conf[detected_indices][i] * prob.transpose(1, 2, 3, 0)[detected_indices][i].max(), "box": Box(x[detected_indices][i] * orig_input_width, y[detected_indices][i] * orig_input_height, w[detected_indices][i] * orig_input_width, h[detected_indices][i] * orig_input_height).crop_region( orig_input_height, orig_input_width) }) # nms nms_results = nms(results, self.iou_thresh) return pred_fcn, nms_results
def _global_average_pooling_2d(x): n, channel, rows, cols = x.data.shape h = F.average_pooling_2d(x, (rows, cols), stride=1) h = F.reshape(h, (n, channel)) return h
def __call__(self, x): x = F.reshape(x, shape=(x.shape[0], -1)) x = self.fc1(x) x = self.activ(x) x = self.fc2(x) return x
def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0): '''AttLoc forward :param enc_hs: :param dec_z: :param att_prev: :param scaling: :return: ''' batch = len(enc_hs) # pre-compute all h outside the decoder loop if self.pre_compute_enc_h is None: self.enc_h = F.pad_sequence(enc_hs) # utt x frame x hdim self.h_length = self.enc_h.shape[1] # utt x frame x att_dim self.pre_compute_enc_h = linear_tensor(self.mlp_enc, self.enc_h) if dec_z is None: dec_z = chainer.Variable( self.xp.zeros((batch, self.dunits), dtype=np.float32)) else: dec_z = F.reshape(dec_z, (batch, self.dunits)) # initialize attention weight with uniform dist. if att_prev is None: att_prev = [ self.xp.full(hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32) for hh in enc_hs ] att_prev = [chainer.Variable(att) for att in att_prev] att_prev = F.pad_sequence(att_prev) # TODO(watanabe) use <chainer variable>.reshpae(), instead of F.reshape() # att_prev: utt x frame -> utt x 1 x 1 x frame -> utt x att_conv_chans x 1 x frame att_conv = self.loc_conv( F.reshape(att_prev, (batch, 1, 1, self.h_length))) # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans att_conv = F.swapaxes(F.squeeze(att_conv, axis=2), 1, 2) # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim att_conv = linear_tensor(self.mlp_att, att_conv) # dec_z_tiled: utt x frame x att_dim dec_z_tiled = F.broadcast_to(F.expand_dims(self.mlp_dec(dec_z), 1), self.pre_compute_enc_h.shape) # dot with gvec # utt x frame x att_dim -> utt x frame # TODO(watanabe) use batch_matmul e = F.squeeze(linear_tensor( self.gvec, F.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled)), axis=2) # Applying a minus-large-number filter to make a probability value zero for a padded area # simply degrades the performance, and I gave up this implementation # Apply a scaling to make an attention sharp w = F.softmax(scaling * e) # weighted sum over flames # utt x hdim c = F.sum(self.enc_h * F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape), axis=1) return c, w
def __call__(self, hs, ys): '''Decoder forward :param Variable hs: :param Variable ys: :return: ''' self.loss = None # prepare input and output word sequences with sos/eos IDs eos = self.xp.array([self.eos], 'i') sos = self.xp.array([self.sos], 'i') ys_in = [F.concat([sos, y], axis=0) for y in ys] ys_out = [F.concat([y, eos], axis=0) for y in ys] # padding for ys with -1 # pys: utt x olen pad_ys_in = F.pad_sequence(ys_in, padding=self.eos) pad_ys_out = F.pad_sequence(ys_out, padding=-1) # get dim, length info batch = pad_ys_out.shape[0] olength = pad_ys_out.shape[1] logging.info(self.__class__.__name__ + ' input lengths: ' + str(self.xp.array([h.shape[0] for h in hs]))) logging.info(self.__class__.__name__ + ' output lengths: ' + str(self.xp.array([y.shape[0] for y in ys_out]))) # initialization c_list = [None] # list of cell state of each layer z_list = [None] # list of hidden state of each layer for l in six.moves.range(1, self.dlayers): c_list.append(None) z_list.append(None) att_w = None z_all = [] self.att.reset() # reset pre-computation of h # pre-computation of embedding eys = self.embed(pad_ys_in) # utt x olen x zdim eys = F.separate(eys, axis=1) # loop for an output sequence for i in six.moves.range(olength): att_c, att_w = self.att(hs, z_list[0], att_w) ey = F.hstack((eys[i], att_c)) # utt x (zdim + hdim) c_list[0], z_list[0] = self.lstm0(c_list[0], z_list[0], ey) for l in six.moves.range(1, self.dlayers): c_list[l], z_list[l] = self['lstm%d' % l](c_list[l], z_list[l], z_list[l - 1]) z_all.append(z_list[-1]) z_all = F.reshape(F.stack(z_all, axis=1), (batch * olength, self.dunits)) # compute loss y_all = self.output(z_all) self.loss = F.softmax_cross_entropy(y_all, F.flatten(pad_ys_out)) # -1: eos, which is removed in the loss computation self.loss *= (np.mean([len(x) for x in ys_in]) - 1) acc = F.accuracy(y_all, F.flatten(pad_ys_out), ignore_label=-1) logging.info('att loss:' + str(self.loss.data)) # show predicted character sequence for debug if self.verbose > 0 and self.char_list is not None: y_hat = F.reshape(y_all, (batch, olength, -1)) y_true = pad_ys_out for (i, y_hat_), y_true_ in zip(enumerate(y_hat.data), y_true.data): if i == MAX_DECODER_OUTPUT: break idx_hat = self.xp.argmax(y_hat_[y_true_ != -1], axis=1) idx_true = y_true_[y_true_ != -1] seq_hat = [self.char_list[int(idx)] for idx in idx_hat] seq_true = [self.char_list[int(idx)] for idx in idx_true] seq_hat = "".join(seq_hat).replace('<space>', ' ') seq_true = "".join(seq_true).replace('<space>', ' ') logging.info("groundtruth[%d]: " % i + seq_true) logging.info("prediction [%d]: " % i + seq_hat) if self.labeldist is not None: if self.vlabeldist is None: self.vlabeldist = chainer.Variable( self.xp.asarray(self.labeldist)) loss_reg = -F.sum( F.scale(F.log_softmax(y_all), self.vlabeldist, axis=1)) / len(ys_in) self.loss = ( 1. - self.lsm_weight) * self.loss + self.lsm_weight * loss_reg return self.loss, acc
def forward(self, x, **kwargs): """forward(self, x, finetune=False) Invokes the forward propagation of InstanceNormalization. The InstanceNormalization computes moving averages of mean and variance for evaluation, and normalizes the input using batch statistics. Args: x (~chainer.Variable): Input variable. finetune (bool): If it is in the training mode and ``finetune`` is ``True``, InstanceNormalization runs in fine-tuning mode; it accumulates the input array to compute population statistics for normalization, and normalizes the input using batch statistics. """ finetune, = argument.parse_kwargs( kwargs, ('finetune', False), test='test argument is not supported anymore. ' 'Use chainer.using_config') if self.avg_mean is None: param_shape = tuple( [d for i, d in enumerate(x.shape) if i not in self.axis]) self._initialize_params(param_shape) gamma = self.gamma if gamma is None: with chainer.using_device(self.device): gamma = self.xp.ones(self.avg_mean.shape, dtype=self._highprec_dtype) beta = self.beta if beta is None: with chainer.using_device(self.device): beta = self.xp.zeros(self.avg_mean.shape, dtype=self._highprec_dtype) # reshape b, ch = x.shape[:2] reshaped = functions.reshape(x, ( 1, b * ch, ) + x.shape[2:]) gamma = self.xp.tile(gamma, (b, )) beta = self.xp.tile(beta, (b, )) avg_mean = self.xp.tile(self.avg_mean, (b, )) avg_var = self.xp.tile(self.avg_var, (b, )) if finetune: self.N += 1 decay = 1. - 1. / self.N else: decay = self.decay if chainer.config.in_recomputing: # Do not update statistics when extra forward computation is # called. if finetune: self.N -= 1 # Revert the count avg_mean = None avg_var = None ret = functions.batch_normalization(reshaped, gamma, beta, eps=self.eps, running_mean=avg_mean, running_var=avg_var, decay=decay, axis=self.axis) # reshape back self.avg_mean, self.avg_var = None, None if avg_mean is not None: self.avg_mean = avg_mean.reshape(b, ch).mean(axis=0) if avg_var is not None: self.avg_var = avg_var.reshape(b, ch).mean(axis=0) ret = functions.reshape(ret, x.shape) return ret
def _extract_gates(x): r = F.reshape(x, (len(x), x.shape[1] // 4, 4) + x.shape[2:]) r = F.separate(r, axis=2) return r[0], r[1], r[2], r[3]