def render(gfunc, stepsize=0.1, momentum=0.9, maxstep=24000): K = 10 num = 30 bbox = config.data.bbox cond = nd.one_hot(nd.repeat(nd.arange(K, ctx=ctx), (num-1)//K+1)[:num], K).reshape((num, K, 1, 1)) anoi = nd.random.normal(shape=(num,100,1,1), ctx=ctx) bnoi = nd.random.normal(shape=(num,100,1,1), ctx=ctx) slast = 0. for step in range(maxstep): snoi = anoi - bnoi sdist = snoi.norm(axis=1,keepdims=True) if sdist.min().asscalar() < .5: anoi = nd.random.normal(shape=(30,100,1,1), ctx=ctx) snoi /= sdist slast = stepsize*snoi + momentum*slast bnoi += slast gen = gfunc(noise=bnoi, cond=cond) indat = ((gen - bbox[0]) * 255/(bbox[1]-bbox[0])).asnumpy().clip(0, 255).astype(np.uint8) indat = align_images(indat, 5, 6, 32, 32, 3) yield indat
numerator += nd.sum(predictions == label) # Total number of checks denominator += data.shape[0] # Returning the accuracy of the net, or the probability of getting label right using our net. return (numerator / denominator).asscalar() epochs = 10 learning_rate = .001 for e in range(epochs): cumulative_loss = 0 for i, (data, label) in enumerate(train_data): data = data.as_in_context(model_ctx).reshape((-1, 784)) label = label.as_in_context(model_ctx) label_one_hot = nd.one_hot(label, 10) with autograd.record(): output = net(data) loss = cross_entropy(output, label_one_hot) loss.backward() SGD(params, learning_rate) cumulative_loss += nd.sum(loss).asscalar() test_accuracy = evaluate_accuracy(test_data, net) train_accuracy = evaluate_accuracy(train_data, net) print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, cumulative_loss / num_examples, train_accuracy, test_accuracy)) # The predictor. Returns prediction when we use our net. def model_predict(net, data):
def to_onehot(X, size): return [nd.one_hot(x, size) for x in X.T]
def forward(self,inputs, state): X = nd.one_hot(inputs.T, self.vocab_size) Y, state = self.rnn(X, state) out
# Initialize weights and biases for each class W = nd.random_normal(shape=(d_inputs, k_outputs), ctx=cntx) W0 = nd.random_normal(shape=k_outputs, ctx=cntx) prams = [W, W0] # Track the gradients of the parameters for parameter in prams: parameter.attach_grad() # Execute training loop using SGD for E in range(epochs): total_loss = 0 for i, (xtrain, ytrain) in enumerate(train_data): xtrain = xtrain.as_in_context(cntx).reshape((-1, 784)) ytrain = ytrain.as_in_context(cntx) ylabel_flag = nd.one_hot(ytrain, 5) with autograd.record(): y_out = aux.nnet(xtrain, W, W0) loss = aux.cross_ent(y_out, ylabel_flag) loss.backward() prams = aux.SGD(prams, learn_rate) total_loss += nd.sum(loss).asscalar() # Evaluate model on training data train_accuracy = aux.compute_accuracy(train_data, aux.nnet, prams, cntx) # Evaluate model on testing data test_accuracy = aux.compute_accuracy(test_data, aux.nnet, prams, cntx) print("Epoch %s. Loss: %s, Train Accuracy: %s, Test Accuracy: %s" % (E, total_loss / m_cases, train_accuracy, test_accuracy))
random.shuffle(example_indices) #每个输入是时间步长度的歌词段,是将前后有序的所有歌词分段,赋予序号后,随机打乱顺序 def _data(pos): return corpus_indices[pos:pos + num_steps] for i in range(epoch_size): i = i * batch_size batch_indices = example_indices[i:i + batch_size] #每批所含歌词段序号,是随机乱序 X = nd.array([_data(j * num_steps) for j in batch_indices], ctx=ctx) #采每个歌词段的歌词字典索引,用于转为one-hot向量 Y = nd.array([_data(j * num_steps + 1) for j in batch_indices], ctx=ctx) #对应歌词下一个字序列 yield X, Y ''' nd.one_hot(nd.array([0, 2]), vocab_size) [[1. 0. 0. ... 0. 0. 0.] [0. 0. 1. ... 0. 0. 0.]] <NDArray 2x1027 @cpu(0)> ''' def to_onehot(X, size): """Represent inputs with one-hot encoding.""" return [nd.one_hot(x, size) for x in X.T] ''' 裁剪梯度
def forward(self,inputs, state): X = nd.one_hot(inputs.T, self.vocab_size) Y, state = self.rnn(X, state) # 先变成(num_steps*batch_size, num_hiddens),之后output是num_steps*batch_size output = self.dense(Y.reshape((-1, Y.shape[-1]))) return output, state
def forward_single_out(self, data, cond=None, logged=False): out = (self.forward_logged if logged else self)(data) if cond is None: cond = nd.argmax(out, axis=1) cond = nd.one_hot(cond, out.shape[1]) return cond * out
def forward(self, cls_targets, ctr_targets, box_targets, mask_targets, matches, cls_preds, ctr_preds, box_preds, mask_preds, maskcoe_preds): """Compute loss in entire batch across devices.""" scale = 4 # require results across different devices at this time cls_targets, ctr_targets, box_targets, mask_targets, matches, cls_preds, ctr_preds, box_preds, mask_preds, maskcoe_preds = \ [_as_list(x) for x in (cls_targets, ctr_targets, box_targets, mask_targets, matches, cls_preds, ctr_preds, box_preds, mask_preds, maskcoe_preds)] # compute element-wise cross entropy loss and sort, then perform negative mining cls_losses = [] ctr_losses = [] box_losses = [] mask_losses = [] sum_losses = [] for clst, ctrt, boxt, maskt, matche, clsp, ctrp, boxp, maskp, maskcoep in zip( *[ cls_targets, ctr_targets, box_targets, mask_targets, matches, cls_preds, ctr_preds, box_preds, mask_preds, maskcoe_preds ]): pos_gt_mask = clst > 0 # cls loss if not self._from_logits: clsp = nd.sigmoid(clsp) one_hot = nd.one_hot(clst, self._num_class) one_hot = nd.slice_axis(one_hot, begin=1, end=None, axis=-1) pt = nd.where(one_hot, clsp, 1 - clsp) t = nd.ones_like(one_hot) alpha = nd.where(one_hot, self._alpha * t, (1 - self._alpha) * t) cls_loss = -alpha * ( (1 - pt)**self._gamma) * nd.log(nd.minimum(pt + self._eps, 1)) cls_loss = nd.sum(cls_loss) / nd.maximum(nd.sum(pos_gt_mask), 1) cls_losses.append(cls_loss) # ctr loss ctrp = nd.squeeze(ctrp, axis=-1) pos_pred_mask = ctrp >= 0 ctr_loss = (ctrp * pos_pred_mask - ctrp * ctrt + nd.log(1 + nd.exp(-nd.abs(ctrp)))) * pos_gt_mask ctr_loss = nd.sum(ctr_loss) / nd.maximum(nd.sum(pos_gt_mask), 1) ctr_losses.append(ctr_loss) # box loss // iou loss px1, py1, px2, py2 = nd.split(boxp, num_outputs=4, axis=-1, squeeze_axis=True) gx1, gy1, gx2, gy2 = nd.split(boxt, num_outputs=4, axis=-1, squeeze_axis=True) apd = nd.abs(px2 - px1 + 1) * nd.abs(py2 - py1 + 1) agt = nd.abs(gx2 - gx1 + 1) * nd.abs(gy2 - gy1 + 1) iw = nd.maximum( nd.minimum(px2, gx2) - nd.maximum(px1, gx1) + 1., 0.) ih = nd.maximum( nd.minimum(py2, gy2) - nd.maximum(py1, gy1) + 1., 0.) ain = iw * ih + 1. union = apd + agt - ain + 1 ious = nd.maximum(ain / union, 0.) fg_mask = nd.where(clst > 0, nd.ones_like(clst), nd.zeros_like(clst)) box_loss = -nd.log(nd.minimum(ious + self._eps, 1.)) * fg_mask if self._return_iou: box_loss = nd.sum(box_loss) / nd.maximum(nd.sum(fg_mask), 1), ious else: box_loss = nd.sum(box_loss) / nd.maximum(nd.sum(fg_mask), 1) box_losses.append(box_loss) # mask loss rank = (-matche).argsort(axis=-1) rank = nd.split(rank, 2, axis=0, squeeze_axis=True) matche = nd.split(matche, 2, axis=0, squeeze_axis=True) maskp = nd.split(maskp, 2, axis=0, squeeze_axis=True) maskt = nd.split(maskt, 2, axis=0, squeeze_axis=True) boxt = nd.split(boxt, 2, axis=0, squeeze_axis=True) maskcoep = nd.split(maskcoep, 2, axis=0, squeeze_axis=True) agt = nd.split(agt, 2, axis=0, squeeze_axis=True) mask_loss = [] for ranki, matchei, maskpi, maskti, boxti, maskcoepi, agti in zip( rank, matche, maskp, maskt, boxt, maskcoep, agt): idx = nd.slice(ranki, 0, 200) pos_mask = nd.take(matchei >= 0, idx) pos_box = nd.take(boxti, idx) area = nd.take(agti, idx) weight = (self.gt_weidth * self.gt_height / (area + self._eps)) * pos_mask mask_idx = nd.take(matchei, idx) maskti = nd.take(maskti, mask_idx) maskpi = nd.dot(nd.take(maskcoepi, idx), maskpi) maskpi = nd.sigmoid(maskpi) with autograd.pause(): _h = nd.arange(186, ctx=maskpi.context) _w = nd.arange(186, ctx=maskpi.context) _h = nd.tile(_h, reps=(pos_box.shape[0], 1)) _w = nd.tile(_w, reps=(pos_box.shape[0], 1)) x1, y1, x2, y2 = nd.split(nd.round(pos_box / scale), num_outputs=4, axis=-1) _w = (_w >= x1) * (_w <= x2) _h = (_h >= y1) * (_h <= y2) _mask = nd.batch_dot(_h.expand_dims(axis=-1), _w.expand_dims(axis=-1), transpose_b=True) maskpi = maskpi * _mask mask_loss.append( nd.sum(self.SBCELoss(maskpi, maskti) * weight) / nd.sum(pos_mask + self._eps)) # if sum(pos_num)>1400: # print(sum(pos_num)) # print(pos_num) # pos_num = (matche >=0).sum(axis=-1).asnumpy() # rank = (-matche).argsort(axis=-1) # mask_loss = [] # for i in range(maskp.shape[0]): # if pos_num[i] == 0.: # # print(pos_num) # mask_loss.append(nd.zeros(shape=(1,), ctx=maskp.context)) # continue # idx = rank[i, :int(pos_num[i])] # pos_box = nd.take(boxt[i], idx) # area = (pos_box[:, 3] - pos_box[:, 1]) * (pos_box[:, 2] - pos_box[:, 0]) # weight = self.gt_weidth * self.gt_height / (area+self._eps) # maskti = maskt[i, matche[i, idx], :, :] # maskpi = nd.dot(nd.take(maskcoep[i], idx), maskp[i]) # _, h, w = maskpi.shape # maskpi = nd.sigmoid(maskpi) # with autograd.pause(): # _h = nd.arange(h, ctx=maskpi.context) # _w = nd.arange(w, ctx=maskpi.context) # _h = nd.tile(_h, reps=(pos_box.shape[0], 1)) # _w = nd.tile(_w, reps=(pos_box.shape[0], 1)) # x1, y1, x2, y2 = nd.split(nd.round(pos_box / scale), num_outputs=4, axis=-1) # _w = (_w >= x1) * (_w <= x2) # _h = (_h >= y1) * (_h <= y2) # _mask = nd.batch_dot(_h.expand_dims(axis=-1), _w.expand_dims(axis=-1), transpose_b=True) # maskpi = maskpi * _mask # mask_loss.append(nd.sum(self.SBCELoss(maskpi, maskti) * weight)/pos_num[i]) mask_loss = nd.mean(nd.concat(*mask_loss, dim=0)) mask_losses.append(mask_loss) sum_losses.append(self._cls_lambd * cls_losses[-1] + self._ctr_lambd * ctr_losses[-1] + self._box_lambd * box_losses[-1] + self._mask_lambd * mask_losses[-1]) return sum_losses, cls_losses, ctr_losses, box_losses, mask_losses
def forward(self, inputs, target, next_word_history, cache_history, begin_state=None): # pylint: disable=arguments-differ """Defines the forward computation for cache cell. Arguments can be either :py:class:`NDArray` or :py:class:`Symbol`. Parameters ---------- inputs: NDArray The input data target: NDArray The label next_word_history: NDArray The next word in memory cache_history: NDArray The hidden state in cache history Returns -------- out: NDArray The linear interpolation of the cache language model with the regular word-level language model next_word_history: NDArray The next words to be kept in the memory for look up (size is equal to the window size) cache_history: NDArray The hidden states to be kept in the memory for look up (size is equal to the window size) """ output, hidden, encoder_hs, _ = \ super(self.lm_model.__class__, self.lm_model).\ forward(inputs, begin_state) encoder_h = encoder_hs[-1].reshape(-3, -2) output = output.reshape(-1, self._vocab_size) start_idx = len(next_word_history) \ if next_word_history is not None else 0 next_word_history = nd.concat(*[nd.one_hot(t[0], self._vocab_size, on_value=1, off_value=0) for t in target], dim=0) if next_word_history is None \ else nd.concat(next_word_history, nd.concat(*[nd.one_hot(t[0], self._vocab_size, on_value=1, off_value=0) for t in target], dim=0), dim=0) cache_history = encoder_h if cache_history is None \ else nd.concat(cache_history, encoder_h, dim=0) out = None softmax_output = nd.softmax(output) for idx, vocab_L in enumerate(softmax_output): joint_p = vocab_L if start_idx + idx > self._window: valid_next_word = next_word_history[start_idx + idx - self._window:start_idx + idx] valid_cache_history = cache_history[start_idx + idx - self._window:start_idx + idx] logits = nd.dot(valid_cache_history, encoder_h[idx]) cache_attn = nd.softmax(self._theta * logits).reshape(-1, 1) cache_dist = (cache_attn.broadcast_to(valid_next_word.shape) * valid_next_word).sum(axis=0) joint_p = self._lambdas * cache_dist + (1 - self._lambdas) * vocab_L out = joint_p[target[idx]] if out is None \ else nd.concat(out, joint_p[target[idx]], dim=0) next_word_history = next_word_history[-self._window:] cache_history = cache_history[-self._window:] return out, next_word_history, cache_history, hidden
def train(): # 1. Init params weight_scale = .1 rho_offset = -3 # initialize variational parameters; mean and variance for each weight mus = [] rhos = [] for shape in layer_param_shapes: mu = nd.random_normal(shape=shape, ctx=ctx, scale=weight_scale) rho = rho_offset + nd.zeros(shape=shape, ctx=ctx) mus.append(mu) rhos.append(rho) variational_params = mus + rhos for param in variational_params: param.attach_grad() # 2. Functions for main training loop def sample_epsilons(param_shapes): epsilons = [ nd.random_normal(shape=shape, loc=0., scale=1.0, ctx=ctx) for shape in param_shapes ] return epsilons def softplus(x): return nd.log(1. + nd.exp(x)) def transform_rhos(rhos): return [softplus(rho) for rho in rhos] def transform_gaussian_samples(mus, sigmas, epsilons): samples = [] for j in range(len(mus)): samples.append(mus[j] + sigmas[j] * epsilons[j]) return samples # 3. Complete training loop epochs = config['epochs'] learning_rate = config['learning_rate'] smoothing_constant = .01 train_acc = [] test_acc = [] for e in range(epochs): for i, (data, label) in enumerate(train_data): print(data.shape, label.shape) if i == 5: break data = data.as_in_context(ctx).reshape((-1, 784)) label = label.as_in_context(ctx) label_one_hot = nd.one_hot(label, 10) with autograd.record(): # sample epsilons from standard normal epsilons = sample_epsilons(layer_param_shapes) # compute softplus for variance sigmas = transform_rhos(rhos) # obtain a sample from q(w|theta) by transforming the epsilons layer_params = transform_gaussian_samples( mus, sigmas, epsilons) # forward-propagate the batch output = net(data, layer_params) # calculate the loss loss = combined_loss(output, label_one_hot, layer_params, mus, sigmas, gaussian_prior, log_softmax_likelihood) # backpropagate for gradient calculation loss.backward() # apply stochastic gradient descent to variational parameters SGD(variational_params, learning_rate) # calculate moving loss for monitoring convergence curr_loss = nd.mean(loss).asscalar() moving_loss = (curr_loss if ((i == 0) and (e == 0)) else (1 - smoothing_constant) * moving_loss + (smoothing_constant) * curr_loss) test_accuracy = evaluate_accuracy(test_data, net, mus) train_accuracy = evaluate_accuracy(train_data, net, mus) train_acc.append(np.asscalar(train_accuracy)) test_acc.append(np.asscalar(test_accuracy)) print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, moving_loss, train_accuracy, test_accuracy)) return [mu.asnumpy().tolist() for mu in mus]
def to_onehot(X, size): # 本函数已保存在d2lzh包中方便以后使用 # 5 x 2 # return [nd.one_hot(x, size) for x in X.T]
from mxnet import autograd, nd from mxnet.gluon import loss as gloss import time # corpus_indices 语料库索引 # char_to_idx char to idx # idx_to_char idx to char # vocab_size 词汇大小(不同汉字的总数) (corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics() # vocab_size 1027 # 第0行 0的位置是1 # 第1行 2的位置是1 # 2 x 1027 tmp = nd.one_hot(nd.array([0, 2]), vocab_size) print(tmp) # def to_onehot(X, size): # 本函数已保存在d2lzh包中方便以后使用 # 5 x 2 # return [nd.one_hot(x, size) for x in X.T] # 2 x 5 X = nd.arange(10).reshape((2, 5)) # 2 x 1027 # 2 x 1027
def calculation(self, input_str, en_dict, ko_dict, ko_rev_dict, ctx): """ inference 코드 """ #앞뒤에 START,END 코드 추가 input_str = [ [ 'START', ] + mecab.morphs(input_str.strip()) + [ 'END', ], ] X = encoding_and_padding(input_str, en_dict, max_seq=self.max_seq_length) #string to embed inputs = F.array(X, ctx=ctx) inputs = F.cast(inputs, dtype='float32') in_sent_last_idx = F.argmax(F.where(inputs == self.end_idx, F.ones_like(inputs), F.zeros_like(inputs)), axis=1) #encoder GRU embeddinged_in = F.cast(self.embedding(inputs), dtype='float32') next_h = F.random.normal(0, 1, (1, self.n_hidden), ctx=ctx) for j in range(self.in_seq_len): p_outputs = F.slice_axis(embeddinged_in, axis=1, begin=j, end=j + 1) p_outputs = F.reshape(p_outputs, (-1, self.embed_dim)) enout, (next_h, ) = self.encoder(p_outputs, [ next_h, ]) if j == 0: enouts = enout next_hs = next_h else: enouts = F.concat(enouts, enout, dim=1) next_hs = F.concat(next_hs, next_h, dim=1) #masking with 0 using length enouts = F.reshape(enouts, (-1, self.in_seq_len, self.n_hidden)) enouts = F.transpose(enouts, (1, 0, 2)) enouts = F.SequenceMask(enouts, sequence_length=in_sent_last_idx + 1, use_sequence_length=True) enouts = F.transpose(enouts, (1, 0, 2)) next_hs = F.reshape(next_hs, (-1, self.n_hidden)) #take가 0 dim만 지원하기 때문에.. # N, 30, 300 -> N * 30, 300 , N = (0,1,2,3,4,5...) next_hs = next_hs.take(in_sent_last_idx) #디코더의 초기 입력값으로 넣을 'START'를 임베딩한다. Y_init = F.array([ [ ko_dict['START'], ], ], ctx=ctx) Y_init = F.cast(self.embedding(Y_init), dtype='float32') deout = Y_init[:, 0, :] #출력 시퀀스 길이만큼 순회 for i in range(self.out_seq_len): if self.attention: #print(deout.shape) deout, att_weight = self.apply_attention( F=F, inputs=deout, hidden=next_hs, encoder_outputs=enouts) if i == 0: att_weights = att_weight else: att_weights = F.concat(att_weights, att_weight, dim=0) deout, (next_hs, ) = self.decoder(deout, [ next_hs, ]) #batchnorm을 적용하기 위해 차원 증가/원복 deout = F.expand_dims(deout, axis=1) deout = self.batchnorm(deout) #reduce dim deout = deout[:, 0, :] #'START'의 다음 시퀀스 출력값도출 deout_sm = self.dense(deout) #print(deout_sm.shape) deout = F.one_hot(F.argmax(F.softmax(deout_sm, axis=1), axis=1), depth=self.vocab_size) #print(deout.shape) #decoder에 들어갈 수 있는 형태로 변환(임베딩 적용 및 차원 맞춤) deout = F.argmax(deout, axis=1) deout = F.expand_dims(deout, axis=0) deout = F.cast(self.embedding(deout)[:, 0, :], dtype='float32') gen_char = ko_rev_dict[F.argmax(deout_sm, axis=1).asnumpy()[0].astype('int')] if gen_char == '__PAD__' or gen_char == 'END': break else: if i == 0: ret_seq = [ gen_char, ] else: ret_seq += [ gen_char, ] return (" ".join(ret_seq), att_weights)
def train_and_valid(en_bert, mt_model, en_vocab, ch_vocab, train_dataiter, dev_dataiter, trainer, en_finetune_trainer, epochs, loss_func, ctx, lr, batch_size, params_save_step, params_save_path_root, eval_step, log_step, check_step, label_smooth, logger, num_train_examples, warmup_ratio): batches = len(train_dataiter) num_train_steps = int(num_train_examples / batch_size * epochs) num_warmup_steps = int(num_train_steps * warmup_ratio) global_step = 0 dev_bleu_score = 0 for epoch in range(epochs): for trans, aim, label, trans_valid_len, aim_valid_len in train_dataiter: if global_step < num_warmup_steps: new_lr = lr * global_step / num_warmup_steps else: non_warmup_steps = global_step - num_warmup_steps offset = non_warmup_steps / (num_train_steps - num_warmup_steps) new_lr = lr - offset * lr trainer.set_learning_rate(new_lr) trans = trans.as_in_context(ctx) aim = aim.as_in_context(ctx) label = label.as_in_context(ctx) trans_valid_len = trans_valid_len.as_in_context(ctx) trans_token_type = nd.zeros_like(trans, ctx=ctx) aim_mask = nd.not_equal(aim, ch_vocab(ch_vocab.padding_token)) if label_smooth: eps = 0.1 num_class = len(ch_vocab.idx_to_token) one_hot = nd.one_hot(label, num_class) one_hot_label = one_hot * (1 - eps) + ( 1 - one_hot) * eps / num_class with autograd.record(): en_bert_outputs = en_bert(trans, trans_token_type, trans_valid_len) mt_outputs = mt_model(en_bert_outputs, trans, aim) loss_mean = loss_func(mt_outputs, one_hot_label, aim_mask) loss_mean.backward() loss_scalar = loss_mean.asscalar() trainer.step(1) en_finetune_trainer.step(1) if global_step and global_step % log_step == 0: predicts = nd.argmax(nd.softmax(mt_outputs, axis=-1), axis=-1) correct = nd.equal(label, predicts) accuracy = (nd.sum(correct * aim_mask) / nd.sum(aim_mask)).asscalar() logger.info( "epoch:{}, batch:{}/{}, bleu:{}, acc:{}, loss:{}, (lr:{}s)" .format(epoch, global_step % batches, batches, dev_bleu_score, accuracy, loss_scalar, trainer.learning_rate)) if global_step and global_step % check_step == 0: predicts = nd.argmax(nd.softmax(mt_outputs, axis=-1), axis=-1) refer_sample = trans.asnumpy().tolist() label_sample = label.asnumpy().tolist() pred_sample = predicts.asnumpy().tolist() logger.info("train sample:") logger.info("refer :{}".format(" ".join([ en_vocab.idx_to_token[int(idx)] for idx in refer_sample[0] ])).replace(en_vocab.padding_token, "")) logger.info("target :{}".format(" ".join([ ch_vocab.idx_to_token[int(idx)] for idx in label_sample[0] ])).replace(EOS, "[EOS]").replace(ch_vocab.padding_token, "")) logger.info("predict:{}".format(" ".join([ ch_vocab.idx_to_token[int(idx)] for idx in pred_sample[0] ])).replace(EOS, "[EOS]")) if global_step and global_step % eval_step == 0: dev_bleu_score = eval(en_bert, mt_model, en_vocab, ch_vocab, dev_dataiter, logger, ctx=ctx) if global_step and global_step % params_save_step == 0: if not os.path.exists(params_save_path_root): os.makedirs(params_save_path_root) model_params_file = params_save_path_root + \ "en_bert.ft_step_{}.params".format(global_step) en_bert.save_parameters(model_params_file) logger.info("{} Save Completed.".format(model_params_file)) model_params_file = params_save_path_root + \ "mt_step_{}.params".format(global_step) mt_model.save_parameters(model_params_file) logger.info("{} Save Completed.".format(model_params_file)) global_step += 1
import d2lzh as d2l from mxnet import autograd, nd from mxnet.gluon import loss as gloss import math, time, numpy as np # 读取数据 # corpus_indices 1w字的idx # char_to_idx 字符转idx # idx_to_char idx转字符 # vocab_size不同字的个数 (corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics() # one-hot向量 print(nd.one_hot(nd.array([1, 2]), vocab_size)) # one-hot一行只有一个1,哪个位置是1呢?1,2位置 def to_onehot(X, size): return [nd.one_hot(x, size) for x in X.T] # X中列是feature,行是sample # Test X = nd.arange(10).reshape((2, 5)) # 2:batch_size 5:num_step inputs = to_onehot(X, vocab_size) # 转成num_steps个形状为(batch_size,vocab_size) np.set_printoptions(edgeitems=6) # 显示个数设置,默认显示3个 print(len(inputs), inputs[0]) # 5个长度, 2*1027 ################################################# TODO 初始化模型参数 ##################################################### num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size ctx = d2l.try_gpu() print('use ', ctx)
def fuzzy_one_hot(arr, size): x = arr.reshape((-1, )) return nd.where(nd.one_hot(x, size), nd.uniform(low=0.7, high=1.2, shape=(x.shape[0], size), ctx=x.context), nd.uniform(low=0.0, high=0.3, shape=(x.shape[0], size), ctx=x.context))
def forward(self, inputs, target, next_word_history, cache_history, begin_state=None): # pylint: disable=arguments-differ """Defines the forward computation for cache cell. Arguments can be either :py:class:`NDArray` or :py:class:`Symbol`. Parameters ---------- inputs: NDArray The input data target: NDArray The label next_word_history: NDArray The next word in memory cache_history: NDArray The hidden state in cache history Returns -------- out: NDArray The linear interpolation of the cache language model with the regular word-level language model next_word_history: NDArray The next words to be kept in the memory for look up (size is equal to the window size) cache_history: NDArray The hidden states to be kept in the memory for look up (size is equal to the window size) """ output, hidden, encoder_hs, _ = \ super(self.lm_model.__class__, self.lm_model).\ forward(inputs, begin_state) encoder_h = encoder_hs[-1].reshape(-3, -2) output = output.reshape(-1, self._vocab_size) start_idx = len(next_word_history) \ if next_word_history is not None else 0 next_word_history = nd.concat(*[nd.one_hot(t[0], self._vocab_size, on_value=1, off_value=0) for t in target], dim=0) if next_word_history is None \ else nd.concat(next_word_history, nd.concat(*[nd.one_hot(t[0], self._vocab_size, on_value=1, off_value=0) for t in target], dim=0), dim=0) cache_history = encoder_h if cache_history is None \ else nd.concat(cache_history, encoder_h, dim=0) out = None softmax_output = nd.softmax(output) for idx, vocab_L in enumerate(softmax_output): joint_p = vocab_L if start_idx + idx > self._window: valid_next_word = next_word_history[start_idx + idx - self._window:start_idx + idx] valid_cache_history = cache_history[start_idx + idx - self._window:start_idx + idx] logits = nd.dot(valid_cache_history, encoder_h[idx]) cache_attn = nd.softmax(self._theta * logits).reshape(-1, 1) cache_dist = (cache_attn.broadcast_to(valid_next_word.shape) * valid_next_word).sum(axis=0) joint_p = self._lambdas * cache_dist + ( 1 - self._lambdas) * vocab_L out = joint_p[target[idx]] if out is None \ else nd.concat(out, joint_p[target[idx]], dim=0) next_word_history = next_word_history[-self._window:] cache_history = cache_history[-self._window:] return out, next_word_history, cache_history, hidden
def forward(self, inputs, state): X = nd.one_hot(inputs.T, self.V)
def forward(self, img, xs, anchors, offsets, gt_boxes, gt_ids, gt_mixratio=None): """Generating training targets that do not require network predictions. Parameters self._fake_x, self._feat_maps, self._anchors, self._offsets, ---------- img : mxnet.nd.NDArray Original image tensor. img = mx.nd.zeros((1, 3, 416, 416)) xs : list of mxnet.nd.NDArray [[13, 13], [26, 26], [52, 52]] List of feature maps. anchors : mxnet.nd.NDArray [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]] YOLO3 anchors. offsets : mxnet.nd.NDArray [[1, 13*13,1,2], [1, 26*26,1,2], [1, 52*52,1,2]] Pre-generated x and y offsets for YOLO3. # 相对的是grid cell左上角的偏移量 gt_boxes : mxnet.nd.NDArray Ground-truth boxes. gt_ids : mxnet.nd.NDArray Ground-truth IDs. gt_mixratio : mxnet.nd.NDArray, optional Mixup ratio from 0 to 1. _fake_x shape : [1, 3, 416, 416] # img 都是list类型 feat_maps:[ # xs (1, 1, 13, 13) (1, 1, 26, 26) (1, 1, 52, 52)] anchors:[ (1, 1, 3, 2) # 13 * 13 (1, 1, 3, 2) # 26 * 26 (1, 1, 3, 2) # 52 * 53] offsets:[ (1, 169, 1, 2) # 13 * 13 (1, 676, 1, 2) # 26 * 26 (1, 2704, 1, 2) # 52 * 53] gt_boxes = train_dataset[0][1] [np.newaxis, :, :4]) [B,M,4] gt_ids = train_dataset[0][1] [np.newaxis, :, :4:5]) gt_mixratio = train_dataset[0][1] [np.newaxis, :, -1:]) Returns ------- # 需要生成的因素 (tuple of) mxnet.nd.NDArray objectness: 0 for negative, 1 for positive, -1 for ignore. center_targets: regression target for center x and y. scale_targets: regression target for scale x and y. weights: element-wise gradient weights for center_targets and scale_targets. class_targets: a one-hot vector for classification. self._target_generator( self._fake_x, self._feat_maps, self._anchors, self._offsets, gt_bboxes, gt_ids, gt_mixratio) anchors_lst = [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]] anchors = [nd.array(a) for a in anchors_lst] offsets = [ nd.arange(13*13*2).reshape(1,13*13, 1, 2), nd.arange(26*26*2).reshape(1,26*26, 1, 2), nd.arange(52*52*2).reshape(1,52*52, 1, 2)] """ assert isinstance(anchors, (list, tuple)) all_anchors = nd.concat(*[a.reshape(-1, 2) for a in anchors], dim=0) # shape = [3549, 2] 169 + 676 + 2704 assert isinstance(offsets, (list, tuple)) all_offsets = nd.concat(*[o.reshape(-1, 2) for o in offsets], dim=0) num_anchors = np.cumsum([a.size // 2 for a in anchors ]) # num_anchors = array(3, 6, 9) num_offsets = np.cumsum([ o.size // 2 for o in offsets ]) # num_offsets = array(169, 169 + 676, 169 + 676 + 2704) _offsets = [ 0 ] + num_offsets.tolist() # _offsets = [0, 169, 845, 3549] assert isinstance(xs, (list, tuple)) assert len(xs) == len(anchors) == len( offsets) # 三者数量保持一致,一个anchor 对应一个offset # orig image size orig_height = img.shape[2] # 416 orig_width = img.shape[3] # 416 # 训练中暂时停止记录梯度,此时autograd.is_training 为True with autograd.pause(): # outputs ''' all_anchors.reshape --> [1, 9, 2] 这是一个grid cell的anchor all_offsets.reshape --> [3549, 1, 2] # 这是所有的feature map 相乘再expand_dims shape --> (1, 3549, 9, 2) # 每个grid cell 都有9个 anchor repeat --> shape_like.shape = [1, 3549, 9, 2] weights.split(axis=-1, num_outputs=2)[0] --> [1, 3549, 9, 2] objectness shape = [1, 3549, 9, 1] ''' shape_like = all_anchors.reshape((1, -1, 2)) * all_offsets.reshape( (-1, 1, 2)).expand_dims(0).repeat(repeats=gt_ids.shape[0], axis=0) ''' 全部初始化为0 ''' center_targets = nd.zeros_like(shape_like) scale_targets = nd.zeros_like(center_targets) weights = nd.zeros_like(center_targets) objectness = nd.zeros_like( weights.split(axis=-1, num_outputs=2)[0]) ''' objectness.squeeze(axis=-1) shape = [1, 3549, 9] class_targets shape = [gt_ids.shape[0], 3549, 9, self._num_class] 默认值全是-1,即忽略 ''' class_targets = nd.one_hot(objectness.squeeze(axis=-1), depth=self._num_class) class_targets[:] = -1 # prefill -1 for ignores ''' # for each ground-truth, find the best matching anchor within the particular grid # for instance, center of object 1 reside in grid (3, 4) in (16, 16) feature map # then only the anchor in (3, 4) is going to be matched 即,对于每个ground-truth寻找与之最匹配的anchor box,要在ground-truth所在的grid cell产生的box里寻找 shift_gt_boxes 还是一个四角坐标,[1, M, 4] anchor_boxes shape = [1, 9,4] 前面两个数是表示是box的中心(0, 0),后面两个数是priors的宽和高 shift_anchor_boxes 化为四角坐标: [1, 9, 4] ious shape = [1,9, M],M是具体某个gt-bbox里面的objness数量 gtx shape:[1, M, 1] gty shape:[1, M, 1] gtw shape:[1, M, 1] gth shape:[1, M, 1] ''' gtx, gty, gtw, gth = self.bbox2center(gt_boxes) shift_gt_boxes = nd.concat(-0.5 * gtw, -0.5 * gth, 0.5 * gtw, 0.5 * gth, dim=-1) # zero center anchor_boxes = nd.concat(0 * all_anchors, all_anchors, dim=-1) # zero center anchors shift_anchor_boxes = self.bbox2corner(anchor_boxes) # 又转换为四角坐标 ious = nd.contrib.box_iou(shift_anchor_boxes, shift_gt_boxes).transpose( (1, 0, 2)) # (1, 9, M) # real value is required to process, convert to Numpy ''' IoU: 得到的是所有的anchor与每个gt_boxes的IoU, ious.argmax(axis=1)得到的是M个gt_box与所有的anchor得到的最大IoU的索引。 这里 一个grid cell对应anchor 有9个, 但是只有一个anchor 最符合gt_box 。 nlayer = np.nonzero(num_anchors > match)[0][0] 就是判断哪一层的anchor最符合 ''' matches = ious.argmax(axis=1).asnumpy() # (B, M) valid_gts = (gt_boxes >= 0).asnumpy().prod( axis=-1) # [B, M, 4]--> [B, M] 1则有效,如果是0则无效(即超过图像左上角边界) np_gtx, np_gty, np_gtw, np_gth = [ x.asnumpy() for x in [gtx, gty, gtw, gth] ] np_anchors = all_anchors.asnumpy() np_gt_ids = gt_ids.asnumpy() np_gt_mixratios = gt_mixratio.asnumpy( ) if gt_mixratio is not None else None # TODO(zhreshold): the number of valid gt is not a big number, therefore for loop # should not be a problem right now. Switch to better solution is needed. for b in range(matches.shape[0]): # batch for m in range(matches.shape[1]): # ground-truth 个数 if valid_gts[b, m] < 1: # 无效的gt,忽略此次循环 break match = int(matches[b, m]) # 取出与这这个gt最匹配的anchor的索引 nlayer = np.nonzero(num_anchors > match)[0][0] height = xs[nlayer].shape[2] # 13,26, 52 width = xs[nlayer].shape[3] gtx, gty, gtw, gth = (np_gtx[b, m, 0], np_gty[b, m, 0], np_gtw[b, m, 0], np_gth[b, m, 0]) ''' index = _offsets[nlayer] + loc_y * width + loc_x ??? _offsets = [0, 169, 845, 3549], grid cell的位置,从上一层开始计算, 即_offsets[nlayer] loc_y * width : 在每一阶段的feature map上, 大小是width * height , loc_y * width 表示位于第几行(因为一行有width个元素) loc_x : 表示第几列的位置。 gtx 是原图中的坐标,转换为相对每个grid cell的一个偏移量 1 首先需要确定gtx在当前feature map上的位置 = gtx/stride, 即:gtx / orig_width * width 2 loc_x = int(gtx / orig_width * width) 即当前grid cell左上角的坐标位置 3 gtx / orig_width * width - loc_x, 同理,gty w/h 根据gtw/gth的位置计算公式: tw = log(gtw/pw), 其中pw是anchor的的width,由于tw是个比例系数,无论在哪个scale下的比例都是一致, 因此直接gtw/pw th = log(gth/ph), 其中ph是anchor的的height weights[b, index, match, :] = 2.0 - gtw * gth / orig_width / orig_height 这个是在计算损失时候x,y,w,h的一个系数,为什么这么算,还真是不太理解 ''' loc_x = int( gtx / orig_width * width ) # loc_x = gtx / stride gtx是原图坐标,计算在当前feature map上落入的grid cell左上角的x位置 loc_y = int( gty / orig_height * height ) # loc_x = gty / stride 计算在当前feature map上落入的grid cell左上角的y位置 # write back to targets index = _offsets[nlayer] + loc_y * width + loc_x # shape = [B, 3549, 9, 2] center_targets[ b, index, match, 0] = gtx / orig_width * width - loc_x # sigmoid(tx) 得到的是小于1的小数,相对当前fgrid cell左上角的偏移量 center_targets[ b, index, match, 1] = gty / orig_height * height - loc_y # sigmoid(ty) scale_targets[b, index, match, 0] = np.log( max(gtw, 1) / np_anchors[match, 0]) # tw scale_targets[b, index, match, 1] = np.log( max(gth, 1) / np_anchors[match, 1]) # th weights[ b, index, match, :] = 2.0 - gtw * gth / orig_width / orig_height # ???? objectness[b, index, match, 0] = (np_gt_mixratios[b, m, 0] if np_gt_mixratios is not None else 1) class_targets[b, index, match, :] = 0 class_targets[b, index, match, int(np_gt_ids[b, m, 0])] = 1 # 实现one-hot编码 # since some stages won't see partial anchors, so we have to slice the correct targets objectness = self._slice(objectness, num_anchors, num_offsets) center_targets = self._slice(center_targets, num_anchors, num_offsets) scale_targets = self._slice(scale_targets, num_anchors, num_offsets) weights = self._slice(weights, num_anchors, num_offsets) class_targets = self._slice(class_targets, num_anchors, num_offsets) # 最后输出的维度都是:# [(B, 10647, 1 or 2)] # 其中,B = 1, 10647 = 13 * 13 * 3 + 26 * 26 * 3 + 52 * 52 * 3 return objectness, center_targets, scale_targets, weights, class_targets
def forward(self,inputs, state): X = nd.one_hot(inputs.T, self.vocab_size) Y, state = self.rnn(X, state) output = self.Dense(Y.reshape((-1, Y.shape[-1])))
def forward(self, inputs, state, *args): X = nd.one_hot(inputs.T, self.data_size) Y, state = self.rnn(X, state) output = self.dense(Y.reshape((-1, Y.shape[-1]))) return output, state
def to_onehot(X, size): """Represent inputs with one-hot encoding.""" return [nd.one_hot(x, size) for x in X.T]
ctx = mx.cpu() train_data, test_data = load_data_mnist(batch_size=batch_size, resize=28) #print(train_data.shape) net = CapsNet(batch_size=batch_size, ctx=ctx) print(net) trainer = Trainer(net.collect_params(), 'adam', {'learning_rate': 0.01}) for epoch in range(epochs): train_loss0 = 0. train_acc0 = 0. train_loss = 0. train_acc = 0. for i, batch in enumerate(train_data): data, label = batch one_hot_label = nd.one_hot(label, 10) label = label.as_in_context(ctx) one_hot_label = one_hot_label.as_in_context(ctx) data = data.as_in_context(ctx) with autograd.record(): output = net(data) L = CapsuleMarginLoss(output, one_hot_label, lambda_value) L.backward() trainer.step(data.shape[0]) n = i + 1 train_loss += nd.mean(L).asscalar() train_acc += nd.mean(nd.argmax(output, axis=1) == label).asscalar()
def forward(self, img, xs, anchors, offsets, gt_boxes, gt_ids, gt_mixratio=None): """Generating training targets that do not require network predictions. Parameters ---------- img : mxnet.nd.NDArray Original image tensor. xs : list of mxnet.nd.NDArray List of feature maps. anchors : mxnet.nd.NDArray YOLO3 anchors. offsets : mxnet.nd.NDArray Pre-generated x and y offsets for YOLO3. gt_boxes : mxnet.nd.NDArray Ground-truth boxes. gt_ids : mxnet.nd.NDArray Ground-truth IDs. gt_mixratio : mxnet.nd.NDArray, optional Mixup ratio from 0 to 1. Returns ------- (tuple of) mxnet.nd.NDArray objectness: 0 for negative, 1 for positive, -1 for ignore. center_targets: regression target for center x and y. scale_targets: regression target for scale x and y. weights: element-wise gradient weights for center_targets and scale_targets. class_targets: a one-hot vector for classification. """ assert isinstance(anchors, (list, tuple)) all_anchors = nd.concat(*[a.reshape(-1, 2) for a in anchors], dim=0) assert isinstance(offsets, (list, tuple)) all_offsets = nd.concat(*[o.reshape(-1, 2) for o in offsets], dim=0) num_anchors = np.cumsum([a.size // 2 for a in anchors]) num_offsets = np.cumsum([o.size // 2 for o in offsets]) _offsets = [0] + num_offsets.tolist() assert isinstance(xs, (list, tuple)) assert len(xs) == len(anchors) == len(offsets) # orig image size orig_height = img.shape[2] orig_width = img.shape[3] with autograd.pause(): # outputs shape_like = all_anchors.reshape((1, -1, 2)) * all_offsets.reshape( (-1, 1, 2)).expand_dims(0).repeat(repeats=gt_ids.shape[0], axis=0) center_targets = nd.zeros_like(shape_like) scale_targets = nd.zeros_like(center_targets) weights = nd.zeros_like(center_targets) objectness = nd.zeros_like(weights.split(axis=-1, num_outputs=2)[0]) class_targets = nd.one_hot(objectness.squeeze(axis=-1), depth=self._num_class) class_targets[:] = -1 # prefill -1 for ignores # for each ground-truth, find the best matching anchor within the particular grid # for instance, center of object 1 reside in grid (3, 4) in (16, 16) feature map # then only the anchor in (3, 4) is going to be matched gtx, gty, gtw, gth = self.bbox2center(gt_boxes) shift_gt_boxes = nd.concat(-0.5 * gtw, -0.5 * gth, 0.5 * gtw, 0.5 * gth, dim=-1) anchor_boxes = nd.concat(0 * all_anchors, all_anchors, dim=-1) # zero center anchors shift_anchor_boxes = self.bbox2corner(anchor_boxes) ious = nd.contrib.box_iou(shift_anchor_boxes, shift_gt_boxes).transpose((1, 0, 2)) # real value is required to process, convert to Numpy matches = ious.argmax(axis=1).asnumpy() # (B, M) valid_gts = (gt_boxes >= 0).asnumpy().prod(axis=-1) # (B, M) np_gtx, np_gty, np_gtw, np_gth = [x.asnumpy() for x in [gtx, gty, gtw, gth]] np_anchors = all_anchors.asnumpy() np_gt_ids = gt_ids.asnumpy() np_gt_mixratios = gt_mixratio.asnumpy() if gt_mixratio is not None else None # TODO(zhreshold): the number of valid gt is not a big number, therefore for loop # should not be a problem right now. Switch to better solution is needed. for b in range(matches.shape[0]): for m in range(matches.shape[1]): if valid_gts[b, m] < 1: break match = int(matches[b, m]) nlayer = np.nonzero(num_anchors > match)[0][0] height = xs[nlayer].shape[2] width = xs[nlayer].shape[3] gtx, gty, gtw, gth = (np_gtx[b, m, 0], np_gty[b, m, 0], np_gtw[b, m, 0], np_gth[b, m, 0]) # compute the location of the gt centers loc_x = int(gtx / orig_width * width) loc_y = int(gty / orig_height * height) # write back to targets index = _offsets[nlayer] + loc_y * width + loc_x center_targets[b, index, match, 0] = gtx / orig_width * width - loc_x # tx center_targets[b, index, match, 1] = gty / orig_height * height - loc_y # ty scale_targets[b, index, match, 0] = np.log(max(gtw, 1) / np_anchors[match, 0]) scale_targets[b, index, match, 1] = np.log(max(gth, 1) / np_anchors[match, 1]) weights[b, index, match, :] = 2.0 - gtw * gth / orig_width / orig_height objectness[b, index, match, 0] = ( np_gt_mixratios[b, m, 0] if np_gt_mixratios is not None else 1) class_targets[b, index, match, :] = 0 class_targets[b, index, match, int(np_gt_ids[b, m, 0])] = 1 # since some stages won't see partial anchors, so we have to slice the correct targets objectness = self._slice(objectness, num_anchors, num_offsets) center_targets = self._slice(center_targets, num_anchors, num_offsets) scale_targets = self._slice(scale_targets, num_anchors, num_offsets) weights = self._slice(weights, num_anchors, num_offsets) class_targets = self._slice(class_targets, num_anchors, num_offsets) return objectness, center_targets, scale_targets, weights, class_targets
def to_onehot(X,size): #one column, one sample return [nd.one_hot(x, size) for x in X.T]
def forward(self, img, xs, anchors, offsets, gt_boxes, gt_ids, gt_mixratio=None): """Generating training targets that do not require network predictions. Parameters ---------- img : mxnet.nd.NDArray Original image tensor. xs : list of mxnet.nd.NDArray List of feature maps. anchors : mxnet.nd.NDArray YOLO3 anchors. offsets : mxnet.nd.NDArray Pre-generated x and y offsets for YOLO3. gt_boxes : mxnet.nd.NDArray Ground-truth boxes. gt_ids : mxnet.nd.NDArray Ground-truth IDs. gt_mixratio : mxnet.nd.NDArray, optional Mixup ratio from 0 to 1. Returns ------- (tuple of) mxnet.nd.NDArray objectness: 0 for negative, 1 for positive, -1 for ignore. center_targets: regression target for center x and y. scale_targets: regression target for scale x and y. weights: element-wise gradient weights for center_targets and scale_targets. class_targets: a one-hot vector for classification. """ assert isinstance(anchors, (list, tuple)) # 这里的anchors中是一个大列表套接着三个小列表 # 以416*416为例,all_anchors---(9, 2) all_anchors = nd.concat(*[a.reshape(-1, 2) for a in anchors], dim=0) assert isinstance(offsets, (list, tuple)) # 这里offsets的作用 # 以416*416为例,all_offsets---(3549, 2), 3549 = 169(13*13) + 676(26*26) + 2704(52*52) all_offsets = nd.concat(*[o.reshape(-1, 2) for o in offsets], dim=0) # 以416*416为例,num_anchors----[3, 6, 9] num_anchors = np.cumsum([a.size // 2 for a in anchors]) # 以416*416为例,num_offsets----[169, 845, 3549] num_offsets = np.cumsum([o.size // 2 for o in offsets]) _offsets = [0] + num_offsets.tolist() assert isinstance(xs, (list, tuple)) assert len(xs) == len(anchors) == len(offsets) # orig image size # 获取训练图片的大小 orig_height = img.shape[2] orig_width = img.shape[3] with autograd.pause(): # outputs # shape_like: (N * 3549 * 9 * 2): 部分target的维度 shape_like = all_anchors.reshape((1, -1, 2)) * all_offsets.reshape( (-1, 1, 2)).expand_dims(0).repeat(repeats=gt_ids.shape[0], axis=0) # 下面就是存储需要返回的转换好的ground truth值 # center_targets:cx, cy , (N * 3549 * 9 * 2) center_targets = nd.zeros_like(shape_like) # scale_targets: w, h , (N * 3549 * 9 * 2) scale_targets = nd.zeros_like(center_targets) # weights: 含义(TO_DO ), (N * 3549 * 9 * 2) weights = nd.zeros_like(center_targets) # objectness: 置信度, (N * 3549 * 9 * 1) objectness = nd.zeros_like( weights.split(axis=-1, num_outputs=2)[0]) # class_targets: target的label值,这里用one-hot向量表示, (N * 3549 * 9 * self._num_class),初始值全部设置为-1,代表忽略 class_targets = nd.one_hot(objectness.squeeze(axis=-1), depth=self._num_class) class_targets[:] = -1 # prefill -1 for ignores # for each ground-truth, find the best matching anchor within the particular grid # for instance, center of object 1 reside in grid (3, 4) in (16, 16) feature map # then only the anchor in (3, 4) is going to be matched # 寻找最为匹配的anchor值 # 由于yolo进行iou匹配时,只看大小上的匹配,这里将box的格式从corner转换为center gtx, gty, gtw, gth = self.bbox2center(gt_boxes) # 得到一个以(0, 0)为中心点,与样本框同样大小的框,格式又转换为了corner格式 shift_gt_boxes = nd.concat(-0.5 * gtw, -0.5 * gth, 0.5 * gtw, 0.5 * gth, dim=-1) # 给预设的9个anchor,前面添加(0,0,),得到如(0, 0, 116, 90),即变成了center格式的,大小为预设框大小的框 anchor_boxes = nd.concat(0 * all_anchors, all_anchors, dim=-1) # zero center anchors # 将预设框格式转换为corner的格式与gt的格式对齐 shift_anchor_boxes = self.bbox2corner(anchor_boxes) # 求取anchor 与 gt box的 iou 值 ious = nd.contrib.box_iou(shift_anchor_boxes, shift_gt_boxes).transpose((1, 0, 2)) # real value is required to process, convert to Numpy # 得到每个gt box与哪一个预设框匹配的最好,也即iou最大 matches = ious.argmax(axis=1).asnumpy() # (B, M) # valid_gts是用来记录有效的box的信息,这里相当于一个mask值,对于在dataloader中为了batch同意而pad成-1的框,给出-1的mask值 valid_gts = (gt_boxes >= 0).asnumpy().prod(axis=-1) # (B, M) np_gtx, np_gty, np_gtw, np_gth = [ x.asnumpy() for x in [gtx, gty, gtw, gth] ] np_anchors = all_anchors.asnumpy() np_gt_ids = gt_ids.asnumpy() np_gt_mixratios = gt_mixratio.asnumpy( ) if gt_mixratio is not None else None # TODO(zhreshold): the number of valid gt is not a big number, therefore for loop # should not be a problem right now. Switch to better solution is needed. # 外循环:batch的大小,内循环:一张图片中框的匹配层数 # 这里的循环其实也说明在yolov3训练 for b in range(matches.shape[0]): for m in range(matches.shape[1]): # pad的过程中是向下增加pad,因此遇到第一个0时,就可跳出当前内循环,进去下一张图片 if valid_gts[b, m] < 1: break # 第b张图片的第m个框匹配的最佳anchor的索引 ,这里anchor的索引是从大到小 match = int(matches[b, m]) # 确切的得到这个框所匹配的anchor处于哪一层 nlayer = np.nonzero(num_anchors > match)[0][0] # 这里的xs是特征图的集合,这里用以在选择特征图后,提供特征图的高宽 height = xs[nlayer].shape[2] width = xs[nlayer].shape[3] # 得到当前框真实的(cx,cy,w,h),相对于原图上的坐标 gtx, gty, gtw, gth = (np_gtx[b, m, 0], np_gty[b, m, 0], np_gtw[b, m, 0], np_gth[b, m, 0]) # compute the location of the gt centers # 将目标框的cx, cy映射到对应anchor层的特征图的坐标 loc_x = int(gtx / orig_width * width) loc_y = int(gty / orig_height * height) # write back to targets # 获取框匹配的cell的索引 index = _offsets[nlayer] + loc_y * width + loc_x # 这里组成一个batch的标签的方法是,做一个B*Cell*Anchor*x ,这里的x针对不同的类别值不相同,例如对于center坐标,就是2 #获得了cx, cy的标签值,取值范围[0,1] center_targets[b, index, match, 0] = gtx / orig_width * width - loc_x # tx center_targets[ b, index, match, 1] = gty / orig_height * height - loc_y # ty # 获得w,h的标签值 scale_targets[b, index, match, 0] = np.log( max(gtw, 1) / np_anchors[match, 0]) scale_targets[b, index, match, 1] = np.log( max(gth, 1) / np_anchors[match, 1]) # 这里是为了减小box大小对于loss的影响,在YOLOv1中使用的是预测根号w的方式,这里采用的是如下加权重的方式 weights[ b, index, match, :] = 2.0 - gtw * gth / orig_width / orig_height # 这里一般讲objectness的target值设置为1 # 这样的话,在没有使用mix_up的前提下,在这个target_generator中不同的anchor分为两类,iou最大匹配的设置为1,其他情况设置为0 objectness[b, index, match, 0] = (np_gt_mixratios[b, m, 0] if np_gt_mixratios is not None else 1) class_targets[b, index, match, :] = 0 class_targets[b, index, match, int(np_gt_ids[b, m, 0])] = 1 # since some stages won't see partial anchors, so we have to slice the correct targets # 最后对所有的标签做最后一次切分,得到B * (Cell*Anchor) * x 的格式 # (TO_DO:)这里的_slice方法的必要性,看的还不太明白 objectness = self._slice(objectness, num_anchors, num_offsets) center_targets = self._slice(center_targets, num_anchors, num_offsets) scale_targets = self._slice(scale_targets, num_anchors, num_offsets) weights = self._slice(weights, num_anchors, num_offsets) class_targets = self._slice(class_targets, num_anchors, num_offsets) return objectness, center_targets, scale_targets, weights, class_targets
def forward(self, img, xs, anchors, offsets, gt_boxes, gt_ids, gt_mixratio=None): """Generating training targets that do not require network predictions. Parameters ---------- img : mxnet.nd.NDArray Original image tensor. xs : list of mxnet.nd.NDArray List of feature maps. anchors : mxnet.nd.NDArray YOLO3 anchors. offsets : mxnet.nd.NDArray Pre-generated x and y offsets for YOLO3. gt_boxes : mxnet.nd.NDArray Ground-truth boxes. gt_ids : mxnet.nd.NDArray Ground-truth IDs. gt_mixratio : mxnet.nd.NDArray, optional Mixup ratio from 0 to 1. Returns ------- (tuple of) mxnet.nd.NDArray objectness: 0 for negative, 1 for positive, -1 for ignore. center_targets: regression target for center x and y. scale_targets: regression target for scale x and y. weights: element-wise gradient weights for center_targets and scale_targets. class_targets: a one-hot vector for classification. """ assert isinstance(anchors, (list, tuple)) all_anchors = nd.concat(*[a.reshape(-1, 2) for a in anchors], dim=0) assert isinstance(offsets, (list, tuple)) all_offsets = nd.concat(*[o.reshape(-1, 2) for o in offsets], dim=0) num_anchors = np.cumsum([a.size // 2 for a in anchors]) num_offsets = np.cumsum([o.size // 2 for o in offsets]) _offsets = [0] + num_offsets.tolist() assert isinstance(xs, (list, tuple)) assert len(xs) == len(anchors) == len(offsets) # orig image size orig_height = img.shape[2] orig_width = img.shape[3] with autograd.pause(): # outputs shape_like = all_anchors.reshape((1, -1, 2)) * all_offsets.reshape( (-1, 1, 2)).expand_dims(0).repeat(repeats=gt_ids.shape[0], axis=0) center_targets = nd.zeros_like(shape_like) scale_targets = nd.zeros_like(center_targets) weights = nd.zeros_like(center_targets) objectness = nd.zeros_like(weights.split(axis=-1, num_outputs=2)[0]) class_targets = nd.one_hot(objectness.squeeze(axis=-1), depth=self._num_class) class_targets[:] = -1 # prefill -1 for ignores # for each ground-truth, find the best matching anchor within the particular grid # for instance, center of object 1 reside in grid (3, 4) in (16, 16) feature map # then only the anchor in (3, 4) is going to be matched gtx, gty, gtw, gth = self.bbox2center(gt_boxes) shift_gt_boxes = nd.concat(-0.5 * gtw, -0.5 * gth, 0.5 * gtw, 0.5 * gth, dim=-1) anchor_boxes = nd.concat(0 * all_anchors, all_anchors, dim=-1) # zero center anchors shift_anchor_boxes = self.bbox2corner(anchor_boxes) ious = nd.contrib.box_iou(shift_anchor_boxes, shift_gt_boxes).transpose((1, 0, 2)) # real value is required to process, convert to Numpy matches = ious.argmax(axis=1).asnumpy() # (B, M) valid_gts = (gt_boxes >= 0).asnumpy().prod(axis=-1) # (B, M) np_gtx, np_gty, np_gtw, np_gth = [x.asnumpy() for x in [gtx, gty, gtw, gth]] np_anchors = all_anchors.asnumpy() np_gt_ids = gt_ids.asnumpy() np_gt_mixratios = gt_mixratio.asnumpy() if gt_mixratio is not None else None # TODO(zhreshold): the number of valid gt is not a big number, therefore for loop # should not be a problem right now. Switch to better solution is needed. for b in range(matches.shape[0]): for m in range(matches.shape[1]): if valid_gts[b, m] < 1: break match = int(matches[b, m]) nlayer = np.nonzero(num_anchors > match)[0][0] height = xs[nlayer].shape[2] width = xs[nlayer].shape[3] gtx, gty, gtw, gth = (np_gtx[b, m, 0], np_gty[b, m, 0], np_gtw[b, m, 0], np_gth[b, m, 0]) # compute the location of the gt centers loc_x = int(gtx / orig_width * width) loc_y = int(gty / orig_height * height) # write back to targets index = _offsets[nlayer] + loc_y * width + loc_x center_targets[b, index, match, 0] = gtx / orig_width * width - loc_x # tx center_targets[b, index, match, 1] = gty / orig_height * height - loc_y # ty scale_targets[b, index, match, 0] = np.log(gtw / np_anchors[match, 0]) scale_targets[b, index, match, 1] = np.log(gth / np_anchors[match, 1]) weights[b, index, match, :] = 2.0 - gtw * gth / orig_width / orig_height objectness[b, index, match, 0] = ( np_gt_mixratios[b, m, 0] if np_gt_mixratios is not None else 1) class_targets[b, index, match, :] = 0 class_targets[b, index, match, int(np_gt_ids[b, m, 0])] = 1 # since some stages won't see partial anchors, so we have to slice the correct targets objectness = self._slice(objectness, num_anchors, num_offsets) center_targets = self._slice(center_targets, num_anchors, num_offsets) scale_targets = self._slice(scale_targets, num_anchors, num_offsets) weights = self._slice(weights, num_anchors, num_offsets) class_targets = self._slice(class_targets, num_anchors, num_offsets) return objectness, center_targets, scale_targets, weights, class_targets
numerator += nd.sum(predictions == label) denominator += data.shape[0] return (numerator / denominator).asscalar() # Defining some variables for training the model epochs = 5 learning_rate = .01 smoothing_constant = .01 # Traning loop for e in range(epochs): for i, (data, label) in enumerate(train_data): data = data.as_in_context(ctx) label = label.as_in_context(ctx) label_one_hot = nd.one_hot(label, num_outputs) with autograd.record(): output = net(data) loss = softmax_cross_entropy(output, label_one_hot) loss.backward() SGD(params, learning_rate) # Keeping moving average fo the loss curr_loss = nd.mean(loss).asscalar() moving_loss = (curr_loss if ((i == 0) and (e == 0)) else (1 - smoothing_constant) * moving_loss + (smoothing_constant) * curr_loss) test_accuracy = evaluate_accuracy(test_data, net) train_accuracy = evaluate_accuracy(train_data, net) print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, moving_loss, train_accuracy, test_accuracy))
def forward(self, inputs, state): X = nd.one_hot(inputs.T, self.vocab_size)
def train(self, input_word_idx, input_len, input_seg, target_word_idx, target_len, target_seg, pm_error_idx, pm_add_idx, pm_remove_idx, inputs_text, targets_text, devices, batch_size, trainer): seq_encoding = [None] * len(devices); cls_encoding = [None] * len(devices) decoder_state = [None] * len(devices); target_word_emb = [None] * len(devices) predict_word_emb = [None] * len(devices); predict_word_logit = [None] * len(devices) target_word_logit = [None] * len(devices); input_word_logit = [None] * len(devices) loss = [None] * len(devices) loss_review = [None] * len(devices) loss_pm = [None] * len(devices) num_device = len(devices) encoder_constraint_loss = [] for i in range(num_device): with autograd.record(): seq_encoding[i], cls_encoding[i] = self.encoder(input_word_idx[i], input_seg[i], input_len[i]) if self.config['use_encoder_constraint']: pm_add_loss = self.fc_pm_add(seq_encoding[i]) pm_remove_loss = self.fc_pm_remove(seq_encoding[i]) encoder_constraint_loss max_target_len = int(max(input_len[i].asnumpy())) _predict_pm_error_logit = nd.softmax(self.fc_pm_error(seq_encoding[i])) _predict_pm_add_logit = nd.softmax(self.fc_pm_add(seq_encoding[i])) _predict_pm_remove_logit = nd.softmax(self.fc_pm_remove(seq_encoding[i])) # _target_start_logit = nd.one_hot(start_idx[i], 2).reshape_like(_predict_start_logit) # _target_end_logit = nd.one_hot(end_idx[i], 2).reshape_like(_predict_end_logit) _target_pm_error_logit = nd.one_hot(pm_error_idx[i], 2).reshape_like(_predict_pm_error_logit) _target_pm_add_logit = nd.one_hot(pm_add_idx[i], 2).reshape_like(_predict_pm_add_logit) _target_pm_remove_logit = nd.one_hot(pm_remove_idx[i], 2).reshape_like(_predict_pm_remove_logit) # print('predcit logit sum : ',( _predict_error_logit.argmax(-1) > 0).sum()) pm_error_balance_mask = self.balance_class(_predict_pm_error_logit[:, : max_target_len], pm_error_idx[i][:, : max_target_len]).detach() pm_add_balance_mask = self.balance_class(_predict_pm_add_logit[:, : max_target_len], pm_add_idx[i][:, : max_target_len]).detach() pm_remove_balance_mask = self.balance_class(_predict_pm_remove_logit[:, : max_target_len], pm_remove_idx[i][:, : max_target_len]).detach() # _loss_start = self.ce(_predict_start_logit, _target_start_logit) # _loss_end = self.ce(_predict_end_logit, _target_end_logit) loss_pm_error = self.ce(_predict_pm_error_logit[:, : max_target_len], _target_pm_error_logit[:, : max_target_len]) * pm_error_balance_mask loss_pm_add = self.ce(_predict_pm_error_logit[:, : max_target_len], _target_pm_error_logit[:, : max_target_len]) * pm_add_balance_mask loss_pm_remove = self.ce(_predict_pm_error_logit[:, : max_target_len], _target_pm_error_logit[:, : max_target_len]) * pm_remove_balance_mask loss_pm[i] = (loss_pm_error + loss_pm_add + loss_pm_remove) / 3 # print(loss_pm[i]) # nd.waitall() for i in range(num_device): with autograd.record(): #""" Decoder with word" # seq_encoding[i], cls_encoding[i] = self.encoder(input_word_idx[i], input_seg[i], input_len[i]) decoder_state[i] = self.decoder.init_state_from_encoder(seq_encoding[i], input_len[i]) target_word_emb[i] = self.emb_tgt(target_word_idx[i]) predict_word_emb[i], _, _ = self.decoder.decode_seq(target_word_emb[i], decoder_state[i])#, valid_len) # target_word_logit_train = nd.softmax(self.fc_proj(target_word_emb[i])) # print(target_word_logit_train.shape) # print(target_word_logit[i].shape)q # raise predict_word_logit[i] = nd.softmax(self.fc_proj(predict_word_emb[i])) target_word_logit[i] = nd.one_hot(target_word_idx[i], len(self.vocab_tgt)) input_word_logit[i] = nd.one_hot(input_word_idx[i], len(self.vocab_src)) max_target_len = int(max(target_len[i].asnumpy())) loss_review[i] = self.ce(predict_word_logit[i][:, : max_target_len - 1], target_word_logit[i][:, 1 : max_target_len]) if self.config['use_encoder_constraint']: # loss[i] = (loss_review[i].mean() + loss_pm[i].mean()) / 2 loss[i] = loss_review[i].mean() + loss_pm[i].mean() else: loss[i] = loss_review[i].mean() #loss[i] = loss[i].mean([1]) + (((predict_word_emb[i][:, : max_target_len - 1]) - target_word_emb[i][:, 1 : max_target_len]) ** 2).mean([1, 2]) #+ self.ce(target_word_logit_train[:, 1 : max_target_len], target_word_logit[i][:, 1 : max_target_len]) # targets_action_embs = self.emb_actions(targets_action) # targets_pm_embs = self.emb_pms(targets_pm) # max_valid_len = int(valid_len.max().asnumpy()) # action_output_embs, _, _ = self.decoder_action.decode_seq(targets_action_embs[ : , : max_valid_len], decoder_action_state)#, valid_len) # """ Decoder """ # decoder_pm_state = self.decoder_pm.init_state_from_encoder(seq_encoding, valid_len) # decoder_action_state = self.decoder_action.init_state_from_encoder(seq_encoding, valid_len) # targets_action_embs = self.emb_actions(targets_action) # targets_pm_embs = self.emb_pms(targets_pm) # max_valid_len = int(valid_len.max().asnumpy()) # action_output_embs, _, _ = self.decoder_action.decode_seq(targets_action_embs[ : , : max_valid_len], decoder_action_state)#, valid_len) # pm_output_embs, _, _ = self.decoder_pm.decode_seq(targets_pm_embs[ : , : max_valid_len], decoder_pm_state)#, valid_len) # action_output = nd.softmax(self.fc_actions(self.dropout(action_output_embs))) # pm_output = nd.softmax(self.fc_pms(self.dropout(pm_output_embs))) # action_idx = action_output.argmax(-1) # pm_idx = pm_output.argmax(-1) # action_mask, pm_mask = self.balance_multi_objective(action_idx, pm_idx, targets_action, targets_pm, 3) # targets_action_logits = nd.one_hot(targets_action, len(self.actions)) # targets_pm_logits = nd.one_hot(targets_pm, len(self.pms)) # action_loss = self.ce(action_output * action_mask, targets_action_logits[:, 1 : max_valid_len + 1] * action_mask) # pm_loss = self.ce(pm_output * pm_mask, targets_pm_logits[:,1 : max_valid_len + 1] * pm_mask) # loss = action_loss / action_mask.sum().detach() + pm_loss / pm_mask.sum().detach() # """ Decoder End """ # """ Encoder Start """ # targets_action_logits = nd.one_hot(targets_action, len(self.actions)) # targets_pm_logits = nd.one_hot(targets_pm, len(self.pms)) # action_output = nd.softmax(self.fc_actions(self.dropout(seq_encoding))) # pm_output = nd.softmax(self.fc_pms(self.dropout(seq_encoding))) # max_valid_len = int(valid_len.max().asnumpy()) # action_idx = action_output.argmax(-1) # pm_idx = pm_output.argmax(-1) # action_mask, pm_mask = self.balance_multi_objective(action_idx, pm_idx, targets_action, targets_pm, 3) # action_loss = self.ce(action_output[:, :max_valid_len ] * action_mask[:, :max_valid_len], # targets_action_logits[:, :max_valid_len] * action_mask[:, :max_valid_len]) # pm_loss = self.ce(pm_output[:, :max_valid_len ] * pm_mask[:, :max_valid_len], # targets_pm_logits[:, :max_valid_len] * pm_mask[:, :max_valid_len]) # loss = action_loss.sum() / action_mask.sum() + pm_loss.sum() / pm_mask.sum() # """ Encoder End """ # debug_action_loss = self.ce((action_output * action_mask) [0:, : max_valid_len], targets_action_logits[:, 1 : max_valid_len + 1] * action_mask) # debug_pm_loss = self.ce(pm_output[:, : max_valid_len] * pm_mask, targets_pm_logits[0:,1:max_valid_len + 1] * pm_mask) # print('action loss : ', (action_loss / action_mask.sum()).sum()) # print('pm_loss : ', (pm_loss / pm_mask.sum()).sum()) # nd.waitall() for _loss in loss: _loss.backward() # _loss_pm.backward() # nd.waitall() nd.waitall() # decode_text = self.decode(inputs_text[0], action_output[0], pm_output[0]) # decode_text_debug = self.decode(inputs_text[0], targets_action_logits[0, 1 : ], targets_pm_logits[0, 1:]) # print('debug => ', decode_text_debug) #self.decode_beamsearch(decoder_state[0], int(batch_size / len(devices)), devices[0]) # trainer.step(batch_size, ignore_stale_grad = True) trainer.step(1, ignore_stale_grad = True) if self.config['use_encoder_constraint']: loss_review = sum([_loss.mean().asnumpy() for _loss in loss_review]) loss_pm = sum([_loss.mean().asnumpy() for _loss in loss_pm]) return loss_review, loss_pm #, self.decode_greedy(predict_word_logit[0][0]).replace('[PAD]', '') else: loss_review = sum([_loss.mean().asnumpy() for _loss in loss_review]) return loss_review, None
def get_inputs(data, vocab_size): return [nd.one_hot(X, vocab_size) for X in data.T]
def to_onehot(X, size): return [nd.one_hot(x, size) for x in X.T] # X中列是feature,行是sample
def fuzzy_one_hot(arr, size): x = arr.reshape((-1, )) return nd.where( nd.one_hot(x, size), nd.uniform(low=0.7, high=1.2, shape=(x.shape[0], size), ctx=x.context), nd.uniform(low=0.0, high=0.3, shape=(x.shape[0], size), ctx=x.context))
def forward(self, inputs, state): X = nd.one_hot(inputs.T, self.vocab_size) Y, state = self.rnn(X, state) output = self.dense(Y.reshape((-1, Y.shape[-1]))) return output, state
def test_one_hot(): # default dtype of ndarray is float32 which cannot index elements over 2^32 a = nd.array([1, (VLARGE_X - 1)], dtype=np.int64) b = nd.one_hot(a, VLARGE_X) b[0][1] == 1 b[1][-1] == 1