def test_gradient_with_attention(): ctx = mx.cpu() input_shape = (2, 2) input_dim = 4 num_steps = 10 latent_dim = 2 batch_size = 3 num_recurrent_units = 3 # build the network read_nn = SelectiveAttentionRead(2, input_shape, batch_size) write_nn = SelectiveAttentionWrite(2, input_shape, batch_size) draw_nn = DRAW(read_nn, write_nn, num_steps, batch_size, num_recurrent_units, input_dim, latent_dim) model_params = draw_nn.collect_params() model_params.initialize(init=mx.init.Uniform(1.0), ctx=ctx) # loss function loss_fn = DRAWLoss( SigmoidBinaryCrossEntropyLoss(from_sigmoid=False, batch_axis=0), input_dim, latent_dim) def fwd(x): y, qs = draw_nn(x) return nd.sum(loss_fn(x, qs, y)) batch_x = mx.nd.random_uniform(shape=(batch_size, input_dim)) # TODO: investigate why this fails for the first parameters if fwd is not called once before check gradient is # called fwd(batch_x) for p in model_params.values(): assert check_gradient(fwd, [batch_x], p)
def __init__(self, mlp_arc_size): super().__init__() self.binary_ce_loss = SigmoidBinaryCrossEntropyLoss(batch_axis=-1) self.arc_W = parameter_init(self, 'arc_W', (mlp_arc_size, mlp_arc_size + 1), init=mx.init.Zero()) self.mlp_arc_size = mlp_arc_size
def test_gradient(): ctx = mx.cpu() num_latent_maps = 1 input_shape = (1, 2, 2) input_dim = 4 batch_size = 2 # build the network enc_nn = nn.HybridSequential() enc_nn.add(nn.Conv2D(channels=2, kernel_size=(1, 1), activation='relu', bias_initializer=mx.init.Uniform(1.0))) dec_nn = nn.HybridSequential() dec_nn.add(nn.Conv2DTranspose(channels=1, kernel_size=(1, 1), bias_initializer=mx.init.Uniform(1.0))) conv_draw_nn = ConvDRAW(enc_nn, dec_nn, num_steps=2, batch_size=batch_size, input_shape=input_shape, num_latent_maps=num_latent_maps, encoder_output_shape=(2, 2, 2), rnn_hidden_channels=1, kernel_size=(1, 1), ctx=ctx) model_params = conv_draw_nn.collect_params() mx.random.seed(np.random.randint(1000000)) model_params.initialize(init=mx.init.Uniform(1.0), ctx=ctx) # don't initialize to small weights # loss function loss_fn = ConvDRAWLoss(SigmoidBinaryCrossEntropyLoss(from_sigmoid=False, batch_axis=0), input_dim, (1, 2, 2)) def fwd(x): y, q, p = conv_draw_nn(x) return nd.sum(loss_fn(x, q, p, y)) batch_x = mx.nd.random_uniform(shape=(batch_size, *input_shape)) fwd(batch_x) # the following check fails for the first parameter if fwd is not called at least once before it. for p in model_params.values(): assert check_gradient(fwd, [batch_x], p)
def train(model, features, X, X_train, y_train, epochs): cross_entropy = SigmoidBinaryCrossEntropyLoss(from_sigmoid=True) trainer = Trainer(model.collect_params(), 'sgd', { 'learning_rate': 0.001, 'momentum': 1 }) feature_representations = [features(X).asnumpy()] for e in range(1, epochs + 1): cum_loss = 0 cum_preds = [] for i, x in enumerate(X_train.flatten()): y = array(y_train)[i] with autograd.record(): preds = model(X)[x] loss = cross_entropy(preds, y) # logger.debug("x:[{}], y:[{}], pred:[{}], loss:[{}]".format(x,y,preds,loss.asscalar())) loss.backward() trainer.step(1) cum_loss += loss.asscalar() cum_preds += [preds.asscalar()] feature_representations.append(features(X).asnumpy()) if (e % (epochs // 10)) == 0: logger.debug(f"Epoch {e}/{epochs} -- Loss: {cum_loss: f}") logger.debug(cum_preds) return feature_representations
def train(model, features, X, X_train, y_train, epochs): cross_entropy = SigmoidBinaryCrossEntropyLoss(from_sigmoid=True) trainer = Trainer(model.collect_params(), 'sgd', {'learning_rate': 0.001, 'momentum': 1}) feature_representations = [features(X).asnumpy()] plt.figure() for e in range(1, epochs + 1): cum_loss = 0 cum_preds = [] for i, x in enumerate(X_train): y = nd.array(y_train)[i] with autograd.record(): preds = model(X)[x] loss = cross_entropy(preds, y) loss.backward() trainer.step(1) cum_loss += loss.asscalar() cum_preds += [preds.asscalar()] plt.cla() plt.title('epochs'+str(e)) showData(features(X).asnumpy(), zkc.network) plt.pause(0.001) feature_representations.append(features(X).asnumpy()) if (e % (epochs // 10)) == 0: print(f"Epoch {e}/{epochs} -- Loss: {cum_loss: .4f}") print(cum_preds) plt.show() return feature_representations
def test_gradient(): ctx = mx.cpu() latent_dim = 2 input_dim = 4 batch_size = 2 # build the network enc_nn = nn.HybridSequential() enc_nn.add(nn.Dense(units=3, activation='relu')) enc_nn.add(nn.Dense(units=latent_dim * 2)) dec_nn = nn.HybridSequential() dec_nn.add(nn.Dense(units=3, activation='relu')) dec_nn.add(nn.Dense(units=input_dim)) vae_nn = VAE(enc_nn, dec_nn, batch_size, latent_dim) model_params = vae_nn.collect_params() model_params.initialize(init=mx.init.Uniform(1.0), ctx=ctx) # don't initialize to small weights # loss function loss_fn = VAELoss(SigmoidBinaryCrossEntropyLoss(from_sigmoid=False, batch_axis=0), input_dim, latent_dim) def fwd(x): y, q = vae_nn(x) return nd.sum(loss_fn(x, q, y)) batch_x = mx.nd.random_uniform(shape=(batch_size, input_dim)) fwd(batch_x) # the following check fails for the first parameter if fwd is not called at least once before it. for p in model_params.values(): assert check_gradient(fwd, [batch_x], p)
def __init__(self, weight=100, batch_axis=0, **kwargs): """ :param weight: for l1 loss :param batch_axis: :param kwargs: """ super(GeneratorCriterion, self).__init__(weight, batch_axis, **kwargs) self.bce_loss = SigmoidBinaryCrossEntropyLoss(from_sigmoid=True, batch_axis=batch_axis) self.l1_loss = L1Loss(weight=weight, batch_axis=0)
def __init__(self, vocab, mlp_arc_size, mlp_rel_size): super(BiAffine, self).__init__() self._vocab = vocab self.binary_ce_loss = SigmoidBinaryCrossEntropyLoss(batch_axis=-1) self.rel_W = parameter_init(self, 'rel_W', (vocab.rel_size * (mlp_rel_size + 1), mlp_rel_size + 1), init=mx.init.Zero()) self.arc_W = parameter_init(self, 'arc_W', (mlp_arc_size, mlp_arc_size + 1), init=mx.init.Zero()) self.softmax_loss = SoftmaxCrossEntropyLoss(axis=0, batch_axis=-1) self.mlp_arc_size = mlp_arc_size self.mlp_rel_size = mlp_rel_size
def __init__(self, affinity=True, affinity_weight=0.2, ignore_label=-1, sub_sample=True, height=None, width=None, affinity_size=36, l2loss=True, **kwargs): """ Initialization. Sub-sample is adopted based on memory considerations. :param affinity: whether to adopt affinity loss besides the standard cross-entropy loss :param affinity_weight: affinity loss coefficient :param ignore_label: ignored label when compute loss :param sub_sample: whether to down-sample label :param height: sub-sample height :param width: sub-sample width :param affinity_size: sub-sample size :param l2loss: Set true to use mean square error for affinity loss, binary cross-entropy loss otherwise. The label and prediction should have the same size. """ super(PixelAffinityLoss, self).__init__(ignore_label=ignore_label, **kwargs) self.affinity = affinity self.weight = affinity_weight self.height = height if height else affinity_size self.width = width if width else affinity_size self.sub_sample = sub_sample if l2loss: from mxnet.gluon.loss import L2Loss self.affinity_loss = L2Loss() else: from mxnet.gluon.loss import SigmoidBinaryCrossEntropyLoss self.affinity_loss = SigmoidBinaryCrossEntropyLoss( from_sigmoid=True)
def run_training(net, trainer, train_dataloader, val_dataloader, options): stop_early = 0 best_metric = 0 best_model_name = '' loss_fn = SigmoidBinaryCrossEntropyLoss(from_sigmoid=True) for epoch in range(args.epochs): start_epoch_time = time.time() epoch_L = 0.0 epoch_sent_num = 0 epoch_wc = 0 # Log interval training stats start_log_interval_time = time.time() log_interval_wc = 0 log_interval_sent_num = 0 log_interval_L = 0.0 for i, (rec_id, (data, original_length), label) in enumerate(train_dataloader): data = data.as_in_context(context) label = label.as_in_context(context).astype(np.float32) original_length = original_length.as_in_context(context).astype( np.float32) wc = original_length.sum().asscalar() log_interval_wc += wc epoch_wc += wc log_interval_sent_num += data.shape[1] epoch_sent_num += data.shape[1] with autograd.record(): output = net(data) loss = loss_fn(output, label).mean() loss.backward() trainer.step(1) log_interval_L += loss.asscalar() epoch_L += loss.asscalar() if (i + 1) % options.log_interval == 0: print( '[Epoch %d Batch %d/%d] avg loss %g, throughput %gK wps' % (epoch, i + 1, len(train_dataloader), log_interval_L / log_interval_sent_num, log_interval_wc / 1000 / (time.time() - start_log_interval_time))) # Clear log interval training stats start_log_interval_time = time.time() log_interval_wc = 0 log_interval_sent_num = 0 log_interval_L = 0 end_epoch_time = time.time() _, train_acc, train_em, train_f1, _ = run_evaluate( net, train_dataloader, options) valid_avg_L, valid_acc, valid_em, valid_f1, _ = run_evaluate( net, val_dataloader, options) print( '[Epoch %d] ' 'train acc %.4f, train EM %.4f, train F1 %.4f, train avg loss %g, ' 'valid acc %.4f, valid EM %.4f, valid F1 %.4f, valid avg loss %g, ' 'throughput %gK wps' % (epoch, train_acc, train_em, train_f1, epoch_L / epoch_sent_num, valid_acc, valid_em, valid_f1, valid_avg_L, epoch_wc / 1000 / (end_epoch_time - start_epoch_time))) if valid_f1 < best_metric: print('No Improvement.') stop_early += 1 if options.early_stop and stop_early == 5: print('No improvement for 5 times. Stop training. ' 'Best valid F1 found: %.4f' % best_metric) break else: # Reset stop_early if the validation loss finds a new low value print('Observed Improvement.') stop_early = 0 best_model_name = options.save_prefix + '_{:04d}.params'.format( epoch) net.save_parameters(best_model_name) best_metric = valid_f1 print('Stop training. Best valid F1: %.4f, best model: %s' % (best_metric, best_model_name)) return best_model_name
def run_evaluate(net, dataloader, options, return_predictions=False): """Evaluate network on the specified dataset""" total_L = 0.0 total_sample_num = 0 total_correct_classes = 0 exact_match = 0 prediction_results = [] f1s = [mx.metric.F1(average='micro') for i in range(options.sentiments)] start_log_interval_time = time.time() loss_fn = SigmoidBinaryCrossEntropyLoss(from_sigmoid=True) print('Begin Testing...') for i, (rec_id, (data, original_length), label) in enumerate(dataloader): data = data.as_in_context(context) original_length = original_length.as_in_context(context).astype( np.float32) label = label.as_in_context(context).astype(np.float32) output = net(data) L = loss_fn(output, label) total_L += L.sum().asscalar() total_sample_num += label.shape[0] total_class_num = label.shape[1] pred = output > options.threshold total_correct_classes += (pred == label).sum().asscalar() exact_match += int( ((pred == label).sum(axis=1) == total_class_num).sum().asscalar()) for j, f1 in enumerate(f1s): emotion_pred = pred[:, j].reshape(0, 1) emotion_pred_neg = (1 - pred[:, j]).reshape(0, 1) pred_for_emotion = [ mx.nd.concat(*[emotion_pred_neg, emotion_pred], dim=1) ] label_for_emotion = [label[:, j]] f1.update(label_for_emotion, pred_for_emotion) if return_predictions: for ri, pr in zip(rec_id, pred): item = { 'ri': ri.asscalar(), 'happiness': pr[0].asscalar(), 'sadness': pr[1].asscalar(), 'anger': pr[2].asscalar(), 'fear': pr[3].asscalar(), 'surprise': pr[4].asscalar() } prediction_results.append(item) if (i + 1) % args.log_interval == 0: print('[Batch {}/{}] elapsed {:.2f} s'.format( i + 1, len(dataloader), time.time() - start_log_interval_time)) start_log_interval_time = time.time() avg_L = total_L / float(total_sample_num) # we need to divide by number of classes, acc = total_correct_classes / float(total_sample_num) / float( total_class_num) em = exact_match / float(total_sample_num) f1_avg = mx.nd.array([f1.get()[1] for f1 in f1s]).mean().asscalar() return avg_L, acc, em, f1_avg, prediction_results
def hybrid_forward(self, F, output, *args, **kwargs): label, _ = args loss = SigmoidBinaryCrossEntropyLoss() return loss(output, label)
def train_model_for_ml(self): """ 训练模型, 多标签 """ base_net = self.get_base_net() # 基础网络 train_data, len_td = self.get_train_data(self.batch_size) # 训练数据,按批次获取 val_data, len_vd = self.get_val_data(self.batch_size) # 训练数据,按批次获取 trainer = Trainer(base_net.collect_params(), 'rmsprop', {'learning_rate': 1e-4}) loss_func = SigmoidBinaryCrossEntropyLoss() lr_steps = [10, 20, 30, np.inf] # 逐渐降低学习率 lr_factor = 0.75 lr_counter = 0 n_batch = int(len_td / self.batch_size) self.print_info('训练 - 样本数:{}, 批次样本: {}, 批次数: {}'.format( len_td, self.batch_size, n_batch)) for epoch in range(self.epochs): if epoch == lr_steps[lr_counter]: # 逐渐降低学习率 trainer.set_learning_rate(trainer.learning_rate * lr_factor) lr_counter += 1 e_loss, e_r, e_p, e_f1 = 0, 0, 0, 0 # epoch for i, batch in enumerate(train_data): data, labels = batch[0], batch[1].astype('float32') data = split_and_load(data, ctx_list=self.ctx, batch_axis=0, even_split=False) labels = split_and_load(labels, ctx_list=self.ctx, batch_axis=0, even_split=False) with autograd.record(): # 梯度求导 outputs = [base_net(X) for X in data] bc_loss = [ loss_func(yhat, y) for yhat, y in zip(outputs, labels) ] for l in bc_loss: l.backward() trainer.step(self.batch_size) batch_loss = sum([l.mean().asscalar() for l in bc_loss]) / len( bc_loss) # batch的loss e_loss += batch_loss br, bp, bf1 = self.get_batch_rpf(outputs, labels) e_r += br e_p += bp e_f1 += bf1 self.print_info( 'batch: {}, loss: {:.5f}, recall: {:.2f}, precision: {:.2f}, f1: {:.2f}' .format(i, batch_loss, br, bp, bf1)) n_batch = i + 1 # 批次数 e_loss /= n_batch e_r /= n_batch e_p /= n_batch e_f1 /= n_batch self.print_info( 'epoch: {}, loss: {:.5f}, recall: {:.2f}, precision: {:.2f}, f1: {:.2f}' .format(epoch, e_loss, e_r, e_p, e_f1)) e_r, e_p, e_f1 = self.val_net(base_net, val_data, len_vd) self.save_net_and_params(base_net, epoch, e_f1, name='multilabel') # 存储网络
def train_model(): epochs = 5 configs = get_configs() is_gpu = configs['is_gpu'] batch_size = configs['batch_size'] ctx = get_context(is_gpu) print("gpu: {}, batch_size: {}".format(is_gpu, batch_size)) base_net = get_base_net(ctx=ctx) trainer = Trainer(base_net.collect_params(), 'rmsprop', {'learning_rate': 1e-3}) bc_loss = SigmoidBinaryCrossEntropyLoss() triplet_loss = TripletLoss(margin=0) train_data = get_train_data(batch_size=batch_size) # train data triplet_train_data = get_triplet_train_data(batch_size=batch_size) # 训练数据 for epoch in range(epochs): train_loss = 0 # 训练loss total_right, total_all = 0, 0 for i, (batch, tp_batch) in enumerate(zip(train_data, triplet_train_data)): data, labels = batch[0], batch[1].astype('float32') tp_data, tp_labels = tp_batch[0], tp_batch[1].astype('float32') # print(data.shape, labels.shape) # print(tp_data.shape, tp_labels.shape) data = data.as_in_context(context=ctx) labels = labels.as_in_context(context=ctx) tp_data = tp_data.as_in_context(context=ctx) tp_labels = tp_labels.as_in_context(context=ctx) tp_data = mx.nd.transpose(tp_data, (1, 0, 2, 3, 4)) tp_labels = mx.nd.transpose(tp_labels, (1, 0, 2)) # print(tp_data.shape, tp_labels.shape) anc_ins, pos_ins, neg_ins = tp_data[0, :], tp_data[1, :], tp_data[2, :] # print(anc_ins.shape, pos_ins.shape, neg_ins.shape) with autograd.record(): outputs = base_net(data) v_bc_loss = bc_loss(outputs, labels) inter1 = base_net(anc_ins) inter2 = base_net(pos_ins) inter3 = base_net(neg_ins) v_triplet_loss = triplet_loss(inter1, inter2, inter3) # 交叉熵 autograd.backward([v_bc_loss, v_triplet_loss]) trainer.step(batch_size) print('bc: {}, triplet: {}'.format(np.sum(v_bc_loss.asnumpy()), np.sum(v_triplet_loss.asnumpy()))) train_loss += v_bc_loss.mean().asscalar() acc, nr, na = get_batch_acc(outputs, labels) total_right += nr total_all += na if i != 0: # batch 0 doesn't have train_loss. print('batch: %s, loss: %s, acc: %s' % (i, train_loss / i, acc)) else: print('batch: %s' % i) train_loss /= len(train_data) print('epoch: %s, loss: %s, acc: %s' % (epoch, train_loss, total_right / total_all))
self.linear1 = nn.Dense(in_units=confidence_C,units=(confidence_C+K_way)//2,\ use_bias=True,activation='relu') self.linear2 = nn.Dense(units=K_way) def forward(self, x): x = self.linear1(x) x = self.linear2(x) return x # x shape is N*K_way,to pred top_k is the output_label.loss is SoftmaxwithCrossentropy if __name__ == '__main__': from mxnet.gluon.loss import SigmoidBinaryCrossEntropyLoss from mxnet import nd, autograd model = Decision_thresh(thresh_size=4) model.initialize(init=mx.init.Xavier()) x = nd.array([[0.1, 0.7, 0.9, 0.4], [0.8, 0.5, 0.8, 0.1]]) label = nd.array([[0, 1, 1, 0], [1, 0, 0, 0]]) loss_criterion = SigmoidBinaryCrossEntropyLoss() with autograd.record(): y_pred = model(x) loss = loss_criterion(y_pred, label) print("loss", nd.sum(loss).asscalar()) loss.backward() print(model.thresh.grad()) # to test the Decision_topk model to predict the top_k is groud truth model2 = Decision_topk(confidence_C=63, K_way=4) mdoel2.initialize(init=mx.init.Xavier()) #x = nd.
# build the network enc_nn = nn.HybridSequential() enc_nn.add(nn.Dense(units=args.num_encoder_units, activation='relu')) enc_nn.add(nn.Dense(units=args.latent_dim * 2)) dec_nn = nn.HybridSequential() dec_nn.add(nn.Dense(units=args.num_decoder_units, activation='relu')) dec_nn.add(nn.Dense(units=input_dim)) vae_nn = VAE(enc_nn, dec_nn, args.batch_size, args.latent_dim) model_params = vae_nn.collect_params() model_params.initialize(ctx=ctx) # loss function loss_fn = VAELoss( SigmoidBinaryCrossEntropyLoss(from_sigmoid=False, batch_axis=0), input_dim, args.latent_dim) # optimizer trainer = Trainer(params=model_params, optimizer='adam', optimizer_params={'learning_rate': args.learning_rate}) # forward function for training def forward_fn(batch): x = batch.data[0].as_in_context(ctx) y, q = vae_nn(x) loss = loss_fn(x, q, y) return loss # train