norm = nd.array(norm, ctx=ctx) g.ndata['norm'] = norm ######################################################################### # Define your own model here ######################################################################## class Model(gluon.Block): pass model = Model() model.initialize(ctx=ctx) trainer = gluon.Trainer(model.collect_params(), 'adam', { 'learning_rate': 0.01, 'wd': 5e-4 }) loss_fcn = gluon.loss.SoftmaxCELoss() feat = feat.as_in_context(ctx) label = label.as_in_context(ctx) for epoch in range(200): with autograd.record(): logits = model(g, feat) loss = loss_fcn(logits[train_mask], label[train_mask]).sum() / n_train_samples loss.backward() trainer.step(batch_size=1)
super(Policy, self).__init__(**kwargs) with self.name_scope(): self.dense = nn.Dense(16, in_units=4, activation='relu') self.action_pred = nn.Dense(2, in_units=16) self.value_pred = nn.Dense(1, in_units=16) def forward(self, x): x = self.dense(x) probs = self.action_pred(x) values = self.value_pred(x) return F.softmax(probs), values net = Policy() net.collect_params().initialize(mx.init.Uniform(0.02)) trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 3e-2}) loss = gluon.loss.L1Loss() running_reward = 10 for epoch in count(1): state = env.reset() rewards = [] values = [] heads = [] actions = [] with autograd.record(): # Sample a sequence of actions for t in range(10000): state = mx.nd.array(np.expand_dims(state, 0)) prob, value = net(state) action, logp = mx.nd.sample_multinomial(prob, get_prob=True)
def train(metric): """Training function.""" if not only_inference: logging.info('Now we are doing BERT classification training on %s!', ctx) all_model_params = model.collect_params() optimizer_params = {'learning_rate': lr, 'epsilon': epsilon, 'wd': 0.01} try: trainer = gluon.Trainer(all_model_params, args.optimizer, optimizer_params, update_on_kvstore=False) except ValueError as e: print(e) warnings.warn( 'AdamW optimizer is not found. Please consider upgrading to ' 'mxnet>=1.5.0. Now the original Adam optimizer is used instead.') trainer = gluon.Trainer(all_model_params, 'adam', optimizer_params, update_on_kvstore=False) if args.dtype == 'float16': amp.init_trainer(trainer) step_size = batch_size * accumulate if accumulate else batch_size num_train_steps = int(num_train_examples / step_size * args.epochs) warmup_ratio = args.warmup_ratio num_warmup_steps = int(num_train_steps * warmup_ratio) step_num = 0 # Do not apply weight decay on LayerNorm and bias terms for _, v in model.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 # Collect differentiable parameters params = [p for p in all_model_params.values() if p.grad_req != 'null'] # Set grad_req if gradient accumulation is required if accumulate and accumulate > 1: for p in params: p.grad_req = 'add' # track best eval score metric_history = [] tic = time.time() for epoch_id in range(args.epochs): if not only_inference: metric.reset() step_loss = 0 tic = time.time() all_model_params.zero_grad() for batch_id, seqs in enumerate(train_data): # learning rate schedule if step_num < num_warmup_steps: new_lr = lr * step_num / num_warmup_steps else: non_warmup_steps = step_num - num_warmup_steps offset = non_warmup_steps / (num_train_steps - num_warmup_steps) new_lr = lr - offset * lr trainer.set_learning_rate(new_lr) # forward and backward with mx.autograd.record(): input_ids, valid_length, type_ids, label = seqs out = model( input_ids.as_in_context(ctx), type_ids.as_in_context(ctx), valid_length.astype('float32').as_in_context(ctx)) ls = loss_function(out, label.as_in_context(ctx)).mean() if args.dtype == 'float16': with amp.scale_loss(ls, trainer) as scaled_loss: mx.autograd.backward(scaled_loss) else: ls.backward() # update if not accumulate or (batch_id + 1) % accumulate == 0: trainer.allreduce_grads() nlp.utils.clip_grad_global_norm(params, 1) trainer.update(accumulate if accumulate else 1) step_num += 1 if accumulate and accumulate > 1: # set grad to zero for gradient accumulation all_model_params.zero_grad() step_loss += ls.asscalar() metric.update([label], [out]) if (batch_id + 1) % (args.log_interval) == 0: log_train(batch_id, len(train_data), metric, step_loss, args.log_interval, epoch_id, trainer.learning_rate) step_loss = 0 mx.nd.waitall() # inference on dev data for segment, dev_data in dev_data_list: metric_nm, metric_val = evaluate(dev_data, metric, segment) metric_history.append((epoch_id, metric_nm, metric_val)) if not only_inference: # save params ckpt_name = 'model_bert_{0}_{1}.params'.format(task_name, epoch_id) params_saved = os.path.join(output_dir, ckpt_name) nlp.utils.save_parameters(model, params_saved) logging.info('params saved in: %s', params_saved) toc = time.time() logging.info('Time cost=%.2fs', toc - tic) tic = toc if not only_inference: # we choose the best model based on metric[0], # assuming higher score stands for better model quality metric_history.sort(key=lambda x: x[2][0], reverse=True) epoch_id, metric_nm, metric_val = metric_history[0] ckpt_name = 'model_bert_{0}_{1}.params'.format(task_name, epoch_id) params_saved = os.path.join(output_dir, ckpt_name) nlp.utils.load_parameters(model, params_saved) metric_str = 'Best model at epoch {}. Validation metrics:'.format( epoch_id) metric_str += ','.join([i + ':%.4f' for i in metric_nm]) logging.info(metric_str, *metric_val) # inference on test data for segment, test_data in test_data_list: test(test_data, segment)
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" kv = mx.kvstore.create(args.kv_store) net.collect_params().setattr('grad_req', 'null') net.collect_train_params().setattr('grad_req', 'write') if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer( net.collect_train_params( ), # fix batchnorm, fix first stage, etc... 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum }) else: trainer = gluon.Trainer( net.collect_train_params( ), # fix batchnorm, fix first stage, etc... 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum }, update_on_kvstore=(False if args.amp else None), kvstore=kv) if args.amp: amp.init_trainer(trainer) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted( [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division # TODO(zhreshold) losses? rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss( from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss() # == smoothl1 metrics = [ mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'), ] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() metrics2 = [ rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric ] # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) if args.verbose: logger.info('Trainable parameters:') logger.info(net.collect_train_params().keys()) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): mix_ratio = 1.0 if not args.disable_hybridization: net.hybridize(static_alloc=args.static_alloc) rcnn_task = ForwardBackwardTask(net, trainer, rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss, mix_ratio=1.0) executor = Parallel(1 if args.horovod else args.executor_threads, rcnn_task) if args.mixup: # TODO(zhreshold) only support evenly mixup now, target generator needs to be modified otherwise train_data._dataset._data.set_mixup(np.random.uniform, 0.5, 0.5) mix_ratio = 0.5 if epoch >= args.epochs - args.no_mixup_epochs: train_data._dataset._data.set_mixup(None) mix_ratio = 1.0 while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() btic = time.time() base_lr = trainer.learning_rate rcnn_task.mix_ratio = mix_ratio for i, batch in enumerate(train_data): if epoch == 0 and i <= lr_warmup: # adjust based on real percentage new_lr = base_lr * get_lr_at_iter(i / lr_warmup) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info( '[Epoch 0 Iteration {}] Set learning rate to {}'. format(i, new_lr)) trainer.set_learning_rate(new_lr) batch = split_and_load(batch, ctx_list=ctx) batch_size = len(batch[0]) metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] for data in zip(*batch): executor.put(data) for j in range(len(ctx)): result = executor.get() if (not args.horovod) or hvd.rank() == 0: for k in range(len(metric_losses)): metric_losses[k].append(result[k]) for k in range(len(add_losses)): add_losses[k].append(result[len(metric_losses) + k]) for metric, record in zip(metrics, metric_losses): metric.update(0, record) for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(batch_size) # update metrics if (not args.horovod or hvd.rank() == 0) and args.log_interval \ and not (i + 1) % args.log_interval: msg = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in metrics + metrics2 ]) logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'. format( epoch, i, args.log_interval * args.batch_size / (time.time() - btic), msg)) btic = time.time() if (not args.horovod) or hvd.rank() == 0: msg = ','.join( ['{}={:.3f}'.format(*metric.get()) for metric in metrics]) logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format( epoch, (time.time() - tic), msg)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric, args) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format( epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, logger, best_map, current_map, epoch, args.save_interval, args.save_prefix) executor.__del__()
def train_ResNeXt(net, lr, input_shape, batch_size, train_path, test_path, epoch, ctx): train_data, val_data = prepare_data(train_path, test_path, input_shape, batch_size) lr_sched = mx.lr_scheduler.FactorScheduler(step=1000, factor=0.94, base_lr=1) optim = mx.optimizer.SGD(learning_rate=lr, momentum=0.9, wd=1e-3, lr_scheduler=lr_sched) trainer = gluon.Trainer(net.collect_params(), optim) loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() train_acc_meter = mx.metric.Accuracy() train_loss_meter = mx.metric.CrossEntropy() hybridized = False with mxboard.SummaryWriter(logdir="./resnext_logs", flush_secs=30) as sw: for ep in range(1, epoch + 1): #train_data.reset() #val_data.reset() print("Current Learning Rate {}".format(trainer.learning_rate)) epoch_start = timeit.default_timer() train_acc_meter.reset() train_loss_meter.reset() for it, (data, label) in enumerate(train_data): data = data.as_in_context(ctx) label = label.as_in_context(ctx) with autograd.record(): output = net(data) loss_val = loss_fn(output, label) loss_val.backward() trainer.step(data.shape[0]) train_acc_meter.update(preds=[output], labels=[label]) train_loss_meter.update(labels=[label], preds=[nd.softmax(output, axis=1)]) if it % 10 == 0: print( "Epoch {}, batch {}, train loss {:.4f}, train acc {:.4f}" .format(ep, it, train_loss_meter.get()[1], train_acc_meter.get()[1])) epoch_stop = timeit.default_timer() val_loss, val_acc = evaluate(val_data, net, ctx) print( "Epoch {}, Training time {}, validation loss {:.5f}, validation acc {:.5f}" .format(ep, epoch_stop - epoch_start, val_loss, val_acc)) sw.add_scalar(tag="train_loss", value=train_loss_meter.get()[1], global_step=ep) sw.add_scalar(tag="train_acc", value=train_acc_meter.get()[1], global_step=ep) sw.add_scalar(tag="val_acc", value=val_acc, global_step=ep) sw.add_scalar(tag="val_loss", value=val_loss, global_step=ep) sw.add_scalar(tag="learning_rate", value=trainer.learning_rate, global_step=ep) if not hybridized: sw.add_graph(net) hybridized = True if ep % 1 == 0: net.export("resnext_models/resnext", ep) return net
step_epoch=lr_decay_epoch, step_factor=lr_decay, power=2) ]) optimizer_params['lr_scheduler'] = lr_scheduler if opt.partial_bn: train_patterns = None if 'inceptionv3' in opt.model: train_patterns = '.*weight|.*bias|inception30_batchnorm0_gamma|inception30_batchnorm0_beta|inception30_batchnorm0_running_mean|inception30_batchnorm0_running_var' else: logger.info( 'Current model does not support partial batch normalization.') trainer = gluon.Trainer(net.collect_params(train_patterns), optimizer, optimizer_params, update_on_kvstore=False) elif opt.partial_bn == False and opt.use_train_patterns == True: logger.info('========\n %s' % net.collect_params()) trainer = gluon.Trainer(net.collect_params(opt.train_patterns), optimizer, optimizer_params, update_on_kvstore=False) logger.info('trainner.patterns: %s.' % opt.train_patterns) logger.info('========\n %s' % net.collect_params(opt.train_patterns)) elif opt.use_lateral and not opt.freeze_lateral: print("============== use_lateral") lst = list(net.collect_params().values()) + list( net1.collect_params().values()) trainer = gluon.Trainer(lst, optimizer,
if not autograd.is_training(): x = nd.sigmoid(x) return x net = LogisticRegression(in_features, out_features) net.collect_params().initialize() # %% # Loss function: Binary Cross Entropy loss_fn = gluon.loss.SigmoidBinaryCrossEntropyLoss() # %% # Optimizer, Stochastic Gradient Decent optimizer = mx.optimizer.SGD(learning_rate=LR, wd=0.0, momentum=0.0) trainer = gluon.Trainer(net.collect_params(), optimizer) # %% # Training loop for epoch in range(EPOCHS): with autograd.record(train_mode=True): # Compute f(x) = Wx y_pred = net(X_train) # Compute loss loss = loss_fn(y_pred, y_train) # Compute dL/dW loss.backward() # Show intermediate values to screen if epoch % 10 == 0: log_info(net, loss) # Update weights, normalization of grads happens here, not in the loss function
print(X, y) """ """ 3.3.3定义模型 """ from mxnet.gluon import nn net = nn.Sequential() net.add(nn.Dense(1)) """ 3.3.4初始化模型函数 """ from mxnet import init net.initialize(init.Normal(sigma=0.01)) """ 3.3.5定义损失函数 """ from mxnet.gluon import loss as gloss loss = gloss.L2Loss() """ 3.3.6定义优化算法 """ from mxnet import gluon trainer = gluon.Trainer(net.collect_params(), "sgd", {"learning_rate": 0.03}) """3.3.7训练模型 """ num_epochs = 10 for epoch in range(1, num_epochs + 1): for X, y in data_iter: with autograd.record(): l = loss(net(X), y) l.backward() trainer.step(batch_size) l = loss(net(features), labels) print("epoch %d, loss:%f" % (epoch, l.mean().asnumpy()))
def main(args): # load and preprocess dataset data = load_data(args) features = mx.nd.array(data.features) labels = mx.nd.array(data.labels) train_mask = mx.nd.array(data.train_mask) val_mask = mx.nd.array(data.val_mask) test_mask = mx.nd.array(data.test_mask) in_feats = features.shape[1] n_classes = data.num_labels n_edges = data.graph.number_of_edges() print("""----Data statistics------' #Edges %d #Classes %d #Train samples %d #Val samples %d #Test samples %d""" % (n_edges, n_classes, train_mask.sum().asscalar(), val_mask.sum().asscalar(), test_mask.sum().asscalar())) if args.gpu < 0: cuda = False ctx = mx.cpu(0) else: cuda = True ctx = mx.gpu(args.gpu) features = features.as_in_context(ctx) labels = labels.as_in_context(ctx) train_mask = train_mask.as_in_context(ctx) val_mask = val_mask.as_in_context(ctx) test_mask = test_mask.as_in_context(ctx) # create GCN model g = DGLGraph(data.graph) if args.self_loop: g.add_edges(g.nodes(), g.nodes()) # normalization degs = g.in_degrees().astype('float32') norm = mx.nd.power(degs, -0.5) if cuda: norm = norm.as_in_context(ctx) g.ndata['norm'] = mx.nd.expand_dims(norm, 1) model = GCN(g, in_feats, args.n_hidden, n_classes, args.n_layers, mx.nd.relu, args.dropout) model.initialize(ctx=ctx) n_train_samples = train_mask.sum().asscalar() loss_fcn = gluon.loss.SoftmaxCELoss() # use optimizer print(model.collect_params()) trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': args.lr, 'wd': args.weight_decay}) # initialize graph dur = [] for epoch in range(args.n_epochs): if epoch >= 3: t0 = time.time() # forward with mx.autograd.record(): pred = model(features) loss = loss_fcn(pred, labels, mx.nd.expand_dims(train_mask, 1)) loss = loss.sum() / n_train_samples loss.backward() trainer.step(batch_size=1) if epoch >= 3: loss.asscalar() dur.append(time.time() - t0) acc = evaluate(model, features, labels, val_mask) print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | " "ETputs(KTEPS) {:.2f}". format( epoch, np.mean(dur), loss.asscalar(), acc, n_edges / np.mean(dur) / 1000)) # test set accuracy acc = evaluate(model, features, labels, test_mask) print("Test accuracy {:.2%}".format(acc))
def batchnormalization(): """ 批量归一化利用小批量上的均值和标准差,不断调整神经网络的中间输出,从而使整个神经网络在各层的中间输出的数值更稳定 :return: """ def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum): # 通过autograd来判断当前模式是训练模式还是预测模式 if not autograd.is_training(): # 如果是在预测模式下,直接使用传入的移动平均所得的均值和方差 X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps) else: assert len(X.shape) in (2, 4) if len(X.shape) == 2: # 使用全连接层的情况,计算特征维上的均值和方差 mean = X.mean(axis=0) var = ((X - mean)**2).mean(axis=0) else: # 使用二维卷积层的情况,计算通道维上(axis=1)的均值和方差。这里我们需要保持 # X的形状以便后面可以做广播运算 mean = X.mean(axis=(0, 2, 3), keepdims=True) var = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True) # 训练模式下用当前的均值和方差做标准化 X_hat = (X - mean) / nd.sqrt(var + eps) # 更新移动平均的均值和方差 moving_mean = momentum * moving_mean + (1.0 - momentum) * mean moving_var = momentum * moving_var + (1.0 - momentum) * var Y = gamma * X_hat + beta # 拉伸和偏移 return Y, moving_mean, moving_var class BatchNorm(nn.Block): def __init__(self, num_features, num_dims, **kwargs): super(BatchNorm, self).__init__(**kwargs) if num_dims == 2: shape = (1, num_features) else: shape = (1, num_features, 1, 1) # 参与求梯度和迭代的拉伸和偏移参数,分别初始化成1和0 self.gamma = self.params.get('gamma', shape=shape, init=init.One()) self.beta = self.params.get('beta', shape=shape, init=init.Zero()) # 不参与求梯度和迭代的变量,全在内存上初始化成0 self.moving_mean = nd.zeros(shape) self.moving_var = nd.zeros(shape) def forward(self, X): # 如果X不在内存上,将moving_mean和moving_var复制到X所在显存上 if self.moving_mean.context != X.context: self.moving_mean = self.moving_mean.copyto(X.context) self.moving_var = self.moving_var.copyto(X.context) # 保存更新过的moving_mean和moving_var Y, self.moving_mean, self.moving_var = batch_norm( X, self.gamma.data(), self.beta.data(), self.moving_mean, self.moving_var, eps=1e-5, momentum=0.9) return Y net = nn.Sequential() net.add(nn.Conv2D(6, kernel_size=5), BatchNorm(6, num_dims=4), nn.Activation('sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), nn.Conv2D(16, kernel_size=5), BatchNorm(16, num_dims=4), nn.Activation('sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), nn.Dense(120), BatchNorm(120, num_dims=2), nn.Activation('sigmoid'), nn.Dense(84), BatchNorm(84, num_dims=2), nn.Activation('sigmoid'), nn.Dense(10)) lr, num_epochs, batch_size, ctx = 1.0, 5, 256, d2l.try_gpu() net.initialize(ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs) net[1].gamma.data().reshape((-1, )), net[1].beta.data().reshape((-1, ))
def resnet(): """ 残差块通过跨层的数据通道从而能够训练出有效的深度神经网络。 :return: """ class Residual(nn.Block): # 本类已保存在d2lzh包中方便以后使用 def __init__(self, num_channels, use_1x1conv=False, strides=1, **kwargs): super(Residual, self).__init__(**kwargs) self.conv1 = nn.Conv2D(num_channels, kernel_size=3, padding=1, strides=strides) self.conv2 = nn.Conv2D(num_channels, kernel_size=3, padding=1) if use_1x1conv: self.conv3 = nn.Conv2D(num_channels, kernel_size=1, strides=strides) else: self.conv3 = None self.bn1 = nn.BatchNorm() self.bn2 = nn.BatchNorm() def forward(self, X): Y = nd.relu(self.bn1(self.conv1(X))) Y = self.bn2(self.conv2(Y)) if self.conv3: X = self.conv3(X) return nd.relu(Y + X) def resnet_block(num_channels, num_residuals, first_block=False): blk = nn.Sequential() for i in range(num_residuals): if i == 0 and not first_block: blk.add(Residual(num_channels, use_1x1conv=True, strides=2)) else: blk.add(Residual(num_channels)) return blk net = nn.Sequential() net.add(nn.Conv2D(64, kernel_size=7, strides=2, padding=3), nn.BatchNorm(), nn.Activation('relu'), nn.MaxPool2D(pool_size=3, strides=2, padding=1)) net.add(resnet_block(64, 2, first_block=True), resnet_block(128, 2), resnet_block(256, 2), resnet_block(512, 2)) net.add(nn.GlobalAvgPool2D(), nn.Dense(10)) X = nd.random.uniform(shape=(1, 1, 224, 224)) net.initialize() for layer in net: X = layer(X) print(layer.name, 'output shape:\t', X.shape) lr, num_epochs, batch_size, ctx = 0.05, 5, 256, d2l.try_gpu() net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96) d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
def googlenet(): """ GoogLeNet吸收了NiN中网络串联网络的思想,并在此基础上做了很大改进。在随后的几年里,研究人员对GoogLeNet进行了数次改进,本节将介绍这个模型系列的第一个版本。 :return: """ class Inception(nn.Block): # c1 - c4为每条线路里的层的输出通道数 def __init__(self, c1, c2, c3, c4, **kwargs): super(Inception, self).__init__(**kwargs) # 线路1,单1 x 1卷积层 self.p1_1 = nn.Conv2D(c1, kernel_size=1, activation='relu') # 线路2,1 x 1卷积层后接3 x 3卷积层 self.p2_1 = nn.Conv2D(c2[0], kernel_size=1, activation='relu') self.p2_2 = nn.Conv2D(c2[1], kernel_size=3, padding=1, activation='relu') # 线路3,1 x 1卷积层后接5 x 5卷积层 self.p3_1 = nn.Conv2D(c3[0], kernel_size=1, activation='relu') self.p3_2 = nn.Conv2D(c3[1], kernel_size=5, padding=2, activation='relu') # 线路4,3 x 3最大池化层后接1 x 1卷积层 self.p4_1 = nn.MaxPool2D(pool_size=3, strides=1, padding=1) self.p4_2 = nn.Conv2D(c4, kernel_size=1, activation='relu') def forward(self, x): p1 = self.p1_1(x) p2 = self.p2_2(self.p2_1(x)) p3 = self.p3_2(self.p3_1(x)) p4 = self.p4_2(self.p4_1(x)) return nd.concat(p1, p2, p3, p4, dim=1) # 在通道维上连结输出 b1 = nn.Sequential() b1.add( nn.Conv2D(64, kernel_size=7, strides=2, padding=3, activation='relu'), nn.MaxPool2D(pool_size=3, strides=2, padding=1)) b2 = nn.Sequential() b2.add(nn.Conv2D(64, kernel_size=1, activation='relu'), nn.Conv2D(192, kernel_size=3, padding=1, activation='relu'), nn.MaxPool2D(pool_size=3, strides=2, padding=1)) b3 = nn.Sequential() b3.add(Inception(64, (96, 128), (16, 32), 32), Inception(128, (128, 192), (32, 96), 64), nn.MaxPool2D(pool_size=3, strides=2, padding=1)) b4 = nn.Sequential() b4.add(Inception(192, (96, 208), (16, 48), 64), Inception(160, (112, 224), (24, 64), 64), Inception(128, (128, 256), (24, 64), 64), Inception(112, (144, 288), (32, 64), 64), Inception(256, (160, 320), (32, 128), 128), nn.MaxPool2D(pool_size=3, strides=2, padding=1)) b5 = nn.Sequential() b5.add(Inception(256, (160, 320), (32, 128), 128), Inception(384, (192, 384), (48, 128), 128), nn.GlobalAvgPool2D()) net = nn.Sequential() net.add(b1, b2, b3, b4, b5, nn.Dense(10)) X = nd.random.uniform(shape=(1, 1, 96, 96)) net.initialize() for layer in net: X = layer(X) print(layer.name, 'output shape:\t', X.shape) lr, num_epochs, batch_size, ctx = 0.1, 5, 128, d2l.try_gpu() net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96) d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
def alexnet(): """ alexnet深度学习 :return: """ def load_data_fashion_mnist(batch_size, resize=None, root=os.path.join('~', '.mxnet', 'datasets', 'fashion-mnist')): root = os.path.expanduser(root) # 展开用户路径'~' transformer = [] if resize: transformer += [gdata.vision.transforms.Resize(resize)] transformer += [gdata.vision.transforms.ToTensor()] transformer = gdata.vision.transforms.Compose(transformer) mnist_train = gdata.vision.FashionMNIST(root=root, train=True) mnist_test = gdata.vision.FashionMNIST(root=root, train=False) num_workers = 0 if sys.platform.startswith('win32') else 4 train_iter = gdata.DataLoader(mnist_train.transform_first(transformer), batch_size, shuffle=True, num_workers=num_workers) test_iter = gdata.DataLoader(mnist_test.transform_first(transformer), batch_size, shuffle=False, num_workers=num_workers) return train_iter, test_iter net = nn.Sequential() # 使用较大的11 x 11窗口来捕获物体。同时使用步幅4来较大幅度减小输出高和宽。这里使用的输出通 # 道数比LeNet中的也要大很多 net.add( nn.Conv2D(96, kernel_size=11, strides=4, activation='relu'), nn.MaxPool2D(pool_size=3, strides=2), # 减小卷积窗口,使用填充为2来使得输入与输出的高和宽一致,且增大输出通道数 nn.Conv2D(256, kernel_size=5, padding=2, activation='relu'), nn.MaxPool2D(pool_size=3, strides=2), # 连续3个卷积层,且使用更小的卷积窗口。除了最后的卷积层外,进一步增大了输出通道数。 # 前两个卷积层后不使用池化层来减小输入的高和宽 nn.Conv2D(384, kernel_size=3, padding=1, activation='relu'), nn.Conv2D(384, kernel_size=3, padding=1, activation='relu'), nn.Conv2D(256, kernel_size=3, padding=1, activation='relu'), nn.MaxPool2D(pool_size=3, strides=2), # 这里全连接层的输出个数比LeNet中的大数倍。使用丢弃层来缓解过拟合 nn.Dense(4096, activation="relu"), nn.Dropout(0.5), nn.Dense(4096, activation="relu"), nn.Dropout(0.5), # 输出层。由于这里使用Fashion-MNIST,所以用类别数为10,而非论文中的1000 nn.Dense(10)) X = nd.random.uniform(shape=(1, 1, 224, 224)) net.initialize() for layer in net: X = layer(X) print(layer.name, 'output shape:\t', X.shape) batch_size = 128 # 如出现“out of memory”的报错信息,可减小batch_size或resize train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=224) lr, num_epochs, ctx = 0.01, 5, d2l.try_gpu() net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
from mxnet import autograd from mxnet import gluon # Generate synthetic data. X = np.random.randn(10000, 2) Y = 2 * X[:, 0] - 3.4 * X[:, 1] + 4.2 + .01 * np.random.normal(size=10000) net = gluon.nn.Sequential() # The output dimension is 1. net.add(gluon.nn.Dense(1)) net.collect_params().initialize() loss = gluon.loss.L2Loss() # Initialize the learning rate as 0.1. trainer = gluon.Trainer(net.collect_params(), 'sgd', optimizer_params={'learning_rate': 0.1}) net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), force_reinit=True) train_data = mx.io.NDArrayIter(X, Y, batch_size=10, shuffle=True) for epoch in range(5): train_data.reset() for i, batch in enumerate(train_data): data = batch.data[0] label = batch.label[0].reshape((-1, 1)) with autograd.record(): output = net(data) mse = loss(output, label) mse.backward() trainer.step(data.shape[0])
def train(net, train_iter, valid_iter, num_epochs, lr, wd, ctx, lr_period, lr_decay): trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr, 'momentum': 0.9, 'wd': wd})
def fit( self, audio_path_label_pairs, model_dir_path, batch_size=64, epochs=20, test_size=0.2, random_state=42, input_shape=(1, 96, 1366), nb_classes=10, learning_rate=0.001, checkpoint_interval=10, ): config_file_path = Cifar10AudioClassifier.get_config_file_path(model_dir_path) self.input_shape = input_shape self.nb_classes = nb_classes self.config = dict() self.config["input_shape"] = input_shape self.config["nb_classes"] = nb_classes np.save(config_file_path, self.config) self.model = self.create_model(self.nb_classes) X, Y = self.unzip(audio_path_label_pairs) Xtrain, Xtest, Ytrain, Ytest = train_test_split( X, Y, test_size=test_size, random_state=random_state ) train_gen = self.generate_batch(Xtrain, Ytrain, batch_size, shuffled=True) train_num_batches = len(Xtrain) // batch_size self.model.collect_params().initialize( mx.init.Xavier(magnitude=2.24), ctx=self.model_ctx ) self.model.hybridize() trainer = gluon.Trainer( self.model.collect_params(), optimizer="adam", optimizer_params={"learning_rate": learning_rate}, ) softmax_loss = gluon.loss.SoftmaxCrossEntropyLoss() history = dict() loss_train = [] loss_test = [] acc_train = [] acc_test = [] for e in range(epochs): loss_avg = 0.0 accuracy = mx.metric.Accuracy() for batch_index, (data, label) in enumerate(train_gen): data = data.as_in_context(self.model_ctx) label = label.as_in_context(self.model_ctx) with autograd.record(): output = self.model(data) prediction = nd.argmax(output, axis=1) accuracy.update(preds=prediction, labels=label) loss = softmax_loss(output, label) loss.backward() trainer.step(data.shape[0]) loss_avg = loss_avg * batch_index / (batch_index + 1) + nd.mean( loss ).asscalar() / (batch_index + 1) print( "Epoch %s / %s, Batch %s / %s. Loss: %s, Accuracy: %s" % ( e + 1, epochs, batch_index + 1, train_num_batches, loss_avg, accuracy.get()[1], ) ) if batch_index + 1 == train_num_batches: break train_acc = accuracy.get()[1] acc_train.append(train_acc) loss_train.append(loss_avg) test_acc, test_avg_loss = self._evaluate_accuracy( Xtest, Ytest, batch_size=batch_size ) acc_test.append(test_acc) loss_test.append(test_avg_loss) print( "Epoch %s / %s. Loss: %s. Accuracy: %s. Test Accuracy: %s." % (e + 1, epochs, loss_avg, train_acc, test_acc) ) if e % checkpoint_interval == 0: self.checkpoint(model_dir_path) self.checkpoint(model_dir_path) history["loss_train"] = loss_train history["loss_test"] = loss_test history["acc_train"] = acc_train history["acc_test"] = acc_test np.save( model_dir_path + "/" + Cifar10AudioClassifier.model_name + "-history.npy", history, ) return history
def graphsage_cv_train(g, ctx, args, n_classes, train_nid, test_nid, n_test_samples, distributed): n0_feats = g.nodes[0].data['features'] num_nodes = g.number_of_nodes() in_feats = n0_feats.shape[1] g_ctx = n0_feats.context norm = mx.nd.expand_dims(1. / g.in_degrees().astype('float32'), 1) g.set_n_repr({'norm': norm.as_in_context(g_ctx)}) degs = g.in_degrees().astype('float32').asnumpy() degs[degs > args.num_neighbors] = args.num_neighbors g.set_n_repr( {'subg_norm': mx.nd.expand_dims(mx.nd.array(1. / degs, ctx=g_ctx), 1)}) n_layers = args.n_layers g.update_all( fn.copy_src(src='features', out='m'), fn.sum(msg='m', out='preprocess'), lambda node: {'preprocess': node.data['preprocess'] * node.data['norm']}) for i in range(n_layers): g.init_ndata('h_{}'.format(i), (num_nodes, args.n_hidden), 'float32') g.init_ndata('agg_h_{}'.format(i), (num_nodes, args.n_hidden), 'float32') model = GraphSAGETrain(in_feats, args.n_hidden, n_classes, n_layers, args.dropout, prefix='GraphSAGE') model.initialize(ctx=ctx) loss_fcn = gluon.loss.SoftmaxCELoss() infer_model = GraphSAGEInfer(in_feats, args.n_hidden, n_classes, n_layers, prefix='GraphSAGE') infer_model.initialize(ctx=ctx) # use optimizer print(model.collect_params()) kv_type = 'dist_sync' if distributed else 'local' trainer = gluon.Trainer(model.collect_params(), 'adam', { 'learning_rate': args.lr, 'wd': args.weight_decay }, kvstore=mx.kv.create(kv_type)) # initialize graph dur = [] adj = g.adjacency_matrix(transpose=False).as_in_context(g_ctx) for epoch in range(args.n_epochs): start = time.time() if distributed: msg_head = "Worker {:d}, epoch {:d}".format(g.worker_id, epoch) else: msg_head = "epoch {:d}".format(epoch) for nf in dgl.contrib.sampling.NeighborSampler(g, args.batch_size, args.num_neighbors, neighbor_type='in', shuffle=True, num_workers=32, num_hops=n_layers, add_self_loop=True, seed_nodes=train_nid): for i in range(n_layers): agg_history_str = 'agg_h_{}'.format(i) dests = nf.layer_parent_nid(i + 1).as_in_context(g_ctx) # TODO we could use DGLGraph.pull to implement this, but the current # implementation of pull is very slow. Let's manually do it for now. agg = mx.nd.dot(mx.nd.take(adj, dests), g.nodes[:].data['h_{}'.format(i)]) g.set_n_repr({agg_history_str: agg}, dests) node_embed_names = [['preprocess', 'features', 'h_0']] for i in range(1, n_layers): node_embed_names.append([ 'h_{}'.format(i), 'agg_h_{}'.format(i - 1), 'subg_norm', 'norm' ]) node_embed_names.append( ['agg_h_{}'.format(n_layers - 1), 'subg_norm', 'norm']) nf.copy_from_parent(node_embed_names=node_embed_names, ctx=ctx) # forward with mx.autograd.record(): pred = model(nf) batch_nids = nf.layer_parent_nid(-1) batch_labels = g.nodes[batch_nids].data[ 'labels'].as_in_context(ctx) loss = loss_fcn(pred, batch_labels) if distributed: loss = loss.sum() / (len(batch_nids) * g.num_workers) else: loss = loss.sum() / (len(batch_nids)) loss.backward() trainer.step(batch_size=1) node_embed_names = [['h_{}'.format(i)] for i in range(n_layers)] node_embed_names.append([]) nf.copy_to_parent(node_embed_names=node_embed_names) mx.nd.waitall() print(msg_head + ': training takes ' + str(time.time() - start)) infer_params = infer_model.collect_params() for key in infer_params: idx = trainer._param2idx[key] trainer._kvstore.pull(idx, out=infer_params[key].data()) num_acc = 0. num_tests = 0 if not distributed or g.worker_id == 0: for nf in dgl.contrib.sampling.NeighborSampler( g, args.test_batch_size, g.number_of_nodes(), neighbor_type='in', num_hops=n_layers, seed_nodes=test_nid, add_self_loop=True): node_embed_names = [['preprocess', 'features']] for i in range(n_layers): node_embed_names.append(['norm', 'subg_norm']) nf.copy_from_parent(node_embed_names=node_embed_names, ctx=ctx) pred = infer_model(nf) batch_nids = nf.layer_parent_nid(-1) batch_labels = g.nodes[batch_nids].data[ 'labels'].as_in_context(ctx) num_acc += (pred.argmax( axis=1) == batch_labels).sum().asscalar() num_tests += nf.layer_size(-1) if distributed: g._sync_barrier() print(msg_head + ": Test Accuracy {:.4f}".format(num_acc / num_tests)) break elif distributed: g._sync_barrier()
edge_size = 256 sizes = [[0.2, 0.272], [0.37, 0.447], [0.54, 0.619], [0.71, 0.79], [0.88, 0.961]] ratios = [[1, 2, 0.5]] * 5 num_anchors = len(sizes[0]) + len(ratios[0]) - 1 ctx = mx.gpu(0) train_iter, val_iter = load_data_pikachu(batch_size, edge_size) train_iter.reshape(label_shape=(3, 5)) net = TinySSD(num_classes=1) net.initialize(init=init.Xavier(), ctx=ctx) class_loss = gloss.SoftmaxCrossEntropyLoss() bbox_loss = gloss.L1Loss() trainer = gluon.Trainer(net.collect_params(), "sgd", { "learning_rate": lr, "wd": wd }) for epoch in range(20): acc, mae = 0, 0 train_iter.reset() start = time.time() for i, batch in enumerate(train_iter): X = batch.data[0].as_in_context(ctx) Y = batch.label[0].as_in_context(ctx) with autograd.record(): anchors, class_preds, bbox_preds = net(X) bbox_labels, bbox_masks, class_labels = contrib.nd.MultiBoxTarget( anchors, Y, class_preds.transpose((0, 2, 1))) l = calc_loss(class_preds, class_labels, bbox_preds, bbox_labels, bbox_masks)
def check_unroll(cell_type, num_states, layout): batch_size = 20 input_size = 50 hidden_size = 30 seq_len = 10 if layout == 'TNC': rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, input_size)) elif layout == 'NTC': rnn_data = mx.nd.normal(loc=0, scale=1, shape=(batch_size, seq_len, input_size)) else: print("Wrong layout") return valid_length = mx.nd.round( mx.nd.random.uniform(low=1, high=10, shape=(batch_size))) state_shape = (batch_size, hidden_size) states = [ mx.nd.normal(loc=0, scale=1, shape=state_shape) for i in range(num_states) ] cell = cell_type(hidden_size, prefix='rnn_') cell.initialize(ctx=default_context()) if layout == 'TNC': cell(rnn_data[0], states) else: cell(rnn_data[:, 0, :], states) params1 = cell.collect_params() orig_params1 = copy.deepcopy(params1) trainer = gluon.Trainer(params1, 'sgd', {'learning_rate': 0.03}) with mx.autograd.record(): res1, states1 = cell.unroll(seq_len, rnn_data, states, valid_length=valid_length, layout=layout, merge_outputs=True) res1.backward() trainer.step(batch_size) configs = [ lambda layer: None, lambda layer: layer.hybridize(), lambda layer: layer.hybridize({'inline_limit': 0}), lambda layer: layer.hybridize({'static_alloc': True}), lambda layer: layer.hybridize({ 'static_alloc': True, 'static_shape': True }) ] # We can't pass None to a hybrid block, but it accepts an empty list. # so we use an empty list to represent valid_length if it's None. if valid_length is None: valid_length = [] for config in configs: layer = TestRNNLayer(cell_type, hidden_size, layout) layer.initialize(ctx=default_context()) config(layer) res2, states2 = layer(rnn_data, states, valid_length) params2 = layer.collect_params() for key, val in orig_params1.items(): params2[key].set_data(copy.deepcopy(val.data())) trainer = gluon.Trainer(params2, 'sgd', {'learning_rate': 0.03}) with mx.autograd.record(): res2, states2 = layer(rnn_data, states, valid_length) assert_almost_equal(res1, res2, rtol=0.001, atol=0.0001) assert len(states1) == len(states2) for i in range(len(states1)): assert_almost_equal(states1[i], states2[i], rtol=0.001, atol=0.0001) res2.backward() trainer.step(batch_size) for key, val in params1.items(): weight1 = val.data() weight2 = params2[key].data() assert_almost_equal(weight1, weight2, rtol=0.001, atol=0.0001)
train_data = get_dataloader(net, dataset, 512, 16, 0) ############################################################################################# # Try use GPU for training try: a = mx.nd.zeros((1, ), ctx=mx.gpu(0)) ctx = [mx.gpu(0)] except: ctx = [mx.cpu()] ############################################################################################# # Start training(finetuning) net.collect_params().reset_ctx(ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': 0.001, 'wd': 0.0005, 'momentum': 0.9 }) mbox_loss = gcv.loss.SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') for epoch in range(0, 2): ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data): batch_size = batch[0].shape[0]
num_convs_in_dense_blocks = [4, 4, 4, 4] for i, num_convs in enumerate(num_convs_in_dense_blocks): densenet.add(denseblock(num_convs, growth_rate)) num_channels += num_convs * growth_rate if i != len(num_convs_in_dense_blocks) - 1: num_channels //= 2 densenet.add(transition_block(num_channels)) densenet.add(nn.BatchNorm(), nn.Activation('relu'), nn.GlobalAvgPool2D(), nn.Dense(10)) ''' X = nd.random.uniform(shape=(100, 1, 28, 28)) densenet.initialize() for blk in densenet: X = blk(X) print(blk.name, 'output shape:\t', X.shape) exit() ''' lr = 0.05 num_epochs = int(sys.argv[1]) densenet.initialize(force_reinit=True, init=init.Xavier(), ctx=ctx) trainer = gluon.Trainer(densenet.collect_params(), 'sgd', {'learning_rate': lr}) test_acc_list = do_train(net=densenet, train_iter=train_data_batched, test_iter=test_data_batched, batch_size=batch_size, trainer=trainer, num_epochs=num_epochs, ctx=ctx) pkl_file = os.path.basename(__file__).split('.')[0] + '.pkl' with open(pkl_file, 'wb') as pkl_f: pickle.dump(test_acc_list, pkl_f)
def fit(self, data_train, data_eva, meta, model_dir_path, epochs=10, learning_rate=0.01): config = dict() config['input_mode_answer'] = self.input_mode_answer config['input_mode_question'] = self.input_mode_question config['nb_classes'] = self.nb_classes config['meta'] = meta self.meta = meta np.save(self.get_config_file_path(model_dir_path), config) loss = gluon.loss.SoftmaxCrossEntropyLoss() self.model = Net1(self.nb_classes) self.model.collect_params().initialize(init=mx.init.Xavier(), ctx=self.model_ctx) trainer = gluon.Trainer(self.model.collect_params(), 'sgd', {'learning_rate': learning_rate}) history = dict() history['train_acc'] = list() history['val_acc'] = list() moving_loss = 0. best_eva = 0 for e in range(epochs): data_train.reset() for i, batch in enumerate(data_train): batch_size = batch.data[0].shape[0] data1 = batch.data[0].as_in_context(self.model_ctx) data2 = batch.data[1].as_in_context(self.model_ctx) data = [data1, data2] label = batch.label[0].as_in_context(self.model_ctx) with autograd.record(): output = self.model(data) cross_entropy = loss(output, label) cross_entropy.backward() trainer.step(batch_size) if i == 0: moving_loss = np.mean(cross_entropy.asnumpy()[0]) else: moving_loss = .99 * moving_loss + .01 * np.mean( cross_entropy.asnumpy()[0]) if i % 200 == 0: logging.debug("Epoch %s, batch %s. Moving avg of loss: %s", e, i, moving_loss) eva_accuracy = self.evaluate_accuracy(data_iterator=data_eva) train_accuracy = self.evaluate_accuracy(data_iterator=data_train) history['train_acc'].append(train_accuracy) history['val_acc'].append(eva_accuracy) print("Epoch %s. Loss: %s, Train_acc %s, Eval_acc %s" % (e, moving_loss, train_accuracy, eva_accuracy)) if eva_accuracy > best_eva: best_eva = eva_accuracy logging.info('Best validation acc found. Checkpointing...') self.checkpoint(model_dir_path) if e % 5 == 0: self.save_history(history, model_dir_path) self.save_history(history, model_dir_path) return history
def train(self): self.net.collect_params().reset_ctx(self.ctx) trainer = gluon.Trainer( params=self.net.collect_params(), optimizer='sgd', optimizer_params={ 'learning_rate': self.lr, 'wd': self.wd, 'momentum': self.momentum }, update_on_kvstore=(False if self.use_amp else None)) if self.use_amp: amp.init_trainer(trainer) lr_decay = self.lr_decay lr_steps = sorted( [float(ls) for ls in self.lr_decay_epoch.split(',') if ls.strip()]) cls_criterion = FocalLoss(num_class=80) box_criterion = HuberLoss(rho=0.11) cls_metric = mx.metric.Loss('FocalLoss') box_metric = mx.metric.Loss('SmoothL1') logging.info('Start training from scratch...') for epoch in range(self.epoch): while lr_steps and epoch > lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logging.info("Epoch {} Set learning rate to {}".format( epoch, new_lr)) cls_metric.reset() box_metric.reset() tic = time.time() btic = time.time() # reset cause save params may change self.net.collect_params().reset_ctx(self.ctx) self.net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(self.train_data): data, box_targets, cls_targets = batch with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = self.net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) cls_loss = [ cls_criterion(cls_pred, cls_target) for cls_pred, cls_target in zip( cls_preds, cls_targets) ] box_loss = [ box_criterion(box_pred, box_target) for box_pred, box_target in zip( box_preds, box_targets) ] sum_loss = [(cl + bl) for cl, bl in zip(cls_loss, box_loss)] if self.use_amp: with amp.scale_loss(sum_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) cls_metric.update(0, [l * self.batch_size for l in cls_loss]) box_metric.update(0, [l * self.batch_size for l in box_loss]) if i > 0 and i % 50 == 0: name1, loss1 = cls_metric.get() name2, loss2 = box_metric.get() logging.info('Epoch {} Batch {} Speed: {:.3f} samples/s, {}={:.5f}, {}={:.5f}'.\ format(epoch, i, self.batch_size/(time.time()-btic), name1, loss1, name2, loss2)) btic = time.time() logging.info('[Epoch {}] Starting Validation.'.format(epoch)) map_name, mean_ap = self.validation() val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logging.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) self.save_params(epoch)
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.resume_params is '': net.initialize(mx.init.MSRAPrelu(), ctx=ctx) if opt.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) if opt.resume_states is not '': trainer.load_states(opt.resume_states) if opt.label_smoothing or opt.mixup: sparse_label_loss = False else: sparse_label_loss = True if distillation: L = gcv.loss.DistillationSoftmaxCrossEntropyLoss( temperature=opt.temperature, hard_weight=opt.hard_weight, sparse_label=sparse_label_loss) else: L = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=sparse_label_loss) best_val_score = 1 for epoch in range(opt.resume_epoch, opt.num_epochs): tic = time.time() if opt.use_rec: train_data.reset() train_metric.reset() btic = time.time() for i, batch in enumerate(train_data): data, label = batch_fn(batch, ctx) if opt.mixup: lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha) if epoch >= opt.num_epochs - opt.mixup_off_epoch: lam = 1 data = [lam * X + (1 - lam) * X[::-1] for X in data] if opt.label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, classes, lam, eta) elif opt.label_smoothing: hard_label = label label = smooth(label, classes) if distillation: teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \ for X in data] with ag.record(): outputs = [ net(X.astype(opt.dtype, copy=False)) for X in data ] if distillation: loss = [ L(yhat.astype('float32', copy=False), y.astype('float32', copy=False), p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob) ] else: loss = [ L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label) ] for l in loss: l.backward() trainer.step(batch_size) if opt.mixup: output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \ for out in outputs] train_metric.update(label, output_softmax) else: if opt.label_smoothing: train_metric.update(hard_label, outputs) else: train_metric.update(label, outputs) if opt.log_interval and not (i + 1) % opt.log_interval: train_metric_name, train_metric_score = train_metric.get() logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f' % (epoch, i, batch_size * opt.log_interval / (time.time() - btic), train_metric_name, train_metric_score, trainer.learning_rate)) btic = time.time() train_metric_name, train_metric_score = train_metric.get() throughput = int(batch_size * i / (time.time() - tic)) err_top1_val, err_top5_val = test(ctx, val_data) logger.info('[Epoch %d] training: %s=%f' % (epoch, train_metric_name, train_metric_score)) logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f' % (epoch, throughput, time.time() - tic)) logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f' % (epoch, err_top1_val, err_top5_val)) if err_top1_val < best_val_score: best_val_score = err_top1_val net.save_parameters( '%s/%.4f-imagenet-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) trainer.save_states( '%s/%.4f-imagenet-%s-%d-best.states' % (save_dir, best_val_score, model_name, epoch)) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, model_name, epoch)) trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, model_name, opt.num_epochs - 1)) trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, model_name, opt.num_epochs - 1))
H = { "epoch": [], "train_loss": [], "train_acc": [], "val_loss": [], "val_acc": [], "chrono": [] } scheduler = mx.lr_scheduler.FactorScheduler(base_lr=1e-3, factor=0.7, step=10 * len(train_data)) trainer = gluon.Trainer(net.collect_params(), "sgd", { "lr_scheduler": scheduler, "momentum": sgd_momentum, "wd": sgd_wd }) train(0, transfer_epochs, H) # %% # -- Finetune last N blocks of the network pretrained_features = pretrained_net.features # Allow update of weights for the last N blocks for param in pretrained_features[24:].collect_params().values(): param.grad_req = 'write' # DEBUG for index, param in enumerate(net.collect_params().values()):
def main(net, batch_size, epochs, opt, ctx): train_data, val_data = get_data_iters(batch_size) if opt.hybridize: net.hybridize() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum}) #trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': opt.lr, 'wd': opt.wd}) criterion1 = [] for _ in range(8): criterion1.append(gluon.loss.SoftmaxCrossEntropyLoss()) criterion2 = [] if opt.triplet: for _ in range(3): criterion2.append(TripletLoss()) lr = opt.lr minlr = lr*0.01 dlr = (lr-minlr)/(epochs[0]-1) prev_time = datetime.datetime.now() for epoch in range(epochs[-1]): _loss = 0. if epoch<epochs[0]: lr = minlr + dlr*epoch else: if epoch in epochs[1:]: lr = lr * opt.lr_decay trainer.set_learning_rate(lr) for data, label in train_data: data_list = gluon.utils.split_and_load(data, ctx) label_list = gluon.utils.split_and_load(label, ctx) with autograd.record(): losses = [] for i in range(opt.num_gpus): outputs, features = net(data_list[i]) temp_loss = [] num = len(outputs) for j in range(len(outputs)): temp_loss.append(criterion1[j](outputs[j], label_list[i])) if opt.triplet: num += len(features) for j in range(len(features)): temp_loss.append(criterion2[j](features[j], label_list[i])) loss = sum(temp_loss) / num losses.append(loss) for l in losses: l.backward() trainer.step(batch_size) _loss_list = [l.mean().asscalar() for l in losses] _loss += sum(_loss_list) / len(_loss_list) cur_time = datetime.datetime.now() h, remainder = divmod((cur_time - prev_time).seconds, 3600) m, s = divmod(remainder, 60) time_str = "Time %02d:%02d:%02d" % (h, m, s) __loss = _loss/len(train_data) if val_data is not None: val_loss, val_accuray = validate(val_data, net, criterion1, criterion2, ctx) epoch_str = ("Epoch %d. Train loss: %f, Val loss %f, Val accuray %f, " % (epoch, __loss , val_loss, val_accuray)) else: epoch_str = ("Epoch %d. Train loss: %f, " % (epoch, __loss)) prev_time = cur_time print(epoch_str + time_str + ', lr ' + str(trainer.learning_rate)) if not os.path.exists("params"): os.mkdir("params") net.save_parameters("params/resnet50.params")
def train(): epochs = 100 lr = 0.1 lamda = 0.1 lr_steps = [40, 70, np.inf] wd = 5e-4 momentum = 0.9 batch_size = 256 plot_period = 5 ctx = [mx.gpu(i) for i in range(2)] train_set = MNIST(train=True, transform=transform_train) train_data = gluon.data.DataLoader(train_set, batch_size, True, num_workers=4, last_batch='discard') val_set = MNIST(train=False, transform=transform_val) val_data = gluon.data.DataLoader(val_set, batch_size, shuffle=False, num_workers=4) net = MnistNet(embedding_size=2, weight_norm=True) net.initialize(init=mx.init.MSRAPrelu(), ctx=ctx) # net.load_parameters("./pretrained_mnist.params", ctx=ctx) net.hybridize() loss = RingLoss(lamda) loss.initialize(ctx=ctx) loss.hybridize() train_params = net.collect_params() train_params.update(loss.params) trainer = gluon.Trainer(train_params, 'sgd', {'learning_rate': lr, 'momentum': momentum, 'wd': wd}) lr_counter = 0 metric = mtc.Accuracy() num_batch = len(train_data) for epoch in range(epochs): if epoch == lr_steps[lr_counter]: trainer.set_learning_rate(trainer.learning_rate * 0.1) lr_counter += 1 if (epoch % plot_period) == 0: plot = True else: plot = False train_loss = 0 metric.reset() tic = time.time() ebs = [] lbs = [] print("Radius", loss.R.data(ctx=mx.gpu(0)).asscalar()) for batch in train_data: data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) labels = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) with ag.record(): ots = [net(X) for X in data] embedds = [ot[0] for ot in ots] outputs = [ot[1] for ot in ots] losses = [loss(yhat, y, emb) for yhat, y, emb in zip(outputs, labels, embedds)] for l in losses: ag.backward(l) if plot: for es, ls in zip(embedds, labels): assert len(es) == len(ls) for idx in range(len(es)): ebs.append(es[idx].asnumpy()) lbs.append(ls[idx].asscalar()) trainer.step(batch_size) metric.update(labels, outputs) train_loss += sum([l.mean().asscalar() for l in losses]) / len(losses) _, train_acc = metric.get() train_loss /= num_batch val_acc, val_loss, val_ebs, val_lbs = validate(net, val_data, ctx, loss, plot) if plot: ebs = np.vstack(ebs) lbs = np.hstack(lbs) plot_result(ebs, lbs, os.path.join("./resources", "ringloss-train-epoch{}.png".format(epoch))) plot_result(val_ebs, val_lbs, os.path.join("./resources", "ringloss-val-epoch{}.png".format(epoch))) toc = time.time() print('[epoch % 3d] train accuracy: %.6f, train loss: %.6f | ' 'val accuracy: %.6f, val loss: %.6f, time: %.6f' % (epoch, train_acc, train_loss, val_acc, val_loss, toc - tic))
def train_model(model, train_data_loader, val_data_loader, embedding, ctx, args): """ Train model and validate/save every epoch. """ logger.info(vars(args)) # Initialization model.hybridize() model.collect_params().initialize(mx.init.Normal(0.01), ctx=ctx) model.word_emb.weight.set_data(embedding.idx_to_vec) # Fix word embedding if args.fix_embedding: model.word_emb.weight.grad_req = 'null' loss_func = gluon.loss.SoftmaxCrossEntropyLoss() trainer = gluon.Trainer(model.collect_params(), args.optimizer, { 'learning_rate': args.lr, 'wd': args.weight_decay, 'clip_gradient': 5 }) checkpoints_dir = os.path.join(args.output_dir, 'checkpoints') if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir) best_val_acc = 0. for epoch_id in range(args.epochs): avg_loss = 0. avg_acc = 0. for batch_id, example in enumerate(train_data_loader): s1, s2, label = example s1 = s1.as_in_context(ctx) s2 = s2.as_in_context(ctx) label = label.as_in_context(ctx) with autograd.record(): output = model(s1, s2) loss = loss_func(output, label).mean() loss.backward() trainer.step(1) avg_loss += loss.sum().asscalar() pred = output.argmax(axis=1) acc = (pred == label.astype(np.float32)).mean() avg_acc += acc.asscalar() if (batch_id + 1) % args.print_interval == 0: avg_loss /= args.print_interval avg_acc /= args.print_interval logger.info( '[Epoch {} Batch {}/{}] loss={:.4f}, acc={:.4f}'.format( epoch_id, batch_id + 1, len(train_data_loader), avg_loss, avg_acc)) avg_loss = 0. avg_acc = 0. # Validation val_loss, val_acc = test_model(model, val_data_loader, loss_func, ctx) if val_acc > best_val_acc: best_val_acc = val_acc checkpoint_path = os.path.join(args.output_dir, 'checkpoints', 'valid_best.params') model.save_parameters(checkpoint_path) logger.info( '[Epoch {}] valid loss={:.4f}, valid acc={:.4f}, best valid acc={:.4f}' .format(epoch_id, val_loss, val_acc, best_val_acc)) # Save checkpoint of last epoch checkpoint_path = os.path.join(args.output_dir, 'checkpoints', 'last.params') model.save_parameters(checkpoint_path)
# chain all blocks together self.net = nn.HybridSequential() self.net.add(b1, b2, b3, b4, b5, b6) def forward(self, x): out = x for i, b in enumerate(self.net): out = b(out) if self.verbose: print('Block %d output: %s' % (i + 1, out.shape)) return out ################################################################ # train train_data, test_data = utils.load_data_fashion_mnist(batch_size=64, resize=96) ctx = utils.try_gpu() net = ResNet(10) net.initialize(ctx=ctx, init=init.Xavier()) ############### 그래프 ############### import gluoncv gluoncv.utils.viz.plot_network(net, save_prefix=False) ##################################### loss = gluon.loss.SoftmaxCrossEntropyLoss() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.05}) utils.train(train_data, test_data, net, loss, trainer, ctx, num_epochs=1)
def __init__(self, options, logger): # configuration setting self.opt = options self.logger = logger self.log_path = os.path.join(self.opt.log_dir, self.opt.model_zoo) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) if self.opt.use_stereo: self.opt.frame_ids.append("s") ######################### dataloader ######################### datasets_dict = { "kitti": KITTIRAWDataset, "kitti_odom": KITTIOdomDataset } self.dataset = datasets_dict[self.opt.dataset] fpath = os.path.join(os.path.expanduser("~"), ".mxnet/datasets/kitti", "splits", self.opt.split, "{}_files.txt") train_filenames = readlines(fpath.format("train")) val_filenames = readlines(fpath.format("val")) img_ext = '.png' if self.opt.png else '.jpg' train_dataset = self.dataset(self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, num_scales=4, is_train=True, img_ext=img_ext) self.train_loader = gluon.data.DataLoader( train_dataset, batch_size=self.opt.batch_size, shuffle=True, batchify_fn=dict_batchify_fn, num_workers=self.opt.num_workers, pin_memory=True, last_batch='discard') val_dataset = self.dataset(self.opt.data_path, val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, num_scales=4, is_train=False, img_ext=img_ext) self.val_loader = gluon.data.DataLoader( val_dataset, batch_size=self.opt.batch_size, shuffle=False, batchify_fn=dict_batchify_fn, num_workers=self.opt.num_workers, pin_memory=True, last_batch='discard') ################### model initialization ################### # create depth network if self.opt.model_zoo is not None: self.model = get_model(self.opt.model_zoo, pretrained_base=self.opt.pretrained_base, scales=self.opt.scales, ctx=self.opt.ctx) else: assert "Must choose a model from model_zoo, " \ "please provide depth the model_zoo using --model_zoo" self.logger.info(self.model) # resume checkpoint if needed if self.opt.resume_depth is not None: if os.path.isfile(self.opt.resume_depth): logger.info('Resume depth model: %s' % self.opt.resume_depth) self.model.load_parameters(self.opt.resume_depth, ctx=self.opt.ctx) else: raise RuntimeError("=> no checkpoint found at '{}'".format( self.opt.resume_depth)) if self.use_pose_net: # create pose network if self.opt.model_zoo_pose is not None: self.posenet = get_model( self.opt.model_zoo_pose, pretrained_base=self.opt.pretrained_base, num_input_images=2, num_input_features=1, num_frames_to_predict_for=2, ctx=self.opt.ctx) else: assert "Must choose a model from model_zoo, " \ "please provide the pose model_zoo_pose using --model_zoo_pose" self.logger.info(self.posenet) # resume checkpoint if needed if self.opt.resume_pose is not None: if os.path.isfile(self.opt.resume_pose): logger.info('Resume pose model: %s' % self.opt.resume_pose) self.model.load_parameters(self.opt.resume_pose, ctx=self.opt.ctx) else: raise RuntimeError("=> no checkpoint found at '{}'".format( self.opt.resume_pose)) if self.opt.hybridize: self.model.hybridize() self.posenet.hybridize() ################### optimization setting ################### self.lr_scheduler_depth = LRSequential([ LRScheduler('step', base_lr=self.opt.learning_rate, nepochs=self.opt.num_epochs - self.opt.warmup_epochs, iters_per_epoch=len(self.train_loader), step_epoch=[ self.opt.scheduler_step_size - self.opt.warmup_epochs ]) ]) optimizer_params_depth = { 'lr_scheduler': self.lr_scheduler_depth, 'learning_rate': self.opt.learning_rate } self.depth_optimizer = gluon.Trainer(self.model.collect_params(), 'adam', optimizer_params_depth) if self.use_pose_net: self.lr_scheduler_pose = LRSequential([ LRScheduler( 'step', base_lr=self.opt.learning_rate, nepochs=self.opt.num_epochs - self.opt.warmup_epochs, iters_per_epoch=len(self.train_loader), step_epoch=[ self.opt.scheduler_step_size - self.opt.warmup_epochs ]) ]) optimizer_params_pose = { 'lr_scheduler': self.lr_scheduler_pose, 'learning_rate': self.opt.learning_rate } self.pose_optimizer = gluon.Trainer(self.posenet.collect_params(), 'adam', optimizer_params_pose) print("Training model named:\n ", self.opt.model_zoo) print("Models are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", "CPU" if self.opt.ctx[0] is mx.cpu() else "GPU") ################### loss function ################### if not self.opt.no_ssim: self.ssim = SSIM() self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2**scale) w = self.opt.width // (2**scale) self.backproject_depth[scale] = BackprojectDepth( self.opt.batch_size, h, w, ctx=self.opt.ctx[0]) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) ################### metrics ################### self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] print("Using split:\n ", self.opt.split) print( "There are {:d} training items and {:d} validation items\n".format( len(train_dataset), len(val_dataset))) self.save_opts() # for save best model self.best_delta1 = 0 self.best_model = self.model if self.use_pose_net: self.best_posenet = self.posenet