def training_full(network, batch_size, epochs, loss_fn, optimizer, data, wo, ho): train_features, train_labels, val_features, val_labels = data num_epochs = train_features.shape[0] // batch_size train_acc = metric.Accuracy() val_acc = metric.Accuracy() t = trange(epochs, leave=True) for e in t: for i in range(num_epochs): final_loss = 0 batch_X = mx.nd.array(train_features[i * batch_size:(i + 1) * batch_size], ctx=context).astype('float32') batch_Y = mx.nd.array(train_labels[i * batch_size:(i + 1) * batch_size], ctx=context).astype('long') flag = 0 network, loss = training_step(batch_X, batch_Y, optimizer, loss_fn, network, wo, ho, batch_size, flag) final_loss += loss.mean().asscalar() flag = 1 l, o = training_step(batch_X, batch_Y, optimizer, loss_fn, network, wo, ho, batch_size, flag) train_acc.update(l, o) validation_loss = validate(network, data, wo, ho, loss_fn) l1, o1 = validate(network, data, wo, ho, loss_fn, flag=1) val_acc.update(l1, o1) return network, final_loss, train_acc.get( )[1], validation_loss, val_acc.get()[1]
def _get_mxnet_metrics(train_config): metrics_mxnet = [ metric.MSE(name='value_loss', output_names=['value_output'], label_names=['value_label']), metric.CrossEntropy(name='policy_loss', output_names=['policy_output'], label_names=['policy_label']), metric.create(acc_sign, name='value_acc_sign', output_names=['value_output'], label_names=['value_label']), metric.Accuracy(axis=1, name='policy_acc', output_names=['policy_output'], label_names=['policy_label']) ] if train_config.use_wdl: metrics_mxnet.append( metric.CrossEntropy(name='wdl_loss', output_names=['wdl_output'], label_names=['wdl_label'])) metrics_mxnet.append( metric.Accuracy(axis=1, name='wdl_acc', output_names=['wdl_output'], label_names=['wdl_label'])) if train_config.use_plys_to_end: metrics_mxnet.append( metric.MSE(name='plys_to_end_loss', output_names=['plys_to_end_output'], label_names=['plys_to_end_label'])) return metrics_mxnet
def _get_gluon_metrics(train_config): metrics_gluon = { 'value_loss': metric.MSE(name='value_loss', output_names=['value_output']), 'value_acc_sign': metric.create(acc_sign, name='value_acc_sign', output_names=['value_output'], label_names=['value_label']), } if train_config.sparse_policy_label: # the default cross entropy only supports sparse labels metrics_gluon['policy_loss'] = metric.CrossEntropy( name='policy_loss', output_names=['policy_output'], label_names=['policy_label']), metrics_gluon['policy_acc'] = metric.Accuracy( axis=1, name='policy_acc', output_names=['policy_output'], label_names=['policy_label']) else: metrics_gluon['policy_loss'] = metric.create( cross_entropy, name='policy_loss', output_names=['policy_output'], label_names=['policy_label']) metrics_gluon['policy_acc'] = metric.create( acc_distribution, name='policy_acc', output_names=['policy_output'], label_names=['policy_label']) return metrics_gluon
def validate(network, validation_dataloader): """ Should compute the accuracy of the network on the validation set. :param network: initialized gluon network to be trained :type network: gluon.Block :param validation_dataloader: the training DataLoader provides batches for data for every iteration :type validation_dataloader: gluon.data.DataLoader :return: validation accuracy :rtype: float """ # YOUR CODE HERE # raise NotImplementedError() validation_accuracy = metric.Accuracy() # validation loop for data, label in validation_dataloader: output = network(data) validation_accuracy.update(label, output) validation_accuracy = validation_accuracy.get()[1] return validation_accuracy
def validate(net, val_data, ctx, loss, plot=False): metric = mtc.Accuracy() val_loss = 0 ebs = [] lbs = [] for i, batch in enumerate(val_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) labels = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) ots = [net(X) for X in data] embedds = [ot[0] for ot in ots] outputs = [ot[1] for ot in ots] losses = [loss(yhat, y) for yhat, y in zip(outputs, labels)] metric.update(labels, outputs) val_loss += sum([l.mean().asscalar() for l in losses]) / len(losses) if plot: for es, ls in zip(embedds, labels): assert len(es) == len(ls) for idx in range(len(es)): ebs.append(es[idx].asnumpy()) lbs.append(ls[idx].asscalar()) if plot: ebs = np.vstack(ebs) lbs = np.hstack(lbs) _, val_acc = metric.get() return val_acc, val_loss / len(val_data), ebs, lbs
def train(network, training_dataloader, batch_size, epochs): # Define training metrics train_acc = metric.Accuracy() #Define parameters needed for training : Optimizer & Learning Rate trainer = gluon.Trainer(network.collect_params(), 'adam', {'learning_rate': 0.002}) # Write a training loop to feed forward, do back-propagation # with the error identified to update the respective weights for epoch in range(epochs): train_loss = 0 tic = time() for data, label in training_dataloader: with autograd.record(): output = network(data) loss = loss_fn(output, label) loss.backward() trainer.step(batch_size) train_loss += loss.asnumpy().mean() train_acc.update(label, output) # Design to print epoch, loss, accuracy for every iteration print( "Epoch(%d) Loss:%.3f Acc:%.3f " % (epoch, train_loss / len(training_dataloader), train_acc.get()[1])) return network, train_acc.get()[1]
def test(valid_iter, net, ctx): val_metric = metric.Accuracy() for X, y in valid_iter: X = X.as_in_context(ctx) y = y.as_in_context(ctx).astype('float32') # 模型的输出是 float32 类型数据 outputs = net(X) val_metric.update(y, outputs) return val_metric.get()
def evaluate_accuracy(self, data_iterator, net): '''Given model and data, the model accuracy will be calculated.''' acc = metric.Accuracy() for i, (data, label) in enumerate(data_iterator): data = data.as_in_context(self.ctx).astype(self.precision) label = label.as_in_context(self.ctx).astype(self.precision) output = net(data) predictions = nd.argmax(output, axis=1) acc.update(preds=predictions, labels=label) return acc.get()[1]
def evaluate_mxnet(model, test_data, loss_fn, device): acc = metric.Accuracy() total_loss = 0.0 for X, Y in test_data: X, Y = X.copyto(device), Y.copyto(device) pred = model(X) loss = loss_fn(pred, Y) total_loss += loss.mean().asscalar() acc.update(preds=pred, labels=Y) return acc.get()[1], total_loss / len(test_data)
def train_model_mxnet(model, train_data, test_data, device, epochs=40, vis_mod=10): loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() optimizer = gluon.Trainer(model.collect_params(), 'sgd', { 'learning_rate': 1e-2, 'momentum': 0.5, 'clip_gradient': 5.0 }) train_acc = [] train_loss = [] test_acc = [] test_loss = [] start = time.time() for epoch in range(epochs): acc = metric.Accuracy() total_loss = 0.0 for X, Y in train_data: X, Y = X.copyto(device), Y.copyto(device) with autograd.record(): pred = model(X) loss = loss_fn(pred, Y) loss.backward() optimizer.step(batch_size=X.shape[0]) total_loss += loss.mean().asscalar() acc.update(preds=pred, labels=Y) train_loss.append(total_loss / len(train_data)) train_acc.append(acc.get()[1]) _test_acc, _test_loss = evaluate_mxnet(model, test_data, loss_fn, device) test_acc.append(_test_acc) test_loss.append(_test_loss) if (epoch + 1) % vis_mod == 0: print(f'[Epoch {epoch + 1:3d}] train_acc: {100 * acc.get()[1]:4.2f}% - '\ f'train_loss: {total_loss / len(train_data):6.3f}') print(f'[Epoch {epoch + 1:3d}] val_acc: {100 * _test_acc:5.2f}% - '\ f'val_loss: {_test_loss:6.3f}') print(f'Model runtime: {time.time() - start:6.3f}s') return { 'accuracy': train_acc, 'loss': train_loss, 'test_accuracy': test_acc, 'test_loss': test_loss }
def train(train_iter): net = nn.HybridSequential() with net.name_scope(): net.add( model.DSOD(32, 6, 32, 1, 1) # 64 6 48 1 1 ) net.initialize() box_loss = SmoothL1Loss() cls_loss = FocalLoss() # hard neg mining vs FocalLoss() l1_loss = gluon.loss.L1Loss() net.collect_params().reset_ctx(ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1, 'wd': 5e-4}) cls_metric = metric.Accuracy() box_metric = metric.MAE() filename = args.params if args.retrain: print('load last time weighting') net.load_params(filename, ctx=mx.gpu()) for epoch in range(args.epoch): train_data.reset() cls_metric.reset() box_metric.reset() tic = time.time() for i, batch in enumerate(train_data): x = batch.data[0].as_in_context(ctx) y = batch.label[0].as_in_context(ctx) with mx.autograd.record(): anchors, class_preds, box_preds = net(x) box_target, box_mask, cls_target = training_targets(anchors, class_preds, y) loss1 = cls_loss(class_preds, cls_target) loss2 = l1_loss(box_preds, box_target, box_mask) loss = loss1 + 5 * loss2 loss.backward() trainer.step(batch_size) cls_metric.update([cls_target], [class_preds.transpose((0, 2, 1))]) box_metric.update([box_target], [box_preds * box_mask]) print('Epoch %2d, train %s %.2f, %s %.5f, time %.1f sec' % ( epoch, *cls_metric.get(), *box_metric.get(), time.time() - tic)) net.save_params(filename)
def eval_model(features, labels): l_sum = 0 l_n = 0 accuracy = metric.Accuracy() for i in range(features.shape[0] // batch_size): X = features[i * batch_size:(i + 1) * batch_size].as_in_context(ctx).T y = labels[i * batch_size:(i + 1) * batch_size].as_in_context(ctx).T output = net(X) l = loss(output, y) l_sum += l.sum().asscalar() l_n += l.size accuracy.update(preds=nd.argmax(output, axis=1), labels=y) return l_sum / l_n, accuracy.get()[1]
def train(network, training_dataloader, batch_size, epochs): """ Should take an initialized network and train that network using data from the data loader. :param network: initialized gluon network to be trained :type network: gluon.Block :param training_dataloader: the training DataLoader provides batches for data for every iteration :type training_dataloader: gluon.data.DataLoader :param batch_size: batch size for the DataLoader. :type batch_size: int :param epochs: number of epochs to train the DataLoader :type epochs: int :return: tuple of trained network and the final training accuracy :rtype: (gluon.Block, float) """ loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() train_accuracy = metric.Accuracy() trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 0.002}) for epoch in range(epochs): train_loss = 0. tic = time() for data, label in training_dataloader: with autograd.record(): output = network(data) loss = loss_fn(output, label) loss.backward() trainer.step(batch_size) train_loss += loss.mean().asscalar() train_accuracy.update(label, output) print("Epoch(%d) Loss:%.3f Acc:%.3f Perf: %.1f img/sec" % (epoch, train_loss / len(training_dataloader), train_accuracy.get()[1], len(training_dataloader) / (time() - tic))) network.save_parameters("trained_net.params") network = network training_accuracy = train_accuracy.get()[1] #raise NotImplementedError() return network, training_accuracy
def train(network, training_dataloader, batch_size, epochs): """ Should take an initialized network and train that network using data from the data loader. :param network: initialized gluon network to be trained :type network: gluon.Block :param training_dataloader: the training DataLoader provides batches for data for every iteration :type training_dataloader: gluon.data.DataLoader :param batch_size: batch size for the DataLoader. :type batch_size: int :param epochs: number of epochs to train the DataLoader :type epochs: int :return: tuple of trained network and the final training accuracy :rtype: (gluon.Block, float) """ # YOUR CODE HERE # raise NotImplementedError() # we have: # network, # training_dataloader, # batch_size, # epochs # create: # loss function # metric accumulator # trainer (adam) # training loop loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() train_acc = metric.Accuracy() trainer = gluon.Trainer(network.collect_params(), 'adam', { 'learning_rate' : 0.002 }) for epoch in range(epochs): for data, label in training_dataloader: with autograd.record(): output = network(data) loss = loss_fn(output, label) loss.backward() trainer.step(batch_size) train_acc.update(label, output) training_accuracy = train_acc.get()[1] print(training_accuracy) return network, training_accuracy
def train(ctx, loss, trainer, datasetName, modelName, net, train_iter, valid_iter, num_epochs, n_retrain_epoch=0): ''' n_retrain_epoch 是从第 n_retrain_epoch 次开始训练模型 ''' train_metric = metric.Accuracy() train_history = TrainingHistory(['training-error', 'validation-error']) best_val_score = 0 modelDir, resultDir = get_result_dirs(datasetName) for epoch in range(num_epochs): train_l_batch, start = 0.0, time.time() # 计时开始 train_metric.reset() for X, y in train_iter: X = X.as_in_context(ctx) y = y.as_in_context(ctx).astype('float32') # 模型的输出是 float32 类型数据 with autograd.record(): # 记录梯度信息 outputs = net(X) # 模型输出 l = loss(outputs, y).mean() # 计算平均损失 l.backward() # 反向传播 trainer.step(1) train_l_batch += l.asscalar() # 计算该批量的总损失 train_metric.update(y, outputs) # 计算训练精度 _, train_acc = train_metric.get() time_s = "time {:.2f} sec".format(time.time() - start) # 计时结束 valid_loss = evaluate_loss(valid_iter, net, ctx, loss) # 计算验证集的平均损失 _, val_acc = test(valid_iter, net, ctx) # 计算验证集的精度 epoch_s = ( "epoch {:d}, train loss {:.5f}, valid loss {:.5f}, train acc {:.5f}, valid acc {:.5f}, " .format(n_retrain_epoch + epoch, train_l_batch, valid_loss, train_acc, val_acc)) print(epoch_s + time_s) train_history.update([1 - train_acc, 1 - val_acc]) # 更新图像的纵轴 train_history.plot( save_path=f'{resultDir}/{modelName}_history.png') # 实时更新图像 if val_acc > best_val_score: # 保存比较好的模型 best_val_score = val_acc net.save_parameters('{}/{:.4f}-{}-{:d}-best.params'.format( modelDir, best_val_score, modelName, n_retrain_epoch + epoch)) return train_history
def fit(self, train_gen, test_gen, epochs, print_every, loss_with_softmax, optimizer): trainer = gluon.Trainer(params=self.collect_params(), optimizer=optimizer) # Initialize some objects for the metrics acc = metric.Accuracy() train_acc_records = [] test_acc_records = [] loss_records = [] for e in range(epochs): for i, (data, label) in enumerate(train_gen): data = data.as_in_context(self.ctx).astype(self.precision) label = label.as_in_context(self.ctx).astype(np.float32) with autograd.record(): label_linear = self.layer(data) label_linear = label_linear.astype( np.float32 ) # Improve accuracy, as suggested in nVIDIA's SDK. loss = loss_with_softmax(label_linear, label) loss.backward() trainer.step(batch_size=128) # Print the metrics every several iterations. if (i % print_every == 0 ): # print metrics for train (current batch) & test data. label_pred = nd.argmax(nd.softmax(label_linear), axis=1) acc.reset() acc.update(preds=label_pred, labels=label) train_acc = acc.get()[1] test_acc = self.evaluate_accuracy(test_gen, self.layer) train_acc_records.append(train_acc) test_acc_records.append(test_acc) curr_loss = nd.mean(loss).asscalar() loss_records.append(curr_loss) print( "epoch=%2s, iter=%5d, loss=%10f, train acc=%10f, test_acc=%10f" % (e, i, curr_loss, train_acc, test_acc)) # Visialize the calculated metrics of accuracy during of training. self.viz_training(train_acc_records, test_acc_records, loss_records)
def calculate_accuracy(network, dataloader): """ Calculates accuracy of the network on the data given by the dataloader. :param network: network to be tested :type network: mx.gluon.Block :param dataloader: dataloader for test data :type dataloader: mx.gluon.data.DataLoader :return: updated metric :rtype: mx.metric.EvalMetric """ accuracy = metric.Accuracy() for data, labels in tqdm(dataloader): preds = network(data) accuracy.update(labels=labels, preds=preds) return accuracy
def train_cls_network(inference, train_loader, trainer, cur_epoch, ctx, criterion, log_iter=100): metric_acc = metric.Accuracy() metric_loss = metric.Loss() train_loader.reset() epoch_start_time = timeit.default_timer() for cur_batch, batch in enumerate(train_loader): batch_start_time = timeit.default_timer() batch_size = batch.data[0].shape[0] data = gluon.utils.split_and_load(batch.data[0], ctx) label = gluon.utils.split_and_load(batch.label[0], ctx) with autograd.record(train_mode=True): losses = [] for x, y in zip(data, label): y_hat = inference(x) loss = criterion(y_hat, y) losses.append(loss) metric_loss.update(None, preds=[loss]) metric_acc.update(preds=[y_hat], labels=[y]) for loss in losses: loss.backward() trainer.step(batch_size) if cur_batch % log_iter == 0 and cur_batch > 0: batch_elpased_time = timeit.default_timer() - batch_start_time print('Epoch [%d-%d]: Speed: %.2f samples/s \t Accuracy: %.2f \t Loss: %.4f' % (cur_epoch, cur_batch, batch_elpased_time / batch_size, 100 * metric_acc.get()[1], metric_loss.get()[1])) epoch_elapsed_time = timeit.default_timer() - epoch_start_time logging.info('Epoch [%d]: Accuracy: %.2f' % (cur_epoch, 100 * metric_acc.get()[1])) logging.info('Epoch [%d]: Loss: %.2f' % (cur_epoch, metric_loss.get()[1])) logging.info('Epoch [%d]: Elapsed time: %s' % (cur_epoch, str(timedelta(seconds=epoch_elapsed_time)))) return metric_acc.get()[1]
def train(net, loss_fn, train_data, epochs, batch_size): """ Should take an initialized network and train that network using data from the data loader. :param network: initialized gluon network to be trained :type network: gluon.Block :param loss_fn: the loss function :type loss_fn: gluon.Block :param train_data: the training DataLoader provides batches for data for every iteration :type train_data: gluon.data.DataLoader :param epochs: number of epochs to train the DataLoader :type epochs: int :param batch_size: batch size for the DataLoader. :type batch_size: int :return: tuple of trained network and the final training accuracy :rtype: (gluon.Block, float) """ # YOUR CODE HERE # raise NotImplementedError() train_acc = metric.Accuracy() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1}) for epoch in range(epochs): print(f'epoch: {epoch}') for data, label in train_data: with autograd.record(): output = net(data) loss = loss_fn(output, label) loss.backward() trainer.step(batch_size) train_acc.update(label, output) print(train_acc.get()[1]) return (net, train_acc.get()[1])
def train_fun(): cls_metric = metric.Accuracy() box_metric = metric.MAE() ctx = gpu(0) train_data, test_data, class_names, num_class = get_iterators( data_shape, batch_size) train_data.reshape(label_shape=(3, 5)) train_data = test_data.sync_label_shape(train_data) net = ToySSD(num_class) net.initialize(init.Xavier(magnitude=2), ctx=ctx) net = ToySSD(num_classes=2, verbose=False) net.initialize(init.Xavier(magnitude=2), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': 0.1, 'wd': 5e-4 }) import time from mxnet import autograd cls_loss = FocalLoss() box_loss = SmoothL1Loss() for epoch in range(30): train_data.reset() cls_metric.reset() box_metric.reset() tic = time.time() for i, batch in enumerate(train_data): x = batch.data[0].as_in_context(ctx) y = batch.label[0].as_in_context(ctx) with autograd.record(): anchors, class_preds, box_preds = net(x) box_target, box_mask, cls_target = training_targets( anchors, class_preds, y) loss1 = cls_loss(class_preds, cls_target) loss2 = box_loss(box_preds, box_target, box_mask) loss = loss1 + loss2 loss.backward() trainer.step(batch_size) cls_metric.update([cls_target], [class_preds.transpose((0, 2, 1))]) box_metric.update([box_target], [box_preds * box_mask]) print('epoch %2d, train %s %.2f, %s %.5f, time %.1f sec' % (epoch, *cls_metric.get(), *box_metric.get(), time.time() - tic))
def evaluate(data_loader, data_len, model, loss, ctx): """ Evaluation, return accuracy and loss """ total_loss = 0.0 acc = metric.Accuracy() for data, label in data_loader: data, label = data.as_in_context(ctx), label.as_in_context(ctx) with autograd.record( train_mode=False): # set the training_mode to False output = model(data) losses = loss(output, label) total_loss += nd.sum(losses).asscalar() predictions = nd.argmax(output, axis=1) acc.update(preds=predictions, labels=label) return acc.get()[1], total_loss / data_len
def validate(net, val_data, ctx): metric = mtc.Accuracy() cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss() val_loss = 0 for i, batch in enumerate(val_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) labels = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) outputs = [net(X) for X in data] loss = [cross_entropy(yhat, y) for yhat, y in zip(outputs, labels)] metric.update(labels, outputs) val_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) _, val_acc = metric.get() return val_acc, val_loss / len(val_data)
def evaluate(network, dataloader): """ Should compute the accuracy of the network on the validation set. :param network: initialized gluon network to be trained :type network: gluon.Block :param dataloader: the validation DataLoader provides batches for data for every iteration :type dataloader: gluon.data.DataLoader :return: validation accuracy :rtype: float """ valid_acc = metric.Accuracy() for data, label in dataloader: output = network(data) valid_acc.update(label, output) #print("Validation acc: %.3f "%(valid_acc.get()[1])) val_acc = valid_acc.get()[1] return val_acc
def validate(network, validation_dataloader): """ Should compute the accuracy of the network on the validation set. :param network: initialized gluon network to be trained :type network: gluon.Block :param validation_dataloader: the training DataLoader provides batches for data for every iteration :type validation_dataloader: gluon.data.DataLoader :return: validation accuracy :rtype: float """ valid_acc = metric.Accuracy() for data, label in validation_dataloader: output = network(data) valid_acc.update(label, output) print("Validation Acc: %.3f " % (valid_acc.get()[1])) # raise NotImplementedError() return valid_acc.get()[1]
def eval_model(features, labels, net, batch_size): l_sum = 0 l_n = 0 accuracy = metric.Accuracy() batch_count = features.shape[0] // batch_size preds_all = None labels_all = None for i in range(batch_count): X = features[i * batch_size:(i + 1) * batch_size].as_in_context( features.context).T # batch_size * embed_size y = labels[i * batch_size:(i + 1) * batch_size].as_in_context( labels.context).T # batch_size * 1 output = net(X) l = loss(output, y) l_sum += l.sum().asscalar() l_n += l.size preds = nd.argmax(output, axis=1) accuracy.update(preds=preds, labels=y) if preds_all is None: preds_all = preds preds_all = nd.concat(preds_all, preds, dim=0) if labels_all is None: labels_all = y labels_all = nd.concat(labels_all, y, dim=0) # tp = nd.sum((preds_all == 1) * (labels_all == 1)).asscalar() # fp = nd.sum((preds_all == 1) * (labels_all == 0)).asscalar() # fn = nd.sum((preds_all == 0) * (labels_all == 1)).asscalar() # precision = float(tp) / (tp + fp) # recall = float(tp) / (tp + fn) # f1 = 2 * (precision * recall) / (precision + recall) return l_sum / l_n, accuracy.get()[1], evaluate(preds_all, labels_all)
box_loss = SmoothL1Loss() print(box_loss) train_data.reshape(label_shape=(3, 5)) train_data = test_data.sync_label_shape(train_data) net = ToySSD(num_class) net.initialize(init.Xavier(magnitude=2), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': 0.1, 'wd': 5e-4 }) ctx = utils.try_gpu() cls_metric = metric.Accuracy() box_metric = metric.MAE() for epoch in range(30): # reset data iterators and metrics train_data.reset() cls_metric.reset() box_metric.reset() tic = time.time() for i, batch in enumerate(train_data): x = batch.data[0].as_in_context(ctx) y = batch.label[0].as_in_context(ctx) with autograd.record(): anchors, class_preds, box_preds = net(x) box_target, box_mask, cls_target = training_targets( anchors, class_preds, y) # losses
def update_network(queue, nn_update_idx, symbol_filename, params_filename, convert_to_onnx, main_config, train_config: TrainConfig, model_contender_dir): """ Creates a new NN checkpoint in the model contender directory after training using the game files stored in the training directory :param queue: Queue object used to return items :param nn_update_idx: Defines how many updates of the nn has already been done. This index should be incremented after every update. :param symbol_filename: Architecture definition file :param params_filename: Weight file which will be loaded before training Updates the neural network with the newly acquired games from the replay memory :param convert_to_onnx: Boolean indicating if the network shall be exported to ONNX to allow TensorRT inference :param main_config: Dict of the main_config (imported from main_config.py) :param train_config: Dict of the train_config (imported from train_config.py) :param model_contender_dir: String of the contender directory path :return: k_steps_final """ # set the context on CPU, switch to GPU if there is one available (strongly recommended for training) ctx = mx.gpu( train_config.device_id) if train_config.context == "gpu" else mx.cpu() # set a specific seed value for reproducibility train_config.nb_parts = len( glob.glob(main_config["planes_train_dir"] + '**/*.zip')) logging.info("number parts for training: %d" % train_config.nb_parts) train_objects = TrainObjects() if train_config.nb_parts <= 0: raise Exception( 'No .zip files for training available. Check the path in main_config["planes_train_dir"]:' ' %s' % main_config["planes_train_dir"]) _, x_val, y_val_value, y_val_policy, _, _ = load_pgn_dataset( dataset_type="val", part_id=0, normalize=train_config.normalize, verbose=False, q_value_ratio=train_config.q_value_ratio) y_val_policy = prepare_policy(y_val_policy, train_config.select_policy_from_plane, train_config.sparse_policy_label, train_config.is_policy_from_plane_data) val_dataset = gluon.data.ArrayDataset(nd.array(x_val), nd.array(y_val_value), nd.array(y_val_policy)) val_data = gluon.data.DataLoader(val_dataset, train_config.batch_size, shuffle=False, num_workers=train_config.cpu_count) symbol = mx.sym.load(symbol_filename) # calculate how many iterations per epoch exist nb_it_per_epoch = (len(x_val) * train_config.nb_parts) // train_config.batch_size # one iteration is defined by passing 1 batch and doing backprop train_config.total_it = int(nb_it_per_epoch * train_config.nb_training_epochs) train_objects.lr_schedule = CosineAnnealingSchedule( train_config.min_lr, train_config.max_lr, max(train_config.total_it * .7, 1)) train_objects.lr_schedule = LinearWarmUp(train_objects.lr_schedule, start_lr=train_config.min_lr, length=max( train_config.total_it * .25, 1)) train_objects.momentum_schedule = MomentumSchedule( train_objects.lr_schedule, train_config.min_lr, train_config.max_lr, train_config.min_momentum, train_config.max_momentum) input_shape = x_val[0].shape inputs = mx.sym.var('data', dtype='float32') value_out = symbol.get_internals()[main_config['value_output'] + '_output'] policy_out = symbol.get_internals()[main_config['policy_output'] + '_output'] sym = mx.symbol.Group([value_out, policy_out]) net = mx.gluon.SymbolBlock(sym, inputs) net.collect_params().load(params_filename, ctx) metrics_gluon = { 'value_loss': metric.MSE(name='value_loss', output_names=['value_output']), 'value_acc_sign': metric.create(acc_sign, name='value_acc_sign', output_names=['value_output'], label_names=['value_label']), } if train_config.sparse_policy_label: print("train with sparse labels") # the default cross entropy only supports sparse labels metrics_gluon['policy_loss'] = metric.CrossEntropy( name='policy_loss', output_names=['policy_output'], label_names=['policy_label']), metrics_gluon['policy_acc'] = metric.Accuracy( axis=1, name='policy_acc', output_names=['policy_output'], label_names=['policy_label']) else: metrics_gluon['policy_loss'] = metric.create( cross_entropy, name='policy_loss', output_names=['policy_output'], label_names=['policy_label']) metrics_gluon['policy_acc'] = metric.create( acc_distribution, name='policy_acc', output_names=['policy_output'], label_names=['policy_label']) train_objects.metrics = metrics_gluon train_config.export_weights = False # don't save intermediate weights train_agent = TrainerAgent(net, val_data, train_config, train_objects, use_rtpt=False) # iteration counter used for the momentum and learning rate schedule cur_it = train_config.k_steps_initial * train_config.batch_steps (k_steps_final, val_value_loss_final, val_policy_loss_final, val_value_acc_sign_final, val_policy_acc_final), _ = train_agent.train(cur_it) prefix = "%smodel-%.5f-%.5f-%.3f-%.3f" % ( model_contender_dir, val_value_loss_final, val_policy_loss_final, val_value_acc_sign_final, val_policy_acc_final) sym_file = prefix + "-symbol.json" params_file = prefix + "-" + "%04d.params" % nn_update_idx # the export function saves both the architecture and the weights net.export(prefix, epoch=nn_update_idx) print() logging.info("Saved checkpoint to %s-%04d.params", prefix, nn_update_idx) if convert_to_onnx: convert_mxnet_model_to_onnx(sym_file, params_file, ["value_out_output", "policy_out_output"], input_shape, [1, 8, 16], False) logging.info("k_steps_final %d" % k_steps_final) queue.put(k_steps_final)
train=True).transform_first(mnist_transformer), batch_size=batch_size, shuffle=False) mnist_net = MNistHybrid(no_class=no_class) # if we set verbose - the printed numbers will be a lot finer mnist_net.collect_params().initialize(init=mx.init.Xavier(), force_reinit=True, verbose=False, ctx=ctx) trainer = mx.gluon.Trainer(params=mnist_net.collect_params(), optimizer="sgd", optimizer_params={"learning_rate": lr}) loss_fun = mx.gluon.loss.SoftmaxCrossEntropyLoss() train_accuracy = metric.Accuracy() test_accuracy = metric.Accuracy() ninv_train = 1 / len(train_data) ninv_test = 1 / len(test_data) # reshaped as batch, no_channel, w, h sample_1 = mx.gluon.data.vision.MNIST(train=True)[0][0].reshape((1, 1, 28, 28)) plt.imshow(sample_1.asnumpy().reshape((28, 28)), cmap='gray') plt.show() if do_train: for a_epoch in range(epochs): train_loss, f_train_acc, f_val_acc = .0, .0, .0 tic = time() for a_batch in train_data:
def run_training(alpha, queue): _, x_val, yv_val, yp_val, plys_to_end, _ = load_pgn_dataset( dataset_type='val', part_id=0, verbose=True, normalize=tc.normalize) if tc.discount != 1: yv_val *= tc.discount**plys_to_end if tc.select_policy_from_plane: val_iter = mx.io.NDArrayIter( {'data': x_val}, { 'value_label': yv_val, 'policy_label': np.array(FLAT_PLANE_IDX)[yp_val.argmax(axis=1)] }, tc.batch_size) else: val_iter = mx.io.NDArrayIter({'data': x_val}, { 'value_label': yv_val, 'policy_label': yp_val.argmax(axis=1) }, tc.batch_size) tc.nb_parts = len(glob.glob(main_config['planes_train_dir'] + '**/*')) nb_it_per_epoch = ( len(x_val) * tc.nb_parts ) // tc.batch_size # calculate how many iterations per epoch exist # one iteration is defined by passing 1 batch and doing backprop tc.total_it = int(nb_it_per_epoch * tc.nb_training_epochs) ### Define a Learning Rate schedule to.lr_schedule = OneCycleSchedule(start_lr=tc.max_lr / 8, max_lr=tc.max_lr, cycle_length=tc.total_it * .3, cooldown_length=tc.total_it * .6, finish_lr=tc.min_lr) to.lr_schedule = LinearWarmUp(to.lr_schedule, start_lr=tc.min_lr, length=tc.total_it / 30) ### Momentum schedule to.momentum_schedule = MomentumSchedule(to.lr_schedule, tc.min_lr, tc.max_lr, tc.min_momentum, tc.max_momentum) plot_schedule(to.momentum_schedule, iterations=tc.total_it, ylabel='Momentum') input_shape = x_val[0].shape beta = np.sqrt(2 / alpha) print("alpha:", alpha) print("beta:", beta) depth = int(round(base_depth * alpha)) channels = int(round(base_channels * beta)) kernels = [3] * depth se_types = [None] * len(kernels) channels_reduced = int(round(channels / 4)) symbol = rise_mobile_v3_symbol(channels=channels, channels_operating_init=channels_reduced, act_type='relu', channels_value_head=8, value_fc_size=256, channels_policy_head=NB_POLICY_MAP_CHANNELS, grad_scale_value=tc.val_loss_factor, grad_scale_policy=tc.policy_loss_factor, dropout_rate=tc.dropout_rate, select_policy_from_plane=True, kernels=kernels, se_types=se_types) # create a trainable module on compute context model = mx.mod.Module(symbol=symbol, context=ctx, label_names=['value_label', 'policy_label']) model.bind(for_training=True, data_shapes=[('data', (tc.batch_size, input_shape[0], input_shape[1], input_shape[2]))], label_shapes=val_iter.provide_label) model.init_params( mx.initializer.Xavier(rnd_type='uniform', factor_type='avg', magnitude=2.24)) metrics_mxnet = [ metric.MSE(name='value_loss', output_names=['value_output'], label_names=['value_label']), metric.CrossEntropy(name='policy_loss', output_names=['policy_output'], label_names=['policy_label']), metric.create(acc_sign, name='value_acc_sign', output_names=['value_output'], label_names=['value_label']), metric.Accuracy(axis=1, name='policy_acc', output_names=['policy_output'], label_names=['policy_label']) ] to.metrics = metrics_mxnet train_agent = TrainerAgentMXNET(model, symbol, val_iter, tc, to, use_rtpt=True) print("model.score(val_iter, to.metrics:", model.score(val_iter, to.metrics)) # Start the training process _, (k_steps_best, val_metric_values_best) = train_agent.train(cur_it) new_row = { 'alpha': alpha, 'beta': beta, 'depth': depth, 'channels': channels, 'k_steps_best': k_steps_best, 'val_loss': val_metric_values_best['loss'], 'val_value_loss': val_metric_values_best['value_loss'], 'val_policy_loss': val_metric_values_best['policy_loss'], 'val_policy_acc': val_metric_values_best['policy_acc'], 'val_value_acc': val_metric_values_best['value_acc_sign'] } queue.put(new_row) print(new_row)
def __init__(self, args): super(Trainer, self).__init__() self.args = args self.experiment_dir = args.experiment_dir if not osp.exists(self.experiment_dir): os.makedirs(self.experiment_dir) print("The experiment dir has been created:{}".format( self.experiment_dir)) self.trainer_log = TrainerLog(args=args, append=True) self.ctx = set_ctx(args=args) self.check_point = CheckPoint(args=args, trainer_log=self.trainer_log, ctx=self.ctx) self.train_loader, self.test_loader = dataloader(args=args) self.lr_scheduler = None self.optimizer = None self.model = None if self.train_loader is not None: self.train_samples_num = self.train_loader._dataset.__len__() print("train dataset samples: {}".format(self.train_samples_num)) self.test_samples_num = self.test_loader._dataset.__len__() print("test dataset samples: {}".format(self.test_samples_num)) self.resume_epoch = 0 if args.only_test is False: if args.use_tensorboard is True: from tensorboardX import SummaryWriter self.tb_writer = SummaryWriter( log_dir=osp.join(args.experiment_dir, 'tensorboard')) else: self.tb_writer = None if args.resume is True: self.checkpoint_epoch = args.checkpoint_epoch self.model = get_networks(args=args, ctx=self.ctx) self.resume_epoch = self.check_point.load_checkpoint_parameters( epoch=self.checkpoint_epoch, model=self.model) else: self.model = get_networks(args=args, ctx=self.ctx) self.model.classifier.initialize(ctx=self.ctx) self.lr_scheduler = get_lr_scheduler( args=args, train_loader=self.train_loader) self.optimizer, self.trainer = set_optimizer( model=self.model, lr_scheduler=self.lr_scheduler, args=args) self.loss_functions = set_loss(args=args, tb_writer=self.tb_writer) self.current_epoch = None elif args.only_test is True: self.checkpoint_epoch = args.checkpoint_epoch self.model = get_networks(args=args, ctx=self.ctx) self.epoch_test = args.epoch_test _ = self.check_point.load_checkpoint_parameters( epoch=self.checkpoint_epoch, model=self.model, epoch_test=self.epoch_test) if self.lr_scheduler is not None: self.trainer_log.print_use_lr_scheduler() if self.optimizer is not None and self.trainer is not None: self.trainer_log.print_use_optimizer() if self.model is not None: self.trainer_log.print_use_network() self.test_accuracy_metric = metric.Accuracy() self.epochs = args.epochs self.train_total = 0 self.best_accuracy = None self.current_accuracy = None