def predict(task): net = gluon.model_zoo.vision.get_model(model_name) with net.name_scope(): net.output = nn.Dense(task_num_class) net.load_params('../../data/%s_%s.params' % (task, model_name), ctx=mx.gpu(1)) logging.info('Training Finished. Starting Prediction.\n') f_out = open('../../data/submission/%s_%s.csv' % (task, model_name), 'w') with open('../../data/z_rank/Tests/question.csv', 'r') as f_in: lines = f_in.readlines() tokens = [l.rstrip().split(',') for l in lines] task_tokens = [t for t in tokens if t[1] == task] n = len(task_tokens) cnt = 0 for path, task, _ in task_tokens: img_path = os.path.join('../../data/z_rank', path) with open(img_path, 'rb') as f: img = image.imdecode(f.read()) data = transform_predict(img) out = net(data.as_in_context(mx.gpu(1))) out = nd.SoftmaxActivation(out).mean(axis=0) pred_out = ';'.join(["%.8f" % (o) for o in out.asnumpy().tolist()]) line_out = ','.join([path, task, pred_out]) f_out.write(line_out + '\n') cnt += 1 progressbar(cnt, n) f_out.close()
def predict(task): logging.info('Training Finished. Starting Prediction.\n') f_out = open('submission/%s.csv'%(task), 'w') #将测试结果写入到此文件 #加载测试集中的图像,将网络检测结果写入到文件中 with open('data/rank/Tests/question.csv', 'r') as f_in: lines = f_in.readlines() tokens = [l.rstrip().split(',') for l in lines] task_tokens = [t for t in tokens if t[1] == task] n = len(task_tokens) cnt = 0 for path, task, _ in task_tokens: img_path = os.path.join('data/rank', path) with open(img_path, 'rb') as f: img = image.imdecode(f.read()) data = transform_predict(img) out = net(data.as_in_context(mx.gpu(0))) out = nd.SoftmaxActivation(out).mean(axis=0) pred_out = ';'.join(["%.8f"%(o) for o in out.asnumpy().tolist()]) line_out = ','.join([path, task, pred_out]) f_out.write(line_out + '\n') cnt += 1 progressbar(cnt, n) f_out.close()
def validate(net, val_data, ctx): metric = mx.metric.Accuracy() L = gluon.loss.SoftmaxCrossEntropyLoss() AP = 0. AP_cnt = 0 val_loss = 0 all_softmax_output = [] mAP_name = task+model_name+'.npy' for i, batch in enumerate(val_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) #data = transform_predict(data, scale) outputs = [net(X) for X in data] # 将图片输入,得到16X5维的结果 metric.update(label, outputs) loss = [L(yhat, y) for yhat, y in zip(outputs, label)] # 输出16个数,代表loss val_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) # loss相加求和 #ap, cnt = calculate_ap(label, outputs) # softmax_output和label本身是list,但是softmax_output[0]和label[0]是NDarray格式,注意这个NDarray是mxnet的,不是numpy的NDarray softmax_output = [nd.SoftmaxActivation(output) for output in outputs] # 取softmax,然后对十个结果取平均 sm_out_label = zip(softmax_output[0].asnumpy(), label[0].asnumpy()) all_softmax_output += sm_out_label np.save(mAP_name, all_softmax_output) #AP += ap #AP_cnt += cnt this_AP = cal_mAP(mAP_name) # 得到当前的AP _, val_acc = metric.get() return ((this_AP, val_acc, val_loss / len(val_data)))
def predict_cropped_images(self, dataset_path, model_path, task, gpus, network='densenet201', loss_type='sfe'): # with Path(dataset_path, 'Annotations/%s.csv' % task).open('r') as f: # self.task_tokens = [l.rstrip().split(',') for l in f.readlines()] # self.task_tokens = [t for t in tokens if t[1] == task] results_path = self.output_submission_path.joinpath('%s.csv'%(task)) f_out = results_path.open('w+') ctx = self.get_ctx()[0] net = get_symbol(network, task_class_num_list[task], ctx) net.load_params(model_path, ctx=ctx) logging.info("load model from %s" % model_path) for index, task_token in enumerate(self.task_tokens): img_path, raw_task = task_token[:2] assert raw_task == task, "task not match" with Path(dataset_path, img_path).open('rb') as f: raw_img = f.read() img = image.imdecode(raw_img) data = utils.transform_cropped_img(img) out = net(data.as_in_context(ctx)) out = nd.SoftmaxActivation(out).mean(axis=0) pred_out = ';'.join(["%.8f"%(o) for o in out.asnumpy().tolist()]) line_out = ','.join([img_path, task, pred_out]) f_out.write(line_out + '\n') utils.progressbar(index, len(self.task_tokens)) f_out.close() logging.info("end predicting for %s, results saved at %s" % (task, results_path))
def predict(task): logging.info('Training Finished. Starting Prediction.\n') f_out = open('submission/%s.csv' % (task), 'w') with open('data2/week-rank/Tests/question.csv', 'r') as f_in: lines = f_in.readlines() tokens = [l.rstrip().split(',') for l in lines] task_tokens = [t for t in tokens if t[1] == task] n = len(task_tokens) cnt = 0 for path, task, _ in task_tokens: img_path = os.path.join('data2/week-rank', path) with open(img_path, 'rb') as f: img = image.imdecode(f.read()) out_all = np.zeros([ task_list[task], ]) ###### Test Time augmentation (muti-scale test) ###### for scale in input_scale: data = transform_predict(img, scale) with ag.predict_mode(): out = net(data.as_in_context( mx.gpu(0))) # 随机crop十张图片,所以此处是10张图片的结果 out = nd.SoftmaxActivation(out).mean( axis=0) # 取softmax,然后对十个结果取平均 out_all += out.asnumpy() out = out_all / len(input_scale) pred_out = ';'.join(["%.8f" % (o) for o in out.tolist()]) line_out = ','.join([path, task, pred_out]) f_out.write(line_out + '\n') cnt += 1 #progressbar(cnt, n) f_out.close()
def predict(task, saved_path): logging.info('Training Finished. Starting Prediction.\n') rank_root = '/data/fashion/data/attribute/datasets_david/rank' f_out = open('submission/%s.csv' % (task), 'w+') with open(rank_root + '/Tests/question.csv', 'r') as f_in: lines = f_in.readlines() tokens = [l.rstrip().split(',') for l in lines] task_tokens = [t for t in tokens if t[1] == task] n = len(task_tokens) cnt = 0 predictor_net = build_model() predictor_ctx = mx.gpu(num_gpus[0]) if len(num_gpus) > 0 else mx.cpu() predictor_net.load_params(saved_path, ctx=predictor_ctx) logging.info("load model from %s" % saved_path) for path, task, _ in task_tokens: img_path = os.path.join(rank_root, path) with open(img_path, 'rb') as f: img = image.imdecode(f.read()) data = transform_predict(img) out = predictor_net(data.as_in_context(predictor_ctx)) out = nd.SoftmaxActivation(out).mean(axis=0) pred_out = ';'.join(["%.8f" % (o) for o in out.asnumpy().tolist()]) line_out = ','.join([path, task, pred_out]) f_out.write(line_out + '\n') cnt += 1 progressbar(cnt, n) f_out.close()
def predict(x, net, ctx): anchors, cls_preds, box_preds = net(x.as_in_context(ctx)) cls_probs = nd.SoftmaxActivation( cls_preds.transpose((0, 2, 1)), mode='channel') return MultiBoxDetection(cls_probs, box_preds, anchors, force_suppress=True, clip=False) '''
def detect_image(img_file): if not os.path.exists(img_file): print('can not find image: ', img_file) img = Image.open(img_file) img = ImageOps.fit(img, [data_shape, data_shape], Image.ANTIALIAS) print(img) origin_img = np.array(img) img = origin_img - np.array([123, 117, 104]) # organize as [batch-channel-height-width] img = np.transpose(img, (2, 0, 1)) img = img[np.newaxis, :] # convert to ndarray img = nd.array(img) print('input image shape: ', img.shape) net = ToySSD(num_class) ctx = mx.cpu() net.initialize(mx.init.Xavier(magnitude=2), ctx=ctx) net.collect_params().reset_ctx(ctx) params = 'ssd_pretrained.params' net.load_params(params, ctx=ctx) anchors, cls_preds, box_preds = net(img.as_in_context(ctx)) print('anchors', anchors) print('class predictions', cls_preds) print('box delta predictions', box_preds) # convert predictions to probabilities using softmax cls_probs = nd.SoftmaxActivation(nd.transpose(cls_preds, (0, 2, 1)), mode='channel') # apply shifts to anchors boxes, non-maximum-suppression, etc... output = MultiBoxDetection(*[cls_probs, box_preds, anchors], force_suppress=True, clip=False) output = output.asnumpy() print(output) print(output.shape) pens = dict() plt.imshow(origin_img) thresh = 0.69 for det in output[0]: cid = int(det[0]) if cid < 0: continue score = det[1] if score < thresh: continue if cid not in pens: pens[cid] = (random.random(), random.random(), random.random()) scales = [origin_img.shape[1], origin_img.shape[0]] * 2 xmin, ymin, xmax, ymax = [int(p * s) for p, s in zip(det[2:6].tolist(), scales)] rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False, edgecolor=pens[cid], linewidth=3) plt.gca().add_patch(rect) text = class_names[cid] plt.gca().text(xmin, ymin - 2, '{:s} {:.3f}'.format(text, score), bbox=dict(facecolor=pens[cid], alpha=0.5), fontsize=12, color='white') plt.axis('off') plt.savefig('result.png', dpi=100) plt.show()
def predict_bounding_boxes(net, image, bb): ''' Given the outputs of the dataset (image and bounding box) and the network, the predicted bounding boxes are provided. Parameters ---------- net: SSD The trained SSD network. image: np.array A grayscale image of the handwriting passages. bb: [(x1, y1, x2, y2)] A tuple that contains the bounding box. Returns ------- predicted_bb: [(x, y, w, h)] The predicted bounding boxes. actual_bb: [(x, y, w, h)] The actual bounding bounding boxes. ''' image, bb = transform(image, bb) image = image.as_in_context(ctx[0]) image = image.expand_dims(axis=0) bb = bb.as_in_context(ctx[0]) bb = bb.expand_dims(axis=0) default_anchors, class_predictions, box_predictions = net(image) box_target, box_mask, cls_target = net.training_targets( default_anchors, class_predictions, bb) cls_probs = nd.SoftmaxActivation(nd.transpose(class_predictions, (0, 2, 1)), mode='channel') predicted_bb = MultiBoxDetection( *[cls_probs, box_predictions, default_anchors], force_suppress=True, clip=False) predicted_bb = box_nms(predicted_bb, overlap_thresh=overlap_thres, valid_thresh=min_c, topk=topk) predicted_bb = predicted_bb.asnumpy() predicted_bb = predicted_bb[0, predicted_bb[0, :, 0] != -1] predicted_bb = predicted_bb[:, 2:] predicted_bb[:, 2] = predicted_bb[:, 2] - predicted_bb[:, 0] predicted_bb[:, 3] = predicted_bb[:, 3] - predicted_bb[:, 1] labeled_bb = bb[:, :, 1:].asnumpy() labeled_bb[:, :, 2] = labeled_bb[:, :, 2] - labeled_bb[:, :, 0] labeled_bb[:, :, 3] = labeled_bb[:, :, 3] - labeled_bb[:, :, 1] labeled_bb = labeled_bb[0] return predicted_bb, labeled_bb
def inference(x, epochs= 295): ctx = mx.cpu(1) net = ToySSD(1) start_time = time.time() net.load_params('models/ssd_%d.params' % epochs, ctx) anchors, cls_preds, box_preds = net(x.as_in_context(ctx)) cls_probs = nd.SoftmaxActivation(nd.transpose(cls_preds, (0,2,1)), mode = 'channel') output = MultiBoxDetection(*[cls_probs, box_preds, anchors], force_suppress = False, clip = False, nms_threshold = 0.001 ) end_time = time.time() print(end_time-start_time) return output
def forward(img_path, net): ctx = mx.gpu(1) img_original = cv2.imread(img_path) img = preprocess(img_original) anchors, cls_preds, box_preds = net(img.as_in_context(ctx)) cls_probs = nd.SoftmaxActivation(nd.transpose(cls_preds, (0, 2, 1)), mode='channel') output = MultiBoxDetection(*[cls_probs, box_preds, anchors], force_suppress=True, clip=True, nms_threshold=0.01) return img_original, output
def inference(x, epochs=295): ctx = mx.cpu(1) net = ToySSD(1) net.load_params('models/ssd_%d.params' % epochs, ctx) print("load sucecuss") anchors, cls_preds, box_preds = net(x.as_in_context(ctx)) cls_probs = nd.SoftmaxActivation(nd.transpose(cls_preds, (0, 2, 1)), mode='channel') output = MultiBoxDetection(*[cls_probs, box_preds, anchors], force_suppress=True, clip=False) return output
def evaluate_metrics(metrics, data_iterator, net, nb_batches=None, ctx=mx.gpu(), sparse_policy_label=False, apply_select_policy_from_plane=True): """ Runs inference of the network on a data_iterator object and evaluates the given metrics. The metric results are returned as a dictionary object. :param metrics: List of mxnet metrics which must have the names ['value_loss', 'policy_loss', 'value_acc_sign', 'policy_acc'] :param data_iterator: Gluon data iterator object :param net: Gluon network handle :param nb_batches: Number of batches to evaluate (early stopping). If set to None all batches of the data_iterator will be evaluated :param ctx: MXNET data context :param sparse_policy_label: Should be set to true if the policy uses one-hot encoded targets (e.g. supervised learning) :param apply_select_policy_from_plane: If true, given policy label is converted to policy map index :return: """ reset_metrics(metrics) for i, (data, value_label, policy_label) in enumerate(data_iterator): data = data.as_in_context(ctx) value_label = value_label.as_in_context(ctx) policy_label = policy_label.as_in_context(ctx) [value_out, policy_out] = net(data) value_out[0][0].wait_to_read() if apply_select_policy_from_plane: policy_out = policy_out[:, FLAT_PLANE_IDX] # update the metrics metrics["value_loss"].update(preds=value_out, labels=value_label) metrics["policy_loss"].update(preds=nd.SoftmaxActivation(policy_out), labels=policy_label) metrics["value_acc_sign"].update(preds=value_out, labels=value_label) metrics["policy_acc"].update(preds=nd.argmax(policy_out, axis=1), labels=policy_label if sparse_policy_label else nd.argmax(policy_label, axis=1)) # stop after evaluating x batches (only recommended to use this for the train set evaluation) if nb_batches and i == nb_batches: break metric_values = { "loss": 0.01 * metrics["value_loss"].get()[1] + 0.99 * metrics["policy_loss"].get()[1] } for metric in metrics.values(): metric_values[metric.get()[0]] = metric.get()[1] return metric_values
def classify_hotdog(net, fname): with open(fname, 'rb') as f: img = image.imdecode(f.read()) data, _ = transform(img, -1, test_augs) plt.imshow(data.transpose((1, 2, 0)).asnumpy() / 255) data = data.expand_dims(axis=0) out = net(data.as_in_context(ctx[0])) out = nd.SoftmaxActivation(out) pred = int(nd.argmax(out, axis=1).asscalar()) prob = out[0][pred].asscalar() label = train_imgs.synsets return 'With prob=%f, %s' % (prob, label[pred])
def predict_cv(net, ctx, fname, label): img = cv2.imread(fname) img = cv2.resize(img, (image_size, image_size)) data, _ = transform(nd.array(img), -1) plt.imshow(data.transpose((1, 2, 0)).asnumpy() / 255) data = data.expand_dims(axis=0) out = net(data.as_in_context(ctx)) out = nd.SoftmaxActivation(out) pred = int(nd.argmax(out, axis=1).asscalar()) prob = out[0][pred].asscalar() print(prob, pred) return '置信度=%f, 类别 %s' % (prob, label[str(pred)])
def score_image(image_base64_string): with open('target_image.jpg', 'wb') as f: f.write(base64.b64decode(image_base64_string)) f.close() with open('target_image.jpg', 'rb') as f: img = image.imdecode(f.read()) data, _ = transform(img, -1, test_augs) data.transpose((1, 2, 0)).asnumpy()/255 data = data.expand_dims(axis=0) net.forward(batch([data]), is_train=False) out = net.get_outputs()[0] out = nd.SoftmaxActivation(out) return int(out[0][1].asscalar() * 100)
def classify(fname): train_ds = vision.ImageFolderDataset('train', flag=1, transform=transform_train) with open(fname, 'rb') as f: img = image.imdecode(f.read()) data = image.imresize(img.astype('float32') / 255, 32, 32) data = nd.transpose(data, (2, 0, 1)) data = data.expand_dims(axis=0) net = get_net(mx.cpu(0)) net.load_params('model.params', mx.cpu(0)) out = net(data.as_in_context(mx.cpu(0))) out = nd.SoftmaxActivation(out) pred = int(nd.argmax(out, axis=1).asscalar()) label = train_ds.synsets return label[pred]
def evaluate_metrics(metrics, data_iterator, net, nb_batches=None, ctx=mx.gpu()): """ Runs inference of the network on a data_iterator object and evaluates the given metrics. The metric results are returned as a dictionary object. :param metrics: List of mxnet metrics which must have the names ['value_loss', 'policy_loss', 'value_acc_sign', 'policy_acc'] :param data_iterator: Gluon dataiterator object :param net: Gluon network handle :param nb_batches: Number of batches to evaluate (early stopping). If set to None all batches of the data_iterator will be evaluated :param ctx: MXNET data context :return: """ reset_metrics(metrics) for i, (data, value_label, policy_label) in enumerate(data_iterator): data = data.as_in_context(ctx) value_label = value_label.as_in_context(ctx) policy_label = policy_label.as_in_context(ctx) [value_out, policy_out] = net(data) # update the metrics metrics["value_loss"].update(preds=value_out, labels=value_label) metrics["policy_loss"].update(preds=nd.SoftmaxActivation(policy_out), labels=policy_label) metrics["value_acc_sign"].update(preds=value_out, labels=value_label) metrics["policy_acc"].update(preds=nd.argmax(policy_out, axis=1), labels=policy_label) # stop after evaluating x batches (only recommeded to use this for the train set evaluation) if nb_batches is not None and i == nb_batches: break metric_values = {} metric_values["loss"] = 0.01 * metrics["value_loss"].get( )[1] + 0.99 * metrics["policy_loss"].get()[1] for metric in metrics.values(): metric_values[metric.get()[0]] = metric.get()[1] return metric_values
def default_train_fn(epoch, num_epochs, net, batch, batch_size, criterion, trainer, batch_fn, ctx, mixup=False, label_smoothing=False, distillation=False, mixup_alpha=0.2, mixup_off_epoch=0, classes=1000, dtype='float32', metric=None, teacher_prob=None): data, label = batch_fn(batch, ctx) if mixup: lam = np.random.beta(mixup_alpha, mixup_alpha) if epoch >= num_epochs - mixup_off_epoch: lam = 1 data = [lam * X + (1 - lam) * X[::-1] for X in data] if label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, classes, lam, eta) elif label_smoothing: hard_label = label label = smooth(label, classes) with mx.autograd.record(): outputs = [net(X.astype(dtype, copy=False)) for X in data] if distillation: loss = [criterion(yhat.astype('float', copy=False), y.astype('float', copy=False), p.astype('float', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob(data))] else: loss = [criterion(yhat, y.astype(dtype, copy=False)) for yhat, y in zip(outputs, label)] for l in loss: l.backward() trainer.step(batch_size, ignore_stale_grad=True) if metric: if mixup: output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \ for out in outputs] metric.update(label, output_softmax) else: if label_smoothing: metric.update(hard_label, outputs) else: metric.update(label, outputs) return metric else: return
def predict_mxnet(net, ctx, fname, label): ''' 使用mxnet对图像进行预测 :param net:训练好的模型 :param ctx:数据context :param fname:图像路径 :param label:标签词典 :return:预测类别及概率 ''' with open(fname, 'rb') as f: img = image.imdecode(f.read()) img = image.ForceResizeAug((image_size, image_size))(img) data, _ = transform_test(img, -1) data = data.expand_dims(axis=0) out = net(data.as_in_context(ctx)) out = nd.SoftmaxActivation(out) pred = int(nd.argmax(out, axis=1).asscalar()) prob = out[0][pred].asscalar() return '置信度=%f, 类别 %s' % (prob, label[str(pred)])
def predict(net, ctx, input_dir, threshold=0.9): ''' Using param-loaded model to predict the classification probabilty of input image. 使用CNN模型预测输入图片的分类概率。 :param net: param loaded CNN net :param ctx: computing device :param input_dir: input image directory :param threshold: probability threshold :return None: ''' movie_list = os.listdir(input_dir) # movie list movie_list.sort() for movie in movie_list: image_list = os.listdir(os.path.join(input_dir, movie)) # image list for _image in image_list: image_file = os.path.join(input_dir, movie, _image) try: # try to read and decode with open(image_file, 'rb') as f: img = image.imdecode(f.read()) except Exception as e: print('Fail to read image %s in movie %' % (_image, movie)) print('And the error is ', e) continue # predict data = transformPredict(img) data = data.as_in_context(ctx) out = net(data) out = nd.SoftmaxActivation(out).mean(axis=0) # softmax process out = out.asnumpy().tolist() # array to list # judge and delete if (out[2] > threshold) or (out[3] > threshold): os.remove(os.path.join(input_dir, movie, _image)) # you can just write the result into file without doing anything. out = [str(number) for number in out] string = '%s:%s' % (image_file, ','.join(out)) writeResult(string + '\n') # you can also move these images to another directory print('Movie %s finished.' % movie)
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.resume_params is '': net.initialize(mx.init.MSRAPrelu(), ctx=ctx) if opt.no_wd: for _, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) if opt.resume_states is not '': trainer.load_states(opt.resume_states) if opt.label_smoothing or opt.mixup: L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False) else: L = gluon.loss.SoftmaxCrossEntropyLoss() best_val_score = 1 first_fwd = True for epoch in range(opt.resume_epoch, opt.num_epochs): tic = time.time() if opt.use_rec: train_data.reset() train_metric.reset() btic = time.time() for i, batch in enumerate(train_data): data, label = batch_fn(batch, ctx) if opt.mixup: lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha) if epoch >= opt.num_epochs - opt.mixup_off_epoch: lam = 1 data = [lam * X + (1 - lam) * X[::-1] for X in data] if opt.label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, classes, lam, eta) elif opt.label_smoothing: hard_label = label label = smooth(label, classes) with ag.record(): outputs = [ net(X.astype(opt.dtype, copy=False)) for X in data ] loss = [ L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label) ] for l in loss: l.backward() if epoch == 0 and first_fwd and opt.mode == 'hybrid': net.export("/tmp/net") mx.visualization.print_summary( mx.symbol.load('/tmp/net-symbol.json'), shape={ 'data': (batch_size, 3, opt.input_size, opt.input_size) }) first_fwd = False trainer._optimizer.lr_scheduler.update(i, epoch) lr_scheduler.update(i, epoch) trainer.step(batch_size) if opt.mixup: output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \ for out in outputs] train_metric.update(label, output_softmax) else: if opt.label_smoothing: train_metric.update(hard_label, outputs) else: train_metric.update(label, outputs) if opt.log_interval and not (i + 1) % opt.log_interval: train_metric_name, train_metric_score = train_metric.get() logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f' % (epoch, i, batch_size * opt.log_interval / (time.time() - btic), train_metric_name, train_metric_score, trainer.learning_rate)) btic = time.time() train_metric_name, train_metric_score = train_metric.get() throughput = int(batch_size * i / (time.time() - tic)) err_top1_val, err_top5_val = test(ctx, val_data) logger.info('[Epoch %d] training: %s=%f' % (epoch, train_metric_name, train_metric_score)) logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f' % (epoch, throughput, time.time() - tic)) logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f' % (epoch, err_top1_val, err_top5_val)) if err_top1_val < best_val_score: best_val_score = err_top1_val net.save_parameters( '%s/%.4f-imagenet-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) trainer.save_states( '%s/%.4f-imagenet-%s-%d-best.states' % (save_dir, best_val_score, model_name, epoch)) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, model_name, epoch)) trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, model_name, opt.num_epochs - 1)) trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, model_name, opt.num_epochs - 1))
def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] net.initialize(mx.init.Xavier(), ctx=ctx) train_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10(train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10(train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) trainer = gluon.Trainer(net.collect_params(), optimizer, { 'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum }) metric = mx.metric.Accuracy() train_metric = mx.metric.RMSE() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False) train_history = TrainingHistory(['training-error', 'validation-error']) iteration = 0 lr_decay_count = 0 best_val_score = 0 for epoch in range(epochs): tic = time.time() train_metric.reset() metric.reset() train_loss = 0 num_batch = len(train_data) alpha = 1 if epoch == lr_decay_epoch[lr_decay_count]: trainer.set_learning_rate(trainer.learning_rate * lr_decay) lr_decay_count += 1 for i, batch in enumerate(train_data): lam = np.random.beta(alpha, alpha) if epoch >= epochs - 20: lam = 1 data_1 = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) label_1 = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) data = [lam * X + (1 - lam) * X[::-1] for X in data_1] label = [] for Y in label_1: y1 = label_transform(Y, classes) y2 = label_transform(Y[::-1], classes) label.append(lam * y1 + (1 - lam) * y2) with ag.record(): output = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] for l in loss: l.backward() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in loss]) output_softmax = [nd.SoftmaxActivation(out) for out in output] train_metric.update(label, output_softmax) name, acc = train_metric.get() iteration += 1 train_loss /= batch_size * num_batch name, acc = train_metric.get() name, val_acc = test(ctx, val_data) train_history.update([acc, 1 - val_acc]) train_history.plot(save_path='%s/%s_history.png' % (plot_name, model_name)) if val_acc > best_val_score: best_val_score = val_acc net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) name, val_acc = test(ctx, val_data) logging.info('[Epoch %d] train=%f val=%f loss=%f time: %f' % (epoch, acc, val_acc, train_loss, time.time() - tic)) if save_period and save_dir and (epoch + 1) % save_period == 0: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epoch)) if save_period and save_dir: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epochs - 1))
return image image = cv2.imread('img/pikachu.jpg') x = preprocess(image) print('x', x.shape) # if pre-trained model is provided, we can load it # net.load_params('ssd_%d.params' % epochs, ctx) anchors, cls_preds, box_preds = net(x.as_in_context(ctx)) print('anchors', anchors) print('class predictions', cls_preds) print('box delta predictions', box_preds) from mxnet.contrib.ndarray import MultiBoxDetection # convert predictions to probabilities using softmax cls_probs = nd.SoftmaxActivation(nd.transpose(cls_preds, (0, 2, 1)), mode='channel') # apply shifts to anchors boxes, non-maximum-suppression, etc... output = MultiBoxDetection(*[cls_probs, box_preds, anchors], force_suppress=True, clip=False) print(output) def display(img, out, thresh=0.5): import random import matplotlib as mpl mpl.rcParams['figure.figsize'] = (10,10) pens = dict() plt.clf() plt.imshow(img) for det in out: cid = int(det[0]) if cid < 0:
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.resume_params == '': net.initialize(mx.init.MSRAPrelu(), ctx=ctx) if opt.summary: # net.summary(mx.nd.zeros((1, 3, opt.input_size, opt.input_size), ctx=ctx[0])) summary(net, mx.nd.zeros((1, 3, opt.input_size, opt.input_size), ctx=ctx[0])) sys.exit() if opt.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) if opt.resume_states != '': trainer.load_states(opt.resume_states) if opt.label_smoothing or opt.mixup: sparse_label_loss = False else: sparse_label_loss = True if distillation: L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(temperature=opt.temperature, hard_weight=opt.hard_weight, sparse_label=sparse_label_loss) else: L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss) best_val_score = 1 for epoch in range(opt.resume_epoch, opt.num_epochs): tic = time.time() if opt.use_rec: train_data.reset() train_metric.reset() btic = time.time() for i, batch in enumerate(train_data): data, label = batch_fn(batch, ctx) if opt.mixup: lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha) if epoch >= opt.num_epochs - opt.mixup_off_epoch: lam = 1 data = [lam*X + (1-lam)*X[::-1] for X in data] if opt.label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, classes, lam, eta) elif opt.label_smoothing: hard_label = label label = smooth(label, classes) if distillation: teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \ for X in data] with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] if distillation: loss = [L(yhat.astype('float32', copy=False), y.astype('float32', copy=False), p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)] else: loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)] for l in loss: l.backward() trainer.step(batch_size) if opt.mixup: output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \ for out in outputs] train_metric.update(label, output_softmax) else: if opt.label_smoothing: train_metric.update(hard_label, outputs) else: train_metric.update(label, outputs) if opt.log_interval and not (i+1)%opt.log_interval: train_metric_name, train_metric_score = train_metric.get() logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f'%( epoch, i, batch_size*opt.log_interval/(time.time()-btic), train_metric_name, train_metric_score, trainer.learning_rate)) btic = time.time() train_metric_name, train_metric_score = train_metric.get() throughput = int(batch_size * i /(time.time() - tic)) err_top1_val, err_top5_val = test(ctx, val_data) logger.info('[Epoch %d] training: %s=%f'%(epoch, train_metric_name, train_metric_score)) logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f'%(epoch, throughput, time.time()-tic)) logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f'%(epoch, err_top1_val, err_top5_val)) if err_top1_val < best_val_score: best_val_score = err_top1_val net.save_parameters('%s/%.4f-imagenet-%s-%d-best.params'%(save_dir, best_val_score, model_name, epoch)) trainer.save_states('%s/%.4f-imagenet-%s-%d-best.states'%(save_dir, best_val_score, model_name, epoch)) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, epoch)) trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, opt.num_epochs-1)) trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, opt.num_epochs-1))
def run_epoch(e, network, dataloader, trainer, print_name, is_train, update_metric): ''' Run one epoch to train or test the SSD network Parameters ---------- e: int The epoch number network: nn.Gluon.HybridSequential The SSD network dataloader: gluon.data.DataLoader The train or testing dataloader that is wrapped around the iam_dataset print_name: Str Name to print for associating with the data. usually this will be "train" and "test" is_train: bool Boolean to indicate whether or not the CNN should be updated. is_train should only be set to true for the training data Returns ------- network: gluon.nn.HybridSequential The class predictor network ''' total_losses = [0 for ctx_i in ctx] for i, (X, Y) in enumerate(dataloader): X = gluon.utils.split_and_load(X, ctx) Y = gluon.utils.split_and_load(Y, ctx) with autograd.record(train_mode=is_train): losses = [] for x, y in zip(X, Y): default_anchors, class_predictions, box_predictions = network( x) box_target, box_mask, cls_target = network.training_targets( default_anchors, class_predictions, y) # losses loss_class = cls_loss(class_predictions, cls_target) loss_box = box_loss(box_predictions, box_target, box_mask) # sum all losses loss = loss_class + loss_box losses.append(loss) if is_train: for loss in losses: loss.backward() step_size = 0 for x in X: step_size += x.shape[0] trainer.step(step_size) for index, loss in enumerate(losses): total_losses[index] += loss.mean().asscalar() if update_metric: cls_metric.update([cls_target], [nd.transpose(class_predictions, (0, 2, 1))]) box_metric.update([box_target], [box_predictions * box_mask]) if i == 0 and e % send_image_every_n == 0 and e > 0: cls_probs = nd.SoftmaxActivation(nd.transpose( class_predictions, (0, 2, 1)), mode='channel') output_image, number_of_bbs = generate_output_image( box_predictions, default_anchors, cls_probs, box_target, box_mask, cls_target, x, y) print("Number of predicted {} BBs = {}".format( print_name, number_of_bbs)) total_loss = 0 for loss in total_losses: total_loss += loss / (len(dataloader) * len(total_losses)) return total_loss
def _train_loop(self, train_data, val_data): if self._cfg.train.no_wd: for k, v in self.net.collect_params( '.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if self._cfg.train.label_smoothing or self._cfg.train.mixup: sparse_label_loss = False else: sparse_label_loss = True if self.distillation: L = loss.DistillationSoftmaxCrossEntropyLoss( temperature=self._cfg.train.temperature, hard_weight=self._cfg.train.hard_weight, sparse_label=sparse_label_loss) else: L = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=sparse_label_loss) if self._cfg.train.mixup: train_metric = mx.metric.RMSE() else: train_metric = mx.metric.Accuracy() if self._cfg.train.mode == 'hybrid': self.net.hybridize(static_alloc=True, static_shape=True) if self.distillation: self.teacher.hybridize(static_alloc=True, static_shape=True) self._logger.info('Start training from [Epoch %d]', max(self._cfg.train.start_epoch, self.epoch)) for self.epoch in range(max(self._cfg.train.start_epoch, self.epoch), self._cfg.train.epochs): epoch = self.epoch if self._best_acc >= 1.0: self._logger.info( '[Epoch {}] Early stopping as acc is reaching 1.0'.format( epoch)) break mx.nd.waitall() tic = time.time() btic = time.time() if self._cfg.train.use_rec: train_data.reset() train_metric.reset() # pylint: disable=undefined-loop-variable for i, batch in enumerate(train_data): data, label = self.batch_fn(batch, self.ctx) if self._cfg.train.mixup: lam = np.random.beta(self._cfg.train.mixup_alpha, self._cfg.train.mixup_alpha) if epoch >= self._cfg.train.epochs - self._cfg.train.mixup_off_epoch: lam = 1 data = [lam * X + (1 - lam) * X[::-1] for X in data] if self._cfg.train.label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, classes, lam, eta) elif self._cfg.train.label_smoothing: hard_label = label label = smooth(label, self.num_class) if self.distillation: teacher_prob = [nd.softmax(self.teacher(X.astype(self._cfg.train.dtype, copy=False)) \ / self._cfg.train.temperature) for X in data] with ag.record(): outputs = [ self.net(X.astype(self._cfg.train.dtype, copy=False)) for X in data ] if self.distillation: losses = [L(yhat.astype('float32', copy=False), y.astype('float32', copy=False), p.astype('float32', copy=False)) \ for yhat, y, p in zip(outputs, label, teacher_prob)] else: losses = [ L(yhat, y.astype(self._cfg.train.dtype, copy=False)) for yhat, y in zip(outputs, label) ] for l in losses: l.backward() self.trainer.step(self.batch_size) if self._cfg.train.mixup: output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \ for out in outputs] train_metric.update(label, output_softmax) else: if self._cfg.train.label_smoothing: train_metric.update(hard_label, outputs) else: train_metric.update(label, outputs) if self._cfg.train.log_interval and not ( i + 1) % self._cfg.train.log_interval: train_metric_name, train_metric_score = train_metric.get() self._logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f', epoch, i, self._cfg.train.batch_size * self._cfg.train.log_interval / (time.time() - btic), train_metric_name, train_metric_score, self.trainer.learning_rate) btic = time.time() train_metric_name, train_metric_score = train_metric.get() throughput = int(self.batch_size * i / (time.time() - tic)) top1_val, top5_val = self._evaluate(val_data) self._logger.info('[Epoch %d] training: %s=%f', epoch, train_metric_name, train_metric_score) self._logger.info( '[Epoch %d] speed: %d samples/sec\ttime cost: %f', epoch, throughput, time.time() - tic) self._logger.info('[Epoch %d] validation: top1=%f top5=%f', epoch, top1_val, top5_val) if top1_val > self._best_acc: cp_name = os.path.join(self._logdir, 'best_checkpoint.pkl') self._logger.info( '[Epoch %d] Current best top-1: %f vs previous %f, saved to %s', self.epoch, top1_val, self._best_acc, cp_name) self.save(cp_name) self._best_acc = top1_val if self._reporter: self._reporter(epoch=epoch, acc_reward=top1_val) self._time_elapsed += time.time() - btic return { 'train_acc': train_metric_score, 'valid_acc': self._best_acc, 'time': self._time_elapsed }
def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] net.initialize(mx.init.Xavier(), ctx=ctx) if opt.summary: summary(net, mx.nd.zeros((1, 3, 32, 32), ctx=ctx[0])) sys.exit() if opt.dataset == 'cifar10': train_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10(train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10(train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) elif opt.dataset == 'cifar100': train_data = gluon.data.DataLoader( gluon.data.vision.CIFAR100(train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader( gluon.data.vision.CIFAR100(train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) else: raise ValueError('Unknown Dataset') if opt.no_wd and opt.cosine: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) if opt.label_smoothing or opt.mixup: sparse_label_loss = False else: sparse_label_loss = True metric = mx.metric.Accuracy() train_metric = mx.metric.RMSE() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss) train_history = TrainingHistory(['training-error', 'validation-error']) iteration = 0 lr_decay_count = 0 best_val_score = 0 for epoch in range(epochs): tic = time.time() train_metric.reset() metric.reset() train_loss = 0 num_batch = len(train_data) if not opt.cosine: if epoch == lr_decay_epoch[lr_decay_count]: trainer.set_learning_rate(trainer.learning_rate * lr_decay) lr_decay_count += 1 for i, batch in enumerate(train_data): data_1 = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) label_1 = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) if opt.mixup: lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha) if (epoch >= epochs - opt.mixup_off_epoch) or not opt.mixup: lam = 1 data = [lam * X + (1 - lam) * X[::-1] for X in data_1] if opt.label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label_1, classes, lam, eta) elif opt.label_smoothing: hard_label = label_1 label = smooth(label_1, classes) with ag.record(): output = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] for l in loss: l.backward() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in loss]) if opt.mixup: output_softmax = [nd.SoftmaxActivation(out) for out in output] train_metric.update(label, output_softmax) else: if opt.label_smoothing: train_metric.update(hard_label, output) else: train_metric.update(label, output) name, acc = train_metric.get() iteration += 1 train_loss /= batch_size * num_batch name, acc = train_metric.get() name, val_acc = test(ctx, val_data) train_history.update([acc, 1 - val_acc]) train_history.plot(save_path='%s/%s_history.png' % (plot_name, model_name)) if val_acc > best_val_score: best_val_score = val_acc net.save_parameters('%s/%.4f-%s-best.params' % (save_dir, best_val_score, model_name)) name, val_acc = test(ctx, val_data) logging.info('[Epoch %d] train=%f val=%f loss=%f lr: %f time: %f' % (epoch, acc, val_acc, train_loss, trainer.learning_rate, time.time() - tic)) host_name = socket.gethostname() with open(opt.dataset + '_' + host_name + '_GPU_' + opt.gpus + '_best_Acc.log', 'a') as f: f.write('best Acc: {:.4f}\n'.format(best_val_score)) print("best_val_score: ", best_val_score)
def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if config.train_cfg.param_init: init_func = getattr(mx.init, config.train_cfg.init) net.initialize(init_func(), ctx=ctx, force_reinit=True) else: net.load_parameters(config.train_cfg.param_file, ctx=ctx) summary(net, stat_name, nd.uniform( shape=(1, 3, imgsize, imgsize), ctx=ctx[0])) # net = nn.HybridBlock() net.hybridize() root = config.dir_cfg.dataset train_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10( root=root, train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10( root=root, train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) trainer_arg = {'learning_rate': config.lr_cfg.lr, 'wd': config.lr_cfg.wd, 'lr_scheduler': lr_sch} extra_arg = eval(config.lr_cfg.extra_arg) trainer_arg.update(extra_arg) trainer = gluon.Trainer(net.collect_params(), optimizer, trainer_arg) if config.train_cfg.amp: amp.init_trainer(trainer) metric = mx.metric.Accuracy() train_metric = mx.metric.RMSE() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=False if config.data_cfg.mixup else True) train_history = TrainingHistory(['training-error', 'validation-error']) # acc_history = TrainingHistory(['training-acc', 'validation-acc']) loss_history = TrainingHistory(['training-loss', 'validation-loss']) iteration = 0 best_val_score = 0 # print('start training') sig_state.emit(1) sig_pgbar.emit(0) # signal.emit('Training') for epoch in range(epochs): tic = time.time() train_metric.reset() metric.reset() train_loss = 0 num_batch = len(train_data) alpha = 1 for i, batch in enumerate(train_data): if epoch == 0 and iteration == 1 and config.save_cfg.profiler: profiler.set_state('run') is_profiler_run = True if epoch == 0 and iteration == 1 and config.save_cfg.tensorboard: sw.add_graph(net) lam = np.random.beta(alpha, alpha) if epoch >= epochs - 20 or not config.data_cfg.mixup: lam = 1 data_1 = gluon.utils.split_and_load( batch[0], ctx_list=ctx, batch_axis=0) label_1 = gluon.utils.split_and_load( batch[1], ctx_list=ctx, batch_axis=0) if not config.data_cfg.mixup: data = data_1 label = label_1 else: data = [lam*X + (1-lam)*X[::-1] for X in data_1] label = [] for Y in label_1: y1 = label_transform(Y, classes) y2 = label_transform(Y[::-1], classes) label.append(lam*y1 + (1-lam)*y2) with ag.record(): output = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] if config.train_cfg.amp: with ag.record(): with amp.scale_loss(loss, trainer) as scaled_loss: ag.backward(scaled_loss) # scaled_loss.backward() else: for l in loss: l.backward() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in loss]) output_softmax = [nd.SoftmaxActivation(out) for out in output] train_metric.update(label, output_softmax) metric.update(label_1, output_softmax) name, acc = train_metric.get() if config.save_cfg.tensorboard: sw.add_scalar(tag='lr', value=trainer.learning_rate, global_step=iteration) if epoch == 0 and iteration == 1 and config.save_cfg.profiler: nd.waitall() profiler.set_state('stop') profiler.dump() iteration += 1 sig_pgbar.emit(iteration) if check_flag()[0]: sig_state.emit(2) while(check_flag()[0] or check_flag()[1]): if check_flag()[1]: print('stop') return else: time.sleep(5) print('pausing') epoch_time = time.time() - tic train_loss /= batch_size * num_batch name, acc = train_metric.get() _, train_acc = metric.get() name, val_acc, _ = test(ctx, val_data) # if config.data_cfg.mixup: # train_history.update([acc, 1-val_acc]) # plt.cla() # train_history.plot(save_path='%s/%s_history.png' % # (plot_name, model_name)) # else: train_history.update([1-train_acc, 1-val_acc]) plt.cla() train_history.plot(save_path='%s/%s_history.png' % (plot_name, model_name)) if val_acc > best_val_score: best_val_score = val_acc net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) current_lr = trainer.learning_rate name, val_acc, val_loss = test(ctx, val_data) logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n val_acc=%f val_loss=%f lr=%f time: %f' % (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, epoch_time)) loss_history.update([train_loss, val_loss]) plt.cla() loss_history.plot(save_path='%s/%s_loss.png' % (plot_name, model_name), y_lim=(0, 2), legend_loc='best') if config.save_cfg.tensorboard: sw._add_scalars(tag='Acc', scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch) sw._add_scalars(tag='Loss', scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch) sig_table.emit([epoch, train_loss, train_acc, val_loss, val_acc, current_lr, epoch_time]) csv_writer.writerow([epoch, train_loss, train_acc, val_loss, val_acc, current_lr, epoch_time]) csv_file.flush() if save_period and save_dir and (epoch + 1) % save_period == 0: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epoch)) if save_period and save_dir: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epochs-1))
def train_epoch(pool=None, pool_lock=None, shared_finished_flag=None, use_pool=False): btic = time.time() for i, batch in enumerate(train_data): if i == num_batches: if use_pool: shared_finished_flag.value = True return data, label = batch_fn(batch, ctx) if opt.mixup: lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha) if epoch >= opt.num_epochs - opt.mixup_off_epoch: lam = 1 data = [lam * X + (1 - lam) * X[::-1] for X in data] if opt.label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, classes, lam, eta) elif opt.label_smoothing: hard_label = label label = smooth(label, classes) if distillation: teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \ for X in data] with ag.record(): if model_name == 'ShuffleNas' and use_pool: cand = None while cand is None: if len(pool) > 0: with pool_lock: cand = pool.pop() if i % opt.log_interval == 0: logger.debug('[Trainer] ' + '-' * 40) logger.debug( "[Trainer] Time: {}".format( time.time())) logger.debug( "[Trainer] Block choice: {}". format(cand['block_list'])) logger.debug( "[Trainer] Channel choice: {}". format(cand['channel_list'])) logger.debug( "[Trainer] Flop: {}M, param: {}M". format(cand['flops'], cand['model_size'])) else: time.sleep(1) full_channel_masks = [ cand['channel'].as_in_context(ctx_i) for ctx_i in ctx ] outputs = [ net(X.astype(opt.dtype, copy=False), cand['block'], channel_mask) for X, channel_mask in zip( data, full_channel_masks) ] elif model_name == 'ShuffleNas': block_choices = net.random_block_choices( select_predefined_block=False, dtype=opt.dtype) if opt.cs_warm_up: full_channel_mask, channel_choices = net.random_channel_mask( select_all_channels=opt.use_all_channels, epoch_after_cs=epoch - opt.epoch_start_cs, dtype=opt.dtype, ignore_first_two_cs=opt.ignore_first_two_cs) else: full_channel_mask, channel_choices = net.random_channel_mask( select_all_channels=opt.use_all_channels, dtype=opt.dtype, ignore_first_two_cs=opt.ignore_first_two_cs) full_channel_masks = [ full_channel_mask.as_in_context(ctx_i) for ctx_i in ctx ] outputs = [ net(X.astype(opt.dtype, copy=False), block_choices, channel_mask) for X, channel_mask in zip( data, full_channel_masks) ] else: outputs = [ net(X.astype(opt.dtype, copy=False)) for X in data ] if distillation: loss = [ L(yhat.astype('float32', copy=False), y.astype('float32', copy=False), p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob) ] else: loss = [ L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label) ] for l in loss: l.backward() trainer.step(batch_size, ignore_stale_grad=True) if opt.mixup: output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \ for out in outputs] train_metric.update(label, output_softmax) else: if opt.label_smoothing: train_metric.update(hard_label, outputs) else: train_metric.update(label, outputs) if opt.log_interval and not (i + 1) % opt.log_interval: train_metric_name, train_metric_score = train_metric.get() logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f' % (epoch, i, batch_size * opt.log_interval / (time.time() - btic), train_metric_name, train_metric_score, trainer.learning_rate)) btic = time.time() return