def _get_lr_scheduler(opt): if 'lr_factor' not in opt or opt.lr_factor >= 1: return opt.lr, None global lr_steps, batch_size lr, lr_factor = opt.lr, opt.lr_factor start_epoch = opt.start_epoch num_examples = get_num_examples(opt.dataset) its_per_epoch = math.ceil(num_examples / batch_size) # move forward to start epoch for s in lr_steps: if start_epoch >= s: lr *= lr_factor if lr != opt.lr: logger.info('Adjust learning rate to %e for epoch %d', lr, start_epoch) steps = [ its_per_epoch * (epoch - start_epoch) for epoch in lr_steps if epoch - start_epoch > 0 ] if steps: return lr, lr_scheduler.MultiFactorScheduler(step=steps, factor=lr_factor) else: return lr, None
def learning_rate_scheduler(self): print("learning_rate \t base_lr:%d, scheduler: MultiFactorScheduler" % self._base_lr) lr_sch = lr_scheduler.MultiFactorScheduler( step=[int(self._num_epochs * 0.5), int(self._num_epochs * 0.75)], factor=0.1) lr_sch.base_lr = self._base_lr return lr_sch
def init_trainer_0(neural_network, number_of_batches): steps_iterations = [s * number_of_batches for s in SCHEDULER_STEPS] schedule = lr_scheduler.MultiFactorScheduler(step=steps_iterations, factor=SCHEDULER_FACTOR) schedule.base_lr = LEARNING_RATE sgd_optimizer = optimizer.SGD(learning_rate=LEARNING_RATE, momentum=MOMENTUM, lr_scheduler=schedule) trainer = gluon.Trainer(params=neural_network.collect_params(), optimizer=sgd_optimizer) return trainer
def generate_lr_scheduler(ls_dict): scheduler_type = ls_dict['type'] scheduler_param = ls_dict['lr_scheduler_config'] factor = float(scheduler_param['factor']) if scheduler_type == 'Factor': step = int(scheduler_param['step']) stop_factor_lr = float(scheduler_param['stop_factor_lr']) return ls.FactorScheduler(step, factor, stop_factor_lr) elif scheduler_type == 'MultiFactor': steps = scheduler_param['steps'] step_list = [int(step) for step in steps] return ls.MultiFactorScheduler(step=step_list, factor=factor)
num_workers = hvd.size() rank = hvd.rank() local_rank = hvd.local_rank() num_classes = 1000 num_training_samples = 1281167 batch_size = args.batch_size epoch_size = \ int(math.ceil(int(num_training_samples // num_workers) / batch_size)) if args.lr_mode == 'step': lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] steps = [epoch_size * x for x in lr_decay_epoch] lr_sched = lr_scheduler.MultiFactorScheduler( step=steps, factor=args.lr_decay, base_lr=(args.lr * num_workers), warmup_steps=(args.warmup_epochs * epoch_size), warmup_begin_lr=args.warmup_lr) elif args.lr_mode == 'poly': lr_sched = lr_scheduler.PolyScheduler(args.num_epochs * epoch_size, base_lr=(args.lr * num_workers), pwr=2, warmup_steps=(args.warmup_epochs * epoch_size), warmup_begin_lr=args.warmup_lr) elif args.lr_mode == 'cosine': lr_sched = lr_scheduler.CosineScheduler(args.num_epochs * epoch_size, base_lr=(args.lr * num_workers), warmup_steps=(args.warmup_epochs * epoch_size), warmup_begin_lr=args.warmup_lr)
def train( backbone, root_dir, train_index_fp, pretrain_model, optimizer, epochs=50, lr=0.001, wd=5e-4, momentum=0.9, batch_size=4, ctx=mx.cpu(), verbose_step=5, output_dir='ckpt', ): output_dir = os.path.join(output_dir, backbone) os.makedirs(output_dir, exist_ok=True) num_kernels = 3 dataset = StdDataset(root_dir=root_dir, train_idx_fp=train_index_fp, num_kernels=num_kernels - 1) if not isinstance(ctx, (list, tuple)): ctx = [ctx] batch_size = batch_size * len(ctx) loader = DataLoader(dataset, batch_size=batch_size, shuffle=True) net = PSENet(base_net_name=backbone, num_kernels=num_kernels, ctx=ctx, pretrained=True) # initial params net.initialize(mx.init.Xavier(), ctx=ctx) net.collect_params("extra_.*_weight|decoder_.*_weight").initialize( mx.init.Xavier(), ctx=ctx, force_reinit=True) net.collect_params("extra_.*_bias|decoder_.*_bias").initialize( mx.init.Zero(), ctx=ctx, force_reinit=True) if pretrain_model is not None: net.load_parameters(pretrain_model, ctx=ctx, allow_missing=True, ignore_extra=True) # pse_loss = DiceLoss(lam=0.7, num_kernels=num_kernels) pse_loss = DiceLoss_with_OHEM(lam=0.7, num_kernels=num_kernels, debug=False) # lr_scheduler = ls.PolyScheduler( # max_update=icdar_loader.length * epochs // batch_size, base_lr=lr # ) max_update = len(dataset) * epochs // batch_size lr_scheduler = ls.MultiFactorScheduler( base_lr=lr, step=[max_update // 3, max_update * 2 // 3], factor=0.1) optimizer_params = { 'learning_rate': lr, 'wd': wd, 'momentum': momentum, 'lr_scheduler': lr_scheduler, } if optimizer.lower() == 'adam': optimizer_params.pop('momentum') trainer = Trainer(net.collect_params(), optimizer=optimizer, optimizer_params=optimizer_params) summary_writer = SummaryWriter(output_dir) for e in range(epochs): cumulative_loss = 0 num_batches = 0 for i, item in enumerate(loader): item_ctxs = [split_and_load(field, ctx) for field in item] loss_list = [] for im, gt_text, gt_kernels, training_masks, ori_img in zip( *item_ctxs): gt_text = gt_text[:, ::4, ::4] gt_kernels = gt_kernels[:, :, ::4, ::4] training_masks = training_masks[:, ::4, ::4] with autograd.record(): kernels_pred = net(im) # 第0个是对complete text的预测 loss = pse_loss(gt_text, gt_kernels, kernels_pred, training_masks) loss_list.append(loss) mean_loss = [] for loss in loss_list: loss.backward() mean_loss.append(mx.nd.mean(to_cpu(loss)).asscalar()) mean_loss = np.mean(mean_loss) trainer.step(batch_size) if i % verbose_step == 0: global_steps = dataset.length * e + i * batch_size summary_writer.add_scalar('loss', mean_loss, global_steps) summary_writer.add_scalar( 'c_loss', mx.nd.mean(to_cpu(pse_loss.C_loss)).asscalar(), global_steps, ) summary_writer.add_scalar( 'kernel_loss', mx.nd.mean(to_cpu(pse_loss.kernel_loss)).asscalar(), global_steps, ) summary_writer.add_scalar('pixel_accuracy', pse_loss.pixel_acc, global_steps) if i % 1 == 0: logger.info( "step: {}, lr: {}, " "loss: {}, score_loss: {}, kernel_loss: {}, pixel_acc: {}, kernel_acc: {}" .format( i * batch_size, trainer.learning_rate, mean_loss, mx.nd.mean(to_cpu(pse_loss.C_loss)).asscalar(), mx.nd.mean(to_cpu(pse_loss.kernel_loss)).asscalar(), pse_loss.pixel_acc, pse_loss.kernel_acc, )) cumulative_loss += mean_loss num_batches += 1 summary_writer.add_scalar('mean_loss_per_epoch', cumulative_loss / num_batches, global_steps) logger.info("Epoch {}, mean loss: {}\n".format( e, cumulative_loss / num_batches)) net.save_parameters( os.path.join(output_dir, model_fn_prefix(backbone, e))) summary_writer.add_image('complete_gt', to_cpu(gt_text[0:1, :, :]), global_steps) summary_writer.add_image('complete_pred', to_cpu(kernels_pred[0:1, 0, :, :]), global_steps) summary_writer.add_images( 'kernels_gt', to_cpu(gt_kernels[0:1, :, :, :]).reshape(-1, 1, 0, 0), global_steps, ) summary_writer.add_images( 'kernels_pred', to_cpu(kernels_pred[0:1, 1:, :, :]).reshape(-1, 1, 0, 0), global_steps, ) summary_writer.close()
number_classes = len(classes) net = ssd.SSD(number_classes) if os.path.exists(pretrained): net.load_parameters(pretrained) print('finetune based on ', pretrained) net.hybridize(static_alloc=True, static_shape=True) #net.hybridize() net.collect_params().reset_ctx(ctx) #lr_sch = lr_scheduler.MultiFactorScheduler(step=[int(num_epochs * 0.45), int(num_epochs * 0.7) ], factor=0.1, base_lr = base_lr, warmup_steps = 10) #lr_sch = lr_scheduler.MultiFactorScheduler(step=[int(num_epochs * 0.7) ], factor=0.1) #lr_sch.base_lr = base_lr #lr_sch = lr_schs.CosineScheduler(num_epochs,base_lr=base_lr,warmup=10) lr_sch = lr_scheduler.MultiFactorScheduler( step=[int(num_epochs * 0.45), int(num_epochs * 0.7)], factor=0.1) #lr_sch = lr_scheduler.MultiFactorScheduler(step=[int(num_epochs * 0.7)], factor=0.1) lr_sch.base_lr = base_lr trainer = Trainer(net.collect_params(), optimizer="sgd", optimizer_params={ "wd": wd, "momentum": momentum }) train_ssd_custom(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs, lr_sch, output_prefix)
def train_eval(opt): mx.random.seed(123) np.random.seed(123) os.environ['CUDA_VISIBLE_DEVICES']='0,1,2,3' gpus = [] if opt.gpus is None or opt.gpus is '' else [ int(gpu) for gpu in opt.gpus.split(',')] num_gpus = len(gpus) batch_size = opt.batch_per_device*max(1,num_gpus) context = [mx.gpu(i) for i in gpus] if num_gpus>0 else [mx.cpu()] steps = [int(step) for step in opt.lr_scheduler_steps.split(',')] vis_env = opt.dataset + opt.output vis = Visulizer(env=vis_env) vis.log(opt) #optional ucf101 or meitu,get net structure,loss criterion,train val loader if opt.dataset=='ucf101' or opt.dataset=='ucf': net = R2Plus2D(num_class=101,model_depth=opt.model_depth) loss_criterion = gloss.SoftmaxCrossEntropyLoss() # loss function train_loader, val_loader = get_ucf101trainval(datadir='/data/jh/notebooks/hudengjun/DeepVideo/UCF-101', batch_size=batch_size, n_frame=opt.n_frame, crop_size=opt.crop_size, scale_h=opt.scale_h, scale_w=opt.scale_w, num_workers=opt.num_workers) # the train and evaluation data loader elif opt.dataset =='meitu': net = R2Plus2D(num_class=63,model_depth=opt.model_depth,final_temporal_kernel=opt.n_frame//8) # labels set 63 # train_loader,val_loader = get_meitu_dataloader(data_dir=opt.meitu_dir, # device_id=opt.decoder_gpu, # batch_size=batch_size, # n_frame=opt.n_frame, # crop_size=opt.crop_size, # scale_h=opt.scale_h, # scale_w=opt.scale_w, # num_workers=opt.num_workers) # use multi gpus to load data train_loader, val_loader = get_meitu_dataloader(data_dir=opt.meitu_dir, device_id=opt.decoder_gpu, batch_size=batch_size, num_workers=opt.num_workers, n_frame=opt.n_frame, crop_size=opt.crop_size, scale_h=opt.scale_h, scale_w=opt.scale_w, cache_size=opt.cache_size) #[type(data) for i,enumerate(train_loader) if i<2] # step when 66,in data/nvvl_meitu.py # create new find_nvv_error.py ,copy train_nvvl_r3d.py one by one test, # find error loss_dict = {'bce':gloss.SigmoidBinaryCrossEntropyLoss, 'warp_nn':WarpLoss, 'warp_fn':WARP_funcLoss, 'lsep_nn':LsepLoss, 'lsep_fn':LSEP_funcLoss} if opt.loss_type == 'lsep_nnh': loss_criterion = LsepLossHy(batch_size=batch_size//num_gpus,num_class=opt.num_class) loss_criterion.hybridize() elif opt.loss_type =='bce': loss_criterion = gloss.SigmoidBinaryCrossEntropyLoss() loss_criterion.hybridize() else: loss_criterion = loss_dict[opt.loss_type]() # net.initialize(mx.init.Xavier(), # ctx=context) # net parameter initialize in several cards net.initialize(mx.init.Xavier(),ctx=context) if not opt.pretrained is None: if opt.pretrained.endswith('.pkl'): net.load_from_caffe2_pickle(opt.pretrained) elif opt.pretrained.endswith('.params'): try: print("load pretrained params ",opt.pretrained) net.load_from_sym_params(opt.pretrained,ctx = context) except Exception as e: print("load as sym params failed,reload as gluon params") net.load_params(opt.pretrained,ctx=context) #load params to net context net.hybridize() trainer = gluon.Trainer(net.collect_params(),'sgd', {'learning_rate':opt.lr,'momentum':0.9,'wd':opt.wd}, kvstore=opt.kvstore) # the trainer lr_steps = lr_schedualer.MultiFactorScheduler(steps,opt.lr_schedualer_factor) lr_steps.base_lr = opt.lr best_eval = 0.0 for epoch in range(opt.num_epoch): tic = time() pre_loss,cumulative_loss = 0.0,0.0 trainer.set_learning_rate(lr_steps(epoch)) vis.log('Epoch %d learning rate %f'%(epoch,trainer.learning_rate)) for i,(data,label) in enumerate(train_loader): try: data_list = gluon.utils.split_and_load(data,ctx_list=context,batch_axis=0) label_list = gluon.utils.split_and_load(label,ctx_list=context,batch_axis=0) except Exception as e: logging.info(e) continue Ls =[] with autograd.record(): for x,y in zip(data_list,label_list): y_hat = net(x) loss = loss_criterion(y_hat,y) Ls.append(loss) cumulative_loss +=nd.mean(loss).asscalar() for L in Ls: L.backward() trainer.step(data.shape[0]) if (i+1)%opt.log_interval ==0: vis.log('[Epoch %d,Iter %d ] training loss= %f'%( epoch,i+1,cumulative_loss-pre_loss )) vis.plot('loss',cumulative_loss-pre_loss) pre_loss =cumulative_loss if opt.debug: if (i+1)//(opt.log_interval)==3: break vis.log('[Epoch %d] training loss=%f'%(epoch,cumulative_loss)) vis.log('[Epoch %d] time used: %f'%(epoch,time()-tic)) vis.log('[Epoch %d] saving net') save_path = './{0}/{1}_test-val{2}.params'.format(opt.output, str(opt.dataset + opt.loss_type), str(epoch)) vis.log("save path %s" % (save_path)) net.save_parameters(save_path) best_iou=0.0 if opt.dataset=='ucf101' or opt.dataset =='ucf': acc = nd.array([0],ctx=mx.cpu()) test_iter = 0 for i,(data,label) in enumerate(val_loader): try: data_list = gluon.utils.split_and_load(data,ctx_list=context,batch_axis=0) label_list = gluon.utils.split_and_load(label,ctx_list=context,batch_axis=0) except Exception as e: logging.info(e) continue for x,y in zip(data_list,label_list): y_hat = net(x) test_iter +=1 # single iter y_pred = y_hat.argmax(axis=1) acc += (y_pred == y.astype('float32')).mean().asscalar() # acc in cpu val_acc = acc.asscalar() / test_iter if (i+1) %(opt.log_interval)==0: logging.info("[Epoch %d,Iter %d],acc=%f" % (epoch,i,val_acc)) if opt.debug: if (i+1)//opt.log_interval ==3: break vis.plot('acc',val_acc) elif opt.dataset=='meitu': k=4 topk_inter = np.array([1e-4]*k) # a epsilon for divide not by zero topk_union = np.array([1e-4]*k) for i,(data,label) in enumerate(val_loader): try: data_list = gluon.utils.split_and_load(data,ctx_list=context,batch_axis=0) label_list = gluon.utils.split_and_load(label,ctx_list=context,batch_axis=0) except Exception as e: logging.info(e) continue for x,y in zip(data_list,label_list): y_hat = net(x) pred_order = y_hat.argsort()[:,::-1] # sort and desend order #just compute top1 label pred_order_np = pred_order.asnumpy() y_np = y.asnumpy() if opt.debug: print("pred shape and target shape",pred_order_np.shape,y_np.shape) for pred_vec,y_vec in zip(pred_order_np,y_np): label_set =set([index for index,value in enumerate(y_vec) if value>0.1]) pred_topk = [set(pred_vec[0:k]) for k in range(1,k+1)] topk_inter +=np.array([len(p_k.intersection(label_set)) for p_k in pred_topk]) topk_union +=np.array([len(p_k.union(label_set)) for p_k in pred_topk]) if (i+1) %(opt.log_interval)==0: logging.info("[Epoch %d,Iter %d],time %s,Iou %s" % (epoch, i, \ tmm.strftime("%Y-%D:%H-%S"), \ str(topk_inter / topk_union))) for i in range(k): vis.plot('val_iou_{0}'.format(i+1),topk_inter[i]/topk_union[i]) if opt.debug: if (i + 1) // (opt.log_interval) == 2: break vis.log("""---------------------------------------- ----XXXX------finished------------------ ----------------------------------------""")
def train_decision(opt): mx.random.seed(123) np.random.seed(123) os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3' gpus = [] if opt.gpus is None or opt.gpus is '' else [ int(gpu) for gpu in opt.gpus.split(',') ] num_gpus = len(gpus) batch_size = opt.batch_per_device * max(1, num_gpus) context = [mx.gpu(i) for i in gpus] if num_gpus > 0 else [mx.cpu()] steps = [int(step) for step in opt.lr_scheduler_steps.split(',')] feature_net = R2Plus2D(num_class=62, model_depth=34) model = Decision_thresh(thresh_size=62) if not opt.ranking_model is None: feature_net.load_params(opt.ranking_model, ctx=context) model.initialize(init=mx.init.Xavier(), ctx=context) trainer = mx.gluon.Trainer(model.collect_params(),'sgd',\ {'learning_rate':opt.lr,'momentum':0.9,'wd':opt.wd}, kvstore=opt.kvstore) train_loader, val_loader = get_simple_meitu_dataloader( datadir=opt.meitu_dir, batch_size=batch_size, n_frame=opt.n_frame, crop_size=opt.crop_size, scale_h=opt.scale_h, scale_w=opt.scale_w, num_workers=opt.num_workers) loss_criterion = gluon.loss.SigmoidBinaryCrossEntropyLoss() lr_steps = lr_schedualer.MultiFactorScheduler(steps, opt.lr_scheduler_factor) best_eval = 0.0 for epoch in range(opt.num_epoch): tic = time() pre_loss, cumulative_loss = 0.0, 0.0 trainer.set_learning_rate(lr_steps(epoch)) logging.info( 'Epoch %d learning rate %f to make decision through threshold' % (epoch, trainer.learning_rate)) for i, (data, label) in enumerate(train_loader): try: data_list = gluon.utils.split_and_load(data, ctx_list=context, batch_axis=0) label_list = gluon.utils.split_and_load(label, ctx_list=context, batch_axis=0) except Exception as e: logging.info(e) continue Ls = [] confidences = [] for x in data_list: confidences.append(feature_net(x)) with autograd.record(): Ls = [] for conf, y in zip(confidences, label_list): decision = model(conf) loss = loss_criterion(decision, y) Ls.append(loss) cumulative_loss += nd.mean(loss).asscalar() for L in Ls: L.backward() trainer.step(data.shape[0]) if (i + 1) % opt.log_interval == 0: logging.info('[Epoch %d,Iter %d] ,training loss=%f' % (epoch + 1, i + 1, cumulative_loss - pre_loss)) pre_loss = cumulative_loss print(model.collect_params()['decision_thresh0_thresh'].data()) if opt.debug: break logging.info('[Epoch %d] training loss = %f' % (epoch, cumulative_loss)) logging.info('[Epoch %d] time used:%f' % (epoch, time() - tic)) logging.info('[Epoch %d] save net') model.save_parameters('./{0}/{1}_decisionmodel_{2}.params'.format( opt.output, str(opt.dataset + opt.loss_type), str(epoch))) # begin to evaluation the model inter = 1e-4 union = 1e-4 tic = time() for i, (data, label) in enumerate(val_loader): try: data_list = gluon.utils.split_and_load(data, ctx_list=context, batch_axis=0) label_list = gluon.utils.split_and_load(label, ctx_list=context, batch_axis=0) except Exception as e: logging.info(e) continue try: for x, y in zip(data_list, label_list): conf = feature_net(x) sig_label = model(conf) y_np = y.asnumpy() sig_np = sig_label.asnumpy() rows, indexs = np.where(sig_np > 0.5) labelset_list = [] for j in range(x.shape[0]): labelset_list.append(set()) #labelset_list = [set()]*x.shape[0] # the sample number for row, index in zip(rows, indexs): labelset_list[row].add(index) for pred_set, gt_vec in zip(labelset_list, y_np): gt_set = set([ index for index, value in enumerate(gt_vec) if value > 0.1 ]) inter += len(pred_set.intersection(gt_set)) union += len(pred_set.union(gt_set)) except Exception as e: print(e) continue if (i + 1) % (opt.log_interval) == 0: logging.info('[Epoch %d,Iter %d],time %s,IoU %s' % (epoch, i, tmm.strftime("%Y-%D:%H-%S"), str(inter / union))) if opt.debug: break logging.info("finish one epoch validataion") logging.info("[Epoch %d],validation time used %d" % (epoch, time() - tic)) logging.info("finish all epoch trainning and test")
def train_eval(opt): mx.random.seed(123) np.random.seed(123) os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3' gpus = [] if opt.gpus is None or opt.gpus is '' else [ int(gpu) for gpu in opt.gpus.split(',')] num_gpus = len(gpus) batch_size = opt.batch_per_device * max(1, num_gpus) context = [mx.gpu(i) for i in gpus][0] if num_gpus > 0 else [mx.cpu()] steps = [int(step) for step in opt.lr_scheduler_steps.split(',')] vis_env = opt.dataset + opt.output vis = Visulizer(env=vis_env) vis.log(opt) net = R2Plus2D_MT(num_scenes=19,num_actions=44, model_depth=opt.model_depth, final_temporal_kernel=opt.n_frame // 8) # labels set 63 # train_loader,val_loader = get_meitu_dataloader(data_dir=opt.meitu_dir, # device_id=opt.decoder_gpu, # batch_size=batch_size, # n_frame=opt.n_frame, # crop_size=opt.crop_size, # scale_h=opt.scale_h, # scale_w=opt.scale_w, # num_workers=opt.num_workers) # use multi gpus to load data train_loader, val_loader,sample_weight = get_meitu_multi_task_dataloader(data_dir=opt.meitu_dir, device_id=opt.decoder_gpu, batch_size=batch_size, num_workers=opt.num_workers, n_frame=opt.n_frame, crop_size=opt.crop_size, scale_h=opt.scale_h, scale_w=opt.scale_w, cache_size=opt.cache_size) action_loss = gloss.SoftmaxCrossEntropyLoss() #scene_loss = LsepLoss() # [type(data) for i,enumerate(train_loader) if i<2] # step when 66,in data/nvvl_meitu.py # create new find_nvv_error.py ,copy train_nvvl_r3d.py one by one test, # find error loss_dict = {'bce': gloss.SigmoidBinaryCrossEntropyLoss, 'warp_nn': WarpLoss, 'warp_fn': WARP_funcLoss, 'lsep_nn': LsepLoss, 'lsep_fn': LSEP_funcLoss} scene_loss = loss_dict[opt.loss_type]() # if opt.loss_type == 'lsep_nnh': # loss_criterion = LsepLossHy(batch_size=batch_size // num_gpus, num_class=opt.num_class) # loss_criterion.hybridize() # elif opt.loss_type == 'bce': # loss_criterion = gloss.SigmoidBinaryCrossEntropyLoss() # loss_criterion.hybridize() # else: # # net.initialize(mx.init.Xavier(), # ctx=context) # net parameter initialize in several cards net.initialize(mx.init.Xavier(), ctx=context) if not opt.pretrained is None: if opt.pretrained.endswith('.pkl'): net.load_from_caffe2_pickle(opt.pretrained) elif opt.pretrained.endswith('.params'): try: print("load pretrained params ", opt.pretrained) net.load_from_sym_params(opt.pretrained, ctx=context) except Exception as e: print("load as sym params failed,reload as gluon params") net.load_params(opt.pretrained, ctx=context) # load params to net context #net.hybridize() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': opt.lr, 'momentum': 0.9, 'wd': opt.wd}, kvstore=opt.kvstore) # the trainer lr_steps = lr_schedualer.MultiFactorScheduler(steps, opt.lr_schedualer_factor) lr_steps.base_lr = opt.lr best_eval = 0.0 for epoch in range(opt.num_epoch): tic = time() scene_pre_loss, scene_cumulative_loss = 0.0,0.0 action_pre_loss,action_cumulative_loss = 0.0, 0.0 trainer.set_learning_rate(lr_steps(epoch)) vis.log('Epoch %d learning rate %f' % (epoch, trainer.learning_rate)) for i, (data, scene_label,action_label) in enumerate(train_loader): # single card not split with autograd.record(): data = data.as_in_context(context) scene_label = scene_label.as_in_context(context) action_label = action_label.as_in_context(context) pred_scene,pred_action = net(data) loss_scene = scene_loss(pred_scene,scene_label) loss_action = action_loss(pred_action,action_label) loss = loss_scene + opt.action_rate*loss_action.mean() scene_cumulative_loss += nd.mean(loss_scene).asscalar() action_cumulative_loss +=nd.mean(loss_action).asscalar() loss.backward() trainer.step(data.shape[0]) if (i + 1) % opt.log_interval == 0: vis.log('[Epoch %d,Iter %d ] scene loss= %f' % (epoch, i + 1, scene_cumulative_loss - scene_pre_loss)) vis.plot('scene_loss', scene_cumulative_loss - scene_pre_loss) scene_pre_loss = scene_cumulative_loss vis.log('[Epoch %d,Iter %d ] action loss= %f' % (epoch, i + 1, action_cumulative_loss - action_pre_loss )) vis.plot("action_loss", action_cumulative_loss - action_pre_loss) action_pre_loss = action_cumulative_loss if opt.debug: if (i + 1) // (opt.log_interval) == 3: break vis.log('[Epoch %d] scene loss=%f,action loss=%f' % (epoch, scene_cumulative_loss,action_cumulative_loss)) vis.log('[Epoch %d] time used: %f' % (epoch, time() - tic)) vis.log('[Epoch %d] saving net') save_path = './{0}/{1}_test-val{2}.params'.format(opt.output, str(opt.dataset + 'multi'), str(epoch)) vis.log("save path %s" % (save_path)) net.save_parameters(save_path) label_inter =1e-4 label_union =1e-4 acc = nd.array([0], ctx=mx.cpu()) val_iter =0 for i,(data,scene_label,action_label) in enumerate(val_loader): data = data.as_in_context(context) action_label = action_label.as_in_context(context) scene_pred,action_pred = net(data) scene_order = scene_pred.argsort()[:,::-1] scene_order_np = scene_order.asnumpy() scene_label_np = scene_label.asnumpy() for scene_pred_v,scene_label_v in zip(scene_order_np,scene_label_np): label_set = set([index for index,value in enumerate(scene_label_v) if value>0.1]) pred_top1 = set([scene_pred_v[0]]) label_inter += len(pred_top1.intersection(label_set)) label_union += len(pred_top1.union(label_set)) action_pred = action_pred.argmax(axis=1) acc += (action_pred == action_label.astype('float32')).mean().asscalar() val_iter +=1 if (i + 1) % (opt.log_interval) == 0: vis.log("[Epoch %d,Iter %d],action_acc= %f"%(epoch,i,acc.asscalar()/val_iter)) vis.log("[Epoch %d,Iter %d],scene_top1=%f"%(epoch,i,label_inter/label_union)) if opt.debug: if (i + 1) // (opt.log_interval) == 2: break vis.log("""---------------------------------------- ----XXXX------finished------------------ ----------------------------------------""")