def test_optimizer(optimizer, optimizer_params): # Weights index = 0 weight = nd.zeros(shape=(8,)) # Optimizer from registry optimizer = opt.create(optimizer, **optimizer_params) state = optimizer.create_state(index, weight) # Run a few updates for i in range(1, 13): grad = nd.random_normal(shape=(8,)) if isinstance(optimizer, SockeyeOptimizer): batch_state = BatchState(metric_val=random()) optimizer.pre_update_batch(batch_state) optimizer.update(index, weight, grad, state) # Checkpoint if i % 3 == 0: if isinstance(optimizer, SockeyeOptimizer): checkpoint_state = CheckpointState(checkpoint=(i % 3 + 1), metric_val=random()) optimizer.pre_update_checkpoint(checkpoint_state)
def test_optimizer(optimizer, optimizer_params): # Weights index = 0 weight = nd.zeros(shape=(8,)) # Optimizer from registry optimizer = opt.create(optimizer, **optimizer_params) state = optimizer.create_state(index, weight) # Run a few updates for i in range(1, 13): grad = nd.random_normal(shape=(8,)) if isinstance(optimizer, SockeyeOptimizer): batch_state = BatchState(metric_val=random()) optimizer.pre_update_batch(batch_state) optimizer.update(index, weight, grad, state) # Checkpoint if i % 3 == 0: if isinstance(optimizer, SockeyeOptimizer): checkpoint_state = CheckpointState(checkpoint=(i % 3 + 1), metric_val=random()) optimizer.pre_update_checkpoint(checkpoint_state)
def network_backprop_setup(self, grad_req, arg_names, arg_shapes, eval_metric): if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith("mean_face") or name.endswith('cls_label') or name.endswith('proj_weight') or name.endswith('proj_label') or name.endswith('ground_truth') or name.endswith('ellipse_label') or name.endswith("bbox_weight")): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) # setting the required optimizer self.optimizer = opt.create(self.optimizer, rescale_grad=1.0, **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = metric.create(eval_metric) return eval_metric
def init_optimizer(self, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), force_init=False): """Install and initialize optimizers. Parameters ---------- kvstore : str or KVStore Default `'local'`. optimizer : str or Optimizer Default `'sgd'` optimizer_params : dict Default `(('learning_rate', 0.01),)`. The default value is not a dictionary, just to avoid pylint warning of dangerous default values. force_init : bool Default `False`, indicating whether we should force re-initializing the optimizer in the case an optimizer is already installed. """ assert self.binded and self.params_initialized if self.optimizer_initialized and not force_init: self.logger.warning('optimizer already initialized, ignoring...') return (kvstore, update_on_kvstore) = \ _create_kvstore(kvstore, len(self._context), self._arg_params) batch_size = self._exec_group.batch_size if kvstore and 'dist' in kvstore.type and '_sync' in kvstore.type: batch_size *= kvstore.num_workers rescale_grad = 1.0/batch_size if isinstance(optimizer, str): idx2name = {} if update_on_kvstore: idx2name.update(enumerate(self._exec_group.param_names)) else: for k in range(len(self._context)): idx2name.update({i*len(self._context)+k: n for i, n in enumerate(self._exec_group.param_names)}) optimizer_params = dict(optimizer_params) if 'rescale_grad' not in optimizer_params: optimizer_params['rescale_grad'] = rescale_grad optimizer = opt.create(optimizer, sym=self.symbol, param_idx2name=idx2name, **optimizer_params) else: assert isinstance(optimizer, opt.Optimizer) if optimizer.rescale_grad != rescale_grad: #pylint: disable=no-member warnings.warn( "Optimizer created manually outside Module but rescale_grad " + "is not normalized to 1.0/batch_size/num_workers (%s vs. %s). "%( optimizer.rescale_grad, rescale_grad) + "Is this intended?", stacklevel=2) self._optimizer = optimizer self._kvstore = kvstore self._update_on_kvstore = update_on_kvstore self._updater = None if kvstore: # copy initialized local parameters to kvstore _initialize_kvstore(kvstore=kvstore, param_arrays=self._exec_group.param_arrays, arg_params=self._arg_params, param_names=self._param_names, update_on_kvstore=update_on_kvstore) if update_on_kvstore: kvstore.set_optimizer(self._optimizer) else: self._updater = opt.get_updater(optimizer) self.optimizer_initialized = True if self._preload_opt_states is not None: self.load_optimizer_states(self._preload_opt_states) self._preload_opt_states = None
def fit(self, train_data, eval_data=None, eval_metric='acc', grad_req='write', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None): if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data=train_data.provide_data[0][1]) arg_names = self.symbol.list_arguments() if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None aux_names = self.symbol.list_auxiliary_states() self.aux_params = {k : nd.zeros(s) for k, s in zip(aux_names, aux_shapes)} data_name = train_data.data_name label_name = train_data.label_name input_names = [data_name, label_name] self.optimizer = opt.create(self.optimizer, rescale_grad=(1.0/train_data.get_batch_size()), **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = metric.create(eval_metric) # begin training for epoch in range(self.begin_epoch, self.num_epoch): nbatch = 0 train_data.reset() eval_metric.reset() for data in train_data: nbatch += 1 label_shape = data[label_name].shape self.arg_params[data_name] = mx.nd.array(data[data_name], self.ctx) self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2]), self.ctx) output_names = self.symbol.list_outputs() self.exector = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) assert len(self.symbol.list_arguments()) == len(self.exector.grad_arrays) update_dict = {name: nd for name, nd in zip(self.symbol.list_arguments(), \ self.exector.grad_arrays) if nd} output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.exector.outputs): output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) self.exector.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) self.exector.backward() for key, arr in update_dict.items(): if key != "bigscore_weight": self.updater(key, arr, self.arg_params[key]) pred_shape = self.exector.outputs[0].shape label = mx.nd.array(data[label_name].reshape(label_shape[0], label_shape[1]*label_shape[2])) pred = mx.nd.array(output_buff["softmax_output"].asnumpy().reshape(pred_shape[0], \ pred_shape[1], pred_shape[2]*pred_shape[3])) eval_metric.update([label], [pred]) self.exector.outputs[0].wait_to_read() batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric) batch_end_callback(batch_end_params) if epoch_end_callback is not None: epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) name, value = eval_metric.get() logger.info(" --->Epoch[%d] Train-%s=%f", epoch, name, value) # evaluation if eval_data: logger.info(" in eval process...") nbatch = 0 eval_data.reset() eval_metric.reset() for data in eval_data: nbatch += 1 label_shape = data[label_name].shape self.arg_params[data_name] = mx.nd.array(data[data_name], self.ctx) self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2]), self.ctx) exector = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) cpu_output_array = mx.nd.zeros(exector.outputs[0].shape) exector.forward(is_train=False) exector.outputs[0].copyto(cpu_output_array) pred_shape = cpu_output_array.shape label = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2])) pred = mx.nd.array(cpu_output_array.asnumpy().reshape(pred_shape[0], \ pred_shape[1], pred_shape[2]*pred_shape[3])) eval_metric.update([label], [pred]) exector.outputs[0].wait_to_read() name, value = eval_metric.get() logger.info('batch[%d] Validation-%s=%f', nbatch, name, value)
def fit(self, train_data, eval_data=None, eval_metric='acc', grad_req='write', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None): global outimgiter if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) logging.info(str(self.kwargs)) batch_size = train_data.provide_data[0][1][0] arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape( \ data=tuple(train_data.provide_data[0][1]), label_det=(batch_size,200,6)) arg_names = self.symbol.list_arguments() out_names = self.symbol.list_outputs() aux_names = self.symbol.list_auxiliary_states() # pprint([(n,s) for n,s in zip(arg_names,arg_shapes)]) # pprint([(n,s) for n,s in zip(out_names,out_shapes)]) # pprint([(n,s) for n,s in zip(aux_names,aux_shapes)]) if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None self.aux_params = {k : mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes)} data_name = train_data.provide_data[0][0] label_name_det = train_data.provide_label[0][0] label_name_seg = train_data.provide_label[1][0] input_names = [data_name, label_name_det, label_name_seg] print(train_data.provide_label) print(os.environ["MXNET_CUDNN_AUTOTUNE_DEFAULT"]) self.optimizer = opt.create(self.optimizer, rescale_grad=(1.0/train_data.batch_size), **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = CustomAccuracyMetric() # metric.create(eval_metric) multibox_metric = MultiBoxMetric() eval_metrics = metric.CompositeEvalMetric() eval_metrics.add(multibox_metric) # eval_metrics.add(eval_metric) # begin training for epoch in range(self.begin_epoch, self.num_epoch): nbatch = 0 train_data.reset() eval_metrics.reset() logger.info('learning rate: '+str(self.optimizer.learning_rate)) for data,_ in train_data: if self.evaluation_only: break nbatch += 1 label_shape_det = data.label[0].shape label_shape_seg = data.label[1].shape self.arg_params[data_name] = mx.nd.array(data.data[0], self.ctx) self.arg_params[label_name_det] = mx.nd.array(data.label[0], self.ctx) self.arg_params[label_name_seg] = mx.nd.array(data.label[1], self.ctx) output_names = self.symbol.list_outputs() ###################### analyze shapes #################### # pprint([(k,v.shape) for k,v in self.arg_params.items()]) self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) assert len(self.symbol.list_arguments()) == len(self.executor.grad_arrays) update_dict = {name: nd for name, nd in zip(self.symbol.list_arguments(), \ self.executor.grad_arrays) if nd is not None} output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.executor.outputs): output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) # output_buff[key] = mx.nd.empty(arr.shape, ctx=self.ctx) def stat_helper(name, array): """wrapper for executor callback""" import ctypes from mxnet.ndarray import NDArray from mxnet.base import NDArrayHandle, py_str array = ctypes.cast(array, NDArrayHandle) if 0: array = NDArray(array, writable=False).asnumpy() print (name, array.shape, np.mean(array), np.std(array), ('%.1fms' % (float(time.time()-stat_helper.start_time)*1000))) else: array = NDArray(array, writable=False) array.wait_to_read() elapsed = float(time.time()-stat_helper.start_time)*1000. if elapsed>5: print (name, array.shape, ('%.1fms' % (elapsed,))) stat_helper.start_time=time.time() stat_helper.start_time=float(time.time()) # self.executor.set_monitor_callback(stat_helper) tic = time.time() self.executor.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) # exit(0) # for debugging forward pass only self.executor.backward() for key, arr in update_dict.items(): if key != "bigscore_weight": self.updater(key, arr, self.arg_params[key]) for output in self.executor.outputs: output.wait_to_read() if TIMING: print("%.0fms" % ((time.time()-tic)*1000.,)) output_dict = dict(zip(output_names, self.executor.outputs)) pred_det_shape = output_dict["det_out_output"].shape # pred_seg_shape = output_dict["seg_out_output"].shape label_det = mx.nd.array(data.label[0].reshape((label_shape_det[0], label_shape_det[1]*label_shape_det[2]))) # label_seg = mx.nd.array(data.label[1].reshape((label_shape_seg[0], # label_shape_seg[1]*label_shape_seg[2]))) pred_det = mx.nd.array(output_buff["det_out_output"].reshape((pred_det_shape[0], pred_det_shape[1], pred_det_shape[2]))) # pred_seg = mx.nd.array(output_buff["seg_out_output"].reshape((pred_seg_shape[0], # pred_seg_shape[1], pred_seg_shape[2]*pred_seg_shape[3]))) if DEBUG: print(data.label[0].asnumpy()[0,:2,:]) if TIMING: print("%.0fms" % ((time.time()-tic)*1000.,)) eval_metrics.get_metric(0).update([mx.nd.zeros(output_buff["cls_prob_output"].shape), mx.nd.zeros(output_buff["loc_loss_output"].shape),label_det], [output_buff["cls_prob_output"], output_buff["loc_loss_output"], output_buff["cls_label_output"]]) # eval_metrics.get_metric(1).update([label_seg.as_in_context(self.ctx)], [pred_seg.as_in_context(self.ctx)]) self.executor.outputs[0].wait_to_read() ##################### display results ############################## # out_det = output_dict["det_out_output"].asnumpy() # for imgidx in range(out_det.shape[0]): # img = np.squeeze(data.data[0].asnumpy()[imgidx,:,:,:]) # det = out_det[imgidx,:,:] # gt = label_det.asnumpy()[imgidx,:].reshape((-1,6)) # display_results(img, det, gt, self.class_names) # [exit(0) if (cv2.waitKey(1)&0xff)==27 else None] # outimgiter += 1 batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metrics) batch_end_callback(batch_end_params) if TIMING: print("%.0fms" % ((time.time()-tic)*1000.,)) # exit(0) # for debugging only ##### save snapshot if (not self.evaluation_only) and (epoch_end_callback is not None): epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) names, values = eval_metrics.get() for name, value in zip(names,values): logger.info(" --->Epoch[%d] Train-%s=%f", epoch, name, value) # evaluation if eval_data: logger.info(" in eval process...") nbatch = 0 depth_metric = DistanceAccuracyMetric(class_names=self.class_names) eval_data.reset() eval_metrics.reset() self.valid_metric.reset() depth_metric.reset() timing_results = [] for data, fnames in eval_data: nbatch += 1 label_shape_det = data.label[0].shape # label_shape_seg = data.label[1].shape self.arg_params[data_name] = mx.nd.array(data.data[0], self.ctx) self.arg_params[label_name_det] = mx.nd.array(data.label[0], self.ctx) # self.arg_params[label_name_seg] = mx.nd.array(data.label[1], self.ctx) self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) output_names = self.symbol.list_outputs() output_dict = dict(zip(output_names, self.executor.outputs)) # cpu_output_array = mx.nd.zeros(output_dict["seg_out_output"].shape) ############## monitor status # def stat_helper(name, array): # """wrapper for executor callback""" # import ctypes # from mxnet.ndarray import NDArray # from mxnet.base import NDArrayHandle, py_str # array = ctypes.cast(array, NDArrayHandle) # if 1: # array = NDArray(array, writable=False).asnumpy() # print (name, array.shape, np.mean(array), np.std(array), # ('%.1fms' % (float(time.time()-stat_helper.start_time)*1000))) # else: # array = NDArray(array, writable=False) # array.wait_to_read() # elapsed = float(time.time()-stat_helper.start_time)*1000. # if elapsed>5: # print (name, array.shape, ('%.1fms' % (elapsed,))) # stat_helper.start_time=time.time() # stat_helper.start_time=float(time.time()) # self.executor.set_monitor_callback(stat_helper) ############## forward tic = time.time() self.executor.forward(is_train=True) # output_dict["seg_out_output"].wait_to_read() timing_results.append((time.time()-tic)*1000.) # output_dict["seg_out_output"].copyto(cpu_output_array) # pred_shape = output_dict["seg_out_output"].shape # label = mx.nd.array(data.label[1].reshape((label_shape_seg[0], label_shape_seg[1]*label_shape_seg[2]))) # output_dict["seg_out_output"].wait_to_read() # seg_out_output = output_dict["seg_out_output"].asnumpy() pred_det_shape = output_dict["det_out_output"].shape # pred_seg_shape = output_dict["seg_out_output"].shape label_det = mx.nd.array(data.label[0].reshape((label_shape_det[0], label_shape_det[1]*label_shape_det[2]))) # label_seg = mx.nd.array(data.label[1].reshape((label_shape_seg[0], label_shape_seg[1]*label_shape_seg[2])),ctx=self.ctx) pred_det = mx.nd.array(output_dict["det_out_output"].reshape((pred_det_shape[0], pred_det_shape[1], pred_det_shape[2]))) # pred_seg = mx.nd.array(output_dict["seg_out_output"].reshape((pred_seg_shape[0], pred_seg_shape[1], pred_seg_shape[2]*pred_seg_shape[3])),ctx=self.ctx) #### remove invalid boxes out_dets = output_dict["det_out_output"].asnumpy() assert len(out_dets.shape)==3 pred_det = np.zeros((batch_size, 200, 7), np.float32)-1. for idx, out_det in enumerate(out_dets): assert len(out_det.shape)==2 out_det = np.expand_dims(out_det, axis=0) indices = np.where(out_det[:,:,0]>=0) # labeled as negative out_det = np.expand_dims(out_det[indices[0],indices[1],:],axis=0) indices = np.where(out_det[:,:,1]>.25) # higher confidence out_det = np.expand_dims(out_det[indices[0],indices[1],:],axis=0) pred_det[idx, :out_det.shape[1], :] = out_det del out_det pred_det = mx.nd.array(pred_det) ##### display results if False: # self.evaluation_only: # out_img = output_dict["seg_out_output"] # out_img = mx.nd.split(out_img, axis=0, num_outputs=out_img.shape[0], squeeze_axis=0) # if not isinstance(out_img,list): # out_img = [out_img] for imgidx in range(eval_data.batch_size): img = np.squeeze(data.data[0].asnumpy()[imgidx,:,:,:]) det = pred_det.asnumpy()[imgidx,:,:] ### ground-truth gt = label_det.asnumpy()[imgidx,:].reshape((-1,6)) # display result display_img = display_results(img, det, gt, self.class_names) res_fname = fnames[imgidx].replace("SegmentationClass","Results").replace("labelIds","results") if cv2.imwrite(res_fname, display_img): print(res_fname,'saved.') [exit(0) if (cv2.waitKey()&0xff)==27 else None] outimgiter += 1 if self.evaluation_only: continue eval_metrics.get_metric(0).update(None, [output_dict["cls_prob_output"], output_dict["loc_loss_output"], output_dict["cls_label_output"]]) # eval_metrics.get_metric(1).update([label_seg], [pred_seg]) self.valid_metric.update([mx.nd.slice_axis(data.label[0],axis=2,begin=0,end=5)], \ [mx.nd.slice_axis(pred_det,axis=2,begin=0,end=6)]) disparities = [] for imgidx in range(batch_size): dispname = fnames[imgidx].replace("SegmentationClass","Disparity").replace("gtFine_labelTrainIds","disparity") disparities.append(cv2.imread(dispname,-1)) assert disparities[0] is not None, dispname + " not found." depth_metric.update(mx.nd.array(disparities),[pred_det]) det_metric = self.valid_metric det_names, det_values = det_metric.get() depth_names, depth_values = depth_metric.get() print("\r %d/%d speed=%.1fms %.1f%% %s=%.1f %s=%.1f" % \ (nbatch*eval_data.batch_size,eval_data.num_samples, math.fsum(timing_results)/float(nbatch), float(nbatch*eval_data.batch_size)*100./float(eval_data.num_samples), det_names[-1],det_values[-1]*100., depth_names[-1],depth_values[-1]*100.,),end='\r') names, values = eval_metrics.get() for name, value in zip(names,values): logger.info(' epoch[%d] Validation-%s=%f', epoch, name, value) logger.info('----------------------------------------------') print(' & '.join(names)) print(' & '.join(map(lambda v:'%.1f'%(v*100.,),values))) logger.info('----------------------------------------------') names, values = self.valid_metric.get() for name, value in zip(names,values): logger.info(' epoch[%d] Validation-%s=%f', epoch, name, value) logger.info('----------------------------------------------') print(' & '.join(names)) print(' & '.join(map(lambda v:'%.1f'%(v*100.,),values))) logger.info('----------------------------------------------') names, values = depth_metric.get() for name, value in zip(names,values): logger.info(' epoch[%d] Validation-%s=%f', epoch, name, value) logger.info('----------------------------------------------') print(' & '.join(names)) print(' & '.join(map(lambda v:'%.1f'%(v*100.,),values))) logger.info('----------------------------------------------') if self.evaluation_only: exit(0) ## for debugging only
def fit(self, train_data, eval_data=None, eval_metric='acc', grad_req='write', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None): if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape( data=train_data.provide_data[0][1]) arg_names = self.symbol.list_arguments() if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None aux_names = self.symbol.list_auxiliary_states() self.aux_params = { k: nd.zeros(s) for k, s in zip(aux_names, aux_shapes) } data_name = train_data.data_name label_name = train_data.label_name input_names = [data_name, label_name] self.optimizer = opt.create(self.optimizer, rescale_grad=(1.0 / train_data.get_batch_size()), **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = metric.create(eval_metric) # begin training for epoch in range(self.begin_epoch, self.num_epoch): nbatch = 0 train_data.reset() eval_metric.reset() for data in train_data: nbatch += 1 label_shape = data[label_name].shape self.arg_params[data_name] = mx.nd.array( data[data_name], self.ctx) self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2]), self.ctx) output_names = self.symbol.list_outputs() self.exector = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) assert len(self.symbol.list_arguments()) == len( self.exector.grad_arrays) update_dict = {name: nd for name, nd in zip(self.symbol.list_arguments(), \ self.exector.grad_arrays) if nd is not None} output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.exector.outputs): output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) self.exector.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) self.exector.backward() for key, arr in update_dict.items(): if key != "bigscore_weight": self.updater(key, arr, self.arg_params[key]) pred_shape = self.exector.outputs[0].shape label = mx.nd.array(data[label_name].reshape( label_shape[0], label_shape[1] * label_shape[2])) pred = mx.nd.array(output_buff["softmax_output"].asnumpy().reshape(pred_shape[0], \ pred_shape[1], pred_shape[2]*pred_shape[3])) eval_metric.update([label], [pred]) self.exector.outputs[0].wait_to_read() batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric) batch_end_callback(batch_end_params) if epoch_end_callback is not None: epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) name, value = eval_metric.get() logger.info(" --->Epoch[%d] Train-%s=%f", epoch, name, value) # evaluation if eval_data: logger.info(" in eval process...") nbatch = 0 eval_data.reset() eval_metric.reset() for data in eval_data: nbatch += 1 label_shape = data[label_name].shape self.arg_params[data_name] = mx.nd.array( data[data_name], self.ctx) self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2]), self.ctx) exector = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) cpu_output_array = mx.nd.zeros(exector.outputs[0].shape) exector.forward(is_train=False) exector.outputs[0].copyto(cpu_output_array) pred_shape = cpu_output_array.shape label = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2])) pred = mx.nd.array(cpu_output_array.asnumpy().reshape(pred_shape[0], \ pred_shape[1], pred_shape[2]*pred_shape[3])) eval_metric.update([label], [pred]) exector.outputs[0].wait_to_read() name, value = eval_metric.get() logger.info('batch[%d] Validation-%s=%f', nbatch, name, value)
def fit(self, X, marks, e_marks=None, y=None, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, time_step_callback=None, kvstore='local', logger=None, work_load_list=None, monitor=None, eval_batch_end_callback=None): """Overwrite""" data = self._init_iter(X, y, is_train=True) eval_data = self._init_eval_iter(eval_data) if self.sym_gen: self.symbol = self.sym_gen( data.default_bucket_key) # pylint: disable=no-member self._check_arguments() self.kwargs["sym"] = self.symbol param_dict = dict(data.provide_data + data.provide_label) arg_names, param_names, aux_names = self._init_params(param_dict) # setup metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) # create kvstore (kvstore, update_on_kvstore) = _create_kvstore( kvstore, len(self.ctx), self.arg_params) param_idx2name = {} if update_on_kvstore: param_idx2name.update(enumerate(param_names)) else: for i, n in enumerate(param_names): for k in range(len(self.ctx)): param_idx2name[i * len(self.ctx) + k] = n self.kwargs["param_idx2name"] = param_idx2name # init optmizer if isinstance(self.optimizer, str): batch_size = data.batch_size if kvstore and kvstore.type == 'dist_sync': batch_size *= kvstore.num_workers optimizer = opt.create(self.optimizer, rescale_grad=(1.0 / batch_size), **(self.kwargs)) elif isinstance(self.optimizer, opt.Optimizer): optimizer = self.optimizer # do training _train_rnn(self.symbol, self.ctx, marks, arg_names, param_names, aux_names, self.arg_params, self.aux_params, begin_epoch=self.begin_epoch, end_epoch=self.num_epoch, epoch_size=self.epoch_size, optimizer=optimizer, train_data=data, eval_data=eval_data, eval_metric=eval_metric, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, time_step_callback=time_step_callback, kvstore=kvstore, update_on_kvstore=update_on_kvstore, logger=logger, work_load_list=work_load_list, monitor=monitor, eval_batch_end_callback=eval_batch_end_callback, sym_gen=self.sym_gen, e_marks=e_marks)
def fddb_finetune_fold(fold_index): target_index = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] num_train_feature = 0 num_valid_feature = 0 for index in target_index: if index != fold_index: num_train_feature += num_feature_fold[index] else: num_valid_feature += num_feature_fold[index] train_feature = np.zeros((num_train_feature, feature_len), dtype=np.float) train_label = np.zeros((num_train_feature, label_len), dtype=np.float) train_weight = np.zeros((num_train_feature, label_len), dtype=np.float) train_feature_index = 0 valid_feature = np.zeros((num_valid_feature, feature_len), dtype=np.float) valid_label = np.zeros((num_valid_feature, label_len), dtype=np.float) valid_weight = np.zeros((num_valid_feature, label_len), dtype=np.float) valid_feature_index = 0 for index in target_index: for i in xrange(num_feature_fold[index]): if index != fold_index: train_feature[train_feature_index] = feature_fold[index][i] train_label[train_feature_index] = label_fold[index][i] train_weight[train_feature_index] = weight_fold[index][i] train_feature_index += 1 else: valid_feature[valid_feature_index] = feature_fold[index][i] valid_label[valid_feature_index] = label_fold[index][i] valid_weight[valid_feature_index] = weight_fold[index][i] valid_feature_index += 1 if retrain: symbol_finetune = fddb_symbol_finetune.get_vgg16_finetune() args = {} auxs = {} arg_names = symbol_finetune.list_arguments() aux_names = symbol_finetune.list_auxiliary_states() arg_shapes, _, aux_shapes = symbol_finetune.infer_shape( data=(batchsize, feature_len)) for name, shape in zip(arg_names, arg_shapes): if len(shape) < 1: continue fan_in, fan_out = np.prod(shape[1:]), shape[0] factor = fan_in scale = np.sqrt(2.34 / factor) tempt = np.random.uniform(-scale, scale, size=shape) args[name] = mx.nd.array(tempt, ctx) for name, shape in zip(aux_names, aux_shapes): if len(shape) < 1: continue fan_in, fan_out = np.prod(shape[1:]), shape[0] factor = fan_in scale = np.sqrt(2.34 / factor) tempt = np.random.uniform(-scale, scale, size=shape) auxs[name] = mx.nd.array(tempt, ctx) else: symbol_finetune = fddb_symbol_finetune.get_vgg16_finetune() _, args, auxs = mx.model.load_checkpoint(rpn_prefix, load_epoch) for k, v in args.items(): if v.context != ctx: args[k] = mx.nd.zeros(v.shape, ctx) v.copyto(args[k]) for k, v in auxs.items(): if v.context != ctx: auxs[k] = mx.nd.zeros(v.shape, ctx) v.copyto(auxs[k]) arg_names = symbol_finetune.list_arguments() arg_shapes, _, aux_shapes = symbol_finetune.infer_shape( data=(batchsize, feature_len)) grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('ell_label') or name.endswith('bbox_weight') or name.endswith('data')): grad_params[name] = mx.nd.zeros(shape, ctx) num_train_batch = num_train_feature / batchsize lr = 0.03 lr_decay = 0.33 epoch_end_callback = mx.callback.do_checkpoint(finetune_prefix + "-" + fold_index) for j in range(start_epoch, end_epoch): bbox_predict_loss = np.array([.0, .0, .0]) if j % 50 == 0 or j == start_epoch: lr *= lr_decay optimizer = opt.create('sgd', rescale_grad=1.0 / batchsize, learning_rate=lr, momentum=0.9, wd=0.00001) updater = get_updater(optimizer) for i in range(num_train_batch): feature_b = train_feature[i * batchsize:(i + 1) * batchsize, :] label_b = train_label[i * batchsize:(i + 1) * batchsize, :] weight_b = train_weight[i * batchsize:(i + 1) * batchsize, :] args["data"] = mx.nd.array(feature_b, ctx) args["ell_label"] = mx.nd.array(label_b, ctx) args["bbox_weight"] = mx.nd.array(weight_b, ctx) executor = symbol_finetune.bind(ctx, args, args_grad=grad_params, grad_req='write', aux_states=auxs) assert len(symbol_finetune.list_arguments()) == len( executor.grad_arrays) update_dict = { name: nd for name, nd in zip(symbol_finetune.list_arguments(), executor.grad_arrays) if nd } output_dict = {} output_buff = {} for key, arr in zip(symbol_finetune.list_outputs(), executor.outputs): output_dict[key] = arr output_buff[key] = mx.nd.zeros(arr.shape, ctx=mx.cpu()) executor.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) executor.backward() for key, arr in update_dict.items(): updater(key, arr, args[key]) executor.outputs[0].wait_to_read() face_pred = output_buff["ellipse_predict_loss_output"].asnumpy() bbox_predict_b = bbox_predict_metric(label_b, face_pred, weight_b) bbox_predict_loss += bbox_predict_b if i % 10 == 0: print "Training-fold[" + \ fold_index + \ "]-epoch[%d/%d]-batch[%d/%d]: lr:%f\tbbox_regress:%f\tbbox_angle:%f\tiou_regress:%f" % \ (j, end_epoch, i, num_train_batch, lr, bbox_predict_b[0], bbox_predict_b[1], bbox_predict_b[2]) print "ALL Training: bbox_regress:%f\tbbox_angle:%f\tiou_regress:%f" % \ (bbox_predict_loss[0] / float(num_train_batch), bbox_predict_loss[1] / float(num_train_batch), bbox_predict_loss[2] / float(num_train_batch)) if j % 25 == 0: print "Saving the model:", j epoch_end_callback(j, symbol_finetune, args, auxs) args["data"] = mx.nd.array(valid_feature, ctx) args["ell_label"] = mx.nd.array(valid_label, ctx) args["bbox_weight"] = mx.nd.array( np.ones((valid_feature.shape[0], label_len), dtype=np.float), ctx) executor = symbol_finetune.bind(ctx, args, args_grad=None, grad_req='null', aux_states=auxs) output_dict = {} output_buff = {} for key, arr in zip(symbol_finetune.list_outputs(), executor.outputs): output_dict[key] = arr output_buff[key] = mx.nd.zeros(arr.shape, ctx=mx.cpu()) executor.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) executor.outputs[0].wait_to_read() face_pred = output_buff["ellipse_predict_loss_output"].asnumpy() print valid_label[0] print face_pred[0] bbox_predict_b = bbox_predict_metric(valid_label, face_pred, valid_weight) print "ALL Validation: bbox_regress:%f\tbbox_angle:%f\tiou_regress:%f" % \ (bbox_predict_b[0], bbox_predict_b[1], bbox_predict_b[2])
def run(mxIter): model_prefix = '/data2/obj_detect/imagenet_models/resnet/resnet-101' load_epoch = 0 #model_prefix = './stage1_models/tiny_face-06440' #load_epoch = 42 #model_prefix = './tiny_face-06440' #load_epoch = 140 head = '%(asctime)-15s %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) input_shapes = get_input_shapes(mxIter.batch_size) optimizer = 'sgd' optimizer_params = { 'learning_rate': 0.0001, 'momentum' : 0.90, 'wd' : 0.0001} optimizer = opt.create(optimizer, rescale_grad=1.0 / mxIter.batch_size, **optimizer_params) updater = get_updater(optimizer) net = get_symbol_focal_loss() arg_params, aux_params = load_params_checkpoint(model_prefix, load_epoch) arg_names = net.list_arguments() param_names = [x for x in arg_names if x not in input_shapes] initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) delete_params_by_shape(net, arg_params, aux_params, input_shapes, initializer) exec_ = net.simple_bind(ctx=mx.gpu(2), **input_shapes) copy_params(arg_params, aux_params, exec_) param_arrays = [[exec_.arg_arrays[i]] for i,name in enumerate(arg_names) if name in param_names] grad_arrays = [[exec_.grad_arrays[i]] for i,name in enumerate(arg_names) if name in param_names] #monitor = mx.monitor.Monitor(interval=1, pattern='.*backward.*') #monitor.install(exec_) batch_size = mxIter.batch_size for epoch in range(load_epoch+1, 200): num_batch = 0 metric = 0 num_inst = 0 num_reg_inst = 0 reg_metric = 0 for batch in mxIter: load_data(batch, exec_) #monitor.tic() exec_.forward(is_train=True) outputs = [output.asnumpy() for output in exec_._get_outputs()] exec_.backward() #monitor.toc_print() _update_params(param_arrays, grad_arrays, updater, 1, param_names=param_names) num_batch += 1 # metric metric += np.sum(outputs[0]) reg_metric += np.sum(outputs[1]) print 'batch -> {}'.format(num_batch) print 'focal_loss -> {}'.format(metric / num_batch) print 'l1_loss -> {}'.format(reg_metric / num_batch) if num_batch % 1000 == 0: save_arg_params = {} for param_name in param_names: save_arg_params[param_name] = exec_.arg_dict[param_name] save_aux_params = exec_.aux_dict save_checkpoint('./tiny_face', num_batch, epoch, net, save_arg_params, save_aux_params) mxIter.reset() save_arg_params = {} for param_name in param_names: save_arg_params[param_name] = exec_.arg_dict[param_name] save_aux_params = exec_.aux_dict save_checkpoint('./tiny_face', num_batch, epoch, net, save_arg_params, save_aux_params)
def fit(self, train_data, eval_data=None, eval_metric='acc', grad_req='write', logger=None, softmax_metric=None, regression_metric=None, epoch_end_callback=None): f = open("log_rpn.txt", 'w') if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) f.write('Start training with %s\n' % str(self.ctx)) arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape( data=(1, 3, 128, 128), mean_face=(10, 3), ground_truth=(10, 2), bbox_label=(10, 5)) arg_names = self.symbol.list_arguments() if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith("mean_face") or name.endswith('cls_label') or name.endswith('proj_weight') or name.endswith('proj_label') or name.endswith('ground_truth') or name.endswith('bbox_label') or name.endswith("bbox_weight")): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None aux_names = self.symbol.list_auxiliary_states() self.aux_params = { k: mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes) } data_name = train_data.data_name cls_label_name = train_data.cls_label_name proj_label_name = train_data.proj_label_name proj_weight_name = train_data.proj_weight_name ground_truth_name = train_data.ground_truth_name bbox_label_name = train_data.bbox_label_name bbox_weight_name = train_data.bbox_weight_name self.optimizer = opt.create(self.optimizer, rescale_grad=1.0, **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = metric.create(eval_metric) for epoch in range(self.begin_epoch, self.num_epoch): if eval_data: logger.info(" in eval process...") f.write(" in eval process...") nbatch = 0 softmax_proj = np.zeros((11, 3)) proj_regression_loss = .0 bbox_predict_loss = np.array([.0, .0]) eval_data.reset() for data in eval_data: nbatch += 1 print "Eval batch:", nbatch softmax_shape = data[cls_label_name].shape self.arg_params[data_name] = mx.nd.array( data[data_name], self.ctx) self.arg_params[cls_label_name] = mx.nd.array( data[cls_label_name].reshape( (softmax_shape[0], softmax_shape[1] * softmax_shape[2])), self.ctx) self.arg_params[proj_label_name] = mx.nd.array( data[proj_label_name], self.ctx) self.arg_params[proj_weight_name] = mx.nd.array( data[proj_weight_name], self.ctx) self.arg_params[ground_truth_name] = mx.nd.array( data[ground_truth_name], self.ctx) self.arg_params[bbox_label_name] = mx.nd.array( data[bbox_label_name], self.ctx) self.arg_params[bbox_weight_name] = mx.nd.array( data[bbox_weight_name], self.ctx) self.arg_params["mean_face"] = mx.nd.array( train_data.mean_face, self.ctx) executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) softmax_output_array = mx.nd.zeros( executor.outputs[0].shape) proj_regression_output_array = mx.nd.zeros( executor.outputs[1].shape) bbox_predict_output_array = mx.nd.zeros( executor.outputs[2].shape) ell_label = mx.nd.zeros(executor.outputs[3].shape) bbox_predict = mx.nd.zeros(executor.outputs[4].shape) executor.forward(is_train=True) executor.outputs[0].copyto(softmax_output_array) executor.outputs[1].copyto(proj_regression_output_array) executor.outputs[2].copyto(bbox_predict_output_array) executor.outputs[3].copyto(ell_label) executor.outputs[4].copyto(bbox_predict) softmax_shape = softmax_output_array.shape index_label = np.nonzero(data[cls_label_name].reshape( softmax_shape[0], softmax_shape[2] * softmax_shape[3]) - 255) label = mx.nd.array(data[cls_label_name].reshape( softmax_shape[0], softmax_shape[2] * softmax_shape[3])[:, index_label[1]]) pred = mx.nd.array((softmax_output_array.asnumpy().reshape( softmax_shape[0], softmax_shape[1], softmax_shape[2] * softmax_shape[3]))[..., index_label[1]]) if softmax_metric: tempt = softmax_metric(label, pred, 11) softmax_proj += tempt proj_label = data[proj_label_name] proj_weight = data[proj_weight_name] proj_pred = proj_regression_output_array.asnumpy().reshape( data[proj_weight_name].shape) index_nonzero = np.nonzero(data[proj_weight_name]) proj_regress_tmp = regression_metric( proj_label[index_nonzero], proj_pred[index_nonzero], proj_weight[index_nonzero]) proj_regression_loss += proj_regress_tmp bbox_pred = bbox_predict_output_array.asnumpy() bbox_predict_tmp = bbox_predict_metric( ell_label.asnumpy(), bbox_pred) bbox_predict_loss += bbox_predict_tmp print "Validation-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f" % \ (epoch, nbatch, get_accuracy(tempt, self.bgfg), proj_regress_tmp, bbox_predict_tmp[0], bbox_predict_tmp[1]) f.write( "Validation-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f\n" % (epoch, nbatch, get_accuracy( tempt, self.bgfg), proj_regress_tmp, bbox_predict_tmp[0], bbox_predict_tmp[1])) img_info = eval_data.AllImg[nbatch - 1] print "%s\twidth: %d height: %d num_face: %d" % \ (img_info.filename, img_info.width, img_info.height, img_info.num_faces) f.write("%s\twidth: %d height: %d num_face: %d\n" % (img_info.filename, img_info.width, img_info.height, img_info.num_faces)) executor.outputs[0].wait_to_read() executor.outputs[1].wait_to_read() executor.outputs[2].wait_to_read() executor.outputs[3].wait_to_read() print_accuracy(softmax_proj, f, train_data.class_names, self.bgfg) logger.info("ALL Validation accuracy: %f", get_accuracy(softmax_proj, self.bgfg)) logger.info('Validation projection regression: %f', proj_regression_loss / nbatch) logger.info('Validation bbox predict: %f %f', bbox_predict_loss[0] / nbatch, bbox_predict_loss[1] / nbatch) f.write("ALL Validation accuracy: %f\n" % get_accuracy(softmax_proj, self.bgfg)) f.write("Validation projection regression: %f\n" % (proj_regression_loss / nbatch)) f.write("Validation bbox predict: %f %f\n" % (bbox_predict_loss[0] / nbatch, bbox_predict_loss[1] / nbatch)) nbatch = 0 train_data.reset() eval_metric.reset() proj_regress_loss_t = .0 proj_regress_loss_b = .0 softmax_count = np.zeros((11, 3)) softmax_batch = np.zeros((11, 3)) bbox_predict_loss_t = np.array([.0, .0]) bbox_predict_loss_b = np.array([.0, .0]) for data in train_data: nbatch += 1 softmax_shape = data[cls_label_name].shape self.arg_params[data_name] = mx.nd.array( data[data_name], self.ctx) self.arg_params[cls_label_name] = mx.nd.array( data[cls_label_name].reshape( (softmax_shape[0], softmax_shape[1] * softmax_shape[2])), self.ctx) self.arg_params[proj_label_name] = mx.nd.array( data[proj_label_name], self.ctx) self.arg_params[proj_weight_name] = mx.nd.array( data[proj_weight_name], self.ctx) self.arg_params[ground_truth_name] = mx.nd.array( data[ground_truth_name], self.ctx) self.arg_params[bbox_label_name] = mx.nd.array( data[bbox_label_name], self.ctx) self.arg_params[bbox_weight_name] = mx.nd.array( data[bbox_weight_name], self.ctx) self.arg_params["mean_face"] = mx.nd.array( train_data.mean_face, self.ctx) self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) assert len(self.symbol.list_arguments()) == len( self.executor.grad_arrays) update_dict = { name: nd for name, nd in zip(self.symbol.list_arguments(), self.executor.grad_arrays) if nd } output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.executor.outputs): output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) self.executor.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) self.executor.backward() ''' for i in xrange(0, 49): if self.executor.grad_arrays[i] != None: print i, arg_names[i], self.executor.grad_arrays[i].asnumpy()[0] ''' for key, arr in update_dict.items(): if key != 'upsample_proposal_weight': self.updater(key, arr, self.arg_params[key]) ''' if key == 'config_fc1_weight': print 'config_fc1_weight' print 'param:', self.arg_params[key].asnumpy() print 'grad:', self.executor.grad_arrays[39].asnumpy() if key == 'refine_proj_param_weight': print 'refine_proj_param_weight' print 'param:', self.arg_params[key].asnumpy() print 'grad:', self.executor.grad_arrays[47].asnumpy() ''' pred_shape = self.executor.outputs[0].shape index_label = np.nonzero(data[cls_label_name].reshape( softmax_shape[0], softmax_shape[1] * softmax_shape[2]) - 255) label = mx.nd.array(data[cls_label_name].reshape( softmax_shape[0], softmax_shape[1] * softmax_shape[2])[:, index_label[1]]) pred = mx.nd.array( (output_buff["proposal_cls_loss_output"].asnumpy().reshape( pred_shape[0], pred_shape[1], pred_shape[2] * pred_shape[3]))[..., index_label[1]]) if softmax_metric: tempt = softmax_metric(label, pred, 11) softmax_count += tempt softmax_batch += tempt # for q in range(0, 50): # print label.asnumpy()[0, q], ':', pred.asnumpy()[0, 0, q], pred.asnumpy()[0, 1, q] proj_label = data[proj_label_name] proj_weight = data[proj_weight_name] proj_pred = output_buff["proj_regression_loss_output"].asnumpy()\ .reshape(data[proj_weight_name].shape) index_nonzero = np.nonzero(data[proj_weight_name]) proj_regress_tmp = regression_metric( proj_label[index_nonzero], proj_pred[index_nonzero], proj_weight[index_nonzero]) proj_regress_loss_t += proj_regress_tmp proj_regress_loss_b += proj_regress_tmp ell_label = output_buff["ell_label_output"].asnumpy() bbox_pred = output_buff["ellipse_predict_loss_output"].asnumpy( ) bbox_predict_tmp = bbox_predict_metric(ell_label, bbox_pred) bbox_predict_loss_t += bbox_predict_tmp bbox_predict_loss_b += bbox_predict_tmp self.executor.outputs[0].wait_to_read() self.executor.outputs[1].wait_to_read() self.executor.outputs[2].wait_to_read() self.executor.outputs[3].wait_to_read() print "Training-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f" % \ (epoch, nbatch, get_accuracy(tempt, self.bgfg), proj_regress_tmp, bbox_predict_tmp[0], bbox_predict_tmp[1]) f.write( "Training-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f\n" % (epoch, nbatch, get_accuracy( tempt, self.bgfg), proj_regress_tmp, bbox_predict_tmp[0], bbox_predict_tmp[1])) img_info = train_data.AllImg[nbatch - 1] print "%s\twidth: %d height: %d num_face: %d" % \ (img_info.filename, img_info.width, img_info.height, img_info.num_faces) f.write("%s\twidth: %d height: %d num_face: %d\n" % \ (img_info.filename, img_info.width, img_info.height, img_info.num_faces)) if nbatch % 50 == 0: print_accuracy(softmax_batch, f, train_data.class_names, self.bgfg) softmax_batch = np.zeros((11, 3)) print "Keypoints projection regression smoothl1 loss:\t", proj_regress_loss_b / 50 f.write( "Keypoints projection regression smoothl1 loss:\t%f\n" % (proj_regress_loss_b / 50)) print "Bounding box regression:\t", bbox_predict_loss_b / 50 f.write("Bounding box regression: %f %f\n" % (bbox_predict_loss_b[0] / 50, bbox_predict_loss_b[1] / 50)) #print "Keypoints offset regression smoothl1 loss:\t", offset_regress_loss_b / 50 #f.write("Keypoints offset regression smoothl1 loss:\t%f\n" % (offset_regress_loss_b / 50)) #print "Keypoints visibility accuracy:\t", float(softmax_vis_batch[2]) / float(softmax_vis_batch[0]) #f.write("Keypoints visibility accuracy:\t%f\n" % # (float(softmax_vis_batch[2]) / float(softmax_vis_batch[0]))) softmax_vis_batch = np.zeros(3) proj_regress_loss_b = .0 offset_regress_loss_b = .0 bbox_predict_loss_b = np.array([.0, .0]) if nbatch % 1000 == 0: if epoch_end_callback != None: epoch_end_callback(epoch * 100000 + nbatch, self.symbol, self.arg_params, self.aux_params) name, value = eval_metric.get() print_accuracy(softmax_count, f, train_data.class_names, self.bgfg) logger.info("--->Epoch[%d] Train-cls-%s=%f", epoch, name, value) logger.info("--->Epoch[%d] Train-proj-reg-smoothl1=%f", epoch, proj_regress_loss_t / nbatch) logger.info("--->Epoch[%d] Train-bbox-reg-smoothl1=%f, %f", epoch, bbox_predict_loss_t[0] / nbatch, bbox_predict_loss_t[1] / nbatch) #logger.info("--->Epoch[%d] Train-offset-reg-smoothl1=%f", epoch, offset_regress_loss_t / nbatch) #logger.info("--->Epoch[%d] Train-vis-acc=%f", epoch, float(softmax_vis_count[2]) / float(softmax_vis_count[0])) f.write("--->Epoch[%d] Train-cls-%s=%f\n" % (epoch, name, value)) f.write("--->Epoch[%d] Train-proj-reg-smoothl1=%f\n" % (epoch, proj_regress_loss_t / nbatch)) f.write("--->Epoch[%d] Train-bbox-reg-smoothl1=%f, %f" % (epoch, bbox_predict_loss_t[0] / nbatch, bbox_predict_loss_t[1] / nbatch)) #f.write("--->Epoch[%d] Train-offset-reg-smoothl1=%f\n" % (epoch, offset_regress_loss_t / nbatch)) #f.write("--->Epoch[%d] Train-vis-acc=%f" % (epoch, float(softmax_vis_count[2]) / float(softmax_vis_count[0]))) f.close()
def fit(self, train_data, eval_data=None, eval_metric='acc', period=['train', 'val'], to_eval_train=True, grad_req='write', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None): if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) # region 1. 准备参数,包括输入数据和标签数据 # FCN的参数名 arg_names = self.symbol.list_arguments() # FCN的参数形状 # print train_data.provide_data[0] arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape( data=train_data.provide_data[0][1]) # arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data=(1, 3, # train_data.resize_size[0], # train_data.resize_size[1], # )) # print train_data.provide_data[0][1] # quit() # 输入数据和标签数据 data_name = train_data.provide_data[0][0] label_name = train_data.provide_label[0][0] # print data_name, label_name # input_names = [data_name, label_name] # batch_size, channel, h, w # data_shape = train_data.provide_data[0][1] self.arg_params[data_name] = mx.nd.empty(train_data.provide_data[0][1], self.ctx) # # batch_size, h*w self.arg_params[label_name] = mx.nd.empty( train_data.provide_label[0][1], self.ctx) # quit() # 其他参数 aux_names = self.symbol.list_auxiliary_states() self.aux_params = { k: mx.nd.zeros(s) for k, s in zip(aux_names, aux_shapes) } # endregion # region 2.准备参数的梯度 if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): # print name,shape self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None # endregion # print self.arg_params # region 3. 绑定模型参数 和 模型的输出 self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) # quit() assert len(self.symbol.list_arguments()) == len( self.executor.grad_arrays) # 绑定输出变量 output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.executor.outputs): # print key, arr output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) # endregion # region 4. 设置优化器 self.optimizer = opt.create(self.optimizer, rescale_grad=1.0 / train_data.batch_size, **self.kwargs) self.updater = get_updater(self.optimizer) # 需要更新梯度的参数 update_dict = { name: nd for name, nd in zip(self.symbol.list_arguments(), self.executor.grad_arrays) if nd is not None } # endregion # region 5. 设置评价尺度 if eval_metric == 'acc': eval_metric = metric.create(eval_metric) elif eval_metric == 'meanIOU': eval_metric = MeanIoU(c=1, ) # endregion for epoch in range(self.begin_epoch, self.num_epoch): # region begin training if 'train' in period: logger.info(" in train process...") all_start = time.time() nbatch = 0 train_data.reset() eval_metric.reset() for data in train_data: nbatch += 1 # all_start = time.time() # region 1. 准备 batch 数据 # start = time.time() self.arg_params[data_name][:] = data.data[0] # end = time.time() # print end-start # label_shape = data.label[0].shape # print label_shape self.arg_params[label_name][:] = data.label[0] # end = time.time() # print 'prepare data and label time: %s s' % (end - start) # quit() # print self.arg_params[label_name][:] # endregion # region 2. forward # start = time.time() self.executor.forward(is_train=True) # end = time.time() # print 'forward time: %s s' % (end - start) # endregion # region 3. backward # start = time.time() self.executor.backward() for key, arr in update_dict.items(): if key != "bigscore_weight": # 参数名,梯度, 权重 self.updater(key, arr, self.arg_params[key]) # self.executor.outputs[0].wait_to_read() # end = time.time() # print 'backward time: %f s' % (end - start) # endregion # region 4. 测评 # start = time.time() if to_eval_train: # start = time.time() # 取得输出 for key in output_dict: # print key output_dict[key].copyto(output_buff[key]) # output_dict[key].wait_to_read() # end = time.time() # print 'output1 copy time: %s s' % (end - start) # start = time.time() pred_shape = output_buff['softmax_output'].shape # print pred_shape, label_shape # label = self.arg_params[label_name] pred = output_buff['softmax_output'].reshape( (pred_shape[0], pred_shape[1], pred_shape[2] * pred_shape[3])) # pred = pred.copyto(self.ctx) # print pred.shape label = data.label[0] # quit() # end = time.time() # print 'output copy2 time: %s s' % (end - start) # 更新评价 eval_metric.update([label], [pred]) batch_end_params = BatchEndParam( epoch=epoch, nbatch=nbatch, eval_metric=eval_metric if to_eval_train else None, ) batch_end_callback(batch_end_params) # end = time.time() # print '测评 time: %s s' % (end - start) # endregion # all_end = time.time() # print 'all time: %s s' % (all_end - all_start) # if nbatch > 1: # quit() if epoch_end_callback is not None: epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) # all_end = time.time() # print 'all time1: %s s' % (all_end - all_start) if to_eval_train: name, value = eval_metric.get() logger.info( " --->Epoch[%d] Train-%s=%f", epoch, name, value) logger.info('train time per epoch: %f s' % (time.time() - all_start)) # endregion # evaluation if 'val' in period and eval_data: logger.info(" in eval process...") nbatch = 0 eval_data.reset() eval_metric.reset() # all_start = time.time() for data in eval_data: nbatch += 1 # label_shape = data.label.shape self.arg_params[data_name][:] = data.data[0] self.arg_params[label_name][:] = data.label[0] self.executor.forward(is_train=False) pred_shape = self.executor.outputs[0].shape cpu_output_array = mx.nd.empty(pred_shape) self.executor.outputs[0].copyto(cpu_output_array) label = data.label[0] pred = cpu_output_array.reshape( (pred_shape[0], pred_shape[1], pred_shape[2] * pred_shape[3])) eval_metric.update([label], [pred]) batch_end_params = BatchEndParam( epoch=epoch, nbatch=nbatch, eval_metric=None, ) batch_end_callback(batch_end_params) # if nbatch>200: # quit() # quit() # self.executor.outputs[0].wait_to_read() # all_end = time.time() # print 'all time1: %s s' % (all_end - all_start) # all_start = time.time() name, value = eval_metric.get() logger.info('Epoch[%d] Validation-%s=%f', epoch, name, value)
def fit(self, X, y=None, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None, work_load_list=None, monitor=None, eval_batch_end_callback=None): """Fit the model. Parameters ---------- X : DataIter, or numpy.ndarray/NDArray Training data. If X is an DataIter, the name or, if not available, position, of its outputs should match the corresponding variable names defined in the symbolic graph. y : numpy.ndarray/NDArray, optional Training set label. If X is numpy.ndarray/NDArray, y is required to be set. While y can be 1D or 2D (with 2nd dimension as 1), its 1st dimension must be the same as X, i.e. the number of data points and labels should be equal. eval_data : DataIter or numpy.ndarray/list/NDArray pair If eval_data is numpy.ndarray/list/NDArray pair, it should be (valid_data, valid_label). eval_metric : metric.EvalMetric or str or callable The evaluation metric, name of evaluation metric. Or a customize evaluation function that returns the statistics based on minibatch. epoch_end_callback : callable(epoch, symbol, arg_params, aux_states) A callback that is invoked at end of each epoch. This can be used to checkpoint model each epoch. batch_end_callback: callable(epoch) A callback that is invoked at end of each batch For print purpose kvstore: KVStore or str, optional The KVStore or a string kvstore type: 'local', 'dist_sync', 'dist_async' In default uses 'local', often no need to change for single machiine. logger : logging logger, optional When not specified, default logger will be used. work_load_list : float or int, optional The list of work load for different devices, in the same order as ctx Note ---- KVStore behavior - 'local', multi-devices on a single machine, will automatically choose best type. - 'dist_sync', multi-machines with BSP - 'dist_async', multi-machines with partical asynchronous """ data = self._init_iter(X, y, is_train=True) eval_data = self._init_eval_iter(eval_data) if self.sym_gen: self.symbol = self.sym_gen(data.default_bucket_key) # pylint: disable=no-member self._check_arguments() self.kwargs["sym"] = self.symbol arg_names, param_names, aux_names = \ self._init_params(dict(data.provide_data+data.provide_label)) param_idx2name = {} for i, n in enumerate(param_names): for k in range(len(self.ctx)): param_idx2name[i*len(self.ctx)+k] = n self.kwargs["param_idx2name"] = param_idx2name # setup metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) # create kvstore (kvstore, update_on_kvstore) = _create_kvstore( kvstore, len(self.ctx), self.arg_params) # init optmizer if isinstance(self.optimizer, str): batch_size = data.batch_size if kvstore and kvstore.type == 'dist_sync': batch_size *= kvstore.num_workers optimizer = opt.create(self.optimizer, rescale_grad=(1.0/batch_size), **(self.kwargs)) elif isinstance(self.optimizer, opt.Optimizer): optimizer = self.optimizer # do training _train_multi_device(self.symbol, self.ctx, arg_names, param_names, aux_names, self.arg_params, self.aux_params, begin_epoch=self.begin_epoch, end_epoch=self.num_epoch, epoch_size=self.epoch_size, optimizer=optimizer, train_data=data, eval_data=eval_data, eval_metric=eval_metric, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=kvstore, update_on_kvstore=update_on_kvstore, logger=logger, work_load_list=work_load_list, monitor=monitor, eval_batch_end_callback=eval_batch_end_callback, sym_gen=self.sym_gen)
def fit(self, X, marks, e_marks=None, y=None, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, time_step_callback=None, kvstore='local', logger=None, work_load_list=None, monitor=None, eval_batch_end_callback=None): """Overwrite""" data = self._init_iter(X, y, is_train=True) eval_data = self._init_eval_iter(eval_data) if self.sym_gen: self.symbol = self.sym_gen(data.default_bucket_key) # pylint: disable=no-member self._check_arguments() self.kwargs["sym"] = self.symbol param_dict = dict(data.provide_data + data.provide_label) arg_names, param_names, aux_names = self._init_params(param_dict) # setup metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) # create kvstore (kvstore, update_on_kvstore) = _create_kvstore(kvstore, len(self.ctx), self.arg_params) param_idx2name = {} if update_on_kvstore: param_idx2name.update(enumerate(param_names)) else: for i, n in enumerate(param_names): for k in range(len(self.ctx)): param_idx2name[i * len(self.ctx) + k] = n self.kwargs["param_idx2name"] = param_idx2name # init optmizer if isinstance(self.optimizer, str): batch_size = data.batch_size if kvstore and kvstore.type == 'dist_sync': batch_size *= kvstore.num_workers optimizer = opt.create(self.optimizer, rescale_grad=(1.0 / batch_size), **(self.kwargs)) elif isinstance(self.optimizer, opt.Optimizer): optimizer = self.optimizer # do training _train_rnn(self.symbol, self.ctx, marks, arg_names, param_names, aux_names, self.arg_params, self.aux_params, begin_epoch=self.begin_epoch, end_epoch=self.num_epoch, epoch_size=self.epoch_size, optimizer=optimizer, train_data=data, eval_data=eval_data, eval_metric=eval_metric, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, time_step_callback=time_step_callback, kvstore=kvstore, update_on_kvstore=update_on_kvstore, logger=logger, work_load_list=work_load_list, monitor=monitor, eval_batch_end_callback=eval_batch_end_callback, sym_gen=self.sym_gen, e_marks=e_marks)
def train_net(args): ctx = [] cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip() if len(cvd) > 0: for i in range(len(cvd.split(','))): ctx.append(mx.gpu(i)) if len(ctx) == 0: ctx = [mx.cpu()] print('use cpu') else: print('gpu num:', len(ctx)) curTime = time.strftime("%Y%m%d%H%M%S", time.localtime()) prefix = os.path.join( args.models_root, '%s-%s-%s-%s' % (curTime, args.network, args.loss, args.dataset), 'model') prefix_dir = os.path.dirname(prefix) print('prefix', prefix) if not os.path.exists(prefix_dir): os.makedirs(prefix_dir) args.ctx_num = len(ctx) args.batch_size = args.per_batch_size * args.ctx_num args.image_channel = config.image_shape[2] config.batch_size = args.batch_size config.per_batch_size = args.per_batch_size config.no_wd = args.no_wd config.last_gamma = args.last_gamma if (args.freeze_block == 1): config.bn_mom = 1.0 print('bbbbbbbbbbbbbbbbbn', config.bn_mom) data_dir = config.dataset_path path_imgrec = None path_imglist = None image_size = config.image_shape[0:2] assert len(image_size) == 2 #assert image_size[0]==image_size[1] print('image_size', image_size) print('num_classes', config.num_classes) path_imgrec = os.path.join(data_dir, "train.rec") print('Called with argument:', args, config) data_shape = (args.image_channel, image_size[0], image_size[1]) mean = None begin_epoch = 0 if len(args.pretrained) == 0: arg_params = None aux_params = None sym = get_symbol(args) else: print('loading', args.pretrained, args.pretrained_epoch) _, arg_params, aux_params = mx.model.load_checkpoint( args.pretrained, args.pretrained_epoch) #for item in arg_params: # print(item) #print(arg_params) #exit() sym = get_symbol(args) if args.model_visual: mx.viz.plot_network(sym, title='model', save_format='pdf', shape={ 'data': (64, 3, 224, 224), 'label': (64, ) }).view() exit(0) if config.count_flops: all_layers = sym.get_internals() pre_fix = '' if (config.emb_size == 2048): pre_fix = '2048_' _sym = all_layers[pre_fix + 'fc1_output'] FLOPs = flops_counter.count_flops(_sym, data=(1, 3, image_size[0], image_size[1])) _str = flops_counter.flops_str(FLOPs) print('Network FLOPs: %s' % _str) #label_name = 'softmax_label' #label_shape = (args.batch_size,) emb_symbol = sym.get_internals()[pre_fix + 'fc1_output'] fixed_param_names = [] if (args.freeze_block == 1): fixed_param_names = emb_symbol.list_arguments() elif (args.freeze_block == 2): emb_symbol = sym.get_internals()[pre_fix + 'bn1_output'] fixed_param_names = emb_symbol.list_arguments() print(fixed_param_names) #fixed_aux = emb_symbol.list_auxiliary_states() #fixed_param_names.extend(fixed_aux) #print('ffffffffffffffixed params : ', fixed_param_names) model = mx.mod.Module(context=ctx, symbol=sym, fixed_param_names=fixed_param_names) val_dataiter = None if config.loss_name.find('fusion') >= 0: from pair_fusion_class_image_iter import FaceImageIter triplet_params = [ config.triplet_bag_size, config.triplet_alpha, config.triplet_max_ap ] train_dataiter = FaceImageIter( batch_size=args.batch_size, data_shape=data_shape, path_imgrec=path_imgrec, shuffle=True, rand_mirror=config.data_rand_mirror, mean=mean, cutoff=config.data_cutoff, ctx_num=args.ctx_num, images_per_identity=config.images_per_identity, triplet_params=triplet_params, mx_model=model, fairface_mode=config.fairface_mode, ) _metric = LossValueMetric() eval_metrics = [mx.metric.create(_metric)] elif config.loss_name.find('triplet') >= 0: #from fair_face_triplet_iter import FaceImageIter from triplet_image_iter import FaceImageIter if (config.loss_name == 'triplet'): dis_type = 'e' elif (config.loss_name == 'atriplet'): dis_type = 'c' triplet_params = [ config.triplet_bag_size, config.triplet_alpha, config.triplet_max_ap ] train_dataiter = FaceImageIter( batch_size=args.batch_size, data_shape=data_shape, path_imgrec=path_imgrec, shuffle=True, rand_mirror=config.data_rand_mirror, mean=mean, cutoff=config.data_cutoff, ctx_num=args.ctx_num, images_per_identity=config.images_per_identity, triplet_params=triplet_params, mx_model=model, fairface_mode=config.fairface_mode, dis_type=dis_type, ) _metric = LossValueMetric() eval_metrics = [mx.metric.create(_metric)] elif config.loss_name.find('softmax') >= 0: from image_iter_gluon import FaceImageDataset train_dataset = FaceImageDataset( batch_size=args.batch_size, data_shape=data_shape, path_imgrec=path_imgrec, shuffle=True, rand_mirror=config.data_rand_mirror, mean=mean, cutoff=config.data_cutoff, color_jittering=config.data_color, images_filter=config.data_images_filter, selected_attributes=args.selected_attributes, label_name=['softmax_label']) train_data = mx.gluon.data.DataLoader(train_dataset, args.batch_size, shuffle=True, last_batch="rollover", num_workers=args.num_workers) train_dataiter = mx.contrib.io.DataLoaderIter(train_data) metric1 = AccMetric() eval_metrics = [mx.metric.create(metric1)] if config.ce_loss: metric2 = LossValueMetric() eval_metrics.append(mx.metric.create(metric2)) else: from image_iter import FaceImageIter train_dataiter = FaceImageIter( batch_size=args.batch_size, data_shape=data_shape, path_imgrec=path_imgrec, shuffle=True, rand_mirror=config.data_rand_mirror, mean=mean, cutoff=config.data_cutoff, color_jittering=config.data_color, images_filter=config.data_images_filter, ) metric1 = AccMetric() eval_metrics = [mx.metric.create(metric1)] if config.loss_name == 'final_softmax': _metric = LossValueMetric() eval_metrics = [mx.metric.create(_metric)] if config.ce_loss: metric2 = LossValueMetric() eval_metrics.append(mx.metric.create(metric2)) initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style #initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2) _rescale = 1.0 / args.ctx_num clip_gradient = None if config.fp_16: _rescale /= config.scale16 clip_gradient = config.gradThres #opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd, rescale_grad=_rescale)#, multi_precision=config.fp_16) opt = optimizer.create(args.opt, learning_rate=args.lr, momentum=config.mom, wd=config.wd, rescale_grad=_rescale, multi_precision=config.fp_16, clip_gradient=clip_gradient) _cb = mx.callback.Speedometer(args.batch_size, args.frequent) # cos learning rate scheduler if args.cos_lr: num_batches = config.num_training_samples // args.batch_size total_batches = default.end_epoch * num_batches ver_list = [] ver_name_list = [] for name in config.val_targets: path = os.path.join(data_dir, name + ".bin") if os.path.exists(path): data_set = verification.load_bin(path, image_size) ver_list.append(data_set) ver_name_list.append(name) print('ver', name) def ver_test(nbatch): results = [] label_shape = None if (config.net_output == 'ECCV'): label_shape = (args.batch_size, 2) for i in range(len(ver_list)): acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test( ver_list[i], model, args.batch_size, 10, None, label_shape) print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm)) #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1)) print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc2, std2)) results.append(acc2) return results highest_acc = [0.0, 0.0] #lfw and target # highest_acc.append(0.0) global_step = [0] save_step = [0] highestStep = [0] lr_steps = [int(x) for x in args.lr_steps.split(',')] print('lr_steps', lr_steps) def _batch_callback(param): #global global_step global_step[0] += 1 mbatch = global_step[0] if config.useWarmup and (mbatch < config.warmupSteps): #opt.lr = args.lr * mbatch / config.warmupSteps opt.lr = 1.0e-8 #print("warmup lr: ", opt.lr) if (not config.useWarmup) or (config.useWarmup and (mbatch >= config.warmupSteps)): targetSteps = mbatch if config.useWarmup: if mbatch == config.warmupSteps: opt.lr = args.lr targetSteps -= config.warmupSteps if args.cos_lr: opt.lr = 0.5 * args.lr * ( 1 + np.cos(np.pi * (targetSteps / total_batches))) if (targetSteps % 500) == 0: print('cos lr change to', opt.lr) else: for step in lr_steps: if targetSteps == step: opt.lr *= 0.1 print('lr change to', opt.lr) break _cb(param) if mbatch % 1000 == 0: print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch) if mbatch >= 0 and mbatch % args.verbose == 0: acc_list = ver_test(mbatch) save_step[0] += 1 msave = save_step[0] do_save = False is_highest = False if len(acc_list) > 0: score = sum(acc_list) if acc_list[-1] >= highest_acc[-1]: if acc_list[-1] > highest_acc[-1]: is_highest = True else: if score >= highest_acc[0]: is_highest = True highest_acc[0] = score highest_acc[-1] = acc_list[-1] highestStep[0] = save_step[0] if is_highest: do_save = True if args.ckpt == 0: do_save = False elif args.ckpt == 2: do_save = True elif args.ckpt == 3: msave = 1 if do_save: print('saving', msave) arg, aux = model.get_params() if config.ckpt_embedding: all_layers = model.symbol.get_internals() _sym = all_layers['fc1_output'] _arg = {} for k in arg: if not k.startswith('fc7'): _arg[k] = arg[k] mx.model.save_checkpoint(prefix, msave, _sym, _arg, aux) else: mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux) print('[%d]Accuracy-Highest: %1.5f, mbatch: %d' % (mbatch, highest_acc[-1], highestStep[0])) if config.max_steps > 0 and mbatch > config.max_steps: sys.exit(0) epoch_cb = None if config.loss_name.find('triplet') < 0: train_dataiter = mx.io.PrefetchingIter( train_dataiter) #triplet loss unavailable ###### if (config.net_output == 'ECCV'): class_metric = AccMetric(acc_name='class_acc', label_index=1, pred_index=4) eval_metrics.append(mx.metric.create(class_metric)) eval_metrics, model.fit( train_dataiter, begin_epoch=begin_epoch, num_epoch=999999, eval_data=val_dataiter, eval_metric=eval_metrics, kvstore=args.kvstore, optimizer=opt, #optimizer_params = optimizer_params, initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=True, batch_end_callback=_batch_callback, epoch_end_callback=epoch_cb)
def init_optimizer(self, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), force_init=False): """Install and initialize optimizers. Parameters ---------- kvstore : str or KVStore Default `'local'`. optimizer : str or Optimizer Default `'sgd'` optimizer_params : dict Default `(('learning_rate', 0.01),)`. The default value is not a dictionary, just to avoid pylint warning of dangerous default values. force_init : bool Default `False`, indicating whether we should force re-initializing the optimizer in the case an optimizer is already installed. """ assert self.binded and self.params_initialized if self.optimizer_initialized and not force_init: self.logger.warning('optimizer already initialized, ignoring...') return (kvstore, update_on_kvstore) = \ _create_kvstore(kvstore, len(self._context), self._arg_params) batch_size = self._exec_group.batch_size if kvstore and 'dist' in kvstore.type and '_sync' in kvstore.type: batch_size *= kvstore.num_workers rescale_grad = 1.0/batch_size if isinstance(optimizer, str): idx2name = {} if update_on_kvstore: idx2name.update(enumerate(self._exec_group.param_names)) else: for k in range(len(self._context)): idx2name.update({i*len(self._context)+k: n for i, n in enumerate(self._exec_group.param_names)}) optimizer_params = dict(optimizer_params) if 'rescale_grad' not in optimizer_params: optimizer_params['rescale_grad'] = rescale_grad optimizer = opt.create(optimizer, sym=self.symbol, param_idx2name=idx2name, **optimizer_params) else: assert isinstance(optimizer, opt.Optimizer) if optimizer.rescale_grad != rescale_grad: #pylint: disable=no-member warnings.warn( "Optimizer created manually outside Module but rescale_grad " + "is not normalized to 1.0/batch_size/num_workers (%s vs. %s). "%( optimizer.rescale_grad, rescale_grad) + "Is this intended?", stacklevel=2) self._optimizer = optimizer self._kvstore = kvstore self._update_on_kvstore = update_on_kvstore self._updater = None if kvstore: # copy initialized local parameters to kvstore _initialize_kvstore(kvstore=kvstore, param_arrays=self._exec_group.param_arrays, arg_params=self._arg_params, param_names=self._param_names, update_on_kvstore=update_on_kvstore) if update_on_kvstore: kvstore.set_optimizer(self._optimizer) else: self._updater = opt.get_updater(optimizer) self.optimizer_initialized = True if self._preload_opt_states is not None: self.load_optimizer_states(self._preload_opt_states) self._preload_opt_states = None
grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): grad_params[name] = mx.nd.zeros(shape, ctx) # prepare aux_params aux_names = network.list_auxiliary_states() aux_params = { k: mx.nd.zeros(s, ctx) for k, s in zip(aux_names, aux_shapes) } # prepare optimizer optimizer = opt.create('adam', rescale_grad=(1.0 / dataiter.get_batch_size()), **({ 'learning_rate': 0.01 })) updater = get_updater(optimizer) # create eval_metrix eval_metric = metric.create('rmse') data_name = dataiter.data_name label_name = dataiter.label_name arg_params = network_args aux_params = network_auxs batch_callback = mx.callback.Speedometer(1, 10) epoch_callback = mx.callback.do_checkpoint(save_model_prefix)