def network_backprop_setup(self, grad_req, arg_names, arg_shapes, eval_metric): if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith("mean_face") or name.endswith('cls_label') or name.endswith('proj_weight') or name.endswith('proj_label') or name.endswith('ground_truth') or name.endswith('ellipse_label') or name.endswith("bbox_weight")): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) # setting the required optimizer self.optimizer = opt.create(self.optimizer, rescale_grad=1.0, **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = metric.create(eval_metric) return eval_metric
def init_optimizer(self, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), force_init=False): """Install and initialize optimizers. Parameters ---------- kvstore : str or KVStore Default `'local'`. optimizer : str or Optimizer Default `'sgd'` optimizer_params : dict Default `(('learning_rate', 0.01),)`. The default value is not a dictionary, just to avoid pylint warning of dangerous default values. force_init : bool Default `False`, indicating whether we should force re-initializing the optimizer in the case an optimizer is already installed. """ assert self.binded and self.params_initialized if self.optimizer_initialized and not force_init: self.logger.warning('optimizer already initialized, ignoring...') return (kvstore, update_on_kvstore) = \ _create_kvstore(kvstore, len(self._context), self._arg_params) batch_size = self._exec_group.batch_size if kvstore and 'dist' in kvstore.type and '_sync' in kvstore.type: batch_size *= kvstore.num_workers rescale_grad = 1.0/batch_size if isinstance(optimizer, str): idx2name = {} if update_on_kvstore: idx2name.update(enumerate(self._exec_group.param_names)) else: for k in range(len(self._context)): idx2name.update({i*len(self._context)+k: n for i, n in enumerate(self._exec_group.param_names)}) optimizer_params = dict(optimizer_params) if 'rescale_grad' not in optimizer_params: optimizer_params['rescale_grad'] = rescale_grad optimizer = opt.create(optimizer, sym=self.symbol, param_idx2name=idx2name, **optimizer_params) else: assert isinstance(optimizer, opt.Optimizer) if optimizer.rescale_grad != rescale_grad: #pylint: disable=no-member warnings.warn( "Optimizer created manually outside Module but rescale_grad " + "is not normalized to 1.0/batch_size/num_workers (%s vs. %s). "%( optimizer.rescale_grad, rescale_grad) + "Is this intended?", stacklevel=2) self._optimizer = optimizer self._kvstore = kvstore self._update_on_kvstore = update_on_kvstore self._updater = None if kvstore: # copy initialized local parameters to kvstore _initialize_kvstore(kvstore=kvstore, param_arrays=self._exec_group.param_arrays, arg_params=self._arg_params, param_names=self._param_names, update_on_kvstore=update_on_kvstore) if update_on_kvstore: kvstore.set_optimizer(self._optimizer) else: self._updater = opt.get_updater(optimizer) self.optimizer_initialized = True if self._preload_opt_states is not None: self.load_optimizer_states(self._preload_opt_states) self._preload_opt_states = None
def init_optimizer(self, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), force_init=False): """Install and initialize optimizers. Parameters ---------- kvstore : str or KVStore Default `'local'`. optimizer : str or Optimizer Default `'sgd'` optimizer_params : dict Default `(('learning_rate', 0.01),)`. The default value is not a dictionary, just to avoid pylint warning of dangerous default values. force_init : bool Default `False`, indicating whether we should force re-initializing the optimizer in the case an optimizer is already installed. """ assert self.binded and self.params_initialized if self.optimizer_initialized and not force_init: self.logger.warning('optimizer already initialized, ignoring...') return (kvstore, update_on_kvstore) = \ _create_kvstore(kvstore, len(self._context), self._arg_params) batch_size = self._exec_group.batch_size if kvstore and 'dist' in kvstore.type and '_sync' in kvstore.type: batch_size *= kvstore.num_workers rescale_grad = 1.0/batch_size if isinstance(optimizer, str): idx2name = {} if update_on_kvstore: idx2name.update(enumerate(self._exec_group.param_names)) else: for k in range(len(self._context)): idx2name.update({i*len(self._context)+k: n for i, n in enumerate(self._exec_group.param_names)}) optimizer_params = dict(optimizer_params) if 'rescale_grad' not in optimizer_params: optimizer_params['rescale_grad'] = rescale_grad optimizer = opt.create(optimizer, sym=self.symbol, param_idx2name=idx2name, **optimizer_params) else: assert isinstance(optimizer, opt.Optimizer) if optimizer.rescale_grad != rescale_grad: #pylint: disable=no-member warnings.warn( "Optimizer created manually outside Module but rescale_grad " + "is not normalized to 1.0/batch_size/num_workers (%s vs. %s). "%( optimizer.rescale_grad, rescale_grad) + "Is this intended?", stacklevel=2) self._optimizer = optimizer self._kvstore = kvstore self._update_on_kvstore = update_on_kvstore self._updater = None if kvstore: # copy initialized local parameters to kvstore _initialize_kvstore(kvstore=kvstore, param_arrays=self._exec_group.param_arrays, arg_params=self._arg_params, param_names=self._param_names, update_on_kvstore=update_on_kvstore) if update_on_kvstore: kvstore.set_optimizer(self._optimizer) else: self._updater = opt.get_updater(optimizer) self.optimizer_initialized = True if self._preload_opt_states is not None: self.load_optimizer_states(self._preload_opt_states) self._preload_opt_states = None
def fit(self, train_data, eval_data=None, eval_metric='acc', grad_req='write', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None): if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data=train_data.provide_data[0][1]) arg_names = self.symbol.list_arguments() if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None aux_names = self.symbol.list_auxiliary_states() self.aux_params = {k : nd.zeros(s) for k, s in zip(aux_names, aux_shapes)} data_name = train_data.data_name label_name = train_data.label_name input_names = [data_name, label_name] self.optimizer = opt.create(self.optimizer, rescale_grad=(1.0/train_data.get_batch_size()), **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = metric.create(eval_metric) # begin training for epoch in range(self.begin_epoch, self.num_epoch): nbatch = 0 train_data.reset() eval_metric.reset() for data in train_data: nbatch += 1 label_shape = data[label_name].shape self.arg_params[data_name] = mx.nd.array(data[data_name], self.ctx) self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2]), self.ctx) output_names = self.symbol.list_outputs() self.exector = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) assert len(self.symbol.list_arguments()) == len(self.exector.grad_arrays) update_dict = {name: nd for name, nd in zip(self.symbol.list_arguments(), \ self.exector.grad_arrays) if nd} output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.exector.outputs): output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) self.exector.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) self.exector.backward() for key, arr in update_dict.items(): if key != "bigscore_weight": self.updater(key, arr, self.arg_params[key]) pred_shape = self.exector.outputs[0].shape label = mx.nd.array(data[label_name].reshape(label_shape[0], label_shape[1]*label_shape[2])) pred = mx.nd.array(output_buff["softmax_output"].asnumpy().reshape(pred_shape[0], \ pred_shape[1], pred_shape[2]*pred_shape[3])) eval_metric.update([label], [pred]) self.exector.outputs[0].wait_to_read() batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric) batch_end_callback(batch_end_params) if epoch_end_callback is not None: epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) name, value = eval_metric.get() logger.info(" --->Epoch[%d] Train-%s=%f", epoch, name, value) # evaluation if eval_data: logger.info(" in eval process...") nbatch = 0 eval_data.reset() eval_metric.reset() for data in eval_data: nbatch += 1 label_shape = data[label_name].shape self.arg_params[data_name] = mx.nd.array(data[data_name], self.ctx) self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2]), self.ctx) exector = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) cpu_output_array = mx.nd.zeros(exector.outputs[0].shape) exector.forward(is_train=False) exector.outputs[0].copyto(cpu_output_array) pred_shape = cpu_output_array.shape label = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2])) pred = mx.nd.array(cpu_output_array.asnumpy().reshape(pred_shape[0], \ pred_shape[1], pred_shape[2]*pred_shape[3])) eval_metric.update([label], [pred]) exector.outputs[0].wait_to_read() name, value = eval_metric.get() logger.info('batch[%d] Validation-%s=%f', nbatch, name, value)
def fit(self, train_data, eval_data=None, eval_metric='acc', grad_req='write', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None): global outimgiter if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) logging.info(str(self.kwargs)) batch_size = train_data.provide_data[0][1][0] arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape( \ data=tuple(train_data.provide_data[0][1]), label_det=(batch_size,200,6)) arg_names = self.symbol.list_arguments() out_names = self.symbol.list_outputs() aux_names = self.symbol.list_auxiliary_states() # pprint([(n,s) for n,s in zip(arg_names,arg_shapes)]) # pprint([(n,s) for n,s in zip(out_names,out_shapes)]) # pprint([(n,s) for n,s in zip(aux_names,aux_shapes)]) if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None self.aux_params = {k : mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes)} data_name = train_data.provide_data[0][0] label_name_det = train_data.provide_label[0][0] label_name_seg = train_data.provide_label[1][0] input_names = [data_name, label_name_det, label_name_seg] print(train_data.provide_label) print(os.environ["MXNET_CUDNN_AUTOTUNE_DEFAULT"]) self.optimizer = opt.create(self.optimizer, rescale_grad=(1.0/train_data.batch_size), **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = CustomAccuracyMetric() # metric.create(eval_metric) multibox_metric = MultiBoxMetric() eval_metrics = metric.CompositeEvalMetric() eval_metrics.add(multibox_metric) # eval_metrics.add(eval_metric) # begin training for epoch in range(self.begin_epoch, self.num_epoch): nbatch = 0 train_data.reset() eval_metrics.reset() logger.info('learning rate: '+str(self.optimizer.learning_rate)) for data,_ in train_data: if self.evaluation_only: break nbatch += 1 label_shape_det = data.label[0].shape label_shape_seg = data.label[1].shape self.arg_params[data_name] = mx.nd.array(data.data[0], self.ctx) self.arg_params[label_name_det] = mx.nd.array(data.label[0], self.ctx) self.arg_params[label_name_seg] = mx.nd.array(data.label[1], self.ctx) output_names = self.symbol.list_outputs() ###################### analyze shapes #################### # pprint([(k,v.shape) for k,v in self.arg_params.items()]) self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) assert len(self.symbol.list_arguments()) == len(self.executor.grad_arrays) update_dict = {name: nd for name, nd in zip(self.symbol.list_arguments(), \ self.executor.grad_arrays) if nd is not None} output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.executor.outputs): output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) # output_buff[key] = mx.nd.empty(arr.shape, ctx=self.ctx) def stat_helper(name, array): """wrapper for executor callback""" import ctypes from mxnet.ndarray import NDArray from mxnet.base import NDArrayHandle, py_str array = ctypes.cast(array, NDArrayHandle) if 0: array = NDArray(array, writable=False).asnumpy() print (name, array.shape, np.mean(array), np.std(array), ('%.1fms' % (float(time.time()-stat_helper.start_time)*1000))) else: array = NDArray(array, writable=False) array.wait_to_read() elapsed = float(time.time()-stat_helper.start_time)*1000. if elapsed>5: print (name, array.shape, ('%.1fms' % (elapsed,))) stat_helper.start_time=time.time() stat_helper.start_time=float(time.time()) # self.executor.set_monitor_callback(stat_helper) tic = time.time() self.executor.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) # exit(0) # for debugging forward pass only self.executor.backward() for key, arr in update_dict.items(): if key != "bigscore_weight": self.updater(key, arr, self.arg_params[key]) for output in self.executor.outputs: output.wait_to_read() if TIMING: print("%.0fms" % ((time.time()-tic)*1000.,)) output_dict = dict(zip(output_names, self.executor.outputs)) pred_det_shape = output_dict["det_out_output"].shape # pred_seg_shape = output_dict["seg_out_output"].shape label_det = mx.nd.array(data.label[0].reshape((label_shape_det[0], label_shape_det[1]*label_shape_det[2]))) # label_seg = mx.nd.array(data.label[1].reshape((label_shape_seg[0], # label_shape_seg[1]*label_shape_seg[2]))) pred_det = mx.nd.array(output_buff["det_out_output"].reshape((pred_det_shape[0], pred_det_shape[1], pred_det_shape[2]))) # pred_seg = mx.nd.array(output_buff["seg_out_output"].reshape((pred_seg_shape[0], # pred_seg_shape[1], pred_seg_shape[2]*pred_seg_shape[3]))) if DEBUG: print(data.label[0].asnumpy()[0,:2,:]) if TIMING: print("%.0fms" % ((time.time()-tic)*1000.,)) eval_metrics.get_metric(0).update([mx.nd.zeros(output_buff["cls_prob_output"].shape), mx.nd.zeros(output_buff["loc_loss_output"].shape),label_det], [output_buff["cls_prob_output"], output_buff["loc_loss_output"], output_buff["cls_label_output"]]) # eval_metrics.get_metric(1).update([label_seg.as_in_context(self.ctx)], [pred_seg.as_in_context(self.ctx)]) self.executor.outputs[0].wait_to_read() ##################### display results ############################## # out_det = output_dict["det_out_output"].asnumpy() # for imgidx in range(out_det.shape[0]): # img = np.squeeze(data.data[0].asnumpy()[imgidx,:,:,:]) # det = out_det[imgidx,:,:] # gt = label_det.asnumpy()[imgidx,:].reshape((-1,6)) # display_results(img, det, gt, self.class_names) # [exit(0) if (cv2.waitKey(1)&0xff)==27 else None] # outimgiter += 1 batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metrics) batch_end_callback(batch_end_params) if TIMING: print("%.0fms" % ((time.time()-tic)*1000.,)) # exit(0) # for debugging only ##### save snapshot if (not self.evaluation_only) and (epoch_end_callback is not None): epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) names, values = eval_metrics.get() for name, value in zip(names,values): logger.info(" --->Epoch[%d] Train-%s=%f", epoch, name, value) # evaluation if eval_data: logger.info(" in eval process...") nbatch = 0 depth_metric = DistanceAccuracyMetric(class_names=self.class_names) eval_data.reset() eval_metrics.reset() self.valid_metric.reset() depth_metric.reset() timing_results = [] for data, fnames in eval_data: nbatch += 1 label_shape_det = data.label[0].shape # label_shape_seg = data.label[1].shape self.arg_params[data_name] = mx.nd.array(data.data[0], self.ctx) self.arg_params[label_name_det] = mx.nd.array(data.label[0], self.ctx) # self.arg_params[label_name_seg] = mx.nd.array(data.label[1], self.ctx) self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) output_names = self.symbol.list_outputs() output_dict = dict(zip(output_names, self.executor.outputs)) # cpu_output_array = mx.nd.zeros(output_dict["seg_out_output"].shape) ############## monitor status # def stat_helper(name, array): # """wrapper for executor callback""" # import ctypes # from mxnet.ndarray import NDArray # from mxnet.base import NDArrayHandle, py_str # array = ctypes.cast(array, NDArrayHandle) # if 1: # array = NDArray(array, writable=False).asnumpy() # print (name, array.shape, np.mean(array), np.std(array), # ('%.1fms' % (float(time.time()-stat_helper.start_time)*1000))) # else: # array = NDArray(array, writable=False) # array.wait_to_read() # elapsed = float(time.time()-stat_helper.start_time)*1000. # if elapsed>5: # print (name, array.shape, ('%.1fms' % (elapsed,))) # stat_helper.start_time=time.time() # stat_helper.start_time=float(time.time()) # self.executor.set_monitor_callback(stat_helper) ############## forward tic = time.time() self.executor.forward(is_train=True) # output_dict["seg_out_output"].wait_to_read() timing_results.append((time.time()-tic)*1000.) # output_dict["seg_out_output"].copyto(cpu_output_array) # pred_shape = output_dict["seg_out_output"].shape # label = mx.nd.array(data.label[1].reshape((label_shape_seg[0], label_shape_seg[1]*label_shape_seg[2]))) # output_dict["seg_out_output"].wait_to_read() # seg_out_output = output_dict["seg_out_output"].asnumpy() pred_det_shape = output_dict["det_out_output"].shape # pred_seg_shape = output_dict["seg_out_output"].shape label_det = mx.nd.array(data.label[0].reshape((label_shape_det[0], label_shape_det[1]*label_shape_det[2]))) # label_seg = mx.nd.array(data.label[1].reshape((label_shape_seg[0], label_shape_seg[1]*label_shape_seg[2])),ctx=self.ctx) pred_det = mx.nd.array(output_dict["det_out_output"].reshape((pred_det_shape[0], pred_det_shape[1], pred_det_shape[2]))) # pred_seg = mx.nd.array(output_dict["seg_out_output"].reshape((pred_seg_shape[0], pred_seg_shape[1], pred_seg_shape[2]*pred_seg_shape[3])),ctx=self.ctx) #### remove invalid boxes out_dets = output_dict["det_out_output"].asnumpy() assert len(out_dets.shape)==3 pred_det = np.zeros((batch_size, 200, 7), np.float32)-1. for idx, out_det in enumerate(out_dets): assert len(out_det.shape)==2 out_det = np.expand_dims(out_det, axis=0) indices = np.where(out_det[:,:,0]>=0) # labeled as negative out_det = np.expand_dims(out_det[indices[0],indices[1],:],axis=0) indices = np.where(out_det[:,:,1]>.25) # higher confidence out_det = np.expand_dims(out_det[indices[0],indices[1],:],axis=0) pred_det[idx, :out_det.shape[1], :] = out_det del out_det pred_det = mx.nd.array(pred_det) ##### display results if False: # self.evaluation_only: # out_img = output_dict["seg_out_output"] # out_img = mx.nd.split(out_img, axis=0, num_outputs=out_img.shape[0], squeeze_axis=0) # if not isinstance(out_img,list): # out_img = [out_img] for imgidx in range(eval_data.batch_size): img = np.squeeze(data.data[0].asnumpy()[imgidx,:,:,:]) det = pred_det.asnumpy()[imgidx,:,:] ### ground-truth gt = label_det.asnumpy()[imgidx,:].reshape((-1,6)) # display result display_img = display_results(img, det, gt, self.class_names) res_fname = fnames[imgidx].replace("SegmentationClass","Results").replace("labelIds","results") if cv2.imwrite(res_fname, display_img): print(res_fname,'saved.') [exit(0) if (cv2.waitKey()&0xff)==27 else None] outimgiter += 1 if self.evaluation_only: continue eval_metrics.get_metric(0).update(None, [output_dict["cls_prob_output"], output_dict["loc_loss_output"], output_dict["cls_label_output"]]) # eval_metrics.get_metric(1).update([label_seg], [pred_seg]) self.valid_metric.update([mx.nd.slice_axis(data.label[0],axis=2,begin=0,end=5)], \ [mx.nd.slice_axis(pred_det,axis=2,begin=0,end=6)]) disparities = [] for imgidx in range(batch_size): dispname = fnames[imgidx].replace("SegmentationClass","Disparity").replace("gtFine_labelTrainIds","disparity") disparities.append(cv2.imread(dispname,-1)) assert disparities[0] is not None, dispname + " not found." depth_metric.update(mx.nd.array(disparities),[pred_det]) det_metric = self.valid_metric det_names, det_values = det_metric.get() depth_names, depth_values = depth_metric.get() print("\r %d/%d speed=%.1fms %.1f%% %s=%.1f %s=%.1f" % \ (nbatch*eval_data.batch_size,eval_data.num_samples, math.fsum(timing_results)/float(nbatch), float(nbatch*eval_data.batch_size)*100./float(eval_data.num_samples), det_names[-1],det_values[-1]*100., depth_names[-1],depth_values[-1]*100.,),end='\r') names, values = eval_metrics.get() for name, value in zip(names,values): logger.info(' epoch[%d] Validation-%s=%f', epoch, name, value) logger.info('----------------------------------------------') print(' & '.join(names)) print(' & '.join(map(lambda v:'%.1f'%(v*100.,),values))) logger.info('----------------------------------------------') names, values = self.valid_metric.get() for name, value in zip(names,values): logger.info(' epoch[%d] Validation-%s=%f', epoch, name, value) logger.info('----------------------------------------------') print(' & '.join(names)) print(' & '.join(map(lambda v:'%.1f'%(v*100.,),values))) logger.info('----------------------------------------------') names, values = depth_metric.get() for name, value in zip(names,values): logger.info(' epoch[%d] Validation-%s=%f', epoch, name, value) logger.info('----------------------------------------------') print(' & '.join(names)) print(' & '.join(map(lambda v:'%.1f'%(v*100.,),values))) logger.info('----------------------------------------------') if self.evaluation_only: exit(0) ## for debugging only
def fit(self, train_data, eval_data=None, eval_metric='acc', grad_req='write', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None): if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape( data=train_data.provide_data[0][1]) arg_names = self.symbol.list_arguments() if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None aux_names = self.symbol.list_auxiliary_states() self.aux_params = { k: nd.zeros(s) for k, s in zip(aux_names, aux_shapes) } data_name = train_data.data_name label_name = train_data.label_name input_names = [data_name, label_name] self.optimizer = opt.create(self.optimizer, rescale_grad=(1.0 / train_data.get_batch_size()), **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = metric.create(eval_metric) # begin training for epoch in range(self.begin_epoch, self.num_epoch): nbatch = 0 train_data.reset() eval_metric.reset() for data in train_data: nbatch += 1 label_shape = data[label_name].shape self.arg_params[data_name] = mx.nd.array( data[data_name], self.ctx) self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2]), self.ctx) output_names = self.symbol.list_outputs() self.exector = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) assert len(self.symbol.list_arguments()) == len( self.exector.grad_arrays) update_dict = {name: nd for name, nd in zip(self.symbol.list_arguments(), \ self.exector.grad_arrays) if nd is not None} output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.exector.outputs): output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) self.exector.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) self.exector.backward() for key, arr in update_dict.items(): if key != "bigscore_weight": self.updater(key, arr, self.arg_params[key]) pred_shape = self.exector.outputs[0].shape label = mx.nd.array(data[label_name].reshape( label_shape[0], label_shape[1] * label_shape[2])) pred = mx.nd.array(output_buff["softmax_output"].asnumpy().reshape(pred_shape[0], \ pred_shape[1], pred_shape[2]*pred_shape[3])) eval_metric.update([label], [pred]) self.exector.outputs[0].wait_to_read() batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric) batch_end_callback(batch_end_params) if epoch_end_callback is not None: epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) name, value = eval_metric.get() logger.info(" --->Epoch[%d] Train-%s=%f", epoch, name, value) # evaluation if eval_data: logger.info(" in eval process...") nbatch = 0 eval_data.reset() eval_metric.reset() for data in eval_data: nbatch += 1 label_shape = data[label_name].shape self.arg_params[data_name] = mx.nd.array( data[data_name], self.ctx) self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2]), self.ctx) exector = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) cpu_output_array = mx.nd.zeros(exector.outputs[0].shape) exector.forward(is_train=False) exector.outputs[0].copyto(cpu_output_array) pred_shape = cpu_output_array.shape label = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2])) pred = mx.nd.array(cpu_output_array.asnumpy().reshape(pred_shape[0], \ pred_shape[1], pred_shape[2]*pred_shape[3])) eval_metric.update([label], [pred]) exector.outputs[0].wait_to_read() name, value = eval_metric.get() logger.info('batch[%d] Validation-%s=%f', nbatch, name, value)
def _train_rnn( symbol, ctx, marks, arg_names, param_names, aux_names, arg_params, aux_params, begin_epoch, end_epoch, epoch_size, optimizer, kvstore, update_on_kvstore, train_data, e_marks=None, eval_data=None, eval_metric=None, epoch_end_callback=None, batch_end_callback=None, time_step_callback=None, logger=None, work_load_list=None, monitor=None, eval_batch_end_callback=None, sym_gen=None, mutable_data_shape=False, max_data_shape=None): """Mark should be a list of #SeriesLength, annotating if image has label by 1 , 0""" # TODO marks not working if label of SAX is different in one batch if logger is None: logger = logging executor_manager = DataParallelExecutorManager(symbol=symbol, sym_gen=sym_gen, ctx=ctx, train_data=train_data, param_names=param_names, arg_names=arg_names, aux_names=aux_names, work_load_list=work_load_list, logger=logger, mutable_data_shape=mutable_data_shape, max_data_shape=max_data_shape) if monitor: executor_manager.install_monitor(monitor) executor_manager.set_params(arg_params, aux_params) #if not update_on_kvstore: updater = get_updater(optimizer) if kvstore: _initialize_kvstore(kvstore=kvstore, param_arrays=executor_manager.param_arrays, arg_params=arg_params, param_names=executor_manager.param_names, update_on_kvstore=update_on_kvstore) if update_on_kvstore: kvstore.set_optimizer(optimizer) # Now start training train_data.reset() for epoch in range(begin_epoch, end_epoch): # Training phase tic = time.time() eval_metric.reset() nbatch = 0 # Iterate over training data. # Into Epoch ######################### # record acc acc_hist = [] logger.info('Starting New Epoch...') while True: do_reset = True # iter on batch_size for data_batch_zoo in train_data: assert isinstance(data_batch_zoo, list), "Iter Error" if monitor is not None: monitor.tic() # Start to iter on Time steps if isinstance(marks[nbatch], list): M = marks[nbatch] else: M = marks executor_manager, eval_metric, acc_hist = _run_sax( data_batch_zoo, M, executor_manager, eval_metric, updater, ctx, kvstore, acc_hist, monitor=monitor, logger=logger, update_on_kvstore=update_on_kvstore, is_train=True, callback= time_step_callback ) nbatch += 1 # batch callback (for print purpose) if batch_end_callback != None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) if isinstance(batch_end_callback, list): for call in batch_end_callback: call(batch_end_params) else: batch_end_callback(batch_end_params) # this epoch is done possibly earlier if epoch_size is not None and nbatch >= epoch_size: do_reset = False break # end on batch_size if do_reset is True: logger.debug('Epoch[%d] Resetting Data Iterator', epoch) train_data.reset() logger.debug('Epoch[%d] Resetting Eval Metric', epoch) eval_metric.reset() # this epoch is done if epoch_size is None or nbatch >= epoch_size: break toc = time.time() logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) if epoch_end_callback or epoch + 1 == end_epoch: executor_manager.copy_to(arg_params, aux_params) if epoch_end_callback != None: if isinstance(epoch_end_callback, list): for call in epoch_end_callback: call(epoch, symbol, arg_params, aux_params, acc_hist) else: epoch_end_callback(epoch, symbol, arg_params, aux_params, acc_hist) # evaluation # print 'enter evaluation' if eval_data: assert e_marks is not None, 'e marks cannot be None' eval_metric.reset() eval_data.reset() for b, eval_zoo in enumerate(eval_data): if isinstance(e_marks[b], list): M = e_marks[b] else: M = e_marks executor_manager, eval_metric, acc_hist = _run_sax( eval_zoo, M, executor_manager, eval_metric, updater, ctx, kvstore, acc_hist, update_on_kvstore=update_on_kvstore, is_train=False) # executor_manager.load_data_batch(eval_batch) # executor_manager.forward(is_train=False) # executor_manager.update_metric(eval_metric, eval_batch.label) if eval_batch_end_callback != None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=i, eval_metric=eval_metric, locals=locals()) if isinstance(eval_batch_end_callback, list): for call in eval_batch_end_callback: call(batch_end_params) else: eval_batch_end_callback(batch_end_params) name_value = eval_metric.get_name_value() for name, value in name_value: logger.info('Epoch[%d] Validation-%s=%f', epoch, name, value) # end of all epochs return
def run(mxIter): model_prefix = '/data2/obj_detect/imagenet_models/resnet/resnet-101' load_epoch = 0 #model_prefix = './stage1_models/tiny_face-06440' #load_epoch = 42 #model_prefix = './tiny_face-06440' #load_epoch = 140 head = '%(asctime)-15s %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) input_shapes = get_input_shapes(mxIter.batch_size) optimizer = 'sgd' optimizer_params = { 'learning_rate': 0.0001, 'momentum' : 0.90, 'wd' : 0.0001} optimizer = opt.create(optimizer, rescale_grad=1.0 / mxIter.batch_size, **optimizer_params) updater = get_updater(optimizer) net = get_symbol_focal_loss() arg_params, aux_params = load_params_checkpoint(model_prefix, load_epoch) arg_names = net.list_arguments() param_names = [x for x in arg_names if x not in input_shapes] initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) delete_params_by_shape(net, arg_params, aux_params, input_shapes, initializer) exec_ = net.simple_bind(ctx=mx.gpu(2), **input_shapes) copy_params(arg_params, aux_params, exec_) param_arrays = [[exec_.arg_arrays[i]] for i,name in enumerate(arg_names) if name in param_names] grad_arrays = [[exec_.grad_arrays[i]] for i,name in enumerate(arg_names) if name in param_names] #monitor = mx.monitor.Monitor(interval=1, pattern='.*backward.*') #monitor.install(exec_) batch_size = mxIter.batch_size for epoch in range(load_epoch+1, 200): num_batch = 0 metric = 0 num_inst = 0 num_reg_inst = 0 reg_metric = 0 for batch in mxIter: load_data(batch, exec_) #monitor.tic() exec_.forward(is_train=True) outputs = [output.asnumpy() for output in exec_._get_outputs()] exec_.backward() #monitor.toc_print() _update_params(param_arrays, grad_arrays, updater, 1, param_names=param_names) num_batch += 1 # metric metric += np.sum(outputs[0]) reg_metric += np.sum(outputs[1]) print 'batch -> {}'.format(num_batch) print 'focal_loss -> {}'.format(metric / num_batch) print 'l1_loss -> {}'.format(reg_metric / num_batch) if num_batch % 1000 == 0: save_arg_params = {} for param_name in param_names: save_arg_params[param_name] = exec_.arg_dict[param_name] save_aux_params = exec_.aux_dict save_checkpoint('./tiny_face', num_batch, epoch, net, save_arg_params, save_aux_params) mxIter.reset() save_arg_params = {} for param_name in param_names: save_arg_params[param_name] = exec_.arg_dict[param_name] save_aux_params = exec_.aux_dict save_checkpoint('./tiny_face', num_batch, epoch, net, save_arg_params, save_aux_params)
def fit(self, train_data, eval_data=None, eval_metric='acc', grad_req='write', logger=None, softmax_metric=None, regression_metric=None, epoch_end_callback=None): f = open("log_rpn.txt", 'w') if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) f.write('Start training with %s\n' % str(self.ctx)) arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape( data=(1, 3, 128, 128), mean_face=(10, 3), ground_truth=(10, 2), bbox_label=(10, 5)) arg_names = self.symbol.list_arguments() if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith("mean_face") or name.endswith('cls_label') or name.endswith('proj_weight') or name.endswith('proj_label') or name.endswith('ground_truth') or name.endswith('bbox_label') or name.endswith("bbox_weight")): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None aux_names = self.symbol.list_auxiliary_states() self.aux_params = { k: mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes) } data_name = train_data.data_name cls_label_name = train_data.cls_label_name proj_label_name = train_data.proj_label_name proj_weight_name = train_data.proj_weight_name ground_truth_name = train_data.ground_truth_name bbox_label_name = train_data.bbox_label_name bbox_weight_name = train_data.bbox_weight_name self.optimizer = opt.create(self.optimizer, rescale_grad=1.0, **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = metric.create(eval_metric) for epoch in range(self.begin_epoch, self.num_epoch): if eval_data: logger.info(" in eval process...") f.write(" in eval process...") nbatch = 0 softmax_proj = np.zeros((11, 3)) proj_regression_loss = .0 bbox_predict_loss = np.array([.0, .0]) eval_data.reset() for data in eval_data: nbatch += 1 print "Eval batch:", nbatch softmax_shape = data[cls_label_name].shape self.arg_params[data_name] = mx.nd.array( data[data_name], self.ctx) self.arg_params[cls_label_name] = mx.nd.array( data[cls_label_name].reshape( (softmax_shape[0], softmax_shape[1] * softmax_shape[2])), self.ctx) self.arg_params[proj_label_name] = mx.nd.array( data[proj_label_name], self.ctx) self.arg_params[proj_weight_name] = mx.nd.array( data[proj_weight_name], self.ctx) self.arg_params[ground_truth_name] = mx.nd.array( data[ground_truth_name], self.ctx) self.arg_params[bbox_label_name] = mx.nd.array( data[bbox_label_name], self.ctx) self.arg_params[bbox_weight_name] = mx.nd.array( data[bbox_weight_name], self.ctx) self.arg_params["mean_face"] = mx.nd.array( train_data.mean_face, self.ctx) executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) softmax_output_array = mx.nd.zeros( executor.outputs[0].shape) proj_regression_output_array = mx.nd.zeros( executor.outputs[1].shape) bbox_predict_output_array = mx.nd.zeros( executor.outputs[2].shape) ell_label = mx.nd.zeros(executor.outputs[3].shape) bbox_predict = mx.nd.zeros(executor.outputs[4].shape) executor.forward(is_train=True) executor.outputs[0].copyto(softmax_output_array) executor.outputs[1].copyto(proj_regression_output_array) executor.outputs[2].copyto(bbox_predict_output_array) executor.outputs[3].copyto(ell_label) executor.outputs[4].copyto(bbox_predict) softmax_shape = softmax_output_array.shape index_label = np.nonzero(data[cls_label_name].reshape( softmax_shape[0], softmax_shape[2] * softmax_shape[3]) - 255) label = mx.nd.array(data[cls_label_name].reshape( softmax_shape[0], softmax_shape[2] * softmax_shape[3])[:, index_label[1]]) pred = mx.nd.array((softmax_output_array.asnumpy().reshape( softmax_shape[0], softmax_shape[1], softmax_shape[2] * softmax_shape[3]))[..., index_label[1]]) if softmax_metric: tempt = softmax_metric(label, pred, 11) softmax_proj += tempt proj_label = data[proj_label_name] proj_weight = data[proj_weight_name] proj_pred = proj_regression_output_array.asnumpy().reshape( data[proj_weight_name].shape) index_nonzero = np.nonzero(data[proj_weight_name]) proj_regress_tmp = regression_metric( proj_label[index_nonzero], proj_pred[index_nonzero], proj_weight[index_nonzero]) proj_regression_loss += proj_regress_tmp bbox_pred = bbox_predict_output_array.asnumpy() bbox_predict_tmp = bbox_predict_metric( ell_label.asnumpy(), bbox_pred) bbox_predict_loss += bbox_predict_tmp print "Validation-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f" % \ (epoch, nbatch, get_accuracy(tempt, self.bgfg), proj_regress_tmp, bbox_predict_tmp[0], bbox_predict_tmp[1]) f.write( "Validation-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f\n" % (epoch, nbatch, get_accuracy( tempt, self.bgfg), proj_regress_tmp, bbox_predict_tmp[0], bbox_predict_tmp[1])) img_info = eval_data.AllImg[nbatch - 1] print "%s\twidth: %d height: %d num_face: %d" % \ (img_info.filename, img_info.width, img_info.height, img_info.num_faces) f.write("%s\twidth: %d height: %d num_face: %d\n" % (img_info.filename, img_info.width, img_info.height, img_info.num_faces)) executor.outputs[0].wait_to_read() executor.outputs[1].wait_to_read() executor.outputs[2].wait_to_read() executor.outputs[3].wait_to_read() print_accuracy(softmax_proj, f, train_data.class_names, self.bgfg) logger.info("ALL Validation accuracy: %f", get_accuracy(softmax_proj, self.bgfg)) logger.info('Validation projection regression: %f', proj_regression_loss / nbatch) logger.info('Validation bbox predict: %f %f', bbox_predict_loss[0] / nbatch, bbox_predict_loss[1] / nbatch) f.write("ALL Validation accuracy: %f\n" % get_accuracy(softmax_proj, self.bgfg)) f.write("Validation projection regression: %f\n" % (proj_regression_loss / nbatch)) f.write("Validation bbox predict: %f %f\n" % (bbox_predict_loss[0] / nbatch, bbox_predict_loss[1] / nbatch)) nbatch = 0 train_data.reset() eval_metric.reset() proj_regress_loss_t = .0 proj_regress_loss_b = .0 softmax_count = np.zeros((11, 3)) softmax_batch = np.zeros((11, 3)) bbox_predict_loss_t = np.array([.0, .0]) bbox_predict_loss_b = np.array([.0, .0]) for data in train_data: nbatch += 1 softmax_shape = data[cls_label_name].shape self.arg_params[data_name] = mx.nd.array( data[data_name], self.ctx) self.arg_params[cls_label_name] = mx.nd.array( data[cls_label_name].reshape( (softmax_shape[0], softmax_shape[1] * softmax_shape[2])), self.ctx) self.arg_params[proj_label_name] = mx.nd.array( data[proj_label_name], self.ctx) self.arg_params[proj_weight_name] = mx.nd.array( data[proj_weight_name], self.ctx) self.arg_params[ground_truth_name] = mx.nd.array( data[ground_truth_name], self.ctx) self.arg_params[bbox_label_name] = mx.nd.array( data[bbox_label_name], self.ctx) self.arg_params[bbox_weight_name] = mx.nd.array( data[bbox_weight_name], self.ctx) self.arg_params["mean_face"] = mx.nd.array( train_data.mean_face, self.ctx) self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) assert len(self.symbol.list_arguments()) == len( self.executor.grad_arrays) update_dict = { name: nd for name, nd in zip(self.symbol.list_arguments(), self.executor.grad_arrays) if nd } output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.executor.outputs): output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) self.executor.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) self.executor.backward() ''' for i in xrange(0, 49): if self.executor.grad_arrays[i] != None: print i, arg_names[i], self.executor.grad_arrays[i].asnumpy()[0] ''' for key, arr in update_dict.items(): if key != 'upsample_proposal_weight': self.updater(key, arr, self.arg_params[key]) ''' if key == 'config_fc1_weight': print 'config_fc1_weight' print 'param:', self.arg_params[key].asnumpy() print 'grad:', self.executor.grad_arrays[39].asnumpy() if key == 'refine_proj_param_weight': print 'refine_proj_param_weight' print 'param:', self.arg_params[key].asnumpy() print 'grad:', self.executor.grad_arrays[47].asnumpy() ''' pred_shape = self.executor.outputs[0].shape index_label = np.nonzero(data[cls_label_name].reshape( softmax_shape[0], softmax_shape[1] * softmax_shape[2]) - 255) label = mx.nd.array(data[cls_label_name].reshape( softmax_shape[0], softmax_shape[1] * softmax_shape[2])[:, index_label[1]]) pred = mx.nd.array( (output_buff["proposal_cls_loss_output"].asnumpy().reshape( pred_shape[0], pred_shape[1], pred_shape[2] * pred_shape[3]))[..., index_label[1]]) if softmax_metric: tempt = softmax_metric(label, pred, 11) softmax_count += tempt softmax_batch += tempt # for q in range(0, 50): # print label.asnumpy()[0, q], ':', pred.asnumpy()[0, 0, q], pred.asnumpy()[0, 1, q] proj_label = data[proj_label_name] proj_weight = data[proj_weight_name] proj_pred = output_buff["proj_regression_loss_output"].asnumpy()\ .reshape(data[proj_weight_name].shape) index_nonzero = np.nonzero(data[proj_weight_name]) proj_regress_tmp = regression_metric( proj_label[index_nonzero], proj_pred[index_nonzero], proj_weight[index_nonzero]) proj_regress_loss_t += proj_regress_tmp proj_regress_loss_b += proj_regress_tmp ell_label = output_buff["ell_label_output"].asnumpy() bbox_pred = output_buff["ellipse_predict_loss_output"].asnumpy( ) bbox_predict_tmp = bbox_predict_metric(ell_label, bbox_pred) bbox_predict_loss_t += bbox_predict_tmp bbox_predict_loss_b += bbox_predict_tmp self.executor.outputs[0].wait_to_read() self.executor.outputs[1].wait_to_read() self.executor.outputs[2].wait_to_read() self.executor.outputs[3].wait_to_read() print "Training-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f" % \ (epoch, nbatch, get_accuracy(tempt, self.bgfg), proj_regress_tmp, bbox_predict_tmp[0], bbox_predict_tmp[1]) f.write( "Training-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f\n" % (epoch, nbatch, get_accuracy( tempt, self.bgfg), proj_regress_tmp, bbox_predict_tmp[0], bbox_predict_tmp[1])) img_info = train_data.AllImg[nbatch - 1] print "%s\twidth: %d height: %d num_face: %d" % \ (img_info.filename, img_info.width, img_info.height, img_info.num_faces) f.write("%s\twidth: %d height: %d num_face: %d\n" % \ (img_info.filename, img_info.width, img_info.height, img_info.num_faces)) if nbatch % 50 == 0: print_accuracy(softmax_batch, f, train_data.class_names, self.bgfg) softmax_batch = np.zeros((11, 3)) print "Keypoints projection regression smoothl1 loss:\t", proj_regress_loss_b / 50 f.write( "Keypoints projection regression smoothl1 loss:\t%f\n" % (proj_regress_loss_b / 50)) print "Bounding box regression:\t", bbox_predict_loss_b / 50 f.write("Bounding box regression: %f %f\n" % (bbox_predict_loss_b[0] / 50, bbox_predict_loss_b[1] / 50)) #print "Keypoints offset regression smoothl1 loss:\t", offset_regress_loss_b / 50 #f.write("Keypoints offset regression smoothl1 loss:\t%f\n" % (offset_regress_loss_b / 50)) #print "Keypoints visibility accuracy:\t", float(softmax_vis_batch[2]) / float(softmax_vis_batch[0]) #f.write("Keypoints visibility accuracy:\t%f\n" % # (float(softmax_vis_batch[2]) / float(softmax_vis_batch[0]))) softmax_vis_batch = np.zeros(3) proj_regress_loss_b = .0 offset_regress_loss_b = .0 bbox_predict_loss_b = np.array([.0, .0]) if nbatch % 1000 == 0: if epoch_end_callback != None: epoch_end_callback(epoch * 100000 + nbatch, self.symbol, self.arg_params, self.aux_params) name, value = eval_metric.get() print_accuracy(softmax_count, f, train_data.class_names, self.bgfg) logger.info("--->Epoch[%d] Train-cls-%s=%f", epoch, name, value) logger.info("--->Epoch[%d] Train-proj-reg-smoothl1=%f", epoch, proj_regress_loss_t / nbatch) logger.info("--->Epoch[%d] Train-bbox-reg-smoothl1=%f, %f", epoch, bbox_predict_loss_t[0] / nbatch, bbox_predict_loss_t[1] / nbatch) #logger.info("--->Epoch[%d] Train-offset-reg-smoothl1=%f", epoch, offset_regress_loss_t / nbatch) #logger.info("--->Epoch[%d] Train-vis-acc=%f", epoch, float(softmax_vis_count[2]) / float(softmax_vis_count[0])) f.write("--->Epoch[%d] Train-cls-%s=%f\n" % (epoch, name, value)) f.write("--->Epoch[%d] Train-proj-reg-smoothl1=%f\n" % (epoch, proj_regress_loss_t / nbatch)) f.write("--->Epoch[%d] Train-bbox-reg-smoothl1=%f, %f" % (epoch, bbox_predict_loss_t[0] / nbatch, bbox_predict_loss_t[1] / nbatch)) #f.write("--->Epoch[%d] Train-offset-reg-smoothl1=%f\n" % (epoch, offset_regress_loss_t / nbatch)) #f.write("--->Epoch[%d] Train-vis-acc=%f" % (epoch, float(softmax_vis_count[2]) / float(softmax_vis_count[0]))) f.close()
def fit(self, train_data, eval_data=None, eval_metric='acc', period=['train', 'val'], to_eval_train=True, grad_req='write', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None): if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) # region 1. 准备参数,包括输入数据和标签数据 # FCN的参数名 arg_names = self.symbol.list_arguments() # FCN的参数形状 # print train_data.provide_data[0] arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape( data=train_data.provide_data[0][1]) # arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data=(1, 3, # train_data.resize_size[0], # train_data.resize_size[1], # )) # print train_data.provide_data[0][1] # quit() # 输入数据和标签数据 data_name = train_data.provide_data[0][0] label_name = train_data.provide_label[0][0] # print data_name, label_name # input_names = [data_name, label_name] # batch_size, channel, h, w # data_shape = train_data.provide_data[0][1] self.arg_params[data_name] = mx.nd.empty(train_data.provide_data[0][1], self.ctx) # # batch_size, h*w self.arg_params[label_name] = mx.nd.empty( train_data.provide_label[0][1], self.ctx) # quit() # 其他参数 aux_names = self.symbol.list_auxiliary_states() self.aux_params = { k: mx.nd.zeros(s) for k, s in zip(aux_names, aux_shapes) } # endregion # region 2.准备参数的梯度 if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): # print name,shape self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None # endregion # print self.arg_params # region 3. 绑定模型参数 和 模型的输出 self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) # quit() assert len(self.symbol.list_arguments()) == len( self.executor.grad_arrays) # 绑定输出变量 output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.executor.outputs): # print key, arr output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) # endregion # region 4. 设置优化器 self.optimizer = opt.create(self.optimizer, rescale_grad=1.0 / train_data.batch_size, **self.kwargs) self.updater = get_updater(self.optimizer) # 需要更新梯度的参数 update_dict = { name: nd for name, nd in zip(self.symbol.list_arguments(), self.executor.grad_arrays) if nd is not None } # endregion # region 5. 设置评价尺度 if eval_metric == 'acc': eval_metric = metric.create(eval_metric) elif eval_metric == 'meanIOU': eval_metric = MeanIoU(c=1, ) # endregion for epoch in range(self.begin_epoch, self.num_epoch): # region begin training if 'train' in period: logger.info(" in train process...") all_start = time.time() nbatch = 0 train_data.reset() eval_metric.reset() for data in train_data: nbatch += 1 # all_start = time.time() # region 1. 准备 batch 数据 # start = time.time() self.arg_params[data_name][:] = data.data[0] # end = time.time() # print end-start # label_shape = data.label[0].shape # print label_shape self.arg_params[label_name][:] = data.label[0] # end = time.time() # print 'prepare data and label time: %s s' % (end - start) # quit() # print self.arg_params[label_name][:] # endregion # region 2. forward # start = time.time() self.executor.forward(is_train=True) # end = time.time() # print 'forward time: %s s' % (end - start) # endregion # region 3. backward # start = time.time() self.executor.backward() for key, arr in update_dict.items(): if key != "bigscore_weight": # 参数名,梯度, 权重 self.updater(key, arr, self.arg_params[key]) # self.executor.outputs[0].wait_to_read() # end = time.time() # print 'backward time: %f s' % (end - start) # endregion # region 4. 测评 # start = time.time() if to_eval_train: # start = time.time() # 取得输出 for key in output_dict: # print key output_dict[key].copyto(output_buff[key]) # output_dict[key].wait_to_read() # end = time.time() # print 'output1 copy time: %s s' % (end - start) # start = time.time() pred_shape = output_buff['softmax_output'].shape # print pred_shape, label_shape # label = self.arg_params[label_name] pred = output_buff['softmax_output'].reshape( (pred_shape[0], pred_shape[1], pred_shape[2] * pred_shape[3])) # pred = pred.copyto(self.ctx) # print pred.shape label = data.label[0] # quit() # end = time.time() # print 'output copy2 time: %s s' % (end - start) # 更新评价 eval_metric.update([label], [pred]) batch_end_params = BatchEndParam( epoch=epoch, nbatch=nbatch, eval_metric=eval_metric if to_eval_train else None, ) batch_end_callback(batch_end_params) # end = time.time() # print '测评 time: %s s' % (end - start) # endregion # all_end = time.time() # print 'all time: %s s' % (all_end - all_start) # if nbatch > 1: # quit() if epoch_end_callback is not None: epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) # all_end = time.time() # print 'all time1: %s s' % (all_end - all_start) if to_eval_train: name, value = eval_metric.get() logger.info( " --->Epoch[%d] Train-%s=%f", epoch, name, value) logger.info('train time per epoch: %f s' % (time.time() - all_start)) # endregion # evaluation if 'val' in period and eval_data: logger.info(" in eval process...") nbatch = 0 eval_data.reset() eval_metric.reset() # all_start = time.time() for data in eval_data: nbatch += 1 # label_shape = data.label.shape self.arg_params[data_name][:] = data.data[0] self.arg_params[label_name][:] = data.label[0] self.executor.forward(is_train=False) pred_shape = self.executor.outputs[0].shape cpu_output_array = mx.nd.empty(pred_shape) self.executor.outputs[0].copyto(cpu_output_array) label = data.label[0] pred = cpu_output_array.reshape( (pred_shape[0], pred_shape[1], pred_shape[2] * pred_shape[3])) eval_metric.update([label], [pred]) batch_end_params = BatchEndParam( epoch=epoch, nbatch=nbatch, eval_metric=None, ) batch_end_callback(batch_end_params) # if nbatch>200: # quit() # quit() # self.executor.outputs[0].wait_to_read() # all_end = time.time() # print 'all time1: %s s' % (all_end - all_start) # all_start = time.time() name, value = eval_metric.get() logger.info('Epoch[%d] Validation-%s=%f', epoch, name, value)
def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names, arg_params, aux_params, begin_epoch, end_epoch, epoch_size, optimizer, kvstore, update_on_kvstore, train_data, eval_data=None, eval_metric=None, epoch_end_callback=None, batch_end_callback=None, logger=None, work_load_list=None, monitor=None, eval_batch_end_callback=None, sym_gen=None): """Internal training function on multiple devices. This function will also work for single device as well. Parameters ---------- symbol : Symbol The network configuration ctx : list of Context The training devices. arg_names: list of str Name of all arguments of the network. param_names: list of str Name of all trainable parameters of the network. aux_names: list of str Name of all auxiliary states of the network. arg_params : dict of str to NDArray Model parameter, dict of name to NDArray of net's weights. aux_params : dict of str to NDArray Model parameter, dict of name to NDArray of net's auxiliary states. begin_epoch : int The begining training epoch. end_epoch : int The end training epoch. epoch_size : int, optional Number of batches in a epoch. In default, it is set to ceil(num_train_examples / batch_size) optimizer : Optimizer The optimization algorithm train_data : DataIter Training data iterator. eval_data : DataIter Validation data iterator. eval_metric : EvalMetric An evaluation function or a list of evaluation functions. epoch_end_callback : callable(epoch, symbol, arg_params, aux_states) A callback that is invoked at end of each epoch. This can be used to checkpoint model each epoch. batch_end_callback : callable(BatchEndParams) A callback that is invoked at end of each batch. This can be used to measure speed, get result from evaluation metric. etc. kvstore : KVStore The KVStore update_on_kvstore : bool whether or not perform weight updating on kvstore logger : logging logger When not specified, default logger will be used. work_load_list : list of float or int, optional The list of work load for different devices, in the same order as ctx monitor : Monitor, optional Monitor installed to executor, for monitoring outputs, weights, and gradients for debugging. Notes ----- - This function will inplace update the NDArrays in arg_params and aux_states. """ if logger is None: logger = logging executor_manager = DataParallelExecutorManager(symbol=symbol, sym_gen=sym_gen, ctx=ctx, train_data=train_data, param_names=param_names, arg_names=arg_names, aux_names=aux_names, work_load_list=work_load_list, logger=logger) if monitor: executor_manager.install_monitor(monitor) executor_manager.set_params(arg_params, aux_params) if not update_on_kvstore: updater = get_updater(optimizer) if kvstore: _initialize_kvstore(kvstore=kvstore, param_arrays=executor_manager.param_arrays, arg_params=arg_params, param_names=executor_manager.param_names, update_on_kvstore=update_on_kvstore) if update_on_kvstore: logger.debug("Update on kvstore, setting optimizer") kvstore.set_optimizer(optimizer) # Now start training train_data.reset() for epoch in range(begin_epoch, end_epoch): # Training phase tic = time.time() eval_metric.reset() nbatch = 0 # Iterate over training data. while True: do_reset = True for data_batch in train_data: executor_manager.load_data_batch(data_batch) if monitor is not None: monitor.tic() executor_manager.forward(is_train=True) executor_manager.backward() if update_on_kvstore: _update_params_on_kvstore(executor_manager.param_arrays, executor_manager.grad_arrays, kvstore) else: _update_params(executor_manager.param_arrays, executor_manager.grad_arrays, updater=updater, num_device=len(ctx), kvstore=kvstore) if monitor is not None: monitor.toc_print() # evaluate at end, so we can lazy copy executor_manager.update_metric(eval_metric, data_batch.label) nbatch += 1 # batch callback (for print purpose) if batch_end_callback != None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) if isinstance(batch_end_callback, list): for call in batch_end_callback: call(batch_end_params) else: batch_end_callback(batch_end_params) # this epoch is done possibly earlier if epoch_size is not None and nbatch >= epoch_size: do_reset = False break if do_reset == True: logger.info('Epoch[%d] Resetting Data Iterator', epoch) train_data.reset() # this epoch is done if epoch_size is None or nbatch >= epoch_size: break toc = time.time() logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) if epoch_end_callback or epoch + 1 == end_epoch: executor_manager.copy_to(arg_params, aux_params) if epoch_end_callback != None: if isinstance(epoch_end_callback, list): for call in epoch_end_callback: call(epoch, symbol, arg_params, aux_params) else: epoch_end_callback(epoch, symbol, arg_params, aux_params) # evaluation if eval_data: eval_metric.reset() eval_data.reset() for i, eval_batch in enumerate(eval_data): executor_manager.load_data_batch(eval_batch) executor_manager.forward(is_train=False) executor_manager.update_metric(eval_metric, eval_batch.label) if eval_batch_end_callback != None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=i, eval_metric=eval_metric, locals=locals()) if isinstance(eval_batch_end_callback, list): for call in eval_batch_end_callback: call(batch_end_params) else: eval_batch_end_callback(batch_end_params) name_value = eval_metric.get_name_value() for name, value in name_value: logger.info('Epoch[%d] Validation-%s=%f', epoch, name, value) # end of all epochs return
def fddb_finetune_fold(fold_index): target_index = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] num_train_feature = 0 num_valid_feature = 0 for index in target_index: if index != fold_index: num_train_feature += num_feature_fold[index] else: num_valid_feature += num_feature_fold[index] train_feature = np.zeros((num_train_feature, feature_len), dtype=np.float) train_label = np.zeros((num_train_feature, label_len), dtype=np.float) train_weight = np.zeros((num_train_feature, label_len), dtype=np.float) train_feature_index = 0 valid_feature = np.zeros((num_valid_feature, feature_len), dtype=np.float) valid_label = np.zeros((num_valid_feature, label_len), dtype=np.float) valid_weight = np.zeros((num_valid_feature, label_len), dtype=np.float) valid_feature_index = 0 for index in target_index: for i in xrange(num_feature_fold[index]): if index != fold_index: train_feature[train_feature_index] = feature_fold[index][i] train_label[train_feature_index] = label_fold[index][i] train_weight[train_feature_index] = weight_fold[index][i] train_feature_index += 1 else: valid_feature[valid_feature_index] = feature_fold[index][i] valid_label[valid_feature_index] = label_fold[index][i] valid_weight[valid_feature_index] = weight_fold[index][i] valid_feature_index += 1 if retrain: symbol_finetune = fddb_symbol_finetune.get_vgg16_finetune() args = {} auxs = {} arg_names = symbol_finetune.list_arguments() aux_names = symbol_finetune.list_auxiliary_states() arg_shapes, _, aux_shapes = symbol_finetune.infer_shape( data=(batchsize, feature_len)) for name, shape in zip(arg_names, arg_shapes): if len(shape) < 1: continue fan_in, fan_out = np.prod(shape[1:]), shape[0] factor = fan_in scale = np.sqrt(2.34 / factor) tempt = np.random.uniform(-scale, scale, size=shape) args[name] = mx.nd.array(tempt, ctx) for name, shape in zip(aux_names, aux_shapes): if len(shape) < 1: continue fan_in, fan_out = np.prod(shape[1:]), shape[0] factor = fan_in scale = np.sqrt(2.34 / factor) tempt = np.random.uniform(-scale, scale, size=shape) auxs[name] = mx.nd.array(tempt, ctx) else: symbol_finetune = fddb_symbol_finetune.get_vgg16_finetune() _, args, auxs = mx.model.load_checkpoint(rpn_prefix, load_epoch) for k, v in args.items(): if v.context != ctx: args[k] = mx.nd.zeros(v.shape, ctx) v.copyto(args[k]) for k, v in auxs.items(): if v.context != ctx: auxs[k] = mx.nd.zeros(v.shape, ctx) v.copyto(auxs[k]) arg_names = symbol_finetune.list_arguments() arg_shapes, _, aux_shapes = symbol_finetune.infer_shape( data=(batchsize, feature_len)) grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('ell_label') or name.endswith('bbox_weight') or name.endswith('data')): grad_params[name] = mx.nd.zeros(shape, ctx) num_train_batch = num_train_feature / batchsize lr = 0.03 lr_decay = 0.33 epoch_end_callback = mx.callback.do_checkpoint(finetune_prefix + "-" + fold_index) for j in range(start_epoch, end_epoch): bbox_predict_loss = np.array([.0, .0, .0]) if j % 50 == 0 or j == start_epoch: lr *= lr_decay optimizer = opt.create('sgd', rescale_grad=1.0 / batchsize, learning_rate=lr, momentum=0.9, wd=0.00001) updater = get_updater(optimizer) for i in range(num_train_batch): feature_b = train_feature[i * batchsize:(i + 1) * batchsize, :] label_b = train_label[i * batchsize:(i + 1) * batchsize, :] weight_b = train_weight[i * batchsize:(i + 1) * batchsize, :] args["data"] = mx.nd.array(feature_b, ctx) args["ell_label"] = mx.nd.array(label_b, ctx) args["bbox_weight"] = mx.nd.array(weight_b, ctx) executor = symbol_finetune.bind(ctx, args, args_grad=grad_params, grad_req='write', aux_states=auxs) assert len(symbol_finetune.list_arguments()) == len( executor.grad_arrays) update_dict = { name: nd for name, nd in zip(symbol_finetune.list_arguments(), executor.grad_arrays) if nd } output_dict = {} output_buff = {} for key, arr in zip(symbol_finetune.list_outputs(), executor.outputs): output_dict[key] = arr output_buff[key] = mx.nd.zeros(arr.shape, ctx=mx.cpu()) executor.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) executor.backward() for key, arr in update_dict.items(): updater(key, arr, args[key]) executor.outputs[0].wait_to_read() face_pred = output_buff["ellipse_predict_loss_output"].asnumpy() bbox_predict_b = bbox_predict_metric(label_b, face_pred, weight_b) bbox_predict_loss += bbox_predict_b if i % 10 == 0: print "Training-fold[" + \ fold_index + \ "]-epoch[%d/%d]-batch[%d/%d]: lr:%f\tbbox_regress:%f\tbbox_angle:%f\tiou_regress:%f" % \ (j, end_epoch, i, num_train_batch, lr, bbox_predict_b[0], bbox_predict_b[1], bbox_predict_b[2]) print "ALL Training: bbox_regress:%f\tbbox_angle:%f\tiou_regress:%f" % \ (bbox_predict_loss[0] / float(num_train_batch), bbox_predict_loss[1] / float(num_train_batch), bbox_predict_loss[2] / float(num_train_batch)) if j % 25 == 0: print "Saving the model:", j epoch_end_callback(j, symbol_finetune, args, auxs) args["data"] = mx.nd.array(valid_feature, ctx) args["ell_label"] = mx.nd.array(valid_label, ctx) args["bbox_weight"] = mx.nd.array( np.ones((valid_feature.shape[0], label_len), dtype=np.float), ctx) executor = symbol_finetune.bind(ctx, args, args_grad=None, grad_req='null', aux_states=auxs) output_dict = {} output_buff = {} for key, arr in zip(symbol_finetune.list_outputs(), executor.outputs): output_dict[key] = arr output_buff[key] = mx.nd.zeros(arr.shape, ctx=mx.cpu()) executor.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) executor.outputs[0].wait_to_read() face_pred = output_buff["ellipse_predict_loss_output"].asnumpy() print valid_label[0] print face_pred[0] bbox_predict_b = bbox_predict_metric(valid_label, face_pred, valid_weight) print "ALL Validation: bbox_regress:%f\tbbox_angle:%f\tiou_regress:%f" % \ (bbox_predict_b[0], bbox_predict_b[1], bbox_predict_b[2])
grad_params[name] = mx.nd.zeros(shape, ctx) # prepare aux_params aux_names = network.list_auxiliary_states() aux_params = { k: mx.nd.zeros(s, ctx) for k, s in zip(aux_names, aux_shapes) } # prepare optimizer optimizer = opt.create('adam', rescale_grad=(1.0 / dataiter.get_batch_size()), **({ 'learning_rate': 0.01 })) updater = get_updater(optimizer) # create eval_metrix eval_metric = metric.create('rmse') data_name = dataiter.data_name label_name = dataiter.label_name arg_params = network_args aux_params = network_auxs batch_callback = mx.callback.Speedometer(1, 10) epoch_callback = mx.callback.do_checkpoint(save_model_prefix) # begin training for epoch in range(10000): nbatch = 0