def _get_gluon_metrics(train_config): metrics_gluon = { 'value_loss': metric.MSE(name='value_loss', output_names=['value_output']), 'value_acc_sign': metric.create(acc_sign, name='value_acc_sign', output_names=['value_output'], label_names=['value_label']), } if train_config.sparse_policy_label: # the default cross entropy only supports sparse labels metrics_gluon['policy_loss'] = metric.CrossEntropy( name='policy_loss', output_names=['policy_output'], label_names=['policy_label']), metrics_gluon['policy_acc'] = metric.Accuracy( axis=1, name='policy_acc', output_names=['policy_output'], label_names=['policy_label']) else: metrics_gluon['policy_loss'] = metric.create( cross_entropy, name='policy_loss', output_names=['policy_output'], label_names=['policy_label']) metrics_gluon['policy_acc'] = metric.create( acc_distribution, name='policy_acc', output_names=['policy_output'], label_names=['policy_label']) return metrics_gluon
def validate_model(self, valid_iter, eval_metric): """ Parameters ---------- eval_metric - "accuracy", "ce" (CrossEntropy), "f1", "mae", "mse", "rmse", "top_k_accuracy". """ # res = self.train_module.score(eval_data, validation_metric) eval_metric_fn = metric.create(eval_metric) end_of_batch = False nbatch = 0 while not end_of_batch: try: valid_batch = valid_iter.next() self.train_module.forward(valid_batch) self.train_module.update_metric(eval_metric_fn, valid_batch.label) for name, value in eval_metric_fn.get(): self.writer.add_scalar(tag="Validation " + name, value=value, global_step=nbatch) print("Batch[%d] Validation-%s=%.3f" % (nbatch, name, value)) except Exception: end_of_batch = True nbatch += 1
def _get_mxnet_metrics(train_config): metrics_mxnet = [ metric.MSE(name='value_loss', output_names=['value_output'], label_names=['value_label']), metric.CrossEntropy(name='policy_loss', output_names=['policy_output'], label_names=['policy_label']), metric.create(acc_sign, name='value_acc_sign', output_names=['value_output'], label_names=['value_label']), metric.Accuracy(axis=1, name='policy_acc', output_names=['policy_output'], label_names=['policy_label']) ] if train_config.use_wdl: metrics_mxnet.append( metric.CrossEntropy(name='wdl_loss', output_names=['wdl_output'], label_names=['wdl_label'])) metrics_mxnet.append( metric.Accuracy(axis=1, name='wdl_acc', output_names=['wdl_output'], label_names=['wdl_label'])) if train_config.use_plys_to_end: metrics_mxnet.append( metric.MSE(name='plys_to_end_loss', output_names=['plys_to_end_output'], label_names=['plys_to_end_label'])) return metrics_mxnet
def score(self, X, eval_metric='acc', num_batch=None, batch_end_callback=None, reset=True): """Run the model on X and calculate the score with eval_metric Parameters ---------- X : mxnet.DataIter eval_metric : metric.metric The metric for calculating score num_batch : int or None the number of batch to run. Go though all batches if None Returns ------- s : float the final score """ # setup metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) X = self._init_iter(X, None, is_train=False) if reset: X.reset() data_shapes = X.provide_data data_names = [x[0] for x in data_shapes] self._init_predictor(data_shapes) data_arrays = [self._pred_exec.arg_dict[name] for name in data_names] for i, batch in enumerate(X): if num_batch is not None and i == num_batch: break _load_data(batch, data_arrays) self._pred_exec.forward(is_train=False) eval_metric.update(batch.label, self._pred_exec.outputs) if batch_end_callback != None: batch_end_params = BatchEndParam(epoch=0, nbatch=i, eval_metric=eval_metric, locals=locals()) if isinstance(batch_end_callback, list): for call in batch_end_callback: call(batch_end_params) else: batch_end_callback(batch_end_params) return eval_metric.get()[1]
def network_backprop_setup(self, grad_req, arg_names, arg_shapes, eval_metric): if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith("mean_face") or name.endswith('cls_label') or name.endswith('proj_weight') or name.endswith('proj_label') or name.endswith('ground_truth') or name.endswith('ellipse_label') or name.endswith("bbox_weight")): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) # setting the required optimizer self.optimizer = opt.create(self.optimizer, rescale_grad=1.0, **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = metric.create(eval_metric) return eval_metric
def fit(self, train_data, eval_data=None, eval_metric='acc', validate_metric=None, work_load_list=None, epoch_end_callback=None, batch_end_callback=None, fixed_param_prefix=None, initializer=None, arg_params=None, aux_params=None, allow_missing=False, optimizer=None, optimizer_params=None, begin_epoch=0, num_epoch=None, kvstore='device', teacher_modules=None): if type(teacher_modules) is not list: teacher_modules = [teacher_modules] self.module.bind(data_shapes=self.data_shapes, label_shapes=self.label_shapes, for_training=True) self.module.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing) self.module.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validate_metric is None: validate_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) # training loop for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() nbatch = 0 data_iter = iter(train_data) end_of_batch = False next_data_batch = next(data_iter) while not end_of_batch: data_batch = next_data_batch if teacher_modules[0] is not None: for teacher_module in teacher_modules: teacher_module.forward(data_batch=data_batch, is_train=True) transfer_label = teacher_module.get_outputs() data_batch.label = data_batch.label + transfer_label self.module.forward(data_batch, is_train=True) self.module.backward() self.module.update() try: next_data_batch = next(data_iter) except StopIteration: end_of_batch = True self.module.update_metric(eval_metric, data_batch.label) if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 for name, val in eval_metric.get_name_value(): self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) arg_params, aux_params = self.module.get_params() self.module.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) if eval_data: res = self.module.score(eval_data, validate_metric, score_end_callback=None, batch_end_callback=None, reset=True, epoch=epoch) for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) train_data.reset()
def run_training(alpha, queue): _, x_val, yv_val, yp_val, plys_to_end, _ = load_pgn_dataset( dataset_type='val', part_id=0, verbose=True, normalize=tc.normalize) if tc.discount != 1: yv_val *= tc.discount**plys_to_end if tc.select_policy_from_plane: val_iter = mx.io.NDArrayIter( {'data': x_val}, { 'value_label': yv_val, 'policy_label': np.array(FLAT_PLANE_IDX)[yp_val.argmax(axis=1)] }, tc.batch_size) else: val_iter = mx.io.NDArrayIter({'data': x_val}, { 'value_label': yv_val, 'policy_label': yp_val.argmax(axis=1) }, tc.batch_size) tc.nb_parts = len(glob.glob(main_config['planes_train_dir'] + '**/*')) nb_it_per_epoch = ( len(x_val) * tc.nb_parts ) // tc.batch_size # calculate how many iterations per epoch exist # one iteration is defined by passing 1 batch and doing backprop tc.total_it = int(nb_it_per_epoch * tc.nb_training_epochs) ### Define a Learning Rate schedule to.lr_schedule = OneCycleSchedule(start_lr=tc.max_lr / 8, max_lr=tc.max_lr, cycle_length=tc.total_it * .3, cooldown_length=tc.total_it * .6, finish_lr=tc.min_lr) to.lr_schedule = LinearWarmUp(to.lr_schedule, start_lr=tc.min_lr, length=tc.total_it / 30) ### Momentum schedule to.momentum_schedule = MomentumSchedule(to.lr_schedule, tc.min_lr, tc.max_lr, tc.min_momentum, tc.max_momentum) plot_schedule(to.momentum_schedule, iterations=tc.total_it, ylabel='Momentum') input_shape = x_val[0].shape beta = np.sqrt(2 / alpha) print("alpha:", alpha) print("beta:", beta) depth = int(round(base_depth * alpha)) channels = int(round(base_channels * beta)) kernels = [3] * depth se_types = [None] * len(kernels) channels_reduced = int(round(channels / 4)) symbol = rise_mobile_v3_symbol(channels=channels, channels_operating_init=channels_reduced, act_type='relu', channels_value_head=8, value_fc_size=256, channels_policy_head=NB_POLICY_MAP_CHANNELS, grad_scale_value=tc.val_loss_factor, grad_scale_policy=tc.policy_loss_factor, dropout_rate=tc.dropout_rate, select_policy_from_plane=True, kernels=kernels, se_types=se_types) # create a trainable module on compute context model = mx.mod.Module(symbol=symbol, context=ctx, label_names=['value_label', 'policy_label']) model.bind(for_training=True, data_shapes=[('data', (tc.batch_size, input_shape[0], input_shape[1], input_shape[2]))], label_shapes=val_iter.provide_label) model.init_params( mx.initializer.Xavier(rnd_type='uniform', factor_type='avg', magnitude=2.24)) metrics_mxnet = [ metric.MSE(name='value_loss', output_names=['value_output'], label_names=['value_label']), metric.CrossEntropy(name='policy_loss', output_names=['policy_output'], label_names=['policy_label']), metric.create(acc_sign, name='value_acc_sign', output_names=['value_output'], label_names=['value_label']), metric.Accuracy(axis=1, name='policy_acc', output_names=['policy_output'], label_names=['policy_label']) ] to.metrics = metrics_mxnet train_agent = TrainerAgentMXNET(model, symbol, val_iter, tc, to, use_rtpt=True) print("model.score(val_iter, to.metrics:", model.score(val_iter, to.metrics)) # Start the training process _, (k_steps_best, val_metric_values_best) = train_agent.train(cur_it) new_row = { 'alpha': alpha, 'beta': beta, 'depth': depth, 'channels': channels, 'k_steps_best': k_steps_best, 'val_loss': val_metric_values_best['loss'], 'val_value_loss': val_metric_values_best['value_loss'], 'val_policy_loss': val_metric_values_best['policy_loss'], 'val_policy_acc': val_metric_values_best['policy_acc'], 'val_value_acc': val_metric_values_best['value_acc_sign'] } queue.put(new_row) print(new_row)
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01), ), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None): assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) ####chris_arg if int(os.getenv("TASK_LIMIT", 0)) != 0: #为0时不分task限制,为1时分task但是每轮更新,为2时分task并但固定 get_task_cmd = "sh /home/ubuntu/tc.sh -l 1" else: self.logger.info("no_task_bandwidth_limit") get_task_cmd = "sh /home/ubuntu/tc.sh -l 0" os.system(get_task_cmd) delay_time = float(os.getenv("DELAY_TIME", 0.8)) ps_upload_bandwidth_part1 = int(os.getenv("PS_UPLOAD_BANDWIDTH1", 2000)) worker_upload_bandwidth_part1 = int( os.getenv("WORKER_UPLOAD_BANDWIDTH1", 2000)) ps_upload_bandwidth_part2 = int(os.getenv("PS_UPLOAD_BANDWIDTH2", 2000)) worker_upload_bandwidth_part2 = int( os.getenv("WORKER_UPLOAD_BANDWIDTH2", 2000)) tc_command = "sudo tc class change dev {} parent 1: classid 1:3 htb rate {}mbit ceil {}mbit && sudo tc class change dev {} parent 1: classid 1:4 htb rate {}mbit ceil {}mbit" ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() nbatch = 0 data_iter = iter(train_data) end_of_batch = False next_data_batch = next(data_iter) while not end_of_batch: data_batch = next_data_batch if monitor is not None: monitor.tic() self.forward(data_batch, is_train=True) if int(os.getenv("TASK_LIMIT", 0)) == 1: ##first part bandwidth allocation ndarray.waitall() # self.logger.info("change bandwidth part1:, "+str(time.time())) x = str(ps_upload_bandwidth_part1) y = str(worker_upload_bandwidth_part1) cmd_up = tc_command.format("ens3", x, x, "ens3", y, y) cmd_down = tc_command.format("ifb0", y, y, "ifb0", x, x) os.system(cmd_up) # os.system(cmd_down) # self.logger.info("after forward, "+str(time.time())) self.backward() # self.logger.info("before update: "+str(time.time())) self.update() #异步执行的 if int(os.getenv("TASK_LIMIT", 0)) == 1: x = str(ps_upload_bandwidth_part2) y = str(worker_upload_bandwidth_part2) cmd_up = tc_command.format("ens3", x, x, "ens3", y, y) cmd_down = tc_command.format("ifb0", y, y, "ifb0", x, x) time.sleep(delay_time) ##second part bandwidth allocation # self.logger.info("change bandwidth part2:, "+str(time.time())) os.system(cmd_up) # os.system(cmd_down) try: # pre fetch next batch next_data_batch = next(data_iter) self.prepare(next_data_batch) except StopIteration: end_of_batch = True self.update_metric(eval_metric, data_batch.label) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 # one epoch of training is finished for name, val in eval_metric.get_name_value(): self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) #---------------------------------------- # evaluation on validation set if eval_data: res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
def fit(self, train_data, eval_data, eval_metric='mse', grad_req='write', epoch_end_callback=None, batch_end_callback=None, kv_store='local', logger=None): if logger is None: logger = logging logging.info('Starting training with %s', str(self.ctx)) arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape( data=train_data.provide_data[0][1]) arg_names = self.symbol.list_arguments() if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) #init the params pdb.set_trace() self.arg_params = { k: mx.nd.zeros(s, self.ctx) for k, s in zip(arg_names, arg_shapes) } for k, v in self.arg_params.items(): if not (k.endswith('data') or k.endswith('label')): self.initializer(k, v) #init the aux params aux_names = self.symbol.list_auxiliary_states() self.aux_params = { k: mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes) } data_name = train_data.data_name label_name = train_data.label_name input_names = [data_name, label_name] self.optimizer = mx.optimizer.create( self.optimizer, rescale_grad=(1.0 / train_data.get_batch_size()), **(self.kwargs)) self.updater = mx.optimizer.get_updater(self.optimizer) eval_metric = metric.create(eval_metric) # begin training for epoch in range(self.begin_epoch, self.num_epoch): nbatch = 0 train_data.reset() eval_metric.reset() #train for databatch in train_data: nbatch += 1 for k, v in databatch.data.items(): self.arg_params[k] = mx.nd.array(v, self.ctx) for k, v in databatch.label.items(): self.arg_params[k] = mx.nd.array(v, self.ctx) executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) # print(nbatch) if nbatch == 1550: pdb.set_trace() update_dict = { name: nd for name, nd in zip(self.symbol.list_arguments(), executor.grad_arrays) if nd } output_dict = { name: nd for name, nd in zip(self.symbol.list_outputs(), executor.outputs) } # pdb.set_trace() executor.forward(is_train=True) executor.backward() for key, arr in update_dict.items(): self.updater(key, arr, self.arg_params[key]) label = self.arg_params['lr_label'] pred = output_dict['lr_output'] eval_metric.update([label], [pred]) executor.outputs[0].wait_to_read() batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric) batch_end_callback(batch_end_params) if epoch_end_callback != None: epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) # pdb.set_trace() name, value = eval_metric.get() logger.info("------------------------------>Epoch[%d] Train-%s=%f", epoch, name, value) #begin evaluation if eval_data: logger.info("in eval process...") nbatch = 0 eval_data.reset() eval_metric.reset() for data in eval_data: nbatch += 1 for k, v in databatch.data.items(): self.arg_params[k] = mx.nd.array(v, self.ctx) for k, v in databatch.label.items(): self.arg_params[k] = mx.nd.array(v, self.ctx) executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) output_dict = { name: nd for name, nd in zip(self.symbol.list_outputs(), executor.outputs) } executor.forward(is_train=False) label = self.arg_params['lr_label'] pred = output_dict['lr_output'] eval_metric.update([label], [pred]) name, value = eval_metric.get() logger.info('batch[%d] Validation-%s=%f', nbatch, name, value)
def fit( self, train_data, ogdb, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=( ('learning_rate', 0.01), ), #,('rescale_grad', 1.0/8.0),), #8 gpu attempt eval_end_callback=None, iter_size=1, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None): """Ke's revision: add iter_size. Trains the module parameters. Checkout `Module Tutorial <http://mxnet.io/tutorials/basic/module.html>`_ to see a end-to-end use-case. Parameters ---------- train_data : DataIter Train DataIter. eval_data : DataIter If not ``None``, will be used as validation set and the performance after each epoch will be evaluated. eval_metric : str or EvalMetric Defaults to 'accuracy'. The performance measure used to display during training. Other possible predefined metrics are: 'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'. epoch_end_callback : function or list of functions Each callback will be called with the current `epoch`, `symbol`, `arg_params` and `aux_params`. batch_end_callback : function or list of function Each callback will be called with a `BatchEndParam`. kvstore : str or KVStore Defaults to 'local'. optimizer : str or Optimizer Defaults to 'sgd'. optimizer_params : dict Defaults to ``(('learning_rate', 0.01),)``. The parameters for the optimizer constructor. The default value is not a dict, just to avoid pylint warning on dangerous default values. eval_end_callback : function or list of function These will be called at the end of each full evaluation, with the metrics over the entire evaluation set. eval_batch_end_callback : function or list of function These will be called at the end of each mini-batch during evaluation. initializer : Initializer The initializer is called to initialize the module parameters when they are not already initialized. arg_params : dict Defaults to ``None``, if not ``None``, should be existing parameters from a trained model or loaded from a checkpoint (previously saved model). In this case, the value here will be used to initialize the module parameters, unless they are already initialized by the user via a call to `init_params` or `fit`. `arg_params` has a higher priority than `initializer`. aux_params : dict Defaults to ``None``. Similar to `arg_params`, except for auxiliary states. allow_missing : bool Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params` and `aux_params` are not ``None``. If this is ``True``, then the missing parameters will be initialized via the `initializer`. force_rebind : bool Defaults to ``False``. Whether to force rebinding the executors if already bound. force_init : bool Defaults to ``False``. Indicates whether to force initialization even if the parameters are already initialized. begin_epoch : int Defaults to 0. Indicates the starting epoch. Usually, if resumed from a checkpoint saved at a previous training phase at epoch N, then this value should be N+1. num_epoch : int Number of epochs for training. Examples -------- >>> # An example of using fit for training. >>> # Assume training dataIter and validation dataIter are ready >>> # Assume loading a previously checkpointed model >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 3) >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer='sgd', ... optimizer_params={'learning_rate':0.01, 'momentum': 0.9}, ... arg_params=arg_params, aux_params=aux_params, ... eval_metric='acc', num_epoch=10, begin_epoch=3) """ assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind, grad_req='add') if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) annealing_steps = 0 # number of current annealing steps in current epoch redo_training = 0 # Flag to redo training / resample val_list = [] # list of validation results per annealing step cur_val = 0 target_prec = 50 #Note: we want to identify the best cluster of images / training sets with a low percentage ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() nbatch = 0 if redo_training: annealing_steps = annealing_steps + 1 self.logger.info('Redoing training to meet criteria = %d', annealing_steps) #sroidb = train_data.roidb #passthrough test atick = time.time() iterdiff = 1.0 # Check if we've stagnated if len(val_list) > 2: itermean = (val_list[-1] + val_list[-2] + val_list[-3]) / 3 iterdiff = abs(itermean - val_list[-1]) self.logger.info('Last 3 samples have diff of: %f', iterdiff) if iterdiff < 0.01: self.logger.info( 'Reached a stagnated annealing criteria, dumping current samples' ) # Do something drastic # Lets try to instantly use the original db sroidb = ogdb # Try to read in another random subset #sroidb = sample_roidb(ogdb, 25) # Sample with removal else: # Continue as usual # Select a new random subset newroidb = sample_roidb(ogdb, 15) # Without removal, this is 10% # Append old with new sroidb = append_roidb(train_data.roidb, newroidb) # Create new training data instance by passing most of previous arguments and new random db train_data2 = AnchorLoader( train_data.feat_sym, sroidb, train_data.batch_size, train_data.shuffle, train_data.ctx, train_data.work_load_list, train_data.feat_stride, train_data.anchor_scales, train_data.anchor_ratios, train_data.aspect_grouping, nThreads=default.prefetch_thread_num) # Overwrite old train_data with the new one train_data = train_data2 data_iter = iter(train_data) atock = time.time() self.logger.info('Annealing[%d] Time cost=%.3f', annealing_steps, (atock - atick)) else: data_iter = iter(train_data) annealing_steps = 0 val_list = [] #target_prec=cur_val+5 target_prec = target_prec + 5 end_of_batch = False next_data_batch = next(data_iter) while not end_of_batch: data_batch = next_data_batch if monitor is not None: monitor.tic() # self.forward_backward(data_batch) self.forward(data_batch, is_train=True, grad_req='add') self.backward() if nbatch % iter_size == 0: # update every iter_size batches self.update() for g in self._curr_module._exec_group.grad_arrays: for g1 in g: if g1 is not None: g1[:] = 0. try: # pre fetch next batch next_data_batch = next(data_iter) self.prepare(next_data_batch) except StopIteration: end_of_batch = True self.update_metric(eval_metric, data_batch.label) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 # one epoch of training is finished for name, val in eval_metric.get_name_value(): self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) #print('Epoch[%d] Time cost=%.3f', epoch, (toc-tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): cur_val = callback(epoch, self.symbol, arg_params, aux_params) self.logger.info('Returned Validation=%f', val) val_list.append(val) #---------------------------------------- # evaluation on validation set if eval_data: self.logger.info('Evaluating data') res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) #---------- # Check epoch if it falls within the validation threshold if cur_val < target_prec: # Evaluate list of precision/validation results first #val_list print(eval_data) #else redo_training = 1 self.logger.info('Retraining data=%f', val) else: redo_training = 0 self.logger.info('Annealing steps=%f', annealing_steps) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
def fit(self, X, marks, e_marks=None, y=None, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, time_step_callback=None, kvstore='local', logger=None, work_load_list=None, monitor=None, eval_batch_end_callback=None): """Overwrite""" data = self._init_iter(X, y, is_train=True) eval_data = self._init_eval_iter(eval_data) if self.sym_gen: self.symbol = self.sym_gen( data.default_bucket_key) # pylint: disable=no-member self._check_arguments() self.kwargs["sym"] = self.symbol param_dict = dict(data.provide_data + data.provide_label) arg_names, param_names, aux_names = self._init_params(param_dict) # setup metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) # create kvstore (kvstore, update_on_kvstore) = _create_kvstore( kvstore, len(self.ctx), self.arg_params) param_idx2name = {} if update_on_kvstore: param_idx2name.update(enumerate(param_names)) else: for i, n in enumerate(param_names): for k in range(len(self.ctx)): param_idx2name[i * len(self.ctx) + k] = n self.kwargs["param_idx2name"] = param_idx2name # init optmizer if isinstance(self.optimizer, str): batch_size = data.batch_size if kvstore and kvstore.type == 'dist_sync': batch_size *= kvstore.num_workers optimizer = opt.create(self.optimizer, rescale_grad=(1.0 / batch_size), **(self.kwargs)) elif isinstance(self.optimizer, opt.Optimizer): optimizer = self.optimizer # do training _train_rnn(self.symbol, self.ctx, marks, arg_names, param_names, aux_names, self.arg_params, self.aux_params, begin_epoch=self.begin_epoch, end_epoch=self.num_epoch, epoch_size=self.epoch_size, optimizer=optimizer, train_data=data, eval_data=eval_data, eval_metric=eval_metric, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, time_step_callback=time_step_callback, kvstore=kvstore, update_on_kvstore=update_on_kvstore, logger=logger, work_load_list=work_load_list, monitor=monitor, eval_batch_end_callback=eval_batch_end_callback, sym_gen=self.sym_gen, e_marks=e_marks)
aux_names = network.list_auxiliary_states() aux_params = { k: mx.nd.zeros(s, ctx) for k, s in zip(aux_names, aux_shapes) } # prepare optimizer optimizer = opt.create('adam', rescale_grad=(1.0 / dataiter.get_batch_size()), **({ 'learning_rate': 0.01 })) updater = get_updater(optimizer) # create eval_metrix eval_metric = metric.create('rmse') data_name = dataiter.data_name label_name = dataiter.label_name arg_params = network_args aux_params = network_auxs batch_callback = mx.callback.Speedometer(1, 10) epoch_callback = mx.callback.do_checkpoint(save_model_prefix) # begin training for epoch in range(10000): nbatch = 0 dataiter.reset() eval_metric.reset() for data in dataiter:
def main(): # set debug DEBUG = False # =============setting============ dataset = config.dataset.dataset batch_size = config.TRAIN.BATCH_SIZE lr = config.TRAIN.lr beta1 = config.TRAIN.beta1 sigma = 0.02 ctx = [mx.gpu(int(i)) for i in config.gpus.split(',')] assert len(ctx) == 1, 'Multi GPU not supported.' ctx = ctx[0] frequent = config.default.frequent check_point = True logger, final_output_path = create_logger(config.output_path, args.cfg) prefix = os.path.join(final_output_path, config.TRAIN.model_prefix) train_fig_path = os.path.join(final_output_path, 'train_fig') train_fig_prefix = os.path.join(train_fig_path, dataset) if not os.path.exists(train_fig_path): os.makedirs(train_fig_path) # set random seed for reproducibility mx.random.seed(config.RNG_SEED) np.random.seed(config.RNG_SEED) # ==============data============== #train_data = pix2pixIter(config, shuffle=True, ctx=ctx) train_data = DataIter(config,ctx=ctx) step = config.TRAIN.step_epoch * train_data.size / batch_size step_decay = config.TRAIN.decay_epoch * train_data.size / batch_size if config.TRAIN.end_epoch == (config.TRAIN.step_epoch + config.TRAIN.decay_epoch): lr_scheduler_g = PIX2PIXScheduler(step=int(step), step_decay=int(step_decay), base_lr=lr) lr_scheduler_d = PIX2PIXScheduler(step=int(step), step_decay=int(step_decay), base_lr=lr/2.0) else: lr_scheduler_g = None lr_scheduler_d = None label = mx.nd.zeros((batch_size,), ctx=ctx) # print config pprint.pprint(config) logger.info('system:{}'.format(os.uname())) logger.info('mxnet path:{}'.format(mx.__file__)) logger.info('rng seed:{}'.format(config.RNG_SEED)) logger.info('training config:{}\n'.format(pprint.pformat(config))) # =============Generator Module============= if batch_size == 1: if config.netG == 'autoencoder': generatorSymbol = defineG_encoder_decoder(config) elif config.netG == 'unet': generatorSymbol = defineG_unet(config) else: raise NotImplemented else: if config.netG == 'autoencoder': generatorSymbol = defineG_encoder_decoder_batch(config) elif config.netG == 'unet': generatorSymbol = defineG_unet_batch(config) else: raise NotImplemented if DEBUG: generatorGroup = generatorSymbol.get_internals() name_list = generatorGroup.list_outputs() out_name = [] for name in name_list: if 'output' in name: out_name += [generatorGroup[name]] out_group = mx.sym.Group(out_name) out_shapes = out_group.infer_shape(A=(4, 3, 256, 256)) generator = mx.mod.Module(symbol=generatorSymbol, data_names=('A', 'B',), label_names=None, context=ctx) generator.bind(data_shapes=train_data.provide_data) #draw network #network_test(generatorSymbol) # init params arg_params = {} aux_params = {} arg_names = generatorSymbol.list_arguments() aux_names = generatorSymbol.list_auxiliary_states() arg_shapes, _, aux_shapes = generatorSymbol.infer_shape(A = train_data.provide_data[0][1], B = train_data.provide_data[1][1]) if batch_size == 1: for idx, arg_name in enumerate(arg_names): if 'weight' in arg_name: arg_params[arg_name] = mx.random.normal(0.0, sigma, shape=arg_shapes[idx]) elif 'gamma' in arg_name: arg_params[arg_name] = mx.random.normal(1.0, sigma, shape=arg_shapes[idx]) elif 'bias' in arg_name: arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx]) elif 'beta' in arg_name: arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx]) else: # raise NameError('Unknown parameter name.') pass else: for idx, arg_name in enumerate(arg_names): if 'weight' in arg_name: arg_params[arg_name] = mx.random.normal(0.0, sigma, shape=arg_shapes[idx]) elif 'gamma' in arg_name: arg_params[arg_name] = mx.random.normal(1.0, sigma, shape=arg_shapes[idx]) elif 'bias' in arg_name: arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx]) elif 'beta' in arg_name: arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx]) else: # raise NameError('Unknown parameter name.') pass for idx, aux_name in enumerate(aux_names): if 'mean' in aux_name: aux_params[aux_name] = mx.nd.zeros(shape=aux_shapes[idx]) elif 'var' in aux_name: aux_params[aux_name] = mx.nd.ones(shape=aux_shapes[idx]) else: raise NameError('Unknown aux_name.') generator.init_params(arg_params=arg_params, aux_params=aux_params) if lr_scheduler_g is not None: generator.init_optimizer( optimizer='adam', optimizer_params={ 'learning_rate': lr, 'lr_scheduler': lr_scheduler_g, 'beta1': beta1, 'rescale_grad': 1.0/batch_size }) else: generator.init_optimizer( optimizer='adam', optimizer_params={ 'learning_rate': lr, 'beta1': beta1, 'rescale_grad': 1.0/batch_size }) mods = [generator] # =============Discriminator Module============= if batch_size == 1: if config.netD == 'basic': discriminatorSymbol = defineD_basic() elif config.netD == 'n_layers': discriminatorSymbol = defineD_n_layers(n_layers = config.n_layers) else: raise NotImplemented else: if config.netD == 'basic': discriminatorSymbol = defineD_basic_batch(batch_size=batch_size) elif config.netD == 'n_layers': discriminatorSymbol = defineD_n_layers_batch(n_layers = config.n_layers, batch_size=batch_size) else: raise NotImplemented if DEBUG: generatorGroup = discriminatorSymbol.get_internals() name_list = generatorGroup.list_outputs() out_name = [] for name in name_list: if 'output' in name: out_name += [generatorGroup[name]] out_group = mx.sym.Group(out_name) out_shapes = out_group.infer_shape(A=(1, 3, 256, 256), B=(1, 3, 256, 256)) discriminator = mx.mod.Module(symbol=discriminatorSymbol, data_names=('A', 'B',), label_names=('label',), context=ctx) discriminator.bind(data_shapes=train_data.provide_data, label_shapes=[('label', (batch_size,))], inputs_need_grad=True) # init params arg_params = {} aux_params = {} arg_names = discriminatorSymbol.list_arguments() aux_names = discriminatorSymbol.list_auxiliary_states() arg_shapes, _, aux_shapes = discriminatorSymbol.infer_shape(A=train_data.provide_data[0][1], B=train_data.provide_data[1][1], label=(batch_size,)) if batch_size == 1: for idx, arg_name in enumerate(arg_names): if 'weight' in arg_name: arg_params[arg_name] = mx.random.normal(0.0, sigma, shape=arg_shapes[idx]) elif 'gamma' in arg_name: arg_params[arg_name] = mx.random.normal(1.0, sigma, shape=arg_shapes[idx]) elif 'bias' in arg_name: arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx]) elif 'beta' in arg_name: arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx]) else: # raise NameError('Unknown parameter name.') pass else: for idx, arg_name in enumerate(arg_names): if 'weight' in arg_name: arg_params[arg_name] = mx.random.normal(0.0, sigma, shape=arg_shapes[idx]) elif 'gamma' in arg_name: arg_params[arg_name] = mx.random.normal(1.0, sigma, shape=arg_shapes[idx]) elif 'bias' in arg_name: arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx]) elif 'beta' in arg_name: arg_params[arg_name] = mx.nd.zeros(shape=arg_shapes[idx]) else: # raise NameError('Unknown parameter name.') pass for idx, aux_name in enumerate(aux_names): if 'mean' in aux_name: aux_params[aux_name] = mx.nd.zeros(shape=aux_shapes[idx]) elif 'var' in aux_name: aux_params[aux_name] = mx.nd.ones(shape=aux_shapes[idx]) else: raise NameError('Unknown aux_name.') discriminator.init_params(arg_params=arg_params, aux_params=aux_params) # gradient is scaled in LogisticRegression layer, no need to rescale gradient if lr_scheduler_d is not None: discriminator.init_optimizer( optimizer='adam', optimizer_params={ 'learning_rate': lr / 2.0, 'lr_scheduler': lr_scheduler_d, 'beta1': beta1, 'rescale_grad': 1.0 }) else: discriminator.init_optimizer( optimizer='adam', optimizer_params={ 'learning_rate': lr / 2.0, 'beta1': beta1, 'rescale_grad': 1.0 }) mods.append(discriminator) #load the trained model import symbols.loss_layer.lsoftmax save_model_prefix = '/home/zhengxiawu/project/FGIR-GAN/trained_model/Resnet_lsoftmax' tag = 0 trained_sym, trained_arg_params, trained_aux_params = \ mx.model.load_checkpoint(save_model_prefix, tag) train_model = mx.mod.Module(symbol=trained_sym,data_names=('data',),label_names=('label',), context=ctx) train_model.bind(data_shapes=[('data',(batch_size,3,256,256))], label_shapes=[('label', (batch_size,))], inputs_need_grad=True) train_model.init_params(arg_params=trained_arg_params,aux_params=trained_aux_params) # metric mG = metric.CrossEntropyMetric() mD = metric.CrossEntropyMetric() mACC = metric.AccMetric() mL1 = metric.L1LossMetric(config) mTrained = mx_metric.create(['accuracy']) t_accumulate = 0 # =============train=============== for epoch in range(config.TRAIN.end_epoch): train_data.reset() mACC.reset() mG.reset() mD.reset() mL1.reset() mTrained.reset() for t, batch in enumerate(train_data): t_start = time.time() # generator input real A, output fake B generator.forward(batch, is_train=True) outG = generator.get_outputs() #put into trained model train_model.forward(mx.io.DataBatch([outG[1]*(255.0/2.0)],batch.label),is_train=True) train_model.backward() diffT = train_model.get_input_grads() train_model.update_metric(mTrained,batch.label) generator.backward([mx.nd.array(np.ones((batch_size,)), ctx=ctx), diffT[0] * config.Trained_model_loss * (255.0/2.0)]) generator.update() # update discriminator on fake # discriminator input real A and fake B # want discriminator to predict fake (0) label[:] = 0 discriminator.forward(mx.io.DataBatch([batch.data[0], outG[1]], [label]), is_train=True) discriminator.backward() gradD = [[grad.copyto(grad.context) for grad in grads] for grads in discriminator._exec_group.grad_arrays] discriminator.update_metric(mD, [label]) discriminator.update_metric(mACC, [label]) # update discriminator on real # discriminator input real A and real B # want discriminator to predict real (1) label[:] = 1 batch.label = [label] discriminator.forward(batch, is_train=True) discriminator.backward() for gradsr, gradsf in zip(discriminator._exec_group.grad_arrays, gradD): for gradr, gradf in zip(gradsr, gradsf): # gradr = (gradr + gradf)/2 gradr += gradf discriminator.update() discriminator.update_metric(mD, [label]) discriminator.update_metric(mACC, [label]) # update generator # discriminator input real A and fake B # want discriminator to predict real (1) label[:] = 1 discriminator.forward(mx.io.DataBatch([batch.data[0], outG[1]], [label]), is_train=True) discriminator.backward() diffD = discriminator.get_input_grads() # loss does not need output gradient generator.backward([mx.nd.array(np.ones((batch_size,)), ctx=ctx), diffD[1] * config.GAN_loss]) generator.update() mG.update([label], discriminator.get_outputs()) mL1.update(None, outG) t_accumulate += time.time() - t_start t += 1 if t % frequent == 0: if config.TRAIN.batch_end_plot_figure: visualize(batch.data[0].asnumpy(), batch.data[1].asnumpy(), outG[1].asnumpy(), train_fig_prefix + '-train-%04d-%06d.png' % (epoch + 1, t)) #a = mTrained.get() print 'Epoch[{}] Batch[{}] Time[{:.4f}] dACC: {:.4f} gCE: {:.4f} dCE: {:.4f} gL1: {:.4f} tAcc: {:.4f}'.format(epoch, t, t_accumulate, mACC.get()[1], mG.get()[1], mD.get()[1], mL1.get()[1],mTrained.get()[1][0]) logger.info('Epoch[{}] Batch[{}] Speed[{:.4f} batch/s] dACC: {:.4f} gCE: {:.4f} dCE: {:.4f} gL1: {:.4f}\n'.format(epoch, t, frequent * batch_size / t_accumulate, mACC.get()[1], mG.get()[1], mD.get()[1], mL1.get()[1])) t_accumulate = 0 if check_point: print('Saving...') if config.TRAIN.epoch_end_plot_figure: visualize(batch.data[0].asnumpy(), batch.data[1].asnumpy(), outG[1].asnumpy(), train_fig_prefix + '-train-%04d.png' % (epoch + 1)) if (epoch + 1) % config.TRAIN.save_interval == 0: generator.save_params(prefix + '-generator-%04d.params' % (epoch + 1)) discriminator.save_params(prefix + '-discriminator-%04d.params' % (epoch + 1)) generator.save_params(prefix + '-generator-%04d.params' % config.TRAIN.end_epoch) discriminator.save_params(prefix + '-discriminator-%04d.params' % config.TRAIN.end_epoch)
def fit(self, train_data, eval_data=None, eval_metric='acc', validate_metric=None, work_load_list=None, epoch_end_callback=None, batch_end_callback=None, fixed_param_prefix=None, initializer=None, arg_params=None, aux_params=None, allow_missing=False, optimizer=None, optimizer_params=None, begin_epoch=0, num_epoch=None, kvstore='device'): self.module.bind(data_shapes=self.data_shapes, label_shapes=self.label_shapes, for_training=True) self.module.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing) self.module.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validate_metric is None: validate_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) temp_count = 0 # # test model size by saving params of model # arg_params, aux_params = self.module.get_params() # for callback in _as_list(epoch_end_callback): # callback(0, self.symbol, arg_params, aux_params) # raise NotImplementedError # training loop for epoch in range(begin_epoch, num_epoch): train_time = AverageMeter() kvstore_sync_time = AverageMeter() get_data_time = AverageMeter() iter_total_time = AverageMeter() tic = time.time() eval_metric.reset() nbatch = 0 data_iter = iter(train_data) end_of_batch = False next_data_batch = next(data_iter) while not end_of_batch: start_time = time.time() data_batch = next_data_batch self.module.forward(data_batch, is_train=True) self.module.backward() # ndarray.waitall() train_time.update(time.time() - start_time) self.module.update() # ndarray.waitall() kvstore_sync_time.update(time.time() - start_time) try: next_data_batch = next(data_iter) except StopIteration: end_of_batch = True # ndarray.waitall() get_data_time.update(time.time() - start_time) if isinstance(data_batch, list): self.module.update_metric(eval_metric, [db.label for db in data_batch], pre_sliced=True) else: self.module.update_metric(eval_metric, data_batch.label) # ndarray.waitall() iter_total_time.update(time.time() - start_time) if batch_end_callback is not None: # batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, # eval_metric=eval_metric, # locals=locals()) batch_end_params = BatchEndParam( epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals(), rank=kvstore.rank, total_iter=temp_count, cur_data_time=get_data_time.val, avg_data_time=get_data_time.avg, cur_batch_time=train_time.val, avg_batch_time=train_time.avg, cur_kvstore_sync_time=kvstore_sync_time.val, avg_kvstore_sync_time=kvstore_sync_time.avg, cur_iter_total_time=iter_total_time.val, avg_iter_total_time=iter_total_time.avg) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 temp_count += 1 for name, val in eval_metric.get_name_value(): self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) arg_params, aux_params = self.module.get_params() self.module.set_params(arg_params, aux_params) if epoch_end_callback is not None and kvstore.rank == 0: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) if eval_data: if self.config.network == 'mobilenet_int8_foldbn': # for fold bn to create inference symbol total_params_path = "./model/%s-%04d.params" % ( self.config.model_prefix, epoch + 1) # total_params_path = "./model/mobilenet_flodbn_0904/mobilenet_int8_flodbn_imagenet_retrain_80_pertensor-fold-0100.params" # _, arg_params, aux_params = mx.model.load_checkpoint('./model/mobilenet_flodbn_0904/mobilenet_int8_flodbn_imagenet_retrain_80_pertensor-fold', 100) import os assert os.path.exists( total_params_path ), "please provide the correct total_params_path for foldbn eval" eval_sym = eval(self.config.network)( num_classes=self.config.num_classes, quant_mod=self.config.quant_mod, delay_quant=self.config.delay_quant, is_weight_perchannel=self.config.is_weight_perchannel, total_params_path=total_params_path, quantize_flag=self.config.quantize_flag) eval_module = Module( symbol=eval_sym, data_names=self.data_names, label_names=self.label_names, logger=self.logger, context=self.context, work_load_list=self.work_load_list, fixed_param_names=self.fixed_param_names) eval_module.bind(data_shapes=self.data_shapes, label_shapes=self.label_shapes, for_training=False) eval_module.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params) res = eval_module.score(eval_data, validate_metric, score_end_callback=None, batch_end_callback=None, reset=True, epoch=epoch) else: res = self.module.score(eval_data, validate_metric, score_end_callback=None, batch_end_callback=None, reset=True, epoch=epoch) for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) train_data.reset()
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01), ), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None): assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) ################################################################################ # training loop ################################################################################ last_grad_debug = None for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() nbatch = 0 data_iter = iter(train_data) end_of_batch = False next_data_batch = next(data_iter) while not end_of_batch: data_batch = next_data_batch if monitor is not None: monitor.tic() self.forward_backward(data_batch) # grad_array = [[grad.copyto(grad.context) if grad is not None else None for grad in grads] for grads in # self._curr_module._exec_group.grad_arrays] # # for exec_ in self._curr_module._exec_group.execs: # grad_dict = exec_.grad_dict # # grad_debug = dict() # for k, v in grad_dict.items(): # if v is not None: # v_np = v.asnumpy() # grad_debug[k] = (np.min(v_np), np.max(v_np)) # print 'rpn_conv_cls_weight:', grad_debug['rpn_conv_cls_weight'] # print 'rcnn_fc_cls_weight:', grad_debug['rcnn_fc_cls_weight'] self.update() try: # pre fetch next batch next_data_batch = next(data_iter) self.prepare(next_data_batch) except StopIteration: end_of_batch = True self.update_metric(eval_metric, data_batch.label) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 # one epoch of training is finished for name, val in eval_metric.get_name_value(): self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) #---------------------------------------- # evaluation on validation set if eval_data: res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
def fit(self, X, y=None, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None, work_load_list=None, monitor=None, eval_batch_end_callback=None): """Fit the model. Parameters ---------- X : DataIter, or numpy.ndarray/NDArray Training data. If X is an DataIter, the name or, if not available, position, of its outputs should match the corresponding variable names defined in the symbolic graph. y : numpy.ndarray/NDArray, optional Training set label. If X is numpy.ndarray/NDArray, y is required to be set. While y can be 1D or 2D (with 2nd dimension as 1), its 1st dimension must be the same as X, i.e. the number of data points and labels should be equal. eval_data : DataIter or numpy.ndarray/list/NDArray pair If eval_data is numpy.ndarray/list/NDArray pair, it should be (valid_data, valid_label). eval_metric : metric.EvalMetric or str or callable The evaluation metric, name of evaluation metric. Or a customize evaluation function that returns the statistics based on minibatch. epoch_end_callback : callable(epoch, symbol, arg_params, aux_states) A callback that is invoked at end of each epoch. This can be used to checkpoint model each epoch. batch_end_callback: callable(epoch) A callback that is invoked at end of each batch For print purpose kvstore: KVStore or str, optional The KVStore or a string kvstore type: 'local', 'dist_sync', 'dist_async' In default uses 'local', often no need to change for single machiine. logger : logging logger, optional When not specified, default logger will be used. work_load_list : float or int, optional The list of work load for different devices, in the same order as ctx Note ---- KVStore behavior - 'local', multi-devices on a single machine, will automatically choose best type. - 'dist_sync', multi-machines with BSP - 'dist_async', multi-machines with partical asynchronous """ data = self._init_iter(X, y, is_train=True) eval_data = self._init_eval_iter(eval_data) if self.sym_gen: self.symbol = self.sym_gen(data.default_bucket_key) # pylint: disable=no-member self._check_arguments() self.kwargs["sym"] = self.symbol arg_names, param_names, aux_names = \ self._init_params(dict(data.provide_data+data.provide_label)) param_idx2name = {} for i, n in enumerate(param_names): for k in range(len(self.ctx)): param_idx2name[i*len(self.ctx)+k] = n self.kwargs["param_idx2name"] = param_idx2name # setup metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) # create kvstore (kvstore, update_on_kvstore) = _create_kvstore( kvstore, len(self.ctx), self.arg_params) # init optmizer if isinstance(self.optimizer, str): batch_size = data.batch_size if kvstore and kvstore.type == 'dist_sync': batch_size *= kvstore.num_workers optimizer = opt.create(self.optimizer, rescale_grad=(1.0/batch_size), **(self.kwargs)) elif isinstance(self.optimizer, opt.Optimizer): optimizer = self.optimizer # do training _train_multi_device(self.symbol, self.ctx, arg_names, param_names, aux_names, self.arg_params, self.aux_params, begin_epoch=self.begin_epoch, end_epoch=self.num_epoch, epoch_size=self.epoch_size, optimizer=optimizer, train_data=data, eval_data=eval_data, eval_metric=eval_metric, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=kvstore, update_on_kvstore=update_on_kvstore, logger=logger, work_load_list=work_load_list, monitor=monitor, eval_batch_end_callback=eval_batch_end_callback, sym_gen=self.sym_gen)
def train_model(self, train_iter, batch_size: int, epochs: int, num_features, optimizer: str, learning_rate: float, momentum: float, score_after: float, eval_metric: str): """ Parameters ---------- eval_metric - "accuracy", "ce" (CrossEntropy), "f1", "mae", "mse", "rmse", "top_k_accuracy". """ # Set Monitor monitor = mon.Monitor(interval=score_after, pattern=".*", stat_func=self.loss_fn) self.train_module.install_monitor(monitor) # Create Metric eval_metric_fn = metric.create(eval_metric) self.train_module.bind(data_shapes=io.DataDesc(name="data", shape=(batch_size, num_features)), label_shapes=io.DataDesc(name="target", shape=(batch_size, 1)), for_training=True) self.train_module.init_optimizer(optimizer=optimizer, optimizer_params={ "learning_rate": learning_rate, "momentum": momentum }) self.train_module.init_params() for epoch in range(epochs): eval_metric_fn.reset() end_of_batch = False tic = time() nbatch = 0 while not end_of_batch: try: train_batch = train_iter.next() monitor.tic() self.train_module.forward_backward(train_batch) self.train_module.update() self.train_module.update_metric(eval_metric_fn, train_batch.label) for name, value in eval_metric_fn.get(): self.writer.add_scalar(tag="Train " + name, value=value, global_step=(epoch + 1) * nbatch) print("Epoch[%d] Batch[%d] Train-%s=%.3f" % (epoch, nbatch, name, value)) except Exception: end_of_batch = True nbatch += 1 print("Epoch[%d] completed! Time cost=%.3f s" % (epoch, (time() - tic))) for grad in self.train_module.get_input_grads(): self.writer.add_histogram(values=grad, bins=1000, global_step=epoch)
def fit(self, train_data, eval_data=None, eval_metric='acc', period=['train', 'val'], to_eval_train=True, grad_req='write', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None): if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) # region 1. 准备参数,包括输入数据和标签数据 # FCN的参数名 arg_names = self.symbol.list_arguments() # FCN的参数形状 # print train_data.provide_data[0] arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape( data=train_data.provide_data[0][1]) # arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data=(1, 3, # train_data.resize_size[0], # train_data.resize_size[1], # )) # print train_data.provide_data[0][1] # quit() # 输入数据和标签数据 data_name = train_data.provide_data[0][0] label_name = train_data.provide_label[0][0] # print data_name, label_name # input_names = [data_name, label_name] # batch_size, channel, h, w # data_shape = train_data.provide_data[0][1] self.arg_params[data_name] = mx.nd.empty(train_data.provide_data[0][1], self.ctx) # # batch_size, h*w self.arg_params[label_name] = mx.nd.empty( train_data.provide_label[0][1], self.ctx) # quit() # 其他参数 aux_names = self.symbol.list_auxiliary_states() self.aux_params = { k: mx.nd.zeros(s) for k, s in zip(aux_names, aux_shapes) } # endregion # region 2.准备参数的梯度 if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): # print name,shape self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None # endregion # print self.arg_params # region 3. 绑定模型参数 和 模型的输出 self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) # quit() assert len(self.symbol.list_arguments()) == len( self.executor.grad_arrays) # 绑定输出变量 output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.executor.outputs): # print key, arr output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) # endregion # region 4. 设置优化器 self.optimizer = opt.create(self.optimizer, rescale_grad=1.0 / train_data.batch_size, **self.kwargs) self.updater = get_updater(self.optimizer) # 需要更新梯度的参数 update_dict = { name: nd for name, nd in zip(self.symbol.list_arguments(), self.executor.grad_arrays) if nd is not None } # endregion # region 5. 设置评价尺度 if eval_metric == 'acc': eval_metric = metric.create(eval_metric) elif eval_metric == 'meanIOU': eval_metric = MeanIoU(c=1, ) # endregion for epoch in range(self.begin_epoch, self.num_epoch): # region begin training if 'train' in period: logger.info(" in train process...") all_start = time.time() nbatch = 0 train_data.reset() eval_metric.reset() for data in train_data: nbatch += 1 # all_start = time.time() # region 1. 准备 batch 数据 # start = time.time() self.arg_params[data_name][:] = data.data[0] # end = time.time() # print end-start # label_shape = data.label[0].shape # print label_shape self.arg_params[label_name][:] = data.label[0] # end = time.time() # print 'prepare data and label time: %s s' % (end - start) # quit() # print self.arg_params[label_name][:] # endregion # region 2. forward # start = time.time() self.executor.forward(is_train=True) # end = time.time() # print 'forward time: %s s' % (end - start) # endregion # region 3. backward # start = time.time() self.executor.backward() for key, arr in update_dict.items(): if key != "bigscore_weight": # 参数名,梯度, 权重 self.updater(key, arr, self.arg_params[key]) # self.executor.outputs[0].wait_to_read() # end = time.time() # print 'backward time: %f s' % (end - start) # endregion # region 4. 测评 # start = time.time() if to_eval_train: # start = time.time() # 取得输出 for key in output_dict: # print key output_dict[key].copyto(output_buff[key]) # output_dict[key].wait_to_read() # end = time.time() # print 'output1 copy time: %s s' % (end - start) # start = time.time() pred_shape = output_buff['softmax_output'].shape # print pred_shape, label_shape # label = self.arg_params[label_name] pred = output_buff['softmax_output'].reshape( (pred_shape[0], pred_shape[1], pred_shape[2] * pred_shape[3])) # pred = pred.copyto(self.ctx) # print pred.shape label = data.label[0] # quit() # end = time.time() # print 'output copy2 time: %s s' % (end - start) # 更新评价 eval_metric.update([label], [pred]) batch_end_params = BatchEndParam( epoch=epoch, nbatch=nbatch, eval_metric=eval_metric if to_eval_train else None, ) batch_end_callback(batch_end_params) # end = time.time() # print '测评 time: %s s' % (end - start) # endregion # all_end = time.time() # print 'all time: %s s' % (all_end - all_start) # if nbatch > 1: # quit() if epoch_end_callback is not None: epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) # all_end = time.time() # print 'all time1: %s s' % (all_end - all_start) if to_eval_train: name, value = eval_metric.get() logger.info( " --->Epoch[%d] Train-%s=%f", epoch, name, value) logger.info('train time per epoch: %f s' % (time.time() - all_start)) # endregion # evaluation if 'val' in period and eval_data: logger.info(" in eval process...") nbatch = 0 eval_data.reset() eval_metric.reset() # all_start = time.time() for data in eval_data: nbatch += 1 # label_shape = data.label.shape self.arg_params[data_name][:] = data.data[0] self.arg_params[label_name][:] = data.label[0] self.executor.forward(is_train=False) pred_shape = self.executor.outputs[0].shape cpu_output_array = mx.nd.empty(pred_shape) self.executor.outputs[0].copyto(cpu_output_array) label = data.label[0] pred = cpu_output_array.reshape( (pred_shape[0], pred_shape[1], pred_shape[2] * pred_shape[3])) eval_metric.update([label], [pred]) batch_end_params = BatchEndParam( epoch=epoch, nbatch=nbatch, eval_metric=None, ) batch_end_callback(batch_end_params) # if nbatch>200: # quit() # quit() # self.executor.outputs[0].wait_to_read() # all_end = time.time() # print 'all time1: %s s' % (all_end - all_start) # all_start = time.time() name, value = eval_metric.get() logger.info('Epoch[%d] Validation-%s=%f', epoch, name, value)
def update_network(queue, nn_update_idx, symbol_filename, params_filename, convert_to_onnx, main_config, train_config: TrainConfig, model_contender_dir): """ Creates a new NN checkpoint in the model contender directory after training using the game files stored in the training directory :param queue: Queue object used to return items :param nn_update_idx: Defines how many updates of the nn has already been done. This index should be incremented after every update. :param symbol_filename: Architecture definition file :param params_filename: Weight file which will be loaded before training Updates the neural network with the newly acquired games from the replay memory :param convert_to_onnx: Boolean indicating if the network shall be exported to ONNX to allow TensorRT inference :param main_config: Dict of the main_config (imported from main_config.py) :param train_config: Dict of the train_config (imported from train_config.py) :param model_contender_dir: String of the contender directory path :return: k_steps_final """ # set the context on CPU, switch to GPU if there is one available (strongly recommended for training) ctx = mx.gpu( train_config.device_id) if train_config.context == "gpu" else mx.cpu() # set a specific seed value for reproducibility train_config.nb_parts = len( glob.glob(main_config["planes_train_dir"] + '**/*.zip')) logging.info("number parts for training: %d" % train_config.nb_parts) train_objects = TrainObjects() if train_config.nb_parts <= 0: raise Exception( 'No .zip files for training available. Check the path in main_config["planes_train_dir"]:' ' %s' % main_config["planes_train_dir"]) _, x_val, y_val_value, y_val_policy, _, _ = load_pgn_dataset( dataset_type="val", part_id=0, normalize=train_config.normalize, verbose=False, q_value_ratio=train_config.q_value_ratio) y_val_policy = prepare_policy(y_val_policy, train_config.select_policy_from_plane, train_config.sparse_policy_label, train_config.is_policy_from_plane_data) val_dataset = gluon.data.ArrayDataset(nd.array(x_val), nd.array(y_val_value), nd.array(y_val_policy)) val_data = gluon.data.DataLoader(val_dataset, train_config.batch_size, shuffle=False, num_workers=train_config.cpu_count) symbol = mx.sym.load(symbol_filename) # calculate how many iterations per epoch exist nb_it_per_epoch = (len(x_val) * train_config.nb_parts) // train_config.batch_size # one iteration is defined by passing 1 batch and doing backprop train_config.total_it = int(nb_it_per_epoch * train_config.nb_training_epochs) train_objects.lr_schedule = CosineAnnealingSchedule( train_config.min_lr, train_config.max_lr, max(train_config.total_it * .7, 1)) train_objects.lr_schedule = LinearWarmUp(train_objects.lr_schedule, start_lr=train_config.min_lr, length=max( train_config.total_it * .25, 1)) train_objects.momentum_schedule = MomentumSchedule( train_objects.lr_schedule, train_config.min_lr, train_config.max_lr, train_config.min_momentum, train_config.max_momentum) input_shape = x_val[0].shape inputs = mx.sym.var('data', dtype='float32') value_out = symbol.get_internals()[main_config['value_output'] + '_output'] policy_out = symbol.get_internals()[main_config['policy_output'] + '_output'] sym = mx.symbol.Group([value_out, policy_out]) net = mx.gluon.SymbolBlock(sym, inputs) net.collect_params().load(params_filename, ctx) metrics_gluon = { 'value_loss': metric.MSE(name='value_loss', output_names=['value_output']), 'value_acc_sign': metric.create(acc_sign, name='value_acc_sign', output_names=['value_output'], label_names=['value_label']), } if train_config.sparse_policy_label: print("train with sparse labels") # the default cross entropy only supports sparse labels metrics_gluon['policy_loss'] = metric.CrossEntropy( name='policy_loss', output_names=['policy_output'], label_names=['policy_label']), metrics_gluon['policy_acc'] = metric.Accuracy( axis=1, name='policy_acc', output_names=['policy_output'], label_names=['policy_label']) else: metrics_gluon['policy_loss'] = metric.create( cross_entropy, name='policy_loss', output_names=['policy_output'], label_names=['policy_label']) metrics_gluon['policy_acc'] = metric.create( acc_distribution, name='policy_acc', output_names=['policy_output'], label_names=['policy_label']) train_objects.metrics = metrics_gluon train_config.export_weights = False # don't save intermediate weights train_agent = TrainerAgent(net, val_data, train_config, train_objects, use_rtpt=False) # iteration counter used for the momentum and learning rate schedule cur_it = train_config.k_steps_initial * train_config.batch_steps (k_steps_final, val_value_loss_final, val_policy_loss_final, val_value_acc_sign_final, val_policy_acc_final), _ = train_agent.train(cur_it) prefix = "%smodel-%.5f-%.5f-%.3f-%.3f" % ( model_contender_dir, val_value_loss_final, val_policy_loss_final, val_value_acc_sign_final, val_policy_acc_final) sym_file = prefix + "-symbol.json" params_file = prefix + "-" + "%04d.params" % nn_update_idx # the export function saves both the architecture and the weights net.export(prefix, epoch=nn_update_idx) print() logging.info("Saved checkpoint to %s-%04d.params", prefix, nn_update_idx) if convert_to_onnx: convert_mxnet_model_to_onnx(sym_file, params_file, ["value_out_output", "policy_out_output"], input_shape, [1, 8, 16], False) logging.info("k_steps_final %d" % k_steps_final) queue.put(k_steps_final)
def fit(self, train_data, eval_data=None, eval_metric='acc', grad_req='write', logger=None, softmax_metric=None, regression_metric=None, epoch_end_callback=None): f = open("log_rpn.txt", 'w') if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) f.write('Start training with %s\n' % str(self.ctx)) arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape( data=(1, 3, 128, 128), mean_face=(10, 3), ground_truth=(10, 2), bbox_label=(10, 5)) arg_names = self.symbol.list_arguments() if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith("mean_face") or name.endswith('cls_label') or name.endswith('proj_weight') or name.endswith('proj_label') or name.endswith('ground_truth') or name.endswith('bbox_label') or name.endswith("bbox_weight")): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None aux_names = self.symbol.list_auxiliary_states() self.aux_params = { k: mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes) } data_name = train_data.data_name cls_label_name = train_data.cls_label_name proj_label_name = train_data.proj_label_name proj_weight_name = train_data.proj_weight_name ground_truth_name = train_data.ground_truth_name bbox_label_name = train_data.bbox_label_name bbox_weight_name = train_data.bbox_weight_name self.optimizer = opt.create(self.optimizer, rescale_grad=1.0, **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = metric.create(eval_metric) for epoch in range(self.begin_epoch, self.num_epoch): if eval_data: logger.info(" in eval process...") f.write(" in eval process...") nbatch = 0 softmax_proj = np.zeros((11, 3)) proj_regression_loss = .0 bbox_predict_loss = np.array([.0, .0]) eval_data.reset() for data in eval_data: nbatch += 1 print "Eval batch:", nbatch softmax_shape = data[cls_label_name].shape self.arg_params[data_name] = mx.nd.array( data[data_name], self.ctx) self.arg_params[cls_label_name] = mx.nd.array( data[cls_label_name].reshape( (softmax_shape[0], softmax_shape[1] * softmax_shape[2])), self.ctx) self.arg_params[proj_label_name] = mx.nd.array( data[proj_label_name], self.ctx) self.arg_params[proj_weight_name] = mx.nd.array( data[proj_weight_name], self.ctx) self.arg_params[ground_truth_name] = mx.nd.array( data[ground_truth_name], self.ctx) self.arg_params[bbox_label_name] = mx.nd.array( data[bbox_label_name], self.ctx) self.arg_params[bbox_weight_name] = mx.nd.array( data[bbox_weight_name], self.ctx) self.arg_params["mean_face"] = mx.nd.array( train_data.mean_face, self.ctx) executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) softmax_output_array = mx.nd.zeros( executor.outputs[0].shape) proj_regression_output_array = mx.nd.zeros( executor.outputs[1].shape) bbox_predict_output_array = mx.nd.zeros( executor.outputs[2].shape) ell_label = mx.nd.zeros(executor.outputs[3].shape) bbox_predict = mx.nd.zeros(executor.outputs[4].shape) executor.forward(is_train=True) executor.outputs[0].copyto(softmax_output_array) executor.outputs[1].copyto(proj_regression_output_array) executor.outputs[2].copyto(bbox_predict_output_array) executor.outputs[3].copyto(ell_label) executor.outputs[4].copyto(bbox_predict) softmax_shape = softmax_output_array.shape index_label = np.nonzero(data[cls_label_name].reshape( softmax_shape[0], softmax_shape[2] * softmax_shape[3]) - 255) label = mx.nd.array(data[cls_label_name].reshape( softmax_shape[0], softmax_shape[2] * softmax_shape[3])[:, index_label[1]]) pred = mx.nd.array((softmax_output_array.asnumpy().reshape( softmax_shape[0], softmax_shape[1], softmax_shape[2] * softmax_shape[3]))[..., index_label[1]]) if softmax_metric: tempt = softmax_metric(label, pred, 11) softmax_proj += tempt proj_label = data[proj_label_name] proj_weight = data[proj_weight_name] proj_pred = proj_regression_output_array.asnumpy().reshape( data[proj_weight_name].shape) index_nonzero = np.nonzero(data[proj_weight_name]) proj_regress_tmp = regression_metric( proj_label[index_nonzero], proj_pred[index_nonzero], proj_weight[index_nonzero]) proj_regression_loss += proj_regress_tmp bbox_pred = bbox_predict_output_array.asnumpy() bbox_predict_tmp = bbox_predict_metric( ell_label.asnumpy(), bbox_pred) bbox_predict_loss += bbox_predict_tmp print "Validation-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f" % \ (epoch, nbatch, get_accuracy(tempt, self.bgfg), proj_regress_tmp, bbox_predict_tmp[0], bbox_predict_tmp[1]) f.write( "Validation-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f\n" % (epoch, nbatch, get_accuracy( tempt, self.bgfg), proj_regress_tmp, bbox_predict_tmp[0], bbox_predict_tmp[1])) img_info = eval_data.AllImg[nbatch - 1] print "%s\twidth: %d height: %d num_face: %d" % \ (img_info.filename, img_info.width, img_info.height, img_info.num_faces) f.write("%s\twidth: %d height: %d num_face: %d\n" % (img_info.filename, img_info.width, img_info.height, img_info.num_faces)) executor.outputs[0].wait_to_read() executor.outputs[1].wait_to_read() executor.outputs[2].wait_to_read() executor.outputs[3].wait_to_read() print_accuracy(softmax_proj, f, train_data.class_names, self.bgfg) logger.info("ALL Validation accuracy: %f", get_accuracy(softmax_proj, self.bgfg)) logger.info('Validation projection regression: %f', proj_regression_loss / nbatch) logger.info('Validation bbox predict: %f %f', bbox_predict_loss[0] / nbatch, bbox_predict_loss[1] / nbatch) f.write("ALL Validation accuracy: %f\n" % get_accuracy(softmax_proj, self.bgfg)) f.write("Validation projection regression: %f\n" % (proj_regression_loss / nbatch)) f.write("Validation bbox predict: %f %f\n" % (bbox_predict_loss[0] / nbatch, bbox_predict_loss[1] / nbatch)) nbatch = 0 train_data.reset() eval_metric.reset() proj_regress_loss_t = .0 proj_regress_loss_b = .0 softmax_count = np.zeros((11, 3)) softmax_batch = np.zeros((11, 3)) bbox_predict_loss_t = np.array([.0, .0]) bbox_predict_loss_b = np.array([.0, .0]) for data in train_data: nbatch += 1 softmax_shape = data[cls_label_name].shape self.arg_params[data_name] = mx.nd.array( data[data_name], self.ctx) self.arg_params[cls_label_name] = mx.nd.array( data[cls_label_name].reshape( (softmax_shape[0], softmax_shape[1] * softmax_shape[2])), self.ctx) self.arg_params[proj_label_name] = mx.nd.array( data[proj_label_name], self.ctx) self.arg_params[proj_weight_name] = mx.nd.array( data[proj_weight_name], self.ctx) self.arg_params[ground_truth_name] = mx.nd.array( data[ground_truth_name], self.ctx) self.arg_params[bbox_label_name] = mx.nd.array( data[bbox_label_name], self.ctx) self.arg_params[bbox_weight_name] = mx.nd.array( data[bbox_weight_name], self.ctx) self.arg_params["mean_face"] = mx.nd.array( train_data.mean_face, self.ctx) self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) assert len(self.symbol.list_arguments()) == len( self.executor.grad_arrays) update_dict = { name: nd for name, nd in zip(self.symbol.list_arguments(), self.executor.grad_arrays) if nd } output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.executor.outputs): output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) self.executor.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) self.executor.backward() ''' for i in xrange(0, 49): if self.executor.grad_arrays[i] != None: print i, arg_names[i], self.executor.grad_arrays[i].asnumpy()[0] ''' for key, arr in update_dict.items(): if key != 'upsample_proposal_weight': self.updater(key, arr, self.arg_params[key]) ''' if key == 'config_fc1_weight': print 'config_fc1_weight' print 'param:', self.arg_params[key].asnumpy() print 'grad:', self.executor.grad_arrays[39].asnumpy() if key == 'refine_proj_param_weight': print 'refine_proj_param_weight' print 'param:', self.arg_params[key].asnumpy() print 'grad:', self.executor.grad_arrays[47].asnumpy() ''' pred_shape = self.executor.outputs[0].shape index_label = np.nonzero(data[cls_label_name].reshape( softmax_shape[0], softmax_shape[1] * softmax_shape[2]) - 255) label = mx.nd.array(data[cls_label_name].reshape( softmax_shape[0], softmax_shape[1] * softmax_shape[2])[:, index_label[1]]) pred = mx.nd.array( (output_buff["proposal_cls_loss_output"].asnumpy().reshape( pred_shape[0], pred_shape[1], pred_shape[2] * pred_shape[3]))[..., index_label[1]]) if softmax_metric: tempt = softmax_metric(label, pred, 11) softmax_count += tempt softmax_batch += tempt # for q in range(0, 50): # print label.asnumpy()[0, q], ':', pred.asnumpy()[0, 0, q], pred.asnumpy()[0, 1, q] proj_label = data[proj_label_name] proj_weight = data[proj_weight_name] proj_pred = output_buff["proj_regression_loss_output"].asnumpy()\ .reshape(data[proj_weight_name].shape) index_nonzero = np.nonzero(data[proj_weight_name]) proj_regress_tmp = regression_metric( proj_label[index_nonzero], proj_pred[index_nonzero], proj_weight[index_nonzero]) proj_regress_loss_t += proj_regress_tmp proj_regress_loss_b += proj_regress_tmp ell_label = output_buff["ell_label_output"].asnumpy() bbox_pred = output_buff["ellipse_predict_loss_output"].asnumpy( ) bbox_predict_tmp = bbox_predict_metric(ell_label, bbox_pred) bbox_predict_loss_t += bbox_predict_tmp bbox_predict_loss_b += bbox_predict_tmp self.executor.outputs[0].wait_to_read() self.executor.outputs[1].wait_to_read() self.executor.outputs[2].wait_to_read() self.executor.outputs[3].wait_to_read() print "Training-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f" % \ (epoch, nbatch, get_accuracy(tempt, self.bgfg), proj_regress_tmp, bbox_predict_tmp[0], bbox_predict_tmp[1]) f.write( "Training-epoch[%d]-batch[%d]: acc:%f\tproj_regress:%f\tbbox_regress:%f\tbbox_angle:%f\n" % (epoch, nbatch, get_accuracy( tempt, self.bgfg), proj_regress_tmp, bbox_predict_tmp[0], bbox_predict_tmp[1])) img_info = train_data.AllImg[nbatch - 1] print "%s\twidth: %d height: %d num_face: %d" % \ (img_info.filename, img_info.width, img_info.height, img_info.num_faces) f.write("%s\twidth: %d height: %d num_face: %d\n" % \ (img_info.filename, img_info.width, img_info.height, img_info.num_faces)) if nbatch % 50 == 0: print_accuracy(softmax_batch, f, train_data.class_names, self.bgfg) softmax_batch = np.zeros((11, 3)) print "Keypoints projection regression smoothl1 loss:\t", proj_regress_loss_b / 50 f.write( "Keypoints projection regression smoothl1 loss:\t%f\n" % (proj_regress_loss_b / 50)) print "Bounding box regression:\t", bbox_predict_loss_b / 50 f.write("Bounding box regression: %f %f\n" % (bbox_predict_loss_b[0] / 50, bbox_predict_loss_b[1] / 50)) #print "Keypoints offset regression smoothl1 loss:\t", offset_regress_loss_b / 50 #f.write("Keypoints offset regression smoothl1 loss:\t%f\n" % (offset_regress_loss_b / 50)) #print "Keypoints visibility accuracy:\t", float(softmax_vis_batch[2]) / float(softmax_vis_batch[0]) #f.write("Keypoints visibility accuracy:\t%f\n" % # (float(softmax_vis_batch[2]) / float(softmax_vis_batch[0]))) softmax_vis_batch = np.zeros(3) proj_regress_loss_b = .0 offset_regress_loss_b = .0 bbox_predict_loss_b = np.array([.0, .0]) if nbatch % 1000 == 0: if epoch_end_callback != None: epoch_end_callback(epoch * 100000 + nbatch, self.symbol, self.arg_params, self.aux_params) name, value = eval_metric.get() print_accuracy(softmax_count, f, train_data.class_names, self.bgfg) logger.info("--->Epoch[%d] Train-cls-%s=%f", epoch, name, value) logger.info("--->Epoch[%d] Train-proj-reg-smoothl1=%f", epoch, proj_regress_loss_t / nbatch) logger.info("--->Epoch[%d] Train-bbox-reg-smoothl1=%f, %f", epoch, bbox_predict_loss_t[0] / nbatch, bbox_predict_loss_t[1] / nbatch) #logger.info("--->Epoch[%d] Train-offset-reg-smoothl1=%f", epoch, offset_regress_loss_t / nbatch) #logger.info("--->Epoch[%d] Train-vis-acc=%f", epoch, float(softmax_vis_count[2]) / float(softmax_vis_count[0])) f.write("--->Epoch[%d] Train-cls-%s=%f\n" % (epoch, name, value)) f.write("--->Epoch[%d] Train-proj-reg-smoothl1=%f\n" % (epoch, proj_regress_loss_t / nbatch)) f.write("--->Epoch[%d] Train-bbox-reg-smoothl1=%f, %f" % (epoch, bbox_predict_loss_t[0] / nbatch, bbox_predict_loss_t[1] / nbatch)) #f.write("--->Epoch[%d] Train-offset-reg-smoothl1=%f\n" % (epoch, offset_regress_loss_t / nbatch)) #f.write("--->Epoch[%d] Train-vis-acc=%f" % (epoch, float(softmax_vis_count[2]) / float(softmax_vis_count[0]))) f.close()
def fit(self, train_data, eval_data=None, eval_metric='acc', grad_req='write', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None): if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape( data=train_data.provide_data[0][1]) arg_names = self.symbol.list_arguments() if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None aux_names = self.symbol.list_auxiliary_states() self.aux_params = { k: nd.zeros(s) for k, s in zip(aux_names, aux_shapes) } data_name = train_data.data_name label_name = train_data.label_name input_names = [data_name, label_name] self.optimizer = opt.create(self.optimizer, rescale_grad=(1.0 / train_data.get_batch_size()), **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = metric.create(eval_metric) # begin training for epoch in range(self.begin_epoch, self.num_epoch): nbatch = 0 train_data.reset() eval_metric.reset() for data in train_data: nbatch += 1 label_shape = data[label_name].shape self.arg_params[data_name] = mx.nd.array( data[data_name], self.ctx) self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2]), self.ctx) output_names = self.symbol.list_outputs() self.exector = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) assert len(self.symbol.list_arguments()) == len( self.exector.grad_arrays) update_dict = {name: nd for name, nd in zip(self.symbol.list_arguments(), \ self.exector.grad_arrays) if nd is not None} output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.exector.outputs): output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) self.exector.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) self.exector.backward() for key, arr in update_dict.items(): if key != "bigscore_weight": self.updater(key, arr, self.arg_params[key]) pred_shape = self.exector.outputs[0].shape label = mx.nd.array(data[label_name].reshape( label_shape[0], label_shape[1] * label_shape[2])) pred = mx.nd.array(output_buff["softmax_output"].asnumpy().reshape(pred_shape[0], \ pred_shape[1], pred_shape[2]*pred_shape[3])) eval_metric.update([label], [pred]) self.exector.outputs[0].wait_to_read() batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric) batch_end_callback(batch_end_params) if epoch_end_callback is not None: epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) name, value = eval_metric.get() logger.info(" --->Epoch[%d] Train-%s=%f", epoch, name, value) # evaluation if eval_data: logger.info(" in eval process...") nbatch = 0 eval_data.reset() eval_metric.reset() for data in eval_data: nbatch += 1 label_shape = data[label_name].shape self.arg_params[data_name] = mx.nd.array( data[data_name], self.ctx) self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2]), self.ctx) exector = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) cpu_output_array = mx.nd.zeros(exector.outputs[0].shape) exector.forward(is_train=False) exector.outputs[0].copyto(cpu_output_array) pred_shape = cpu_output_array.shape label = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2])) pred = mx.nd.array(cpu_output_array.asnumpy().reshape(pred_shape[0], \ pred_shape[1], pred_shape[2]*pred_shape[3])) eval_metric.update([label], [pred]) exector.outputs[0].wait_to_read() name, value = eval_metric.get() logger.info('batch[%d] Validation-%s=%f', nbatch, name, value)
def fit(self, train_data, eval_data, eval_metric='mse', grad_req='write', epoch_end_callback=None, batch_end_callback=None, kv_store='local', logger=None): if logger is None: logger = logging logging.info('Starting training with %s', str(self.ctx)) arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data = train_data.provide_data[0][1]) arg_names = self.symbol.list_arguments() if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) #init the params pdb.set_trace() self.arg_params = {k : mx.nd.zeros(s, self.ctx) for k, s in zip(arg_names, arg_shapes)} for k, v in self.arg_params.items(): if not (k.endswith('data') or k.endswith('label')): self.initializer(k, v) #init the aux params aux_names = self.symbol.list_auxiliary_states() self.aux_params = {k : mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes)} data_name = train_data.data_name label_name = train_data.label_name input_names = [data_name, label_name] self.optimizer = mx.optimizer.create(self.optimizer, rescale_grad = (1.0/train_data.get_batch_size()), **(self.kwargs)) self.updater = mx.optimizer.get_updater(self.optimizer) eval_metric = metric.create(eval_metric) # begin training for epoch in range(self.begin_epoch, self.num_epoch): nbatch = 0 train_data.reset() eval_metric.reset() #train for databatch in train_data: nbatch += 1 for k, v in databatch.data.items(): self.arg_params[k] = mx.nd.array(v, self.ctx) for k, v in databatch.label.items(): self.arg_params[k] = mx.nd.array(v, self.ctx) executor = self.symbol.bind(self.ctx, self.arg_params, args_grad = self.grad_params, grad_req = grad_req, aux_states = self.aux_params) # print(nbatch) if nbatch == 1550: pdb.set_trace() update_dict = {name:nd for name, nd in zip(self.symbol.list_arguments(), executor.grad_arrays) if nd} output_dict = {name:nd for name, nd in zip(self.symbol.list_outputs(), executor.outputs)} # pdb.set_trace() executor.forward(is_train=True) executor.backward() for key, arr in update_dict.items(): self.updater(key, arr, self.arg_params[key]) label = self.arg_params['lr_label'] pred = output_dict['lr_output'] eval_metric.update([label], [pred]) executor.outputs[0].wait_to_read() batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric) batch_end_callback(batch_end_params) if epoch_end_callback != None: epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) # pdb.set_trace() name, value = eval_metric.get() logger.info("------------------------------>Epoch[%d] Train-%s=%f", epoch, name, value) #begin evaluation if eval_data: logger.info( "in eval process...") nbatch = 0 eval_data.reset() eval_metric.reset() for data in eval_data: nbatch += 1 for k, v in databatch.data.items(): self.arg_params[k] = mx.nd.array(v, self.ctx) for k, v in databatch.label.items(): self.arg_params[k] = mx.nd.array(v, self.ctx) executor = self.symbol.bind(self.ctx, self.arg_params, args_grad = self.grad_params, grad_req = grad_req, aux_states = self.aux_params) output_dict = {name:nd for name, nd in zip(self.symbol.list_outputs(), executor.outputs)} executor.forward(is_train=False) label = self.arg_params['lr_label'] pred = output_dict['lr_output'] eval_metric.update([label], [pred]) name, value = eval_metric.get() logger.info('batch[%d] Validation-%s=%f', nbatch, name, value)
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01), ), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None, sparse_row_id_fn=None, profile=False): """Trains the module parameters. Checkout `Module Tutorial <http://mxnet.io/tutorials/basic/module.html>`_ to see a end-to-end use-case. Parameters ---------- train_data : DataIter Train DataIter. eval_data : DataIter If not ``None``, will be used as validation set and the performance after each epoch will be evaluated. eval_metric : str or EvalMetric Defaults to 'accuracy'. The performance measure used to display during training. Other possible predefined metrics are: 'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'. epoch_end_callback : function or list of functions Each callback will be called with the current `epoch`, `symbol`, `arg_params` and `aux_params`. batch_end_callback : function or list of function Each callback will be called with a `BatchEndParam`. kvstore : str or KVStore Defaults to 'local'. optimizer : str or Optimizer Defaults to 'sgd'. optimizer_params : dict Defaults to ``(('learning_rate', 0.01),)``. The parameters for the optimizer constructor. The default value is not a dict, just to avoid pylint warning on dangerous default values. eval_end_callback : function or list of function These will be called at the end of each full evaluation, with the metrics over the entire evaluation set. eval_batch_end_callback : function or list of function These will be called at the end of each mini-batch during evaluation. initializer : Initializer The initializer is called to initialize the module parameters when they are not already initialized. arg_params : dict Defaults to ``None``, if not ``None``, should be existing parameters from a trained model or loaded from a checkpoint (previously saved model). In this case, the value here will be used to initialize the module parameters, unless they are already initialized by the user via a call to `init_params` or `fit`. `arg_params` has a higher priority than `initializer`. aux_params : dict Defaults to ``None``. Similar to `arg_params`, except for auxiliary states. allow_missing : bool Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params` and `aux_params` are not ``None``. If this is ``True``, then the missing parameters will be initialized via the `initializer`. force_rebind : bool Defaults to ``False``. Whether to force rebinding the executors if already bound. force_init : bool Defaults to ``False``. Indicates whether to force initialization even if the parameters are already initialized. begin_epoch : int Defaults to 0. Indicates the starting epoch. Usually, if resumed from a checkpoint saved at a previous training phase at epoch N, then this value should be N+1. num_epoch : int Number of epochs for training. sparse_row_id_fn : A callback function The function takes `data_batch` as an input and returns a dict of str -> NDArray. The resulting dict is used for pulling row_sparse parameters from the kvstore, where the str key is the name of the param, and the value is the row id of the param to pull. """ assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() nbatch = 0 data_iter = iter(train_data) end_of_batch = False next_data_batch = next(data_iter) while not end_of_batch: data_batch = next_data_batch if monitor is not None: monitor.tic() self.forward_backward(data_batch) self.update() if isinstance(data_batch, list): self.update_metric(eval_metric, [db.label for db in data_batch], pre_sliced=True) else: self.update_metric(eval_metric, data_batch.label) try: # pre fetch next batch next_data_batch = next(data_iter) self.prepare(next_data_batch, sparse_row_id_fn=sparse_row_id_fn) except StopIteration: end_of_batch = True if monitor is not None: monitor.toc_print() if end_of_batch: eval_name_vals = eval_metric.get_name_value() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 if profile is True and nbatch == 10: self.logger.info("Profiling ends") import mxnet as mx mx.profiler.dump() # one epoch of training is finished for name, val in eval_name_vals: self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None and self._kvstore.rank == 0: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
def fit(self): # kvstore if self.kv_store is 'local' and (self.gpus is None or len(self.gpus.split(',')) is 1): kv = None else: kv = mx.kvstore.create(self.kv_store) # setup module, including symbol, params and aux # get_model should always be called before get_data_iterator to ensure correct data loader self.get_model() # get dataloader train_data, eval_data = self.get_data_iterator() # evaluate metrics eval_metric_lst = [] if "acc" in self.eval_metric: eval_metric_lst.append(metric.create(self.eval_metric)) if "acc_ignore" in self.eval_metric and self.ignore_label is not None: eval_metric_lst.append( AccWithIgnoreMetric(self.ignore_label, name="acc_ignore")) if "IoU" in self.eval_metric and self.ignore_label is not None: eval_metric_lst.append( IoUMetric(self.ignore_label, label_num=self.label_num, name="IoU")) eval_metric_lst.append( SoftmaxLoss(self.ignore_label, label_num=self.label_num, name="SoftmaxLoss")) eval_metrics = CompositeEvalMetric(metrics=eval_metric_lst) optimizer_params = {} # optimizer # lr policy if self.lr_policy == 'step' and self.lr_factor < 1 and self.lr_factor_epoch > 0: optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler( step=max(int(self.epoch_size * self.lr_factor_epoch), 1), factor=self.lr_factor) elif self.lr_policy == 'poly': optimizer_params['lr_scheduler'] = lr_scheduler.PolyScheduler( origin_lr=self.lr, max_samples=max(int(self.epoch_size * self.num_epochs), 1), factor=self.lr_factor) else: logging.error('Unknown lr policy: %s' % self.lr_policy) optimizer_params['learning_rate'] = self.lr optimizer_params['momentum'] = self.momentum optimizer_params['wd'] = self.weight_decay optimizer_params['rescale_grad'] = 1.0 / self.batch_size optimizer_params['clip_gradient'] = 5 # directory for saving models model_path = os.path.join(self.model_dir, self.save_model_prefix) if not os.path.isdir(model_path): os.mkdir(model_path) model_full_path = os.path.join( model_path, datetime.now().strftime('%Y_%m_%d_%H:%M:%S')) if not os.path.isdir(model_full_path): os.mkdir(model_full_path) checkpoint = utils.do_checkpoint( os.path.join(model_full_path, self.save_model_prefix), self.checkpoint_interval) with open( os.path.join( model_full_path, 'train_' + datetime.now().strftime('%Y_%m_%d_%H:%M:%S') + '.cfg'), 'w') as f: self.config.write(f) utils.save_symbol( self.symbol, os.path.join(model_full_path, self.save_model_prefix)) utils.save_log(self.save_model_prefix, model_full_path) # draw network if self.draw_network is True: utils.draw_network( self.symbol, os.path.join(model_full_path, self.save_model_prefix), self.data_shape[0]) # batch_end_callback batch_end_callback = list() batch_end_callback.append(utils.Speedometer(self.batch_size, 10)) module = mx.module.Module(self.symbol, context=self.ctx, data_names=self.data_name, label_names=self.label_name) # initialize (base_module now no more do this initialization) train_data.reset() module.fit( train_data=train_data, eval_data=eval_data, eval_metric=eval_metrics, epoch_end_callback=checkpoint, batch_end_callback=batch_end_callback, kvstore=kv, optimizer=self.optimizer, optimizer_params=optimizer_params, initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), arg_params=self.arg_params, aux_params=self.aux_params, allow_missing=True, begin_epoch=self.load_epoch, num_epoch=self.num_epochs, )
def fit(self, train_data, eval_data=None, eval_metric='acc', grad_req='write', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None): if logger is None: logger = logging logging.info('Start training with %s', str(self.ctx)) arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data=train_data.provide_data[0][1]) arg_names = self.symbol.list_arguments() if grad_req != 'null': self.grad_params = {} for name, shape in zip(arg_names, arg_shapes): if not (name.endswith('data') or name.endswith('label')): self.grad_params[name] = mx.nd.zeros(shape, self.ctx) else: self.grad_params = None aux_names = self.symbol.list_auxiliary_states() self.aux_params = {k : nd.zeros(s) for k, s in zip(aux_names, aux_shapes)} data_name = train_data.data_name label_name = train_data.label_name input_names = [data_name, label_name] self.optimizer = opt.create(self.optimizer, rescale_grad=(1.0/train_data.get_batch_size()), **(self.kwargs)) self.updater = get_updater(self.optimizer) eval_metric = metric.create(eval_metric) # begin training for epoch in range(self.begin_epoch, self.num_epoch): nbatch = 0 train_data.reset() eval_metric.reset() for data in train_data: nbatch += 1 label_shape = data[label_name].shape self.arg_params[data_name] = mx.nd.array(data[data_name], self.ctx) self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2]), self.ctx) output_names = self.symbol.list_outputs() self.exector = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) assert len(self.symbol.list_arguments()) == len(self.exector.grad_arrays) update_dict = {name: nd for name, nd in zip(self.symbol.list_arguments(), \ self.exector.grad_arrays) if nd} output_dict = {} output_buff = {} for key, arr in zip(self.symbol.list_outputs(), self.exector.outputs): output_dict[key] = arr output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) self.exector.forward(is_train=True) for key in output_dict: output_dict[key].copyto(output_buff[key]) self.exector.backward() for key, arr in update_dict.items(): if key != "bigscore_weight": self.updater(key, arr, self.arg_params[key]) pred_shape = self.exector.outputs[0].shape label = mx.nd.array(data[label_name].reshape(label_shape[0], label_shape[1]*label_shape[2])) pred = mx.nd.array(output_buff["softmax_output"].asnumpy().reshape(pred_shape[0], \ pred_shape[1], pred_shape[2]*pred_shape[3])) eval_metric.update([label], [pred]) self.exector.outputs[0].wait_to_read() batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric) batch_end_callback(batch_end_params) if epoch_end_callback is not None: epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) name, value = eval_metric.get() logger.info(" --->Epoch[%d] Train-%s=%f", epoch, name, value) # evaluation if eval_data: logger.info(" in eval process...") nbatch = 0 eval_data.reset() eval_metric.reset() for data in eval_data: nbatch += 1 label_shape = data[label_name].shape self.arg_params[data_name] = mx.nd.array(data[data_name], self.ctx) self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2]), self.ctx) exector = self.symbol.bind(self.ctx, self.arg_params, args_grad=self.grad_params, grad_req=grad_req, aux_states=self.aux_params) cpu_output_array = mx.nd.zeros(exector.outputs[0].shape) exector.forward(is_train=False) exector.outputs[0].copyto(cpu_output_array) pred_shape = cpu_output_array.shape label = mx.nd.array(data[label_name].reshape(label_shape[0], \ label_shape[1]*label_shape[2])) pred = mx.nd.array(cpu_output_array.asnumpy().reshape(pred_shape[0], \ pred_shape[1], pred_shape[2]*pred_shape[3])) eval_metric.update([label], [pred]) exector.outputs[0].wait_to_read() name, value = eval_metric.get() logger.info('batch[%d] Validation-%s=%f', nbatch, name, value)
def fit( self, train_data, eval_data=None, eval_metric="acc", epoch_end_callback=None, batch_end_callback=None, kvstore="local", optimizer="sgd", optimizer_params=(("learning_rate", 0.01),), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None, prefix=None, ): """Train the module parameters. Parameters ---------- train_data : DataIter eval_data : DataIter If not `None`, will be used as validation set and evaluate the performance after each epoch. eval_metric : str or EvalMetric Default `'acc'`. The performance measure used to display during training. epoch_end_callback : function or list of function Each callback will be called with the current `epoch`, `symbol`, `arg_params` and `aux_params`. batch_end_callback : function or list of function Each callback will be called with a `BatchEndParam`. kvstore : str or KVStore Default `'local'`. optimizer : str or Optimizer Default `'sgd'` optimizer_params : dict Default `(('learning_rate', 0.01),)`. The parameters for the optimizer constructor. The default value is not a `dict`, just to avoid pylint warning on dangerous default values. eval_end_callback : function or list of function These will be called at the end of each full evaluation, with the metrics over the entire evaluation set. eval_batch_end_callback : function or list of function These will be called at the end of each minibatch during evaluation initializer : Initializer Will be called to initialize the module parameters if not already initialized. arg_params : dict Default `None`, if not `None`, should be existing parameters from a trained model or loaded from a checkpoint (previously saved model). In this case, the value here will be used to initialize the module parameters, unless they are already initialized by the user via a call to `init_params` or `fit`. `arg_params` has higher priority to `initializer`. aux_params : dict Default `None`. Similar to `arg_params`, except for auxiliary states. allow_missing : bool Default `False`. Indicate whether we allow missing parameters when `arg_params` and `aux_params` are not `None`. If this is `True`, then the missing parameters will be initialized via the `initializer`. force_rebind : bool Default `False`. Whether to force rebinding the executors if already binded. force_init : bool Default `False`. Indicate whether we should force initialization even if the parameters are already initialized. begin_epoch : int Default `0`. Indicate the starting epoch. Usually, if we are resuming from a checkpoint saved at a previous training phase at epoch N, then we should specify this value as N+1. num_epoch : int Number of epochs to run training. Examples -------- An example of using fit for training:: >>> #Assume training dataIter and validation dataIter are ready >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer_params={'learning_rate':0.01, 'momentum': 0.9}, num_epoch=10) """ assert num_epoch is not None, "please specify number of epochs" self.bind( data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind, ) if monitor is not None: self.install_monitor(monitor) self.init_params( initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init, ) self.init_optimizer( kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params ) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) ################################################################################ # training loop ################################################################################ # epoch 0 if epoch_end_callback is not None: arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) for callback in _as_list(epoch_end_callback): callback(-1, self.symbol, arg_params, aux_params) from lib.pair_matching.batch_updater_py_multi import batchUpdaterPyMulti config = self.config if config.TRAIN.TENSORBOARD_LOG: from mxboard import SummaryWriter tf_log_dir = os.path.join( os.path.dirname(prefix), "logs/{}".format(time.strftime("%Y-%m-%d-%H-%M")), ) summ_writer = SummaryWriter(logdir=tf_log_dir) interBatchUpdater = batchUpdaterPyMulti(config, 480, 640) last_lr = 0 cur_step = 0 for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() for nbatch, data_batch in enumerate(train_data): if monitor is not None: monitor.tic() # disp weights L2 norm cur_lr = self._curr_module._optimizer._get_lr(0) if nbatch % (4000 / train_data.batch_size) == 0: all_params = self._curr_module.get_params()[0] all_param_names = all_params.keys() all_param_names = sorted(all_param_names) print_and_log(prefix, self.logger) weight_str = "" for view_name in all_param_names: weight_str += "{}: {} ".format( view_name, nd.norm(all_params[view_name]).asnumpy() ) print_and_log(weight_str, self.logger) print_and_log( "batch {}: lr: {}".format(nbatch, cur_lr), self.logger ) if config.TRAIN.TENSORBOARD_LOG: summ_writer.add_scalar( tag="learning_rate", value=cur_lr, global_step=cur_step ) if cur_lr != last_lr: print_and_log( "batch {}: lr: {}".format(nbatch, cur_lr), self.logger ) last_lr = cur_lr if config.TRAIN.TENSORBOARD_LOG: summ_writer.add_scalar( tag="learning_rate", value=cur_lr, global_step=cur_step ) train_iter_size = config.network.TRAIN_ITER_SIZE for iter_idx in range(train_iter_size): self.forward_backward(data_batch) preds = self._curr_module.get_outputs(False) self.update() if iter_idx != train_iter_size - 1: data_batch = interBatchUpdater.forward( data_batch, preds, config ) cur_step += 1 self.update_metric(eval_metric, data_batch.label) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam( epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals(), ) for callback in _as_list(batch_end_callback): callback(batch_end_params) if config.TRAIN.TENSORBOARD_LOG: for name, val in eval_metric.get_name_value(): summ_writer.add_scalar( tag="BatchTrain-{}".format(name), value=val, global_step=cur_step, ) # one epoch of training is finished for name, val in eval_metric.get_name_value(): self.logger.info("Epoch[%d] Train-%s=%f", epoch, name, val) if config.TRAIN.TENSORBOARD_LOG: summ_writer.add_scalar( tag="EpochTrain-{}".format(name), value=val, global_step=epoch ) toc = time.time() self.logger.info("Epoch[%d] Time cost=%.3f", epoch, (toc - tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) # ---------------------------------------- # evaluation on validation set if eval_data: res = self.score( eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch, ) # TODO: pull this into default for name, val in res: self.logger.info("Epoch[%d] Validation-%s=%f", epoch, name, val) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None, prefix=None, state=None): """Train the module parameters. Parameters ---------- train_data : DataIter eval_data : DataIter If not `None`, will be used as validation set and evaluate the performance after each epoch. eval_metric : str or EvalMetric Default `'acc'`. The performance measure used to display during training. epoch_end_callback : function or list of function Each callback will be called with the current `epoch`, `symbol`, `arg_params` and `aux_params`. batch_end_callback : function or list of function Each callback will be called with a `BatchEndParam`. kvstore : str or KVStore Default `'local'`. optimizer : str or Optimizer Default `'sgd'` optimizer_params : dict Default `(('learning_rate', 0.01),)`. The parameters for the optimizer constructor. The default value is not a `dict`, just to avoid pylint warning on dangerous default values. eval_end_callback : function or list of function These will be called at the end of each full evaluation, with the metrics over the entire evaluation set. eval_batch_end_callback : function or list of function These will be called at the end of each minibatch during evaluation initializer : Initializer Will be called to initialize the module parameters if not already initialized. arg_params : dict Default `None`, if not `None`, should be existing parameters from a trained model or loaded from a checkpoint (previously saved model). In this case, the value here will be used to initialize the module parameters, unless they are already initialized by the user via a call to `init_params` or `fit`. `arg_params` has higher priority to `initializer`. aux_params : dict Default `None`. Similar to `arg_params`, except for auxiliary states. allow_missing : bool Default `False`. Indicate whether we allow missing parameters when `arg_params` and `aux_params` are not `None`. If this is `True`, then the missing parameters will be initialized via the `initializer`. force_rebind : bool Default `False`. Whether to force rebinding the executors if already binded. force_init : bool Default `False`. Indicate whether we should force initialization even if the parameters are already initialized. begin_epoch : int Default `0`. Indicate the starting epoch. Usually, if we are resuming from a checkpoint saved at a previous training phase at epoch N, then we should specify this value as N+1. num_epoch : int Number of epochs to run training. Examples -------- An example of using fit for training:: >>> #Assume training dataIter and validation dataIter are ready >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer_params={'learning_rate':0.01, 'momentum': 0.9}, num_epoch=10) """ assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if state is not None: self._curr_module.load_optimizer_states(state) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() for nbatch, data_batch in enumerate(train_data): if monitor is not None: monitor.tic() self.forward_backward(data_batch) self.update() self.update_metric(eval_metric, data_batch.label) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) # one epoch of training is finished for name, val in eval_metric.get_name_value(): self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc-tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) #---------------------------------------- # evaluation on validation set if eval_data: res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
path_imgrec = "./segmentation_data/%s_person_datasets/%s_%s_%d,%d.rec" % ( data_type[0], data_type[0], data_type[1], resize_size[0], resize_size[1]) logging.info('数据集:%s' % path_imgrec) idx2imgname = {} with open(path_imglst, 'r') as fin: idx2imgname = { int(line.split('\t')[0]): line.split('\t')[2].strip() for line in fin.readlines() } logging.info('图片数量:%d' % len(idx2imgname)) # endregion eval_metric = 'meanIOU' if eval_metric == 'acc': eval_metric = metric.create(eval_metric) elif eval_metric == 'meanIOU': # eval_metric = MeanIoU(c=15, threshold=args.threshold, num_class=21) eval_metric = MeanIoU(c=1, threshold=args.threshold, num_class=2) visual = args.visual if visual: result_path = './result/%s_person_datasets/result_%s_epoch%d_%s' % ( data_type[0], pre_train_model_type, epoch, data_type[1]) if not os.path.exists(result_path): os.makedirs(result_path) logging.info('结果保存到:%s' % result_path) # 产生一个颜色和mask rgb = (np.random.random((1, 3)) * 0.6 + 0.4).tolist()[0] # print rgb
def fit(self, X, marks, e_marks=None, y=None, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, time_step_callback=None, kvstore='local', logger=None, work_load_list=None, monitor=None, eval_batch_end_callback=None): """Overwrite""" data = self._init_iter(X, y, is_train=True) eval_data = self._init_eval_iter(eval_data) if self.sym_gen: self.symbol = self.sym_gen(data.default_bucket_key) # pylint: disable=no-member self._check_arguments() self.kwargs["sym"] = self.symbol param_dict = dict(data.provide_data + data.provide_label) arg_names, param_names, aux_names = self._init_params(param_dict) # setup metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) # create kvstore (kvstore, update_on_kvstore) = _create_kvstore(kvstore, len(self.ctx), self.arg_params) param_idx2name = {} if update_on_kvstore: param_idx2name.update(enumerate(param_names)) else: for i, n in enumerate(param_names): for k in range(len(self.ctx)): param_idx2name[i * len(self.ctx) + k] = n self.kwargs["param_idx2name"] = param_idx2name # init optmizer if isinstance(self.optimizer, str): batch_size = data.batch_size if kvstore and kvstore.type == 'dist_sync': batch_size *= kvstore.num_workers optimizer = opt.create(self.optimizer, rescale_grad=(1.0 / batch_size), **(self.kwargs)) elif isinstance(self.optimizer, opt.Optimizer): optimizer = self.optimizer # do training _train_rnn(self.symbol, self.ctx, marks, arg_names, param_names, aux_names, self.arg_params, self.aux_params, begin_epoch=self.begin_epoch, end_epoch=self.num_epoch, epoch_size=self.epoch_size, optimizer=optimizer, train_data=data, eval_data=eval_data, eval_metric=eval_metric, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, time_step_callback=time_step_callback, kvstore=kvstore, update_on_kvstore=update_on_kvstore, logger=logger, work_load_list=work_load_list, monitor=monitor, eval_batch_end_callback=eval_batch_end_callback, sym_gen=self.sym_gen, e_marks=e_marks)
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01), ), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, best_model_callbacks=None, eval_interval=None, validation_metric=None, monitor=None): assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) if validation_metric is None: validation_metric = copy.deepcopy(eval_metric) epoch_metric = copy.deepcopy(eval_metric) swa_arg_params = None swa_aux_params = None swa_cnt = 0 ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): tic_epoch = time.time() eval_metric.reset() nbatch = 0 end_of_batch = False data_iter = iter(train_data) next_data_batch = next(data_iter) name_values = [] while not end_of_batch: data_batch = next_data_batch if monitor is not None: monitor.tic() self.forward_backward(data_batch) self.update() try: # pre fetch next batch next_data_batch = next(data_iter) self.prepare(next_data_batch) except StopIteration: end_of_batch = True self.update_metric(eval_metric, data_batch.label) if end_of_batch: name_values = eval_metric.get_name_value() if monitor is not None: monitor.toc_print() nbatch += 1 if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) eval_metric.reset() # ---------------------------------------- # evaluation on validation set to_go = eval_interval is not None and nbatch % eval_interval == 0 if to_go and eval_data: res = self.score( eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) for name, val in res: self.logger.info( 'Epoch[%d] Batch[%d] Validation-%s=%f', epoch, nbatch, name, val) if best_model_callbacks is not None: for callback in _as_list(best_model_callbacks): if callback.is_best(validation_metric): # sync aux params across devices arg_params, aux_params = self.get_params() sync_made = True callback.checkpoint_if_only_best( validation_metric, self.symbol, arg_params, aux_params) break # one epoch of training is finished for name, val in name_values: self.logger.info('Epoch[%d] Train-%s=%f', epoch + 1, name, val) toc_epoch = time.time() elapsed = (toc_epoch - tic_epoch) avg_speed = float(len(train_data)) / (toc_epoch - tic_epoch) self.logger.info('Epoch[%d] Time cost=%.3f', epoch + 1, elapsed) self.logger.info('Epoch[%d] Average speed=%.3f samples/sec', epoch + 1, avg_speed) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) # evaluation on validation set if eval_data: res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch + 1) for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch + 1, name, val) if best_model_callbacks is not None: for callback in _as_list(best_model_callbacks): callback.checkpoint_if_only_best( validation_metric, self.symbol, arg_params, aux_params) # end of epoch, reset the data-iter for another epoch train_data.reset()
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None, prefix=None, batches_checkpoint=None, num_batches_save_ckpt=2000): """Train the module parameters. Parameters ---------- train_data : DataIter eval_data : DataIter If not `None`, will be used as validation set and evaluate the performance after each epoch. eval_metric : str or EvalMetric Default `'acc'`. The performance measure used to display during training. epoch_end_callback : function or list of function Each callback will be called with the current `epoch`, `symbol`, `arg_params` and `aux_params`. batch_end_callback : function or list of function Each callback will be called with a `BatchEndParam`. kvstore : str or KVStore Default `'local'`. optimizer : str or Optimizer Default `'sgd'` optimizer_params : dict Default `(('learning_rate', 0.01),)`. The parameters for the optimizer constructor. The default value is not a `dict`, just to avoid pylint warning on dangerous default values. eval_end_callback : function or list of function These will be called at the end of each full evaluation, with the metrics over the entire evaluation set. eval_batch_end_callback : function or list of function These will be called at the end of each minibatch during evaluation initializer : Initializer Will be called to initialize the module parameters if not already initialized. arg_params : dict Default `None`, if not `None`, should be existing parameters from a trained model or loaded from a checkpoint (previously saved model). In this case, the value here will be used to initialize the module parameters, unless they are already initialized by the user via a call to `init_params` or `fit`. `arg_params` has higher priority to `initializer`. aux_params : dict Default `None`. Similar to `arg_params`, except for auxiliary states. allow_missing : bool Default `False`. Indicate whether we allow missing parameters when `arg_params` and `aux_params` are not `None`. If this is `True`, then the missing parameters will be initialized via the `initializer`. force_rebind : bool Default `False`. Whether to force rebinding the executors if already binded. force_init : bool Default `False`. Indicate whether we should force initialization even if the parameters are already initialized. begin_epoch : int Default `0`. Indicate the starting epoch. Usually, if we are resuming from a checkpoint saved at a previous training phase at epoch N, then we should specify this value as N+1. num_epoch : int Number of epochs to run training. Examples -------- An example of using fit for training:: >>> #Assume training dataIter and validation dataIter are ready >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer_params={'learning_rate':0.01, 'momentum': 0.9}, num_epoch=10) """ assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() for nbatch, data_batch in enumerate(train_data): if monitor is not None: monitor.tic() self.forward_backward(data_batch) self.update() self.update_metric(eval_metric, data_batch.label) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) if batches_checkpoint is not None and nbatch != 0 and nbatch % num_batches_save_ckpt == 0: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) # one epoch of training is finished for name, val in eval_metric.get_name_value(): self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc-tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) if prefix is not None: self._curr_module.save_checkpoint(prefix, epoch + 1, save_optimizer_states=True) #---------------------------------------- # evaluation on validation set if eval_data: res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01), ), eval_end_callback=None, iter_size=1, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None): """Ke's revision: add iter_size. Trains the module parameters. Checkout `Module Tutorial <http://mxnet.io/tutorials/basic/module.html>`_ to see a end-to-end use-case. Parameters ---------- train_data : DataIter Train DataIter. eval_data : DataIter If not ``None``, will be used as validation set and the performance after each epoch will be evaluated. eval_metric : str or EvalMetric Defaults to 'accuracy'. The performance measure used to display during training. Other possible predefined metrics are: 'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'. epoch_end_callback : function or list of functions Each callback will be called with the current `epoch`, `symbol`, `arg_params` and `aux_params`. batch_end_callback : function or list of function Each callback will be called with a `BatchEndParam`. kvstore : str or KVStore Defaults to 'local'. optimizer : str or Optimizer Defaults to 'sgd'. optimizer_params : dict Defaults to ``(('learning_rate', 0.01),)``. The parameters for the optimizer constructor. The default value is not a dict, just to avoid pylint warning on dangerous default values. eval_end_callback : function or list of function These will be called at the end of each full evaluation, with the metrics over the entire evaluation set. eval_batch_end_callback : function or list of function These will be called at the end of each mini-batch during evaluation. initializer : Initializer The initializer is called to initialize the module parameters when they are not already initialized. arg_params : dict Defaults to ``None``, if not ``None``, should be existing parameters from a trained model or loaded from a checkpoint (previously saved model). In this case, the value here will be used to initialize the module parameters, unless they are already initialized by the user via a call to `init_params` or `fit`. `arg_params` has a higher priority than `initializer`. aux_params : dict Defaults to ``None``. Similar to `arg_params`, except for auxiliary states. allow_missing : bool Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params` and `aux_params` are not ``None``. If this is ``True``, then the missing parameters will be initialized via the `initializer`. force_rebind : bool Defaults to ``False``. Whether to force rebinding the executors if already bound. force_init : bool Defaults to ``False``. Indicates whether to force initialization even if the parameters are already initialized. begin_epoch : int Defaults to 0. Indicates the starting epoch. Usually, if resumed from a checkpoint saved at a previous training phase at epoch N, then this value should be N+1. num_epoch : int Number of epochs for training. Examples -------- >>> # An example of using fit for training. >>> # Assume training dataIter and validation dataIter are ready >>> # Assume loading a previously checkpointed model >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 3) >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer='sgd', ... optimizer_params={'learning_rate':0.01, 'momentum': 0.9}, ... arg_params=arg_params, aux_params=aux_params, ... eval_metric='acc', num_epoch=10, begin_epoch=3) """ assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind, grad_req='add') if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() nbatch = 0 data_iter = iter(train_data) end_of_batch = False next_data_batch = next(data_iter) while not end_of_batch: data_batch = next_data_batch if monitor is not None: monitor.tic() # self.forward_backward(data_batch) self.forward(data_batch, is_train=True, grad_req='add') self.backward() if nbatch % iter_size == 0: # update every iter_size batches self.update() for g in self._curr_module._exec_group.grad_arrays: for g1 in g: if g1 is not None: g1[:] = 0. try: # pre fetch next batch next_data_batch = next(data_iter) self.prepare(next_data_batch) except StopIteration: end_of_batch = True self.update_metric(eval_metric, data_batch.label) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 # one epoch of training is finished for name, val in eval_metric.get_name_value(): self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) #---------------------------------------- # evaluation on validation set if eval_data: res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()