def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_init=False, allow_extra=False): if self.params_initialized and not force_init: return assert self.binded, 'call bind before initializing the parameters' #TODO init the same weights with all work nodes self._curr_module.init_params(initializer=initializer, arg_params=None, aux_params=None, allow_missing=allow_missing, force_init=force_init, allow_extra=allow_extra) for _module in self._arcface_modules: #_initializer = initializer _initializer = mx.init.Normal(0.01) _module.init_params(initializer=_initializer, arg_params=None, aux_params=None, allow_missing=allow_missing, force_init=force_init, allow_extra=allow_extra) self.params_initialized = True
def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_init=False): """Initialize the parameters and auxiliary states. Parameters ---------- initializer : Initializer Called to initialize parameters if needed. arg_params : dict If not None, should be a dictionary of existing arg_params. Initialization will be copied from that. aux_params : dict If not None, should be a dictionary of existing aux_params. Initialization will be copied from that. allow_missing : bool If true, params could contain missing values, and the initializer will be called to fill those missing params. force_init : bool If true, will force re-initialize even if already initialized. """ if self.params_initialized and not force_init: warnings.warn("Parameters already initialized and force_init=False. " "init_params call ignored.", stacklevel=2) return assert self.binded, 'call bind before initializing the parameters' def _impl(name, arr, cache): """Internal helper for parameter initialization""" if cache is not None: if name in cache: cache_arr = cache[name] # just in case the cached array is just the target itself if cache_arr is not arr: cache_arr.copyto(arr) else: if not allow_missing: raise RuntimeError("%s is not presented" % name) if initializer != None: initializer(name, arr) else: initializer(name, arr) attrs = self._symbol.attr_dict() for name, arr in self._arg_params.items(): desc = InitDesc(name, attrs.get(name, None)) _impl(desc, arr, arg_params) for name, arr in self._aux_params.items(): desc = InitDesc(name, attrs.get(name, None)) _impl(desc, arr, aux_params) self.params_initialized = True self._params_dirty = False # copy the initialized parameters to devices self._exec_group.set_params(self._arg_params, self._aux_params)
def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_init=False, allow_extra=False): if self.params_initialized and not force_init: return assert self.binded, 'call bind before initializing the parameters' self._curr_module.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.params_initialized = True
def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_init=False, allow_extra=False): """Initializes parameters. Parameters ---------- initializer : Initializer arg_params : dict Default ``None``. Existing parameters. This has higher priority than `initializer`. aux_params : dict Default ``None``. Existing auxiliary states. This has higher priority than `initializer`. allow_missing : bool Allow missing values in `arg_params` and `aux_params` (if not ``None``). In this case, missing values will be filled with `initializer`. force_init : bool Default ``False``. allow_extra : boolean, optional Whether allow extra parameters that are not needed by symbol. If this is True, no error will be thrown when arg_params or aux_params contain extra parameters that is not needed by the executor. """ if self.params_initialized and not force_init: return assert self.binded, 'call bind before initializing the parameters' for module in self._modules: module.init_params(initializer=initializer, \ arg_params=arg_params, aux_params=aux_params, \ allow_missing=allow_missing, force_init=force_init,\ allow_extra=allow_extra) # make sure we do not have duplicated parameter names def _check_name(known_names, new_names, modules, i): """Internal function to help checking duplicated names.""" for name in new_names: assert not name in known_names, "Duplicated parameter names: " + \ ('name "%s" in layer %d (%s) is already ' % (name, i, type(modules[i]))) + \ ('used in layer %d (%s).' % (known_names[name], type(modules[known_names[name]]))) known_names[name] = i arg_names = dict() aux_names = dict() for i_layer, module in enumerate(self._modules): arg_params, aux_params = module.get_params() _check_name(arg_names, arg_params.keys(), self._modules, i_layer) _check_name(aux_names, aux_params.keys(), self._modules, i_layer) self.params_initialized = True
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01), ), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None, sparse_row_id_fn=None, profile=False): assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label + self.teacher_label_shapes, for_training=True, force_rebind=force_rebind) super().fit(force_rebind=False, train_data=train_data, eval_data=eval_data, eval_metric=eval_metric, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params, eval_end_callback=eval_end_callback, eval_batch_end_callback=eval_batch_end_callback, initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init, begin_epoch=begin_epoch, num_epoch=num_epoch, validation_metric=validation_metric, monitor=monitor, sparse_row_id_fn=sparse_row_id_fn, profile=profile)
def __init__(self, num_dim, **kwargs): super(Model, self).__init__(**kwargs) wi1 = Uniform(0.25) wi2 = Uniform(0.1) with self.name_scope(): self.encoder1 = nn.Dense(num_dim//4, in_units=num_dim, weight_initializer=wi1) self.encoder2 = nn.Dense(num_dim//16, in_units=num_dim//4, weight_initializer=wi1) self.encoder3 = nn.Dense(num_dim//64, in_units=num_dim//16, weight_initializer=wi2) self.encoder4 = nn.Dense(num_dim//256, in_units=num_dim//64, weight_initializer=wi2) self.decoder4 = nn.Dense(num_dim//64, in_units=num_dim//256, weight_initializer=wi2) self.decoder3 = nn.Dense(num_dim//16, in_units=num_dim//64, weight_initializer=wi2) self.decoder2 = nn.Dense(num_dim//4, in_units=num_dim//16, weight_initializer=wi1) self.decoder1 = nn.Dense(num_dim, in_units=num_dim//4, weight_initializer=wi1) self.layers = [(self.encoder1,self.decoder1), (self.encoder2,self.decoder2), (self.encoder3,self.decoder3), (self.encoder4,self.decoder4)] for layer in self.layers: self.register_child(layer[0]) self.register_child(layer[1])
def build_model(A, X): model = HybridSequential() with model.name_scope(): features, out_units = build_features(A, X) model.add(features) logger.info("GCN Summary: \n{}".format(model)) classifier = LogisticRegressor(out_units) model.add(classifier) logger.info("GCN + LR Summary: \n{}".format(model)) model.hybridize() model.initialize(Uniform(1)) return model, features
def build_model(A, X): model = HybridSequential() hidden_layer_specs = [(4, 'tanh'), (2, 'tanh')] in_units = in_units = X.shape[1] with model.name_scope(): features, out_units = build_features(A, X) model.add(features) classifier = LogisticRegressor(out_units) model.add(classifier) model.hybridize() model.initialize(Uniform(1)) return model, features
def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_init=False, allow_extra=False): assert self.binded # backbone self.backbone_module.init_params( initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init, allow_extra=allow_extra) self.backbone_module.init_params( initializer=initializer, arg_params=None, aux_params=None, allow_missing=allow_missing, force_init=force_init, allow_extra=allow_extra) # self.bn_module.init_params( # initializer=initializer, arg_params=arg_params, # aux_params=aux_params, allow_missing=allow_missing, # force_init=force_init, allow_extra=allow_extra) self.params_initialized = True
def __init__(self, symbol, ctx=None, num_epoch=None, epoch_size=None, optimizer='sgd', initializer=Uniform(0.01), numpy_batch_size=128, arg_params=None, aux_params=None, allow_extra_params=False, begin_epoch=0, **kwargs): if isinstance(symbol, sym.Symbol): self.symbol = symbol self.sym_gen = None else: assert(callable(symbol)) self.symbol = None self.sym_gen = symbol # model parameters self.arg_params = arg_params self.aux_params = aux_params self.allow_extra_params = allow_extra_params self.argument_checked = False if self.sym_gen is None: self._check_arguments() # basic configuration if ctx is None: ctx = [cpu()] elif isinstance(ctx, Context): ctx = [ctx] self.ctx = ctx # training parameters self.num_epoch = num_epoch self.epoch_size = epoch_size self.kwargs = kwargs.copy() self.optimizer = optimizer self.initializer = initializer self.numpy_batch_size = numpy_batch_size # internal helper state self._pred_exec = None self.begin_epoch = begin_epoch
def create(symbol, X, marks, e_marks=None, y=None, ctx=None, num_epoch=None, epoch_size=None, optimizer='sgd', initializer=Uniform(0.01), eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, time_step_callback=None, kvstore='local', logger=None, work_load_list=None, eval_batch_end_callback=None, **kwargs): """Overwrite""" model = Feed(symbol, ctx=ctx, num_epoch=num_epoch, epoch_size=epoch_size, optimizer=optimizer, initializer=initializer, **kwargs) model.fit(X, y, marks, e_marks=e_marks, eval_data=eval_data, eval_metric=eval_metric, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=kvstore, logger=logger, work_load_list=work_load_list, eval_batch_end_callback=eval_batch_end_callback) return model
def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_init=False, allow_extra=False): if self.params_initialized and not force_init: return assert self.binded, 'call bind before initializing the parameters' #TODO init the same weights with all work nodes self._curr_module.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init, allow_extra=allow_extra) # print('load fcw file') # file_name = '/data/insightface/models/fcw_{}.pkl'.format(str(4)) # tmp_fcw = pickle.load(open(file_name, 'rb')) # print('load weight file done') idx = 0 for _module in self._arcface_modules: # _initializer = mx.init.Normal(0.01) # _module.init_params(initializer=_initializer, arg_params=None, # aux_params=None, allow_missing=allow_missing, # force_init=force_init, allow_extra=allow_extra) file_name = '/data/insightface/recognition/models/fc_weights/v5/sec-fcw_1_{}.pkl'.format(str(idx)) tmp_fcw = pickle.load(open(file_name, 'rb')) print ('for debug, local key is ', tmp_fcw[0]) # local_fcw = mx.nd.array(tmp_fcw[local_key]) local_fcw = mx.nd.array(tmp_fcw[1]) # print('for debug, local fc weigth is: {}, shape is {}'.format(local_fcw, local_fcw.shape)) assert self._ctx_num_classes == local_fcw.shape[0] _initializer = mx.init.Constant(local_fcw) idx += 1 tmp_arg_params = {tmp_fcw[0]: local_fcw} _module.init_params(initializer=_initializer, arg_params=tmp_arg_params, aux_params=None, allow_missing=allow_missing, force_init=force_init, allow_extra=allow_extra) arg_w, aux_w = _module.get_params() # print('for debug, module arg p is', arg_w) # print('for debug, module aux p is ', aux_w) self.params_initialized = True
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None, summary_writer = None): assert num_epoch is not None, 'please specify number of epochs' self.num_batch = 0 self.writer = summary_writer self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) acc_metric = IgnoreAccuracy(output_names=['softmax_output'], label_names=['softmax_label']) # acc_metric = metric.Accuracy(output_names=['softmax_output'], label_names=['softmax_label']) lmnn_metric = metric.Loss(output_names=['lmnn_output'], label_names=['softmax_label']) if validation_metric is None: validation_metric = lmnn_metric ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): tic = time.time() acc_metric.reset() lmnn_metric.reset() # eval_metric.reset() for nbatch, data_batch in enumerate(train_data): if monitor is not None: monitor.tic() self.forward_backward(data_batch) self.update() self.update_metric(acc_metric, data_batch.label) self.update_metric(lmnn_metric, data_batch.label) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) # one epoch of training is finished for name, val in acc_metric.get_name_value(): self.logger.info('Epoch[%d] Accuracy Train-%s=%f', epoch, name, val) for name, val in lmnn_metric.get_name_value(): self.logger.info('Epoch[%d] Lmnn Train-%s=%f', epoch, name, val) if self.num_batch % 10 == 0: # print acc_metric.sum_metric, acc_metric.num_inst self.writer.add_scalar('{}/cls_acc'.format('Train'), acc_metric.sum_metric / acc_metric.num_inst, self.num_batch) self.writer.add_scalar('{}/lmnn_loss'.format('Train'), lmnn_metric.sum_metric / lmnn_metric.num_inst, self.num_batch) self.num_batch += 1 toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc-tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) #---------------------------------------- # evaluation on validation set if eval_data: res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01), ), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None): assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) ####chris_arg if int(os.getenv("TASK_LIMIT", 0)) != 0: #为0时不分task限制,为1时分task但是每轮更新,为2时分task并但固定 get_task_cmd = "sh /home/ubuntu/tc.sh -l 1" else: self.logger.info("no_task_bandwidth_limit") get_task_cmd = "sh /home/ubuntu/tc.sh -l 0" os.system(get_task_cmd) delay_time = float(os.getenv("DELAY_TIME", 0.8)) ps_upload_bandwidth_part1 = int(os.getenv("PS_UPLOAD_BANDWIDTH1", 2000)) worker_upload_bandwidth_part1 = int( os.getenv("WORKER_UPLOAD_BANDWIDTH1", 2000)) ps_upload_bandwidth_part2 = int(os.getenv("PS_UPLOAD_BANDWIDTH2", 2000)) worker_upload_bandwidth_part2 = int( os.getenv("WORKER_UPLOAD_BANDWIDTH2", 2000)) tc_command = "sudo tc class change dev {} parent 1: classid 1:3 htb rate {}mbit ceil {}mbit && sudo tc class change dev {} parent 1: classid 1:4 htb rate {}mbit ceil {}mbit" ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() nbatch = 0 data_iter = iter(train_data) end_of_batch = False next_data_batch = next(data_iter) while not end_of_batch: data_batch = next_data_batch if monitor is not None: monitor.tic() self.forward(data_batch, is_train=True) if int(os.getenv("TASK_LIMIT", 0)) == 1: ##first part bandwidth allocation ndarray.waitall() # self.logger.info("change bandwidth part1:, "+str(time.time())) x = str(ps_upload_bandwidth_part1) y = str(worker_upload_bandwidth_part1) cmd_up = tc_command.format("ens3", x, x, "ens3", y, y) cmd_down = tc_command.format("ifb0", y, y, "ifb0", x, x) os.system(cmd_up) # os.system(cmd_down) # self.logger.info("after forward, "+str(time.time())) self.backward() # self.logger.info("before update: "+str(time.time())) self.update() #异步执行的 if int(os.getenv("TASK_LIMIT", 0)) == 1: x = str(ps_upload_bandwidth_part2) y = str(worker_upload_bandwidth_part2) cmd_up = tc_command.format("ens3", x, x, "ens3", y, y) cmd_down = tc_command.format("ifb0", y, y, "ifb0", x, x) time.sleep(delay_time) ##second part bandwidth allocation # self.logger.info("change bandwidth part2:, "+str(time.time())) os.system(cmd_up) # os.system(cmd_down) try: # pre fetch next batch next_data_batch = next(data_iter) self.prepare(next_data_batch) except StopIteration: end_of_batch = True self.update_metric(eval_metric, data_batch.label) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 # one epoch of training is finished for name, val in eval_metric.get_name_value(): self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) #---------------------------------------- # evaluation on validation set if eval_data: res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
def create(symbol, X, y=None, ctx=None, num_epoch=None, epoch_size=None, optimizer='sgd', initializer=Uniform(0.01), eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', logger=None, work_load_list=None, eval_batch_end_callback=None, **kwargs): """Functional style to create a model. This function will be more consistent with functional languages such as R, where mutation is not allowed. Parameters ---------- symbol : Symbol The symbol configuration of computation network. X : DataIter Training data y : numpy.ndarray, optional If X is numpy.ndarray y is required to set ctx : Context or list of Context, optional The device context of training and prediction. To use multi GPU training, pass in a list of gpu contexts. num_epoch : int, optional Training parameter, number of training epochs(epochs). epoch_size : int, optional Number of batches in a epoch. In default, it is set to ceil(num_train_examples / batch_size) optimizer : str or Optimizer, optional Training parameter, name or optimizer object for training. initializier : initializer function, optional Training parameter, the initialization scheme used. eval_data : DataIter or numpy.ndarray pair If eval_set is numpy.ndarray pair, it should be (valid_data, valid_label) eval_metric : metric.EvalMetric or str or callable The evaluation metric, name of evaluation metric. Or a customize evaluation function that returns the statistics based on minibatch. epoch_end_callback : callable(epoch, symbol, arg_params, aux_states) A callback that is invoked at end of each epoch. This can be used to checkpoint model each epoch. batch_end_callback: callable(epoch) A callback that is invoked at end of each batch For print purpose kvstore: KVStore or str, optional The KVStore or a string kvstore type: 'local', 'dist_sync', 'dis_async' In default uses 'local', often no need to change for single machiine. logger : logging logger, optional When not specified, default logger will be used. work_load_list : list of float or int, optional The list of work load for different devices, in the same order as ctx """ model = FeedForward(symbol, ctx=ctx, num_epoch=num_epoch, epoch_size=epoch_size, optimizer=optimizer, initializer=initializer, **kwargs) model.fit(X, y, eval_data=eval_data, eval_metric=eval_metric, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=kvstore, logger=logger, work_load_list=work_load_list, eval_batch_end_callback=eval_batch_end_callback) return model
def train(self, train_data, epochs=1, batch_size=32, validation_data=None, train_resize_batch_num=None): """Train the model and update the model parameters.""" stats = dict() if self.is_worker: from zoo.orca.data.shard import RayPartition if isinstance(train_data, RayPartition): from zoo.orca.data.utils import ray_partition_get_data_label data, label = ray_partition_get_data_label( train_data.get_data(), allow_tuple=False, allow_list=False) train_data_iter = mx.io.NDArrayIter(data=data, label=label, batch_size=batch_size, shuffle=True) if train_resize_batch_num is not None: train_data_iter = mx.io.ResizeIter(train_data_iter, train_resize_batch_num) if validation_data: data_val, label_val = ray_partition_get_data_label( validation_data.get_data(), allow_tuple=False, allow_list=False) val_data_iter = mx.io.NDArrayIter(data=data_val, label=label_val, batch_size=batch_size, shuffle=True) else: val_data_iter = None else: # data_creator functions; should return Iter or DataLoader config = self.config if "batch_size" not in config: config["batch_size"] = batch_size train_data_iter = train_data(config, self.kv) val_data_iter = validation_data( config, self.kv) if validation_data else None start_time = time.time() if self.trainer: # Imperative API for epoch in range(epochs): train_data_iter.reset() if self.eval_metrics: self.eval_metrics.reset( ) # metrics will accumulate for one batch batch_start_time = time.time() epoch_start_time = time.time() for i, batch in enumerate(train_data_iter): data = gluon.utils.split_and_load( batch.data[0].astype("float32"), ctx_list=[mx.cpu()], batch_axis=0) label = gluon.utils.split_and_load( batch.label[0].astype("float32"), ctx_list=[mx.cpu()], batch_axis=0) outputs = [] Ls = [] from mxnet import autograd as ag with ag.record(): for x, y in zip(data, label): z = self.model(x) # forward L = self.loss(z, y) # store the loss and do backward on a batch for better speed Ls.append(L) outputs.append(z) ag.backward(Ls) self.trainer.step(batch.data[0].shape[0]) if self.eval_metrics: self.eval_metrics.update(label, outputs) if not (i + 1) % self.config["log_interval"]: # This would be logged on driver for each worker process. iteration_log = \ "Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f" \ % (epoch, i, batch_size / (time.time() - batch_start_time), "loss", Ls[0].asnumpy().mean()) if self.eval_metrics: names, accs = self.eval_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): iteration_log += " %s=%f" % (name, acc) self.logger.info(iteration_log) batch_start_time = time.time() # Epoch time log self.logger.info("[Epoch %d] time cost: %f" % (epoch, time.time() - epoch_start_time)) # Epoch metrics log on train data if self.eval_metrics: epoch_train_log = "[Epoch %d] training: " % epoch names, accs = self.eval_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): epoch_train_log += "%s=%f " % (name, acc) self.logger.info(epoch_train_log) # Epoch metrics log on validation data if any: if val_data_iter: self.val_metrics.reset() val_data_iter.reset() for batch in val_data_iter: data = gluon.utils.split_and_load( batch.data[0].astype("float32", copy=False), ctx_list=[mx.cpu()], batch_axis=0) label = gluon.utils.split_and_load( batch.label[0].astype("float32", copy=False), ctx_list=[mx.cpu()], batch_axis=0) outputs = [self.model(X) for X in data] self.val_metrics.update(label, outputs) epoch_val_log = "[Epoch %d] validation: " % epoch names, accs = self.val_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): epoch_val_log += "%s=%f " % (name, acc) self.logger.info(epoch_val_log) # TODO: save checkpoints if self.eval_metrics: names, accs = self.eval_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): stats[name] = acc else: # Symbolic API # TODO: seems no history (i.e. validation accuracy) returned by fit? if "init" not in self.config: from mxnet.initializer import Uniform self.config["init"] = Uniform( 0.01) # This is the default value for MXNet if self.eval_metrics is None: self.eval_metrics = 'acc' self.model.fit( train_data=train_data_iter, num_epoch=epochs, initializer=self.config["init"], kvstore=self.kv, optimizer=self.config["optimizer"], optimizer_params=self.config["optimizer_params"], eval_data=val_data_iter, eval_metric=self.eval_metrics, validation_metric=self.val_metrics, batch_end_callback=mx.callback.Speedometer( batch_size, self.config["log_interval"]), epoch_end_callback=None if "model" not in self.config else mx.callback.do_checkpoint(self.config["model"])) epoch_time = time.time() - start_time stats["epoch_time"] = epoch_time if isinstance(train_data, RayPartition): del train_data if validation_data and isinstance(validation_data, RayPartition): del validation_data return stats
def __init__(self, **kwargs): super(QANet, self).__init__(**kwargs) with self.name_scope(): self.flatten = gluon.nn.Flatten() self.dropout = gluon.nn.Dropout(opt.layers_dropout) self.char_conv = ConvolutionalEncoder( embed_size=opt.char_emb_dim, num_filters=opt.char_conv_filters, ngram_filter_sizes=opt.char_conv_ngrams, conv_layer_activation=None, num_highway=0) self.highway = gluon.nn.HybridSequential() with self.highway.name_scope(): self.highway.add( gluon.nn.Dense(units=opt.emb_encoder_conv_channels, flatten=False, use_bias=False, weight_initializer=Xavier())) self.highway.add( Highway(input_size=opt.emb_encoder_conv_channels, num_layers=opt.highway_layers, activation='relu', highway_bias=HighwayBias(nonlinear_transform_bias=0.0, transform_gate_bias=0.0))) self.word_emb = gluon.nn.HybridSequential() with self.word_emb.name_scope(): self.word_emb.add( gluon.nn.Embedding(input_dim=opt.word_corpus, output_dim=opt.word_emb_dim)) self.word_emb.add(gluon.nn.Dropout(rate=opt.word_emb_dropout)) self.char_emb = gluon.nn.HybridSequential() with self.char_emb.name_scope(): self.char_emb.add( gluon.nn.Embedding(input_dim=opt.character_corpus, output_dim=opt.char_emb_dim, weight_initializer=Normal(sigma=0.1))) self.char_emb.add(gluon.nn.Dropout(rate=opt.char_emb_dropout)) with self.name_scope(): self.emb_encoder = Encoder( kernel_size=opt.emb_encoder_conv_kernerl_size, num_filters=opt.emb_encoder_conv_channels, conv_layers=opt.emb_encoder_num_conv_layers, num_heads=opt.emb_encoder_num_head, num_blocks=opt.emb_encoder_num_block) self.project = gluon.nn.Dense(units=opt.emb_encoder_conv_channels, flatten=False, use_bias=False, weight_initializer=Xavier()) with self.name_scope(): self.co_attention = CoAttention() with self.name_scope(): self.model_encoder = Encoder( kernel_size=opt.model_encoder_conv_kernel_size, num_filters=opt.model_encoder_conv_channels, conv_layers=opt.model_encoder_conv_layers, num_heads=opt.model_encoder_num_head, num_blocks=opt.model_encoder_num_block) with self.name_scope(): self.predict_begin = gluon.nn.Dense( units=1, use_bias=True, flatten=False, weight_initializer=Xavier(rnd_type='uniform', factor_type='in', magnitude=1), bias_initializer=Uniform(1.0 / opt.model_encoder_conv_channels)) self.predict_end = gluon.nn.Dense( units=1, use_bias=True, flatten=False, weight_initializer=Xavier(rnd_type='uniform', factor_type='in', magnitude=1), bias_initializer=Uniform(1.0 / opt.model_encoder_conv_channels))
def train(self, nb_epoch=1): """Train the model and update the model parameters.""" stats = dict() if self.is_worker: start_time = time.time() if self.trainer: # Imperative API for epoch in range(nb_epoch): self.train_data.reset() if self.metrics: self.metrics.reset() # metrics will accumulate for one batch batch_start_time = time.time() epoch_start_time = time.time() for i, batch in enumerate(self.train_data): data = gluon.utils.split_and_load( batch.data[0].astype("float32"), ctx_list=[mx.cpu()], batch_axis=0) label = gluon.utils.split_and_load( batch.label[0].astype("float32"), ctx_list=[mx.cpu()], batch_axis=0) outputs = [] Ls = [] from mxnet import autograd as ag with ag.record(): for x, y in zip(data, label): z = self.model(x) # forward L = self.loss(z, y) # store the loss and do backward on a batch for better speed Ls.append(L) outputs.append(z) ag.backward(Ls) self.trainer.step(batch.data[0].shape[0]) if self.metrics: self.metrics.update(label, outputs) if not (i + 1) % self.config["log_interval"]: # This would be logged on driver for each worker process. iteration_log = \ "Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f" \ % (epoch, i, self.config["batch_size"] / (time.time() - batch_start_time), "loss", Ls[0].asnumpy().mean()) if self.metrics: names, accs = self.metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): iteration_log += " %s=%f" % (name, acc) self.logger.info(iteration_log) batch_start_time = time.time() # Epoch time log self.logger.info("[Epoch %d] time cost: %f" % (epoch, time.time() - epoch_start_time)) # Epoch metrics log on train data if self.metrics: epoch_train_log = "[Epoch %d] training: " % epoch names, accs = self.metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): epoch_train_log += "%s=%f " % (name, acc) self.logger.info(epoch_train_log) # Epoch metrics log on validation data if any: if self.val_data: self.metrics.reset() self.val_data.reset() for batch in self.val_data: data = gluon.utils.split_and_load( batch.data[0].astype("float32", copy=False), ctx_list=[mx.cpu()], batch_axis=0) label = gluon.utils.split_and_load( batch.label[0].astype("float32", copy=False), ctx_list=[mx.cpu()], batch_axis=0) outputs = [self.model(X) for X in data] self.metrics.update(label, outputs) epoch_val_log = "[Epoch %d] validation: " % epoch names, accs = self.metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): epoch_val_log += "%s=%f " % (name, acc) self.logger.info(epoch_val_log) # TODO: save checkpoints if self.metrics: names, accs = self.metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): stats[name] = acc else: # Symbolic API # TODO: seems no history (i.e. validation accuracy) returned by fit? if "init" not in self.config: from mxnet.initializer import Uniform self.config["init"] = Uniform(0.01) # This is the default value for MXNet self.model.fit(train_data=self.train_data, num_epoch=nb_epoch, initializer=self.config["init"], kvstore=self.kv, optimizer=self.config["optimizer"], optimizer_params=self.config["optimizer_params"], eval_data=self.val_data, # TODO: eval and validation metrics could be different eval_metric=self.metrics, validation_metric=self.metrics, batch_end_callback=mx.callback.Speedometer( self.config["batch_size"], self.config["log_interval"]), epoch_end_callback=None if "model" not in self.config else mx.callback.do_checkpoint(self.config["model"])) epoch_time = time.time() - start_time stats["epoch_time"] = epoch_time return stats
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01), ), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None, sparse_row_id_fn=None, profile=False): """Trains the module parameters. Checkout `Module Tutorial <http://mxnet.io/tutorials/basic/module.html>`_ to see a end-to-end use-case. Parameters ---------- train_data : DataIter Train DataIter. eval_data : DataIter If not ``None``, will be used as validation set and the performance after each epoch will be evaluated. eval_metric : str or EvalMetric Defaults to 'accuracy'. The performance measure used to display during training. Other possible predefined metrics are: 'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'. epoch_end_callback : function or list of functions Each callback will be called with the current `epoch`, `symbol`, `arg_params` and `aux_params`. batch_end_callback : function or list of function Each callback will be called with a `BatchEndParam`. kvstore : str or KVStore Defaults to 'local'. optimizer : str or Optimizer Defaults to 'sgd'. optimizer_params : dict Defaults to ``(('learning_rate', 0.01),)``. The parameters for the optimizer constructor. The default value is not a dict, just to avoid pylint warning on dangerous default values. eval_end_callback : function or list of function These will be called at the end of each full evaluation, with the metrics over the entire evaluation set. eval_batch_end_callback : function or list of function These will be called at the end of each mini-batch during evaluation. initializer : Initializer The initializer is called to initialize the module parameters when they are not already initialized. arg_params : dict Defaults to ``None``, if not ``None``, should be existing parameters from a trained model or loaded from a checkpoint (previously saved model). In this case, the value here will be used to initialize the module parameters, unless they are already initialized by the user via a call to `init_params` or `fit`. `arg_params` has a higher priority than `initializer`. aux_params : dict Defaults to ``None``. Similar to `arg_params`, except for auxiliary states. allow_missing : bool Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params` and `aux_params` are not ``None``. If this is ``True``, then the missing parameters will be initialized via the `initializer`. force_rebind : bool Defaults to ``False``. Whether to force rebinding the executors if already bound. force_init : bool Defaults to ``False``. Indicates whether to force initialization even if the parameters are already initialized. begin_epoch : int Defaults to 0. Indicates the starting epoch. Usually, if resumed from a checkpoint saved at a previous training phase at epoch N, then this value should be N+1. num_epoch : int Number of epochs for training. sparse_row_id_fn : A callback function The function takes `data_batch` as an input and returns a dict of str -> NDArray. The resulting dict is used for pulling row_sparse parameters from the kvstore, where the str key is the name of the param, and the value is the row id of the param to pull. """ assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() nbatch = 0 data_iter = iter(train_data) end_of_batch = False next_data_batch = next(data_iter) while not end_of_batch: data_batch = next_data_batch if monitor is not None: monitor.tic() self.forward_backward(data_batch) self.update() if isinstance(data_batch, list): self.update_metric(eval_metric, [db.label for db in data_batch], pre_sliced=True) else: self.update_metric(eval_metric, data_batch.label) try: # pre fetch next batch next_data_batch = next(data_iter) self.prepare(next_data_batch, sparse_row_id_fn=sparse_row_id_fn) except StopIteration: end_of_batch = True if monitor is not None: monitor.toc_print() if end_of_batch: eval_name_vals = eval_metric.get_name_value() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 if profile is True and nbatch == 10: self.logger.info("Profiling ends") import mxnet as mx mx.profiler.dump() # one epoch of training is finished for name, val in eval_name_vals: self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None and self._kvstore.rank == 0: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
def net_initialize(net, model_ctx, initializer: (str, Initializer, dict, list) = mx.init.Xavier(), select=None, logger=logging, verbose=False, force_reinit=False): """ 初始化网络参数 Parameters ---------- net model_ctx: mx.cpu or mx.gpu initializer: str, Initializer, dict or list, tuple select logger verbose : bool, default False Whether to verbosely print out details on initialization. force_reinit : bool, default False Whether to force re-initialization if parameter is already initialized. Notes ------ The developer who modify this document should simultaneously modify the related function in glue Examples -------- >>> import mxnet as mx >>> from mxnet import gluon >>> emb = gluon.nn.Embedding(2, 3) >>> net_initialize(emb, mx.cpu()) >>> emb.weight.data() <BLANKLINE> [[0.10694504 0.2034123 0.4714563 ] [0.7542485 0.2251432 0.7842196 ]] <NDArray 2x3 @cpu(0)> >>> emb1 = gluon.nn.Embedding(2, 3) >>> net_initialize(emb1, mx.cpu(), initializer=mx.init.Xavier()) >>> emb1.weight.data() <BLANKLINE> [[ 0.09833419 0.76079047 -0.16726398] [ 0.27071452 0.319638 -0.25330698]] <NDArray 2x3 @cpu(0)> >>> class EmbNet(gluon.nn.HybridBlock): ... def __init__(self, prefix=None, params=None): ... super(EmbNet, self).__init__(prefix, params) ... with self.name_scope(): ... self.emb = gluon.nn.Embedding(2, 3) ... self.linear = gluon.nn.Dense(4) ... def hybrid_forward(self, F, x, *args, **kwargs): ... return self.linear(self.emb(x)) >>> net = EmbNet() >>> from longling.ML.DL import BLOCK_EMBEDDING >>> net_initialize(net, mx.cpu(), initializer={BLOCK_EMBEDDING: "xaiver", ".*embedding": "uniform"}) >>> net(mx.nd.array([0, 1])) <BLANKLINE> [[ 0.03268543 -0.00860071 0.04774952 0.00056277] [-0.00648303 -0.03121923 -0.04578817 -0.08059631]] <NDArray 2x4 @cpu(0)> >>> net1 = EmbNet() >>> net_initialize(net1, mx.cpu(), initializer=["xaiver", "uniform"], select=[BLOCK_EMBEDDING, ".*embedding"]) >>> net1(mx.nd.array([0, 1])) # doctest: +ELLIPSIS <BLANKLINE> [[-0.0896... -0.0179... -0.0156... -0.0136...] [ 0.0033... 0.0255... 0.0111... 0.0446...]] <NDArray 2x4 @cpu(0)> >>> net_initialize(net1, mx.cpu(), initializer=[(BLOCK_EMBEDDING, "xaiver"), (".*embedding", "uniform")], ... force_reinit=True) >>> net1(mx.nd.array([0, 1])) # doctest: +ELLIPSIS <BLANKLINE> [[ 0.0153... 0.0266... -0.0466... 0.0291...] [-0.0362... 0.0063... 0.0227... -0.0212...]] <NDArray 2x4 @cpu(0)> """ if isinstance(initializer, str): initializer = { "xaiver": Xavier(), "uniform": Uniform(), "normal": Normal() }[initializer] elif isinstance(initializer, dict): for _select, _initializer in initializer.items(): net_initialize(net, model_ctx=model_ctx, initializer=_initializer, select=_select, logger=logger, verbose=verbose, force_reinit=force_reinit) return elif isinstance(initializer, (list, tuple)): if select is not None: assert len(select) == len(initializer) for _select, _initializer in zip(select, initializer): net_initialize(net, model_ctx=model_ctx, initializer=_initializer, select=_select, logger=logger, verbose=verbose, force_reinit=force_reinit) else: for _select, _initializer in initializer: net_initialize(net, model_ctx=model_ctx, initializer=_initializer, select=_select, logger=logger, verbose=verbose, force_reinit=force_reinit) return elif initializer is None or isinstance(initializer, Initializer): pass else: raise TypeError( "initializer should be either str or Initializer, now is", type(initializer)) logger.info("initializer: %s, select: %s, ctx: %s" % (initializer, select, model_ctx)) net.collect_params(select).initialize(initializer, ctx=model_ctx, verbose=verbose, force_reinit=force_reinit)
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None, prefix=None, batches_checkpoint=None, num_batches_save_ckpt=2000): """Train the module parameters. Parameters ---------- train_data : DataIter eval_data : DataIter If not `None`, will be used as validation set and evaluate the performance after each epoch. eval_metric : str or EvalMetric Default `'acc'`. The performance measure used to display during training. epoch_end_callback : function or list of function Each callback will be called with the current `epoch`, `symbol`, `arg_params` and `aux_params`. batch_end_callback : function or list of function Each callback will be called with a `BatchEndParam`. kvstore : str or KVStore Default `'local'`. optimizer : str or Optimizer Default `'sgd'` optimizer_params : dict Default `(('learning_rate', 0.01),)`. The parameters for the optimizer constructor. The default value is not a `dict`, just to avoid pylint warning on dangerous default values. eval_end_callback : function or list of function These will be called at the end of each full evaluation, with the metrics over the entire evaluation set. eval_batch_end_callback : function or list of function These will be called at the end of each minibatch during evaluation initializer : Initializer Will be called to initialize the module parameters if not already initialized. arg_params : dict Default `None`, if not `None`, should be existing parameters from a trained model or loaded from a checkpoint (previously saved model). In this case, the value here will be used to initialize the module parameters, unless they are already initialized by the user via a call to `init_params` or `fit`. `arg_params` has higher priority to `initializer`. aux_params : dict Default `None`. Similar to `arg_params`, except for auxiliary states. allow_missing : bool Default `False`. Indicate whether we allow missing parameters when `arg_params` and `aux_params` are not `None`. If this is `True`, then the missing parameters will be initialized via the `initializer`. force_rebind : bool Default `False`. Whether to force rebinding the executors if already binded. force_init : bool Default `False`. Indicate whether we should force initialization even if the parameters are already initialized. begin_epoch : int Default `0`. Indicate the starting epoch. Usually, if we are resuming from a checkpoint saved at a previous training phase at epoch N, then we should specify this value as N+1. num_epoch : int Number of epochs to run training. Examples -------- An example of using fit for training:: >>> #Assume training dataIter and validation dataIter are ready >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer_params={'learning_rate':0.01, 'momentum': 0.9}, num_epoch=10) """ assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() for nbatch, data_batch in enumerate(train_data): if monitor is not None: monitor.tic() self.forward_backward(data_batch) self.update() self.update_metric(eval_metric, data_batch.label) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) if batches_checkpoint is not None and nbatch != 0 and nbatch % num_batches_save_ckpt == 0: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) # one epoch of training is finished for name, val in eval_metric.get_name_value(): self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc-tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) if prefix is not None: self._curr_module.save_checkpoint(prefix, epoch + 1, save_optimizer_states=True) #---------------------------------------- # evaluation on validation set if eval_data: res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
def acc_fit(mod, update_batch_size,\ train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None): """ this function aims to support training in larger input size by allocating additive space to store auxiliary grads mod: mx.mod.Module update_batch: int, specifying how many batches between two updates **arg_keys: same as mod.fit """ assert num_epoch is not None, 'please specify number of epochs' it_batch_size = train_data.batch_size mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: mod.install_monitor(monitor) mod.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) mod.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) ################################################################################ # training loop ################################################################################ arg_acc_grad_arrays = None # to store auxiliary grad_arrays for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() nbatch = 0 data_iter = iter(train_data) end_of_batch = False next_data_batch = next(data_iter) while not end_of_batch: data_batch = next_data_batch if monitor is not None: monitor.tic() mod.forward_backward(data_batch) arg_acc_grad_arrays = acc_grad_arrays(mod, arg_acc_grad_arrays) if nbatch * it_batch_size % update_batch_size == 0 and nbatch > 0: set_grad_arrays( mod, arg_acc_grad_arrays, update_batch_size / it_batch_size) # normsize=1 by default(softmax norm) mod.update() arg_acc_grad_arrays = None try: # pre fetch next batch next_data_batch = next(data_iter) mod.prepare(next_data_batch) except StopIteration: end_of_batch = True mod.update_metric(eval_metric, data_batch.label) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 if arg_acc_grad_arrays is not None: # left one update... set_grad_arrays(mod, arg_acc_grad_arrays, update_batch_size / it_batch_size) mod.update() arg_acc_grad_arrays = None # one epoch of training is finished for name, val in eval_metric.get_name_value(): mod.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() mod.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) # sync aux params across devices arg_params, aux_params = mod.get_params() mod.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, mod.symbol, arg_params, aux_params) #---------------------------------------- # evaluation on validation set if eval_data: res = mod.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default for name, val in res: mod.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
def fit( self, train_data, eval_data=None, eval_metric="acc", epoch_end_callback=None, batch_end_callback=None, kvstore="local", optimizer="sgd", optimizer_params=(("learning_rate", 0.01),), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None, prefix=None, ): """Train the module parameters. Parameters ---------- train_data : DataIter eval_data : DataIter If not `None`, will be used as validation set and evaluate the performance after each epoch. eval_metric : str or EvalMetric Default `'acc'`. The performance measure used to display during training. epoch_end_callback : function or list of function Each callback will be called with the current `epoch`, `symbol`, `arg_params` and `aux_params`. batch_end_callback : function or list of function Each callback will be called with a `BatchEndParam`. kvstore : str or KVStore Default `'local'`. optimizer : str or Optimizer Default `'sgd'` optimizer_params : dict Default `(('learning_rate', 0.01),)`. The parameters for the optimizer constructor. The default value is not a `dict`, just to avoid pylint warning on dangerous default values. eval_end_callback : function or list of function These will be called at the end of each full evaluation, with the metrics over the entire evaluation set. eval_batch_end_callback : function or list of function These will be called at the end of each minibatch during evaluation initializer : Initializer Will be called to initialize the module parameters if not already initialized. arg_params : dict Default `None`, if not `None`, should be existing parameters from a trained model or loaded from a checkpoint (previously saved model). In this case, the value here will be used to initialize the module parameters, unless they are already initialized by the user via a call to `init_params` or `fit`. `arg_params` has higher priority to `initializer`. aux_params : dict Default `None`. Similar to `arg_params`, except for auxiliary states. allow_missing : bool Default `False`. Indicate whether we allow missing parameters when `arg_params` and `aux_params` are not `None`. If this is `True`, then the missing parameters will be initialized via the `initializer`. force_rebind : bool Default `False`. Whether to force rebinding the executors if already binded. force_init : bool Default `False`. Indicate whether we should force initialization even if the parameters are already initialized. begin_epoch : int Default `0`. Indicate the starting epoch. Usually, if we are resuming from a checkpoint saved at a previous training phase at epoch N, then we should specify this value as N+1. num_epoch : int Number of epochs to run training. Examples -------- An example of using fit for training:: >>> #Assume training dataIter and validation dataIter are ready >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer_params={'learning_rate':0.01, 'momentum': 0.9}, num_epoch=10) """ assert num_epoch is not None, "please specify number of epochs" self.bind( data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind, ) if monitor is not None: self.install_monitor(monitor) self.init_params( initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init, ) self.init_optimizer( kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params ) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) ################################################################################ # training loop ################################################################################ # epoch 0 if epoch_end_callback is not None: arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) for callback in _as_list(epoch_end_callback): callback(-1, self.symbol, arg_params, aux_params) from lib.pair_matching.batch_updater_py_multi import batchUpdaterPyMulti config = self.config if config.TRAIN.TENSORBOARD_LOG: from mxboard import SummaryWriter tf_log_dir = os.path.join( os.path.dirname(prefix), "logs/{}".format(time.strftime("%Y-%m-%d-%H-%M")), ) summ_writer = SummaryWriter(logdir=tf_log_dir) interBatchUpdater = batchUpdaterPyMulti(config, 480, 640) last_lr = 0 cur_step = 0 for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() for nbatch, data_batch in enumerate(train_data): if monitor is not None: monitor.tic() # disp weights L2 norm cur_lr = self._curr_module._optimizer._get_lr(0) if nbatch % (4000 / train_data.batch_size) == 0: all_params = self._curr_module.get_params()[0] all_param_names = all_params.keys() all_param_names = sorted(all_param_names) print_and_log(prefix, self.logger) weight_str = "" for view_name in all_param_names: weight_str += "{}: {} ".format( view_name, nd.norm(all_params[view_name]).asnumpy() ) print_and_log(weight_str, self.logger) print_and_log( "batch {}: lr: {}".format(nbatch, cur_lr), self.logger ) if config.TRAIN.TENSORBOARD_LOG: summ_writer.add_scalar( tag="learning_rate", value=cur_lr, global_step=cur_step ) if cur_lr != last_lr: print_and_log( "batch {}: lr: {}".format(nbatch, cur_lr), self.logger ) last_lr = cur_lr if config.TRAIN.TENSORBOARD_LOG: summ_writer.add_scalar( tag="learning_rate", value=cur_lr, global_step=cur_step ) train_iter_size = config.network.TRAIN_ITER_SIZE for iter_idx in range(train_iter_size): self.forward_backward(data_batch) preds = self._curr_module.get_outputs(False) self.update() if iter_idx != train_iter_size - 1: data_batch = interBatchUpdater.forward( data_batch, preds, config ) cur_step += 1 self.update_metric(eval_metric, data_batch.label) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam( epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals(), ) for callback in _as_list(batch_end_callback): callback(batch_end_params) if config.TRAIN.TENSORBOARD_LOG: for name, val in eval_metric.get_name_value(): summ_writer.add_scalar( tag="BatchTrain-{}".format(name), value=val, global_step=cur_step, ) # one epoch of training is finished for name, val in eval_metric.get_name_value(): self.logger.info("Epoch[%d] Train-%s=%f", epoch, name, val) if config.TRAIN.TENSORBOARD_LOG: summ_writer.add_scalar( tag="EpochTrain-{}".format(name), value=val, global_step=epoch ) toc = time.time() self.logger.info("Epoch[%d] Time cost=%.3f", epoch, (toc - tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) # ---------------------------------------- # evaluation on validation set if eval_data: res = self.score( eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch, ) # TODO: pull this into default for name, val in res: self.logger.info("Epoch[%d] Validation-%s=%f", epoch, name, val) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
def __init__(self, bert, prefix=None, params=None, \ n_rnn_layers=0, rnn_hidden_size=600, num_rnn_layers=1, n_dense_layers=0, units_dense=600, \ add_query=False, \ apply_coattention=False, bert_out_dim=768,\ apply_self_attention=False, self_attention_dimension=None, n_attention_heads=4, apply_transformer=False, qanet_style_out=False, bidaf_style_out=False, remove_special_token=False): super(BertForQA, self).__init__(prefix=prefix, params=params) self.add_query=add_query self.apply_coattention = apply_coattention self.apply_self_attention = apply_self_attention self.apply_transformer = apply_transformer self.qanet_style_out = qanet_style_out self.bidaf_style_out = bidaf_style_out self.remove_special_token = remove_special_token self.bert = bert if self.apply_coattention: with self.name_scope(): #self.co_attention_ = CoAttention("co-attention_", bert_out_dim) # try multiple layers self.co_attention = CoAttention("co-attention", bert_out_dim) if self.qanet_style_out: self.project = gluon.nn.Dense( units=bert_out_dim, flatten=False, use_bias=False, weight_initializer=Xavier(), prefix='projection_' ) self.dropout = gluon.nn.Dropout(0.1) self.model_encoder = TransformerEncoder(units=bert_out_dim) self.predict_begin = gluon.nn.Dense( units=1, use_bias=True, flatten=False, weight_initializer=Xavier( rnd_type='uniform', factor_type='in', magnitude=1), bias_initializer=Uniform(1.0/bert_out_dim), prefix='predict_start_' ) self.predict_end = gluon.nn.Dense( units=1, use_bias=True, flatten=False, weight_initializer=Xavier( rnd_type='uniform', factor_type='in', magnitude=1), bias_initializer=Uniform(1.0/bert_out_dim), prefix='predict_end_' ) self.flatten = gluon.nn.Flatten() elif self.bidaf_style_out: # BiDAF mode self.modeling_layer = rnn.LSTM( hidden_size=int(bert_out_dim / 2), num_layers=2, dropout=0.0, bidirectional=True, input_size=int(bert_out_dim * 4)) self.output_layer = BiDAFOutputLayer(span_start_input_dim=int(bert_out_dim / 2), nlayers=1, dropout=0.2) # ''' # for the cls's encoding # used in version 2.0 self.cls_mapping = nn.Dense( units=2, flatten=False, weight_initializer=Xavier(), prefix='cls_mapping_' ) # ''' if self.apply_self_attention: if self_attention_dimension is None: self_attention_dimension = bert_out_dim with self.name_scope(): self.multi_head_attention = MultiHeadAttentionCell(DotProductAttentionCell(), \ self_attention_dimension, self_attention_dimension, self_attention_dimension, n_attention_heads) if self.apply_transformer: with self.name_scope(): self.transformer = TransformerEncoder(units=bert_out_dim) if self.apply_coattention and (self.qanet_style_out or self.bidaf_style_out): self.span_classifier = None else: self.span_classifier = nn.HybridSequential() with self.span_classifier.name_scope(): for i in range(n_rnn_layers): self.span_classifier.add(rnn.LSTM( hidden_size=rnn_hidden_size, num_layers=num_rnn_layers, dropout=0.0, bidirectional=True)) for i in range(n_dense_layers): self.span_classifier.add(nn.Dense(units=units_dense, flatten=False, activation='relu')) self.span_classifier.add(nn.Dense(units=2, flatten=False))
def train(self, train_data, epochs=1, batch_size=32, validation_data=None, train_resize_batch_num=None): """Train the model and update the model parameters.""" stats = dict() if self.is_worker: config = self.config.copy() if "batch_size" not in config: config["batch_size"] = batch_size if train_resize_batch_num is not None: config["train_resize_batch_num"] = train_resize_batch_num train_data_iter = train_data(config, self.kv) val_data_iter = validation_data( config, self.kv) if validation_data else None start_time = time.time() if self.trainer: # Imperative API def cpu_context(target_data): if isinstance(target_data, list): return [cpu_context(d) for d in target_data] else: return target_data.as_in_context(mx.cpu()) for epoch in range(epochs): # DataLoader doesn't need to be reset. if isinstance(train_data_iter, mx.io.DataIter): train_data_iter.reset() if self.eval_metrics: self.eval_metrics.reset( ) # metrics will accumulate for one batch. batch_start_time = time.time() epoch_start_time = time.time() for i, batch in enumerate(train_data_iter): data = cpu_context(batch.data) label = cpu_context(batch.label) if not isinstance(data, list): data = [data] if not isinstance(label, list): label = [label] from mxnet import autograd as ag with ag.record(): output = self.model(*data) # forward if not isinstance(output, list): output = [output] Ls = self.loss(*output, *label) ag.backward(Ls) self.trainer.step(batch_size) if self.eval_metrics: self.eval_metrics.update(label, output) if not (i + 1) % self.config["log_interval"]: # This would be logged on driver for each worker process. iteration_log = \ "Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f" \ % (epoch, i, batch_size / (time.time() - batch_start_time), "loss", Ls.asnumpy().mean()) if self.eval_metrics: names, accs = self.eval_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): iteration_log += " %s=%f" % (name, acc) self.logger.info(iteration_log) batch_start_time = time.time() # Epoch time log. self.logger.info("[Epoch %d] time cost: %f" % (epoch, time.time() - epoch_start_time)) # Epoch metrics log on train data. if self.eval_metrics: epoch_train_log = "[Epoch %d] training: " % epoch names, accs = self.eval_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): epoch_train_log += "%s=%f " % (name, acc) self.logger.info(epoch_train_log) # Epoch metrics log on validation data if any. if val_data_iter: if isinstance(val_data_iter, mx.io.DataIter): val_data_iter.reset() self.val_metrics.reset() for batch in val_data_iter: data = cpu_context(batch.data) label = cpu_context(batch.label) if not isinstance(data, list): data = [data] if not isinstance(label, list): label = [label] output = self.model(*data) if not isinstance(output, list): output = [output] self.val_metrics.update(label, output) epoch_val_log = "[Epoch %d] validation: " % epoch names, accs = self.val_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): epoch_val_log += "%s=%f " % (name, acc) self.logger.info(epoch_val_log) # TODO: save checkpoints if self.eval_metrics: names, accs = self.eval_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): stats[name] = acc else: # Symbolic API # TODO: seems no history (i.e. validation accuracy) returned by fit? if "init" not in self.config: from mxnet.initializer import Uniform self.config["init"] = Uniform( 0.01) # This is the default value for MXNet. if self.eval_metrics is None: self.eval_metrics = 'acc' # This is the default value for MXNet. self.model.fit( train_data=train_data_iter, num_epoch=epochs, initializer=self.config["init"], kvstore=self.kv, optimizer=self.config["optimizer"], optimizer_params=self.config["optimizer_params"], eval_data=val_data_iter, eval_metric=self.eval_metrics, validation_metric=self.val_metrics, batch_end_callback=mx.callback.Speedometer( batch_size, self.config["log_interval"]), epoch_end_callback=None if "model" not in self.config else mx.callback.do_checkpoint(self.config["model"])) epoch_time = time.time() - start_time stats["epoch_time"] = epoch_time return [stats]
def mlperf_fit(self, args, data_loader, epoch_size, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01), ), explorer='linear', explorer_params=None, eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None, sparse_row_id_fn=None, eval_offset=0, eval_period=1, accuracy_threshold=1.0): assert num_epoch is not None, 'please specify number of epochs' if monitor is not None: self.install_monitor(monitor) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) explorer = Explorer.create_explorer(name=explorer, optimizer=self._optimizer, explorer_params=explorer_params) #This mxnet can not use several optimizers without sgd series explorer.set_best_coeff(0) explorer.set_best_wd_coeff(0) explorer.set_best_cg(0) exp_freq = explorer_params['explore_freq'] exp_start_epoch = explorer_params['explore_start_epoch'] if validation_metric is None: validation_metric = eval_metric ########################################################################### # Adding Correct and Total Count metrics ########################################################################### if not isinstance(validation_metric, list): validation_metric = [validation_metric] validation_metric = mx.metric.create(validation_metric) if not isinstance(validation_metric, mx.metric.CompositeEvalMetric): vm = mx.metric.CompositeEvalMetric() vm.append(validation_metric) validation_metric = vm for m in [CorrectCount(), TotalCount()]: validation_metric.metrics.append(m) ########################################################################### if not isinstance(eval_metric, mx.metric.EvalMetric): eval_metric = mx.metric.create(eval_metric) try: world_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) except: world_rank = 0 world_size = 1 use_cval_data = explorer_params['add_one_fwd_epoch'] < num_epoch \ or explorer_params['no_augument_epoch'] < num_epoch best_rank = 0 self.prepare_states() mx_resnet_print(key=mlperf_constants.INIT_STOP, sync=True) mx_resnet_print(key=mlperf_constants.RUN_START, sync=True) # data iterators (train_data, eval_data, cval_data) = data_loader(args, kvstore) if 'dist' in args.kv_store and not 'async' in args.kv_store: logging.info('Resizing training data to %d batches per machine', epoch_size) # resize train iter to ensure each machine has same number of batches per epoch # if not, dist_sync can hang at the end with one machine waiting for other machines if not args.use_dali: train = mx.io.ResizeIter(train_data, epoch_size) block_epoch_start = begin_epoch block_epoch_count = eval_offset + 1 - (begin_epoch % eval_period) if block_epoch_count < 0: block_epoch_count += eval_period mx_resnet_print(key=mlperf_constants.BLOCK_START, metadata={ 'first_epoch_num': block_epoch_start + 1, 'epoch_count': block_epoch_count }) ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): mx_resnet_print(key=mlperf_constants.EPOCH_START, metadata={'epoch_num': epoch + 1}) tic = time.time() eval_metric.reset() nbatch = 0 use_normal_data_batch = epoch < explorer_params['no_augument_epoch'] if not use_normal_data_batch: if world_rank == 0: self.logger.info('use non-augumented batch') end_of_batch = False if use_normal_data_batch: data_iter = iter(train_data) next_data_batch = next(data_iter) else: cval_iter = iter(cval_data) next_cval_batch = next(cval_iter) smooth_decay = explorer_params['smooth_decay'] if not smooth_decay: explorer.apply_lr_decay_epoch(epoch) explorer.apply_wd_decay_epoch(epoch) explorer.set_mom(epoch) while not end_of_batch: if use_normal_data_batch: data_batch = next_data_batch else: cval_batch = next_cval_batch if monitor is not None: monitor.tic() if use_normal_data_batch: self.forward_backward(data_batch) else: self.forward_backward(cval_batch) if smooth_decay: explorer.apply_lr_decay_iter() explorer.apply_wd_decay_iter() explorer.apply_wd_warmup() explorer.apply_burn_in() use_explorer = (epoch == 0 and nbatch == 0) or (epoch >= exp_start_epoch and nbatch % exp_freq == 0) if use_explorer: explorer.set_tmp_coeff(world_rank) explorer.set_tmp_wd_coeff(world_rank) explorer.set_tmp_cg(world_rank) explorer.set_best_coeff(0) explorer.set_best_wd_coeff(0) explorer.set_best_cg(world_rank) self.update() if use_normal_data_batch: if isinstance(data_batch, list): self.update_metric(eval_metric, [db.label for db in data_batch], pre_sliced=True) else: self.update_metric(eval_metric, data_batch.label) else: if isinstance(cval_batch, list): self.update_metric(eval_metric, [db.label for db in cval_batch], pre_sliced=True) else: self.update_metric(eval_metric, cval_batch.label) if use_normal_data_batch: try: # pre fetch next batch next_data_batch = next(data_iter) except StopIteration: end_of_batch = True else: try: # pre fetch next cval batch next_cval_batch = next(cval_iter) except StopIteration: end_of_batch = True if use_normal_data_batch: if not end_of_batch: self.prepare(next_data_batch, sparse_row_id_fn=sparse_row_id_fn) else: if not end_of_batch: self.prepare(next_cval_batch, sparse_row_id_fn=sparse_row_id_fn) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 mx_resnet_print(key=mlperf_constants.EPOCH_STOP, metadata={"epoch_num": epoch + 1}) # one epoch of training is finished toc = time.time() if kvstore: if kvstore.rank == 0: self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) else: self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) # sync aux params across devices #arg_params, aux_params = self.get_params() #self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) #---------------------------------------- # evaluation on validation set if eval_data and epoch >= eval_offset and ( epoch - eval_offset) % eval_period == 0: mx_resnet_print(key=mlperf_constants.EVAL_START, metadata={'epoch_num': epoch + 1}) res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default if kvstore: if kvstore.rank == 0: for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) else: for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) res = dict(res) acc = [res['correct-count'], res['total-count']] acc = all_reduce(acc) acc = acc[0] / acc[1] mx_resnet_print(key=mlperf_constants.EVAL_STOP, metadata={'epoch_num': epoch + 1}) mx_resnet_print(key=mlperf_constants.EVAL_ACCURACY, val=acc, metadata={'epoch_num': epoch + 1}) mx_resnet_print( key=mlperf_constants.BLOCK_STOP, metadata={'first_epoch_num': block_epoch_start + 1}) if acc > accuracy_threshold: mx_resnet_print(key=mlperf_constants.RUN_STOP, metadata={'status': 'success'}) return epoch if epoch < (num_epoch - 1): block_epoch_start = epoch + 1 block_epoch_count = num_epoch - epoch - 1 if block_epoch_count > eval_period: block_epoch_count = eval_period mx_resnet_print(key=mlperf_constants.BLOCK_START, metadata={ 'first_epoch_num': block_epoch_start + 1, 'epoch_count': block_epoch_count }) # end of 1 epoch, reset the data-iter for another epoch if use_normal_data_batch: train_data.reset() else: cval_data.reset() mx_resnet_print(key=mlperf_constants.RUN_STOP, metadata={'status': 'aborted'}) return num_epoch
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01), ), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, best_model_callbacks=None, eval_interval=None, validation_metric=None, monitor=None): assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) if validation_metric is None: validation_metric = copy.deepcopy(eval_metric) epoch_metric = copy.deepcopy(eval_metric) swa_arg_params = None swa_aux_params = None swa_cnt = 0 ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): tic_epoch = time.time() eval_metric.reset() nbatch = 0 end_of_batch = False data_iter = iter(train_data) next_data_batch = next(data_iter) name_values = [] while not end_of_batch: data_batch = next_data_batch if monitor is not None: monitor.tic() self.forward_backward(data_batch) self.update() try: # pre fetch next batch next_data_batch = next(data_iter) self.prepare(next_data_batch) except StopIteration: end_of_batch = True self.update_metric(eval_metric, data_batch.label) if end_of_batch: name_values = eval_metric.get_name_value() if monitor is not None: monitor.toc_print() nbatch += 1 if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) eval_metric.reset() # ---------------------------------------- # evaluation on validation set to_go = eval_interval is not None and nbatch % eval_interval == 0 if to_go and eval_data: res = self.score( eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) for name, val in res: self.logger.info( 'Epoch[%d] Batch[%d] Validation-%s=%f', epoch, nbatch, name, val) if best_model_callbacks is not None: for callback in _as_list(best_model_callbacks): if callback.is_best(validation_metric): # sync aux params across devices arg_params, aux_params = self.get_params() sync_made = True callback.checkpoint_if_only_best( validation_metric, self.symbol, arg_params, aux_params) break # one epoch of training is finished for name, val in name_values: self.logger.info('Epoch[%d] Train-%s=%f', epoch + 1, name, val) toc_epoch = time.time() elapsed = (toc_epoch - tic_epoch) avg_speed = float(len(train_data)) / (toc_epoch - tic_epoch) self.logger.info('Epoch[%d] Time cost=%.3f', epoch + 1, elapsed) self.logger.info('Epoch[%d] Average speed=%.3f samples/sec', epoch + 1, avg_speed) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) # evaluation on validation set if eval_data: res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch + 1) for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch + 1, name, val) if best_model_callbacks is not None: for callback in _as_list(best_model_callbacks): callback.checkpoint_if_only_best( validation_metric, self.symbol, arg_params, aux_params) # end of epoch, reset the data-iter for another epoch train_data.reset()
def fit( self, train_data, ogdb, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=( ('learning_rate', 0.01), ), #,('rescale_grad', 1.0/8.0),), #8 gpu attempt eval_end_callback=None, iter_size=1, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None): """Ke's revision: add iter_size. Trains the module parameters. Checkout `Module Tutorial <http://mxnet.io/tutorials/basic/module.html>`_ to see a end-to-end use-case. Parameters ---------- train_data : DataIter Train DataIter. eval_data : DataIter If not ``None``, will be used as validation set and the performance after each epoch will be evaluated. eval_metric : str or EvalMetric Defaults to 'accuracy'. The performance measure used to display during training. Other possible predefined metrics are: 'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'. epoch_end_callback : function or list of functions Each callback will be called with the current `epoch`, `symbol`, `arg_params` and `aux_params`. batch_end_callback : function or list of function Each callback will be called with a `BatchEndParam`. kvstore : str or KVStore Defaults to 'local'. optimizer : str or Optimizer Defaults to 'sgd'. optimizer_params : dict Defaults to ``(('learning_rate', 0.01),)``. The parameters for the optimizer constructor. The default value is not a dict, just to avoid pylint warning on dangerous default values. eval_end_callback : function or list of function These will be called at the end of each full evaluation, with the metrics over the entire evaluation set. eval_batch_end_callback : function or list of function These will be called at the end of each mini-batch during evaluation. initializer : Initializer The initializer is called to initialize the module parameters when they are not already initialized. arg_params : dict Defaults to ``None``, if not ``None``, should be existing parameters from a trained model or loaded from a checkpoint (previously saved model). In this case, the value here will be used to initialize the module parameters, unless they are already initialized by the user via a call to `init_params` or `fit`. `arg_params` has a higher priority than `initializer`. aux_params : dict Defaults to ``None``. Similar to `arg_params`, except for auxiliary states. allow_missing : bool Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params` and `aux_params` are not ``None``. If this is ``True``, then the missing parameters will be initialized via the `initializer`. force_rebind : bool Defaults to ``False``. Whether to force rebinding the executors if already bound. force_init : bool Defaults to ``False``. Indicates whether to force initialization even if the parameters are already initialized. begin_epoch : int Defaults to 0. Indicates the starting epoch. Usually, if resumed from a checkpoint saved at a previous training phase at epoch N, then this value should be N+1. num_epoch : int Number of epochs for training. Examples -------- >>> # An example of using fit for training. >>> # Assume training dataIter and validation dataIter are ready >>> # Assume loading a previously checkpointed model >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 3) >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer='sgd', ... optimizer_params={'learning_rate':0.01, 'momentum': 0.9}, ... arg_params=arg_params, aux_params=aux_params, ... eval_metric='acc', num_epoch=10, begin_epoch=3) """ assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind, grad_req='add') if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) annealing_steps = 0 # number of current annealing steps in current epoch redo_training = 0 # Flag to redo training / resample val_list = [] # list of validation results per annealing step cur_val = 0 target_prec = 50 #Note: we want to identify the best cluster of images / training sets with a low percentage ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): tic = time.time() eval_metric.reset() nbatch = 0 if redo_training: annealing_steps = annealing_steps + 1 self.logger.info('Redoing training to meet criteria = %d', annealing_steps) #sroidb = train_data.roidb #passthrough test atick = time.time() iterdiff = 1.0 # Check if we've stagnated if len(val_list) > 2: itermean = (val_list[-1] + val_list[-2] + val_list[-3]) / 3 iterdiff = abs(itermean - val_list[-1]) self.logger.info('Last 3 samples have diff of: %f', iterdiff) if iterdiff < 0.01: self.logger.info( 'Reached a stagnated annealing criteria, dumping current samples' ) # Do something drastic # Lets try to instantly use the original db sroidb = ogdb # Try to read in another random subset #sroidb = sample_roidb(ogdb, 25) # Sample with removal else: # Continue as usual # Select a new random subset newroidb = sample_roidb(ogdb, 15) # Without removal, this is 10% # Append old with new sroidb = append_roidb(train_data.roidb, newroidb) # Create new training data instance by passing most of previous arguments and new random db train_data2 = AnchorLoader( train_data.feat_sym, sroidb, train_data.batch_size, train_data.shuffle, train_data.ctx, train_data.work_load_list, train_data.feat_stride, train_data.anchor_scales, train_data.anchor_ratios, train_data.aspect_grouping, nThreads=default.prefetch_thread_num) # Overwrite old train_data with the new one train_data = train_data2 data_iter = iter(train_data) atock = time.time() self.logger.info('Annealing[%d] Time cost=%.3f', annealing_steps, (atock - atick)) else: data_iter = iter(train_data) annealing_steps = 0 val_list = [] #target_prec=cur_val+5 target_prec = target_prec + 5 end_of_batch = False next_data_batch = next(data_iter) while not end_of_batch: data_batch = next_data_batch if monitor is not None: monitor.tic() # self.forward_backward(data_batch) self.forward(data_batch, is_train=True, grad_req='add') self.backward() if nbatch % iter_size == 0: # update every iter_size batches self.update() for g in self._curr_module._exec_group.grad_arrays: for g1 in g: if g1 is not None: g1[:] = 0. try: # pre fetch next batch next_data_batch = next(data_iter) self.prepare(next_data_batch) except StopIteration: end_of_batch = True self.update_metric(eval_metric, data_batch.label) if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 # one epoch of training is finished for name, val in eval_metric.get_name_value(): self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) toc = time.time() self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) #print('Epoch[%d] Time cost=%.3f', epoch, (toc-tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): cur_val = callback(epoch, self.symbol, arg_params, aux_params) self.logger.info('Returned Validation=%f', val) val_list.append(val) #---------------------------------------- # evaluation on validation set if eval_data: self.logger.info('Evaluating data') res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) #---------- # Check epoch if it falls within the validation threshold if cur_val < target_prec: # Evaluate list of precision/validation results first #val_list print(eval_data) #else redo_training = 1 self.logger.info('Retraining data=%f', val) else: redo_training = 0 self.logger.info('Annealing steps=%f', annealing_steps) # end of 1 epoch, reset the data-iter for another epoch train_data.reset()
def fit(self, train_data_list, optimizer_params, batch_end_callback=None, kvstore='local', initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None): assert num_epoch is not None, 'please specify number of epochs' assert arg_params is None and aux_params is None provide_data_list = [] provide_label_list = [] for td in train_data_list: provide_data_list.append(td.provide_data) provide_label_list.append(td.provide_label) self.bind(data_shapes_list=provide_data_list, label_shapes_list=provide_label_list, for_training=True) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(optimizer_params=optimizer_params) _arg_params, _aux_params = self.backbone_module.get_params() _arg_params_rank_0 = self.broadcast_parameters(_arg_params) _aux_params_rank_0 = self.broadcast_parameters(_aux_params) self.backbone_module.set_params(_arg_params_rank_0, _aux_params_rank_0) data_end_id = 0 ################################################################################ # training loop ################################################################################ num_epoch_list = [0] * self.head_num for epoch in range(begin_epoch, num_epoch): nbatch = 0 end_of_batch = False data_iter_list = [] for i in range(self.head_num): train_data_list[i].reset() data_iter_list.append(iter(train_data_list[i])) next_data_batch_list = [] for i in range(self.head_num): next_data_batch_list.append(next(data_iter_list[i])) while not end_of_batch: data_batch_list = next_data_batch_list data_batch = self.combine(data_batch_list) self.forward_backward(data_batch) self.update() assert not isinstance(data_batch, list) for i in range(self.head_num): try: next_data_batch_list[i] = next(data_iter_list[i]) self.prepare(next_data_batch_list[i], sparse_row_id_fn=None) except StopIteration: num_epoch_list[i] += 1 data_end_id += 1 if data_end_id != self.head_num: train_data_list[i].reset() data_iter_list[i] = iter(train_data_list[i]) next_data_batch_list[i] = next(data_iter_list[i]) logging.info('reset dataset_%d' % i) if batch_end_callback is not None: batch_end_params = self.batch_end_param( loss_list=self.loss_cache, epoch=epoch, num_update=self.num_update, num_epoch_list=num_epoch_list ) batch_end_callback(batch_end_params) nbatch += 1
def mlperf_fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01), ), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None, sparse_row_id_fn=None, eval_offset=0, eval_period=1, accuracy_threshold=1.0): assert num_epoch is not None, 'please specify number of epochs' self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric ########################################################################### # Adding Correct and Total Count metrics ########################################################################### if not isinstance(validation_metric, list): validation_metric = [validation_metric] validation_metric = mx.metric.create(validation_metric) if not isinstance(validation_metric, mx.metric.CompositeEvalMetric): vm = mx.metric.CompositeEvalMetric() vm.append(validation_metric) validation_metric = vm for m in [CorrectCount(), TotalCount()]: validation_metric.metrics.append(m) ########################################################################### if not isinstance(eval_metric, mx.metric.EvalMetric): eval_metric = mx.metric.create(eval_metric) mx_resnet_print(key=mlperf_log.TRAIN_LOOP) ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): mx_resnet_print(key=mlperf_log.TRAIN_EPOCH, val=epoch) tic = time.time() eval_metric.reset() nbatch = 0 data_iter = iter(train_data) end_of_batch = False next_data_batch = next(data_iter) while not end_of_batch: data_batch = next_data_batch if monitor is not None: monitor.tic() self.forward_backward(data_batch) self.update() if isinstance(data_batch, list): self.update_metric(eval_metric, [db.label for db in data_batch], pre_sliced=True) else: self.update_metric(eval_metric, data_batch.label) try: # pre fetch next batch next_data_batch = next(data_iter) self.prepare(next_data_batch, sparse_row_id_fn=sparse_row_id_fn) except StopIteration: end_of_batch = True if monitor is not None: monitor.toc_print() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 # one epoch of training is finished toc = time.time() if kvstore: if kvstore.rank == 0: self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) else: self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) if epoch_end_callback is not None: for callback in _as_list(epoch_end_callback): callback(epoch, self.symbol, arg_params, aux_params) #---------------------------------------- # evaluation on validation set if eval_data and epoch % eval_period == eval_offset: mx_resnet_print(key=mlperf_log.EVAL_START) res = self.score(eval_data, validation_metric, score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default if kvstore: if kvstore.rank == 0: for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) else: for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) res = dict(res) acc = [res['correct-count'], res['total-count']] acc = all_reduce(acc) acc = acc[0] / acc[1] mx_resnet_print(key=mlperf_log.EVAL_ACCURACY, val={ "epoch": epoch, "value": acc }) mx_resnet_print(key=mlperf_log.EVAL_STOP) if acc > accuracy_threshold: return epoch # end of 1 epoch, reset the data-iter for another epoch train_data.reset() return num_epoch