def init_params(self,
                 initializer=Uniform(0.01),
                 arg_params=None,
                 aux_params=None,
                 allow_missing=False,
                 force_init=False,
                 allow_extra=False):
     if self.params_initialized and not force_init:
         return
     assert self.binded, 'call bind before initializing the parameters'
     #TODO init the same weights with all work nodes
     self._curr_module.init_params(initializer=initializer,
                                   arg_params=None,
                                   aux_params=None,
                                   allow_missing=allow_missing,
                                   force_init=force_init,
                                   allow_extra=allow_extra)
     for _module in self._arcface_modules:
         #_initializer = initializer
         _initializer = mx.init.Normal(0.01)
         _module.init_params(initializer=_initializer,
                             arg_params=None,
                             aux_params=None,
                             allow_missing=allow_missing,
                             force_init=force_init,
                             allow_extra=allow_extra)
     self.params_initialized = True
Пример #2
0
    def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None,
                    allow_missing=False, force_init=False):
        """Initialize the parameters and auxiliary states.

        Parameters
        ----------
        initializer : Initializer
            Called to initialize parameters if needed.
        arg_params : dict
            If not None, should be a dictionary of existing arg_params. Initialization
            will be copied from that.
        aux_params : dict
            If not None, should be a dictionary of existing aux_params. Initialization
            will be copied from that.
        allow_missing : bool
            If true, params could contain missing values, and the initializer will be
            called to fill those missing params.
        force_init : bool
            If true, will force re-initialize even if already initialized.
        """
        if self.params_initialized and not force_init:
            warnings.warn("Parameters already initialized and force_init=False. "
                          "init_params call ignored.", stacklevel=2)
            return
        assert self.binded, 'call bind before initializing the parameters'

        def _impl(name, arr, cache):
            """Internal helper for parameter initialization"""
            if cache is not None:
                if name in cache:
                    cache_arr = cache[name]

                    # just in case the cached array is just the target itself
                    if cache_arr is not arr:
                        cache_arr.copyto(arr)
                else:
                    if not allow_missing:
                        raise RuntimeError("%s is not presented" % name)
                    if initializer != None:
                        initializer(name, arr)
            else:
                initializer(name, arr)

        attrs = self._symbol.attr_dict()

        for name, arr in self._arg_params.items():
            desc = InitDesc(name, attrs.get(name, None))
            _impl(desc, arr, arg_params)

        for name, arr in self._aux_params.items():
            desc = InitDesc(name, attrs.get(name, None))
            _impl(desc, arr, aux_params)

        self.params_initialized = True
        self._params_dirty = False

        # copy the initialized parameters to devices
        self._exec_group.set_params(self._arg_params, self._aux_params)
Пример #3
0
 def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None,
                 allow_missing=False, force_init=False, allow_extra=False):
     if self.params_initialized and not force_init:
         return
     assert self.binded, 'call bind before initializing the parameters'
     self._curr_module.init_params(initializer=initializer, arg_params=arg_params,
                                   aux_params=aux_params, allow_missing=allow_missing,
                                   force_init=force_init)
     self.params_initialized = True
Пример #4
0
    def init_params(self,
                    initializer=Uniform(0.01),
                    arg_params=None,
                    aux_params=None,
                    allow_missing=False,
                    force_init=False,
                    allow_extra=False):
        """Initializes parameters.

        Parameters
        ----------
        initializer : Initializer
        arg_params : dict
            Default ``None``. Existing parameters. This has higher priority
            than `initializer`.
        aux_params : dict
            Default ``None``. Existing auxiliary states. This has higher priority
            than `initializer`.
        allow_missing : bool
            Allow missing values in `arg_params` and `aux_params` (if not ``None``).
            In this case, missing values will be filled with `initializer`.
        force_init : bool
            Default ``False``.
        allow_extra : boolean, optional
            Whether allow extra parameters that are not needed by symbol.
            If this is True, no error will be thrown when arg_params or aux_params
            contain extra parameters that is not needed by the executor.
        """
        if self.params_initialized and not force_init:
            return
        assert self.binded, 'call bind before initializing the parameters'

        for module in self._modules:
            module.init_params(initializer=initializer, \
                    arg_params=arg_params, aux_params=aux_params, \
                    allow_missing=allow_missing, force_init=force_init,\
                    allow_extra=allow_extra)

        # make sure we do not have duplicated parameter names
        def _check_name(known_names, new_names, modules, i):
            """Internal function to help checking duplicated names."""
            for name in new_names:
                assert not name in known_names, "Duplicated parameter names: " + \
                    ('name "%s" in layer %d (%s) is already ' % (name, i, type(modules[i]))) + \
                    ('used in layer %d (%s).' % (known_names[name],
                                                 type(modules[known_names[name]])))
                known_names[name] = i

        arg_names = dict()
        aux_names = dict()
        for i_layer, module in enumerate(self._modules):
            arg_params, aux_params = module.get_params()
            _check_name(arg_names, arg_params.keys(), self._modules, i_layer)
            _check_name(aux_names, aux_params.keys(), self._modules, i_layer)

        self.params_initialized = True
Пример #5
0
    def fit(self,
            train_data,
            eval_data=None,
            eval_metric='acc',
            epoch_end_callback=None,
            batch_end_callback=None,
            kvstore='local',
            optimizer='sgd',
            optimizer_params=(('learning_rate', 0.01), ),
            eval_end_callback=None,
            eval_batch_end_callback=None,
            initializer=Uniform(0.01),
            arg_params=None,
            aux_params=None,
            allow_missing=False,
            force_rebind=False,
            force_init=False,
            begin_epoch=0,
            num_epoch=None,
            validation_metric=None,
            monitor=None,
            sparse_row_id_fn=None,
            profile=False):

        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data,
                  label_shapes=train_data.provide_label +
                  self.teacher_label_shapes,
                  for_training=True,
                  force_rebind=force_rebind)
        super().fit(force_rebind=False,
                    train_data=train_data,
                    eval_data=eval_data,
                    eval_metric=eval_metric,
                    epoch_end_callback=epoch_end_callback,
                    batch_end_callback=batch_end_callback,
                    kvstore=kvstore,
                    optimizer=optimizer,
                    optimizer_params=optimizer_params,
                    eval_end_callback=eval_end_callback,
                    eval_batch_end_callback=eval_batch_end_callback,
                    initializer=initializer,
                    arg_params=arg_params,
                    aux_params=aux_params,
                    allow_missing=allow_missing,
                    force_init=force_init,
                    begin_epoch=begin_epoch,
                    num_epoch=num_epoch,
                    validation_metric=validation_metric,
                    monitor=monitor,
                    sparse_row_id_fn=sparse_row_id_fn,
                    profile=profile)
		def __init__(self, num_dim, **kwargs):
			super(Model, self).__init__(**kwargs)
			wi1 = Uniform(0.25)
			wi2 = Uniform(0.1)
			with self.name_scope():
				self.encoder1 = nn.Dense(num_dim//4, in_units=num_dim, weight_initializer=wi1)
				self.encoder2 = nn.Dense(num_dim//16, in_units=num_dim//4, weight_initializer=wi1)
				self.encoder3 = nn.Dense(num_dim//64, in_units=num_dim//16, weight_initializer=wi2)
				self.encoder4 = nn.Dense(num_dim//256, in_units=num_dim//64, weight_initializer=wi2)
				self.decoder4 = nn.Dense(num_dim//64, in_units=num_dim//256, weight_initializer=wi2)
				self.decoder3 = nn.Dense(num_dim//16, in_units=num_dim//64, weight_initializer=wi2)
				self.decoder2 = nn.Dense(num_dim//4, in_units=num_dim//16, weight_initializer=wi1)
				self.decoder1 = nn.Dense(num_dim, in_units=num_dim//4, weight_initializer=wi1)
			self.layers = [(self.encoder1,self.decoder1),
						(self.encoder2,self.decoder2),
						(self.encoder3,self.decoder3),
						(self.encoder4,self.decoder4)]

			for layer in self.layers:
				self.register_child(layer[0])
				self.register_child(layer[1])
Пример #7
0
def build_model(A, X):
    model = HybridSequential()

    with model.name_scope():
        features, out_units = build_features(A, X)
        model.add(features)
        logger.info("GCN Summary: \n{}".format(model))

        classifier = LogisticRegressor(out_units)
        model.add(classifier)
        logger.info("GCN + LR Summary: \n{}".format(model))

    model.hybridize()
    model.initialize(Uniform(1))

    return model, features
Пример #8
0
def build_model(A, X):
    model = HybridSequential()
    hidden_layer_specs = [(4, 'tanh'), (2, 'tanh')]
    in_units = in_units = X.shape[1]

    with model.name_scope():
        features, out_units = build_features(A, X)
        model.add(features)

        classifier = LogisticRegressor(out_units)
        model.add(classifier)

    model.hybridize()
    model.initialize(Uniform(1))

    return model, features
Пример #9
0
    def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None,
                    allow_missing=False, force_init=False, allow_extra=False):
        assert self.binded
        # backbone
        self.backbone_module.init_params(
            initializer=initializer, arg_params=arg_params,
            aux_params=aux_params, allow_missing=allow_missing,
            force_init=force_init, allow_extra=allow_extra)

        self.backbone_module.init_params(
            initializer=initializer, arg_params=None,
            aux_params=None, allow_missing=allow_missing,
            force_init=force_init, allow_extra=allow_extra)

        # self.bn_module.init_params(
        #     initializer=initializer, arg_params=arg_params,
        #     aux_params=aux_params, allow_missing=allow_missing,
        #     force_init=force_init, allow_extra=allow_extra)
        self.params_initialized = True
Пример #10
0
    def __init__(self, symbol, ctx=None,
                 num_epoch=None, epoch_size=None, optimizer='sgd',
                 initializer=Uniform(0.01),
                 numpy_batch_size=128,
                 arg_params=None, aux_params=None,
                 allow_extra_params=False,
                 begin_epoch=0,
                 **kwargs):

        if isinstance(symbol, sym.Symbol):
            self.symbol = symbol
            self.sym_gen = None
        else:
            assert(callable(symbol))
            self.symbol = None
            self.sym_gen = symbol

        # model parameters
        self.arg_params = arg_params
        self.aux_params = aux_params
        self.allow_extra_params = allow_extra_params

        self.argument_checked = False
        if self.sym_gen is None:
            self._check_arguments()

        # basic configuration
        if ctx is None:
            ctx = [cpu()]
        elif isinstance(ctx, Context):
            ctx = [ctx]
        self.ctx = ctx
        # training parameters
        self.num_epoch = num_epoch
        self.epoch_size = epoch_size
        self.kwargs = kwargs.copy()
        self.optimizer = optimizer
        self.initializer = initializer
        self.numpy_batch_size = numpy_batch_size
        # internal helper state
        self._pred_exec = None
        self.begin_epoch = begin_epoch
Пример #11
0
 def create(symbol,
            X,
            marks,
            e_marks=None,
            y=None,
            ctx=None,
            num_epoch=None,
            epoch_size=None,
            optimizer='sgd',
            initializer=Uniform(0.01),
            eval_data=None,
            eval_metric='acc',
            epoch_end_callback=None,
            batch_end_callback=None,
            time_step_callback=None,
            kvstore='local',
            logger=None,
            work_load_list=None,
            eval_batch_end_callback=None,
            **kwargs):
     """Overwrite"""
     model = Feed(symbol,
                  ctx=ctx,
                  num_epoch=num_epoch,
                  epoch_size=epoch_size,
                  optimizer=optimizer,
                  initializer=initializer,
                  **kwargs)
     model.fit(X,
               y,
               marks,
               e_marks=e_marks,
               eval_data=eval_data,
               eval_metric=eval_metric,
               epoch_end_callback=epoch_end_callback,
               batch_end_callback=batch_end_callback,
               kvstore=kvstore,
               logger=logger,
               work_load_list=work_load_list,
               eval_batch_end_callback=eval_batch_end_callback)
     return model
    def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None,
                    allow_missing=False, force_init=False, allow_extra=False):
        if self.params_initialized and not force_init:
            return
        assert self.binded, 'call bind before initializing the parameters'
        #TODO init the same weights with all work nodes
        self._curr_module.init_params(initializer=initializer, arg_params=arg_params,
                                      aux_params=aux_params, allow_missing=allow_missing,
                                      force_init=force_init, allow_extra=allow_extra)
        #  print('load fcw file')
        #  file_name = '/data/insightface/models/fcw_{}.pkl'.format(str(4))
        #  tmp_fcw = pickle.load(open(file_name, 'rb'))
	#  print('load weight file done')
        idx = 0
        for _module in self._arcface_modules:
          #  _initializer = mx.init.Normal(0.01)
          #  _module.init_params(initializer=_initializer, arg_params=None,
          #                                aux_params=None, allow_missing=allow_missing,
          #                                force_init=force_init, allow_extra=allow_extra)


          file_name = '/data/insightface/recognition/models/fc_weights/v5/sec-fcw_1_{}.pkl'.format(str(idx))
          tmp_fcw = pickle.load(open(file_name, 'rb'))
          print ('for debug, local key is ', tmp_fcw[0])
          #  local_fcw = mx.nd.array(tmp_fcw[local_key])
          local_fcw = mx.nd.array(tmp_fcw[1])
          #  print('for debug, local fc weigth is: {}, shape is {}'.format(local_fcw, local_fcw.shape))
          assert self._ctx_num_classes == local_fcw.shape[0]
          _initializer = mx.init.Constant(local_fcw)
          idx += 1
          tmp_arg_params = {tmp_fcw[0]: local_fcw}

          _module.init_params(initializer=_initializer, arg_params=tmp_arg_params,
                                        aux_params=None, allow_missing=allow_missing,
                                        force_init=force_init, allow_extra=allow_extra)
          arg_w, aux_w = _module.get_params()
          #  print('for debug, module arg p is', arg_w)
          #  print('for debug, module aux p is ', aux_w)
        self.params_initialized = True
Пример #13
0
    def fit(self, train_data, eval_data=None, eval_metric='acc',
            epoch_end_callback=None, batch_end_callback=None, kvstore='local',
            optimizer='sgd', optimizer_params=(('learning_rate', 0.01),),
            eval_end_callback=None,
            eval_batch_end_callback=None, initializer=Uniform(0.01),
            arg_params=None, aux_params=None, allow_missing=False,
            force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None,
            validation_metric=None, monitor=None, summary_writer = None):

        assert num_epoch is not None, 'please specify number of epochs'
        self.num_batch = 0
        self.writer = summary_writer

        self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label,
                  for_training=True, force_rebind=force_rebind)

        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params,
                         allow_missing=allow_missing, force_init=force_init)
        self.init_optimizer(kvstore=kvstore, optimizer=optimizer,
                            optimizer_params=optimizer_params)

        acc_metric = IgnoreAccuracy(output_names=['softmax_output'], label_names=['softmax_label'])
        # acc_metric = metric.Accuracy(output_names=['softmax_output'], label_names=['softmax_label'])
        lmnn_metric = metric.Loss(output_names=['lmnn_output'], label_names=['softmax_label'])

        if validation_metric is None:
            validation_metric = lmnn_metric

        ################################################################################
        # training loop
        ################################################################################
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            acc_metric.reset()
            lmnn_metric.reset()
            # eval_metric.reset()
            for nbatch, data_batch in enumerate(train_data):
                if monitor is not None:
                    monitor.tic()

                self.forward_backward(data_batch)

                self.update()

                self.update_metric(acc_metric, data_batch.label)
                self.update_metric(lmnn_metric, data_batch.label)

                if monitor is not None:
                    monitor.toc_print()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)

                # one epoch of training is finished
                for name, val in acc_metric.get_name_value():
                    self.logger.info('Epoch[%d] Accuracy Train-%s=%f', epoch, name, val)
                for name, val in lmnn_metric.get_name_value():
                    self.logger.info('Epoch[%d] Lmnn Train-%s=%f', epoch, name, val)

                if self.num_batch % 10 == 0:
                    # print acc_metric.sum_metric, acc_metric.num_inst
                    self.writer.add_scalar('{}/cls_acc'.format('Train'), acc_metric.sum_metric / acc_metric.num_inst,
                                                       self.num_batch)
                    self.writer.add_scalar('{}/lmnn_loss'.format('Train'), lmnn_metric.sum_metric / lmnn_metric.num_inst,
                                                       self.num_batch)

                self.num_batch += 1

            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc-tic))

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)

            #----------------------------------------
            # evaluation on validation set
            if eval_data:
                res = self.score(eval_data, validation_metric,
                                 score_end_callback=eval_end_callback,
                                 batch_end_callback=eval_batch_end_callback, epoch=epoch)
                #TODO: pull this into default
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()
Пример #14
0
    def fit(self,
            train_data,
            eval_data=None,
            eval_metric='acc',
            epoch_end_callback=None,
            batch_end_callback=None,
            kvstore='local',
            optimizer='sgd',
            optimizer_params=(('learning_rate', 0.01), ),
            eval_end_callback=None,
            eval_batch_end_callback=None,
            initializer=Uniform(0.01),
            arg_params=None,
            aux_params=None,
            allow_missing=False,
            force_rebind=False,
            force_init=False,
            begin_epoch=0,
            num_epoch=None,
            validation_metric=None,
            monitor=None):
        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data,
                  label_shapes=train_data.provide_label,
                  for_training=True,
                  force_rebind=force_rebind)
        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(initializer=initializer,
                         arg_params=arg_params,
                         aux_params=aux_params,
                         allow_missing=allow_missing,
                         force_init=force_init)
        self.init_optimizer(kvstore=kvstore,
                            optimizer=optimizer,
                            optimizer_params=optimizer_params)

        if validation_metric is None:
            validation_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)
        ####chris_arg
        if int(os.getenv("TASK_LIMIT",
                         0)) != 0:  #为0时不分task限制,为1时分task但是每轮更新,为2时分task并但固定
            get_task_cmd = "sh /home/ubuntu/tc.sh -l 1"
        else:
            self.logger.info("no_task_bandwidth_limit")
            get_task_cmd = "sh /home/ubuntu/tc.sh -l 0"
        os.system(get_task_cmd)
        delay_time = float(os.getenv("DELAY_TIME", 0.8))
        ps_upload_bandwidth_part1 = int(os.getenv("PS_UPLOAD_BANDWIDTH1",
                                                  2000))
        worker_upload_bandwidth_part1 = int(
            os.getenv("WORKER_UPLOAD_BANDWIDTH1", 2000))
        ps_upload_bandwidth_part2 = int(os.getenv("PS_UPLOAD_BANDWIDTH2",
                                                  2000))
        worker_upload_bandwidth_part2 = int(
            os.getenv("WORKER_UPLOAD_BANDWIDTH2", 2000))
        tc_command = "sudo tc class change dev {} parent 1: classid 1:3 htb rate {}mbit ceil {}mbit  && sudo tc class change dev {} parent 1: classid 1:4 htb rate {}mbit ceil {}mbit"
        ################################################################################
        # training loop
        ################################################################################
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            nbatch = 0
            data_iter = iter(train_data)
            end_of_batch = False
            next_data_batch = next(data_iter)
            while not end_of_batch:
                data_batch = next_data_batch
                if monitor is not None:
                    monitor.tic()
                self.forward(data_batch, is_train=True)
                if int(os.getenv("TASK_LIMIT", 0)) == 1:
                    ##first part bandwidth allocation
                    ndarray.waitall()
                    # self.logger.info("change bandwidth part1:, "+str(time.time()))
                    x = str(ps_upload_bandwidth_part1)
                    y = str(worker_upload_bandwidth_part1)
                    cmd_up = tc_command.format("ens3", x, x, "ens3", y, y)
                    cmd_down = tc_command.format("ifb0", y, y, "ifb0", x, x)
                    os.system(cmd_up)
                    # os.system(cmd_down)
                # self.logger.info("after forward, "+str(time.time()))
                self.backward()
                # self.logger.info("before update: "+str(time.time()))
                self.update()  #异步执行的
                if int(os.getenv("TASK_LIMIT", 0)) == 1:
                    x = str(ps_upload_bandwidth_part2)
                    y = str(worker_upload_bandwidth_part2)
                    cmd_up = tc_command.format("ens3", x, x, "ens3", y, y)
                    cmd_down = tc_command.format("ifb0", y, y, "ifb0", x, x)
                    time.sleep(delay_time)
                    ##second part bandwidth allocation
                    # self.logger.info("change bandwidth part2:, "+str(time.time()))
                    os.system(cmd_up)
                    # os.system(cmd_down)
                try:
                    # pre fetch next batch
                    next_data_batch = next(data_iter)
                    self.prepare(next_data_batch)
                except StopIteration:
                    end_of_batch = True

                self.update_metric(eval_metric, data_batch.label)

                if monitor is not None:
                    monitor.toc_print()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch,
                                                     nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)
                nbatch += 1

            # one epoch of training is finished
            for name, val in eval_metric.get_name_value():
                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)

            #----------------------------------------
            # evaluation on validation set
            if eval_data:
                res = self.score(eval_data,
                                 validation_metric,
                                 score_end_callback=eval_end_callback,
                                 batch_end_callback=eval_batch_end_callback,
                                 epoch=epoch)
                #TODO: pull this into default
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name,
                                     val)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()
Пример #15
0
    def create(symbol, X, y=None, ctx=None,
               num_epoch=None, epoch_size=None, optimizer='sgd', initializer=Uniform(0.01),
               eval_data=None, eval_metric='acc',
               epoch_end_callback=None, batch_end_callback=None,
               kvstore='local', logger=None, work_load_list=None,
               eval_batch_end_callback=None, **kwargs):
        """Functional style to create a model.
        This function will be more consistent with functional
        languages such as R, where mutation is not allowed.

        Parameters
        ----------
        symbol : Symbol
            The symbol configuration of computation network.
        X : DataIter
            Training data
        y : numpy.ndarray, optional
            If X is numpy.ndarray y is required to set
        ctx : Context or list of Context, optional
            The device context of training and prediction.
            To use multi GPU training, pass in a list of gpu contexts.
        num_epoch : int, optional
            Training parameter, number of training epochs(epochs).
        epoch_size : int, optional
            Number of batches in a epoch. In default, it is set to
            ceil(num_train_examples / batch_size)
        optimizer : str or Optimizer, optional
            Training parameter, name or optimizer object for training.
        initializier : initializer function, optional
            Training parameter, the initialization scheme used.
        eval_data : DataIter or numpy.ndarray pair
            If eval_set is numpy.ndarray pair, it should be (valid_data, valid_label)
        eval_metric : metric.EvalMetric or str or callable
            The evaluation metric, name of evaluation metric.
            Or a customize evaluation function that returns the statistics
            based on minibatch.
        epoch_end_callback : callable(epoch, symbol, arg_params, aux_states)
            A callback that is invoked at end of each epoch.
            This can be used to checkpoint model each epoch.
        batch_end_callback: callable(epoch)
            A callback that is invoked at end of each batch
            For print purpose
        kvstore: KVStore or str, optional
           The KVStore or a string kvstore type: 'local', 'dist_sync', 'dis_async'
           In default uses 'local', often no need to change for single machiine.
        logger : logging logger, optional
            When not specified, default logger will be used.
        work_load_list : list of float or int, optional
            The list of work load for different devices,
            in the same order as ctx
        """
        model = FeedForward(symbol, ctx=ctx, num_epoch=num_epoch,
                            epoch_size=epoch_size,
                            optimizer=optimizer, initializer=initializer, **kwargs)
        model.fit(X, y, eval_data=eval_data, eval_metric=eval_metric,
                  epoch_end_callback=epoch_end_callback,
                  batch_end_callback=batch_end_callback,
                  kvstore=kvstore,
                  logger=logger,
                  work_load_list=work_load_list,
                  eval_batch_end_callback=eval_batch_end_callback)
        return model
Пример #16
0
 def train(self,
           train_data,
           epochs=1,
           batch_size=32,
           validation_data=None,
           train_resize_batch_num=None):
     """Train the model and update the model parameters."""
     stats = dict()
     if self.is_worker:
         from zoo.orca.data.shard import RayPartition
         if isinstance(train_data, RayPartition):
             from zoo.orca.data.utils import ray_partition_get_data_label
             data, label = ray_partition_get_data_label(
                 train_data.get_data(), allow_tuple=False, allow_list=False)
             train_data_iter = mx.io.NDArrayIter(data=data,
                                                 label=label,
                                                 batch_size=batch_size,
                                                 shuffle=True)
             if train_resize_batch_num is not None:
                 train_data_iter = mx.io.ResizeIter(train_data_iter,
                                                    train_resize_batch_num)
             if validation_data:
                 data_val, label_val = ray_partition_get_data_label(
                     validation_data.get_data(),
                     allow_tuple=False,
                     allow_list=False)
                 val_data_iter = mx.io.NDArrayIter(data=data_val,
                                                   label=label_val,
                                                   batch_size=batch_size,
                                                   shuffle=True)
             else:
                 val_data_iter = None
         else:  # data_creator functions; should return Iter or DataLoader
             config = self.config
             if "batch_size" not in config:
                 config["batch_size"] = batch_size
             train_data_iter = train_data(config, self.kv)
             val_data_iter = validation_data(
                 config, self.kv) if validation_data else None
         start_time = time.time()
         if self.trainer:  # Imperative API
             for epoch in range(epochs):
                 train_data_iter.reset()
                 if self.eval_metrics:
                     self.eval_metrics.reset(
                     )  # metrics will accumulate for one batch
                 batch_start_time = time.time()
                 epoch_start_time = time.time()
                 for i, batch in enumerate(train_data_iter):
                     data = gluon.utils.split_and_load(
                         batch.data[0].astype("float32"),
                         ctx_list=[mx.cpu()],
                         batch_axis=0)
                     label = gluon.utils.split_and_load(
                         batch.label[0].astype("float32"),
                         ctx_list=[mx.cpu()],
                         batch_axis=0)
                     outputs = []
                     Ls = []
                     from mxnet import autograd as ag
                     with ag.record():
                         for x, y in zip(data, label):
                             z = self.model(x)  # forward
                             L = self.loss(z, y)
                             # store the loss and do backward on a batch for better speed
                             Ls.append(L)
                             outputs.append(z)
                         ag.backward(Ls)
                     self.trainer.step(batch.data[0].shape[0])
                     if self.eval_metrics:
                         self.eval_metrics.update(label, outputs)
                     if not (i + 1) % self.config["log_interval"]:
                         # This would be logged on driver for each worker process.
                         iteration_log = \
                             "Epoch[%d] Batch[%d]  Speed: %f samples/sec  %s=%f" \
                             % (epoch, i,
                                batch_size / (time.time() - batch_start_time),
                                "loss", Ls[0].asnumpy().mean())
                         if self.eval_metrics:
                             names, accs = self.eval_metrics.get()
                             names, accs = to_list(names), to_list(accs)
                             for name, acc in zip(names, accs):
                                 iteration_log += "  %s=%f" % (name, acc)
                         self.logger.info(iteration_log)
                     batch_start_time = time.time()
                 # Epoch time log
                 self.logger.info("[Epoch %d] time cost: %f" %
                                  (epoch, time.time() - epoch_start_time))
                 # Epoch metrics log on train data
                 if self.eval_metrics:
                     epoch_train_log = "[Epoch %d] training: " % epoch
                     names, accs = self.eval_metrics.get()
                     names, accs = to_list(names), to_list(accs)
                     for name, acc in zip(names, accs):
                         epoch_train_log += "%s=%f  " % (name, acc)
                     self.logger.info(epoch_train_log)
                 # Epoch metrics log on validation data if any:
                 if val_data_iter:
                     self.val_metrics.reset()
                     val_data_iter.reset()
                     for batch in val_data_iter:
                         data = gluon.utils.split_and_load(
                             batch.data[0].astype("float32", copy=False),
                             ctx_list=[mx.cpu()],
                             batch_axis=0)
                         label = gluon.utils.split_and_load(
                             batch.label[0].astype("float32", copy=False),
                             ctx_list=[mx.cpu()],
                             batch_axis=0)
                         outputs = [self.model(X) for X in data]
                         self.val_metrics.update(label, outputs)
                     epoch_val_log = "[Epoch %d] validation: " % epoch
                     names, accs = self.val_metrics.get()
                     names, accs = to_list(names), to_list(accs)
                     for name, acc in zip(names, accs):
                         epoch_val_log += "%s=%f  " % (name, acc)
                     self.logger.info(epoch_val_log)
                 # TODO: save checkpoints
             if self.eval_metrics:
                 names, accs = self.eval_metrics.get()
                 names, accs = to_list(names), to_list(accs)
                 for name, acc in zip(names, accs):
                     stats[name] = acc
         else:  # Symbolic API
             # TODO: seems no history (i.e. validation accuracy) returned by fit?
             if "init" not in self.config:
                 from mxnet.initializer import Uniform
                 self.config["init"] = Uniform(
                     0.01)  # This is the default value for MXNet
             if self.eval_metrics is None:
                 self.eval_metrics = 'acc'
             self.model.fit(
                 train_data=train_data_iter,
                 num_epoch=epochs,
                 initializer=self.config["init"],
                 kvstore=self.kv,
                 optimizer=self.config["optimizer"],
                 optimizer_params=self.config["optimizer_params"],
                 eval_data=val_data_iter,
                 eval_metric=self.eval_metrics,
                 validation_metric=self.val_metrics,
                 batch_end_callback=mx.callback.Speedometer(
                     batch_size, self.config["log_interval"]),
                 epoch_end_callback=None if "model" not in self.config else
                 mx.callback.do_checkpoint(self.config["model"]))
         epoch_time = time.time() - start_time
         stats["epoch_time"] = epoch_time
         if isinstance(train_data, RayPartition):
             del train_data
         if validation_data and isinstance(validation_data, RayPartition):
             del validation_data
     return stats
Пример #17
0
    def __init__(self, **kwargs):
        super(QANet, self).__init__(**kwargs)
        with self.name_scope():
            self.flatten = gluon.nn.Flatten()
            self.dropout = gluon.nn.Dropout(opt.layers_dropout)
            self.char_conv = ConvolutionalEncoder(
                embed_size=opt.char_emb_dim,
                num_filters=opt.char_conv_filters,
                ngram_filter_sizes=opt.char_conv_ngrams,
                conv_layer_activation=None,
                num_highway=0)

        self.highway = gluon.nn.HybridSequential()
        with self.highway.name_scope():
            self.highway.add(
                gluon.nn.Dense(units=opt.emb_encoder_conv_channels,
                               flatten=False,
                               use_bias=False,
                               weight_initializer=Xavier()))
            self.highway.add(
                Highway(input_size=opt.emb_encoder_conv_channels,
                        num_layers=opt.highway_layers,
                        activation='relu',
                        highway_bias=HighwayBias(nonlinear_transform_bias=0.0,
                                                 transform_gate_bias=0.0)))

        self.word_emb = gluon.nn.HybridSequential()
        with self.word_emb.name_scope():
            self.word_emb.add(
                gluon.nn.Embedding(input_dim=opt.word_corpus,
                                   output_dim=opt.word_emb_dim))
            self.word_emb.add(gluon.nn.Dropout(rate=opt.word_emb_dropout))
        self.char_emb = gluon.nn.HybridSequential()
        with self.char_emb.name_scope():
            self.char_emb.add(
                gluon.nn.Embedding(input_dim=opt.character_corpus,
                                   output_dim=opt.char_emb_dim,
                                   weight_initializer=Normal(sigma=0.1)))
            self.char_emb.add(gluon.nn.Dropout(rate=opt.char_emb_dropout))

        with self.name_scope():
            self.emb_encoder = Encoder(
                kernel_size=opt.emb_encoder_conv_kernerl_size,
                num_filters=opt.emb_encoder_conv_channels,
                conv_layers=opt.emb_encoder_num_conv_layers,
                num_heads=opt.emb_encoder_num_head,
                num_blocks=opt.emb_encoder_num_block)

            self.project = gluon.nn.Dense(units=opt.emb_encoder_conv_channels,
                                          flatten=False,
                                          use_bias=False,
                                          weight_initializer=Xavier())

        with self.name_scope():
            self.co_attention = CoAttention()

        with self.name_scope():
            self.model_encoder = Encoder(
                kernel_size=opt.model_encoder_conv_kernel_size,
                num_filters=opt.model_encoder_conv_channels,
                conv_layers=opt.model_encoder_conv_layers,
                num_heads=opt.model_encoder_num_head,
                num_blocks=opt.model_encoder_num_block)

        with self.name_scope():
            self.predict_begin = gluon.nn.Dense(
                units=1,
                use_bias=True,
                flatten=False,
                weight_initializer=Xavier(rnd_type='uniform',
                                          factor_type='in',
                                          magnitude=1),
                bias_initializer=Uniform(1.0 /
                                         opt.model_encoder_conv_channels))
            self.predict_end = gluon.nn.Dense(
                units=1,
                use_bias=True,
                flatten=False,
                weight_initializer=Xavier(rnd_type='uniform',
                                          factor_type='in',
                                          magnitude=1),
                bias_initializer=Uniform(1.0 /
                                         opt.model_encoder_conv_channels))
Пример #18
0
 def train(self, nb_epoch=1):
     """Train the model and update the model parameters."""
     stats = dict()
     if self.is_worker:
         start_time = time.time()
         if self.trainer:  # Imperative API
             for epoch in range(nb_epoch):
                 self.train_data.reset()
                 if self.metrics:
                     self.metrics.reset()  # metrics will accumulate for one batch
                 batch_start_time = time.time()
                 epoch_start_time = time.time()
                 for i, batch in enumerate(self.train_data):
                     data = gluon.utils.split_and_load(
                         batch.data[0].astype("float32"), ctx_list=[mx.cpu()], batch_axis=0)
                     label = gluon.utils.split_and_load(
                         batch.label[0].astype("float32"), ctx_list=[mx.cpu()], batch_axis=0)
                     outputs = []
                     Ls = []
                     from mxnet import autograd as ag
                     with ag.record():
                         for x, y in zip(data, label):
                             z = self.model(x)  # forward
                             L = self.loss(z, y)
                             # store the loss and do backward on a batch for better speed
                             Ls.append(L)
                             outputs.append(z)
                         ag.backward(Ls)
                     self.trainer.step(batch.data[0].shape[0])
                     if self.metrics:
                         self.metrics.update(label, outputs)
                     if not (i + 1) % self.config["log_interval"]:
                         # This would be logged on driver for each worker process.
                         iteration_log = \
                             "Epoch[%d] Batch[%d]  Speed: %f samples/sec  %s=%f" \
                             % (epoch, i,
                                self.config["batch_size"] / (time.time() - batch_start_time),
                                "loss", Ls[0].asnumpy().mean())
                         if self.metrics:
                             names, accs = self.metrics.get()
                             names, accs = to_list(names), to_list(accs)
                             for name, acc in zip(names, accs):
                                 iteration_log += "  %s=%f" % (name, acc)
                         self.logger.info(iteration_log)
                     batch_start_time = time.time()
                 # Epoch time log
                 self.logger.info("[Epoch %d] time cost: %f" %
                                  (epoch, time.time() - epoch_start_time))
                 # Epoch metrics log on train data
                 if self.metrics:
                     epoch_train_log = "[Epoch %d] training: " % epoch
                     names, accs = self.metrics.get()
                     names, accs = to_list(names), to_list(accs)
                     for name, acc in zip(names, accs):
                         epoch_train_log += "%s=%f  " % (name, acc)
                     self.logger.info(epoch_train_log)
                 # Epoch metrics log on validation data if any:
                 if self.val_data:
                     self.metrics.reset()
                     self.val_data.reset()
                     for batch in self.val_data:
                         data = gluon.utils.split_and_load(
                             batch.data[0].astype("float32", copy=False),
                             ctx_list=[mx.cpu()], batch_axis=0)
                         label = gluon.utils.split_and_load(
                             batch.label[0].astype("float32", copy=False),
                             ctx_list=[mx.cpu()], batch_axis=0)
                         outputs = [self.model(X) for X in data]
                         self.metrics.update(label, outputs)
                     epoch_val_log = "[Epoch %d] validation: " % epoch
                     names, accs = self.metrics.get()
                     names, accs = to_list(names), to_list(accs)
                     for name, acc in zip(names, accs):
                         epoch_val_log += "%s=%f  " % (name, acc)
                     self.logger.info(epoch_val_log)
                 # TODO: save checkpoints
             if self.metrics:
                 names, accs = self.metrics.get()
                 names, accs = to_list(names), to_list(accs)
                 for name, acc in zip(names, accs):
                     stats[name] = acc
         else:  # Symbolic API
             # TODO: seems no history (i.e. validation accuracy) returned by fit?
             if "init" not in self.config:
                 from mxnet.initializer import Uniform
                 self.config["init"] = Uniform(0.01)  # This is the default value for MXNet
             self.model.fit(train_data=self.train_data,
                            num_epoch=nb_epoch,
                            initializer=self.config["init"],
                            kvstore=self.kv,
                            optimizer=self.config["optimizer"],
                            optimizer_params=self.config["optimizer_params"],
                            eval_data=self.val_data,
                            # TODO: eval and validation metrics could be different
                            eval_metric=self.metrics,
                            validation_metric=self.metrics,
                            batch_end_callback=mx.callback.Speedometer(
                                self.config["batch_size"], self.config["log_interval"]),
                            epoch_end_callback=None if "model" not in self.config
                            else mx.callback.do_checkpoint(self.config["model"]))
         epoch_time = time.time() - start_time
         stats["epoch_time"] = epoch_time
     return stats
Пример #19
0
    def fit(self,
            train_data,
            eval_data=None,
            eval_metric='acc',
            epoch_end_callback=None,
            batch_end_callback=None,
            kvstore='local',
            optimizer='sgd',
            optimizer_params=(('learning_rate', 0.01), ),
            eval_end_callback=None,
            eval_batch_end_callback=None,
            initializer=Uniform(0.01),
            arg_params=None,
            aux_params=None,
            allow_missing=False,
            force_rebind=False,
            force_init=False,
            begin_epoch=0,
            num_epoch=None,
            validation_metric=None,
            monitor=None,
            sparse_row_id_fn=None,
            profile=False):
        """Trains the module parameters.
        Checkout `Module Tutorial <http://mxnet.io/tutorials/basic/module.html>`_ to see
        a end-to-end use-case.
        Parameters
        ----------
        train_data : DataIter
            Train DataIter.
        eval_data : DataIter
            If not ``None``, will be used as validation set and the performance
            after each epoch will be evaluated.
        eval_metric : str or EvalMetric
            Defaults to 'accuracy'. The performance measure used to display during training.
            Other possible predefined metrics are:
            'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'.
        epoch_end_callback : function or list of functions
            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
            and `aux_params`.
        batch_end_callback : function or list of function
            Each callback will be called with a `BatchEndParam`.
        kvstore : str or KVStore
            Defaults to 'local'.
        optimizer : str or Optimizer
            Defaults to 'sgd'.
        optimizer_params : dict
            Defaults to ``(('learning_rate', 0.01),)``. The parameters for
            the optimizer constructor.
            The default value is not a dict, just to avoid pylint warning on dangerous
            default values.
        eval_end_callback : function or list of function
            These will be called at the end of each full evaluation, with the metrics over
            the entire evaluation set.
        eval_batch_end_callback : function or list of function
            These will be called at the end of each mini-batch during evaluation.
        initializer : Initializer
            The initializer is called to initialize the module parameters when they are
            not already initialized.
        arg_params : dict
            Defaults to ``None``, if not ``None``, should be existing parameters from a trained
            model or loaded from a checkpoint (previously saved model). In this case,
            the value here will be used to initialize the module parameters, unless they
            are already initialized by the user via a call to `init_params` or `fit`.
            `arg_params` has a higher priority than `initializer`.
        aux_params : dict
            Defaults to ``None``. Similar to `arg_params`, except for auxiliary states.
        allow_missing : bool
            Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params`
            and `aux_params` are not ``None``. If this is ``True``, then the missing parameters
            will be initialized via the `initializer`.
        force_rebind : bool
            Defaults to ``False``. Whether to force rebinding the executors if already bound.
        force_init : bool
            Defaults to ``False``. Indicates whether to force initialization even if the
            parameters are already initialized.
        begin_epoch : int
            Defaults to 0. Indicates the starting epoch. Usually, if resumed from a
            checkpoint saved at a previous training phase at epoch N, then this value should be
            N+1.
        num_epoch : int
            Number of epochs for training.
        sparse_row_id_fn : A callback function
            The function  takes `data_batch` as an input and returns a dict of
            str -> NDArray. The resulting dict is used for pulling row_sparse
            parameters from the kvstore, where the str key is the name of the param,
            and the value is the row id of the param to pull.
        """
        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data,
                  label_shapes=train_data.provide_label,
                  for_training=True,
                  force_rebind=force_rebind)

        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(initializer=initializer,
                         arg_params=arg_params,
                         aux_params=aux_params,
                         allow_missing=allow_missing,
                         force_init=force_init)
        self.init_optimizer(kvstore=kvstore,
                            optimizer=optimizer,
                            optimizer_params=optimizer_params)

        if validation_metric is None:
            validation_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        ################################################################################
        # training loop
        ################################################################################
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            nbatch = 0
            data_iter = iter(train_data)
            end_of_batch = False
            next_data_batch = next(data_iter)
            while not end_of_batch:
                data_batch = next_data_batch
                if monitor is not None:
                    monitor.tic()
                self.forward_backward(data_batch)
                self.update()

                if isinstance(data_batch, list):
                    self.update_metric(eval_metric,
                                       [db.label for db in data_batch],
                                       pre_sliced=True)
                else:
                    self.update_metric(eval_metric, data_batch.label)

                try:
                    # pre fetch next batch
                    next_data_batch = next(data_iter)
                    self.prepare(next_data_batch,
                                 sparse_row_id_fn=sparse_row_id_fn)
                except StopIteration:
                    end_of_batch = True

                if monitor is not None:
                    monitor.toc_print()

                if end_of_batch:
                    eval_name_vals = eval_metric.get_name_value()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch,
                                                     nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)
                nbatch += 1

                if profile is True and nbatch == 10:
                    self.logger.info("Profiling ends")
                    import mxnet as mx
                    mx.profiler.dump()

            # one epoch of training is finished
            for name, val in eval_name_vals:
                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None and self._kvstore.rank == 0:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()
Пример #20
0
def net_initialize(net,
                   model_ctx,
                   initializer: (str, Initializer, dict,
                                 list) = mx.init.Xavier(),
                   select=None,
                   logger=logging,
                   verbose=False,
                   force_reinit=False):
    """
    初始化网络参数

    Parameters
    ----------
    net
    model_ctx: mx.cpu or mx.gpu
    initializer: str, Initializer, dict or list, tuple
    select
    logger
    verbose : bool, default False
        Whether to verbosely print out details on initialization.
    force_reinit : bool, default False
        Whether to force re-initialization if parameter is already initialized.
    Notes
    ------
    The developer who modify this document should simultaneously modify the related function in glue

    Examples
    --------
    >>> import mxnet as mx
    >>> from mxnet import gluon
    >>> emb = gluon.nn.Embedding(2, 3)
    >>> net_initialize(emb, mx.cpu())
    >>> emb.weight.data()
    <BLANKLINE>
    [[0.10694504 0.2034123  0.4714563 ]
     [0.7542485  0.2251432  0.7842196 ]]
    <NDArray 2x3 @cpu(0)>
    >>> emb1 = gluon.nn.Embedding(2, 3)
    >>> net_initialize(emb1, mx.cpu(), initializer=mx.init.Xavier())
    >>> emb1.weight.data()
    <BLANKLINE>
    [[ 0.09833419  0.76079047 -0.16726398]
     [ 0.27071452  0.319638   -0.25330698]]
    <NDArray 2x3 @cpu(0)>
    >>> class EmbNet(gluon.nn.HybridBlock):
    ...     def __init__(self, prefix=None, params=None):
    ...         super(EmbNet, self).__init__(prefix, params)
    ...         with self.name_scope():
    ...             self.emb = gluon.nn.Embedding(2, 3)
    ...             self.linear = gluon.nn.Dense(4)
    ...     def hybrid_forward(self, F, x, *args, **kwargs):
    ...         return self.linear(self.emb(x))
    >>> net = EmbNet()
    >>> from longling.ML.DL import BLOCK_EMBEDDING
    >>> net_initialize(net, mx.cpu(), initializer={BLOCK_EMBEDDING: "xaiver", ".*embedding": "uniform"})
    >>> net(mx.nd.array([0, 1]))
    <BLANKLINE>
    [[ 0.03268543 -0.00860071  0.04774952  0.00056277]
     [-0.00648303 -0.03121923 -0.04578817 -0.08059631]]
    <NDArray 2x4 @cpu(0)>
    >>> net1 = EmbNet()
    >>> net_initialize(net1, mx.cpu(), initializer=["xaiver", "uniform"], select=[BLOCK_EMBEDDING, ".*embedding"])
    >>> net1(mx.nd.array([0, 1]))  # doctest: +ELLIPSIS
    <BLANKLINE>
    [[-0.0896... -0.0179... -0.0156... -0.0136...]
     [ 0.0033...  0.0255...  0.0111...  0.0446...]]
    <NDArray 2x4 @cpu(0)>
    >>> net_initialize(net1, mx.cpu(), initializer=[(BLOCK_EMBEDDING, "xaiver"), (".*embedding", "uniform")],
    ...     force_reinit=True)
    >>> net1(mx.nd.array([0, 1]))  # doctest: +ELLIPSIS
    <BLANKLINE>
    [[ 0.0153...  0.0266... -0.0466...  0.0291...]
     [-0.0362...  0.0063...  0.0227... -0.0212...]]
    <NDArray 2x4 @cpu(0)>
    """
    if isinstance(initializer, str):
        initializer = {
            "xaiver": Xavier(),
            "uniform": Uniform(),
            "normal": Normal()
        }[initializer]
    elif isinstance(initializer, dict):
        for _select, _initializer in initializer.items():
            net_initialize(net,
                           model_ctx=model_ctx,
                           initializer=_initializer,
                           select=_select,
                           logger=logger,
                           verbose=verbose,
                           force_reinit=force_reinit)
        return
    elif isinstance(initializer, (list, tuple)):
        if select is not None:
            assert len(select) == len(initializer)
            for _select, _initializer in zip(select, initializer):
                net_initialize(net,
                               model_ctx=model_ctx,
                               initializer=_initializer,
                               select=_select,
                               logger=logger,
                               verbose=verbose,
                               force_reinit=force_reinit)
        else:
            for _select, _initializer in initializer:
                net_initialize(net,
                               model_ctx=model_ctx,
                               initializer=_initializer,
                               select=_select,
                               logger=logger,
                               verbose=verbose,
                               force_reinit=force_reinit)
        return
    elif initializer is None or isinstance(initializer, Initializer):
        pass
    else:
        raise TypeError(
            "initializer should be either str or Initializer, now is",
            type(initializer))

    logger.info("initializer: %s, select: %s, ctx: %s" %
                (initializer, select, model_ctx))
    net.collect_params(select).initialize(initializer,
                                          ctx=model_ctx,
                                          verbose=verbose,
                                          force_reinit=force_reinit)
Пример #21
0
    def fit(self, train_data, eval_data=None, eval_metric='acc',
            epoch_end_callback=None, batch_end_callback=None, kvstore='local',
            optimizer='sgd', optimizer_params=(('learning_rate', 0.01),),
            eval_end_callback=None,
            eval_batch_end_callback=None, initializer=Uniform(0.01),
            arg_params=None, aux_params=None, allow_missing=False,
            force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None,
            validation_metric=None, monitor=None, prefix=None,
            batches_checkpoint=None, num_batches_save_ckpt=2000):
        """Train the module parameters.

        Parameters
        ----------
        train_data : DataIter
        eval_data : DataIter
            If not `None`, will be used as validation set and evaluate the performance
            after each epoch.
        eval_metric : str or EvalMetric
            Default `'acc'`. The performance measure used to display during training.
        epoch_end_callback : function or list of function
            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
            and `aux_params`.
        batch_end_callback : function or list of function
            Each callback will be called with a `BatchEndParam`.
        kvstore : str or KVStore
            Default `'local'`.
        optimizer : str or Optimizer
            Default `'sgd'`
        optimizer_params : dict
            Default `(('learning_rate', 0.01),)`. The parameters for the optimizer constructor.
            The default value is not a `dict`, just to avoid pylint warning on dangerous
            default values.
        eval_end_callback : function or list of function
            These will be called at the end of each full evaluation, with the metrics over
            the entire evaluation set.
        eval_batch_end_callback : function or list of function
            These will be called at the end of each minibatch during evaluation
        initializer : Initializer
            Will be called to initialize the module parameters if not already initialized.
        arg_params : dict
            Default `None`, if not `None`, should be existing parameters from a trained
            model or loaded from a checkpoint (previously saved model). In this case,
            the value here will be used to initialize the module parameters, unless they
            are already initialized by the user via a call to `init_params` or `fit`.
            `arg_params` has higher priority to `initializer`.
        aux_params : dict
            Default `None`. Similar to `arg_params`, except for auxiliary states.
        allow_missing : bool
            Default `False`. Indicate whether we allow missing parameters when `arg_params`
            and `aux_params` are not `None`. If this is `True`, then the missing parameters
            will be initialized via the `initializer`.
        force_rebind : bool
            Default `False`. Whether to force rebinding the executors if already binded.
        force_init : bool
            Default `False`. Indicate whether we should force initialization even if the
            parameters are already initialized.
        begin_epoch : int
            Default `0`. Indicate the starting epoch. Usually, if we are resuming from a
            checkpoint saved at a previous training phase at epoch N, then we should specify
            this value as N+1.
        num_epoch : int
            Number of epochs to run training.

        Examples
        --------
        An example of using fit for training::
            >>> #Assume training dataIter and validation dataIter are ready
            >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter,
                        optimizer_params={'learning_rate':0.01, 'momentum': 0.9},
                        num_epoch=10)
        """
        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label,
                  for_training=True, force_rebind=force_rebind)
        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params,
                         allow_missing=allow_missing, force_init=force_init)
        self.init_optimizer(kvstore=kvstore, optimizer=optimizer,
                            optimizer_params=optimizer_params)

        if validation_metric is None:
            validation_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        ################################################################################
        # training loop
        ################################################################################
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            for nbatch, data_batch in enumerate(train_data):
                if monitor is not None:
                    monitor.tic()
                self.forward_backward(data_batch)
                self.update()
                self.update_metric(eval_metric, data_batch.label)
                if monitor is not None:
                    monitor.toc_print()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)

                if batches_checkpoint is not None and nbatch != 0 and nbatch % num_batches_save_ckpt == 0:
                    for callback in _as_list(epoch_end_callback):
                        callback(epoch, self.symbol, arg_params, aux_params)

            # one epoch of training is finished
            for name, val in eval_metric.get_name_value():
                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc-tic))

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)
            if prefix is not None:
                self._curr_module.save_checkpoint(prefix, epoch + 1, save_optimizer_states=True)

            #----------------------------------------
            # evaluation on validation set
            if eval_data:
                res = self.score(eval_data, validation_metric,
                                 score_end_callback=eval_end_callback,
                                 batch_end_callback=eval_batch_end_callback, epoch=epoch)
                #TODO: pull this into default
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()
Пример #22
0
def acc_fit(mod, update_batch_size,\
            train_data, eval_data=None, eval_metric='acc',
            epoch_end_callback=None, batch_end_callback=None, kvstore='local',
            optimizer='sgd', optimizer_params=(('learning_rate', 0.01),),
            eval_end_callback=None,
            eval_batch_end_callback=None, initializer=Uniform(0.01),
            arg_params=None, aux_params=None, allow_missing=False,
            force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None,
            validation_metric=None, monitor=None):
    """
    this function aims to support training in larger input size by
                allocating additive space to store auxiliary grads
        mod:
            mx.mod.Module
        update_batch:
            int, specifying how many batches between two updates
        **arg_keys:
            same as mod.fit
        """
    assert num_epoch is not None, 'please specify number of epochs'
    it_batch_size = train_data.batch_size

    mod.bind(data_shapes=train_data.provide_data,
             label_shapes=train_data.provide_label,
             for_training=True,
             force_rebind=force_rebind)
    if monitor is not None:
        mod.install_monitor(monitor)
    mod.init_params(initializer=initializer,
                    arg_params=arg_params,
                    aux_params=aux_params,
                    allow_missing=allow_missing,
                    force_init=force_init)
    mod.init_optimizer(kvstore=kvstore,
                       optimizer=optimizer,
                       optimizer_params=optimizer_params)

    if validation_metric is None:
        validation_metric = eval_metric
    if not isinstance(eval_metric, metric.EvalMetric):
        eval_metric = metric.create(eval_metric)

    ################################################################################
    # training loop
    ################################################################################
    arg_acc_grad_arrays = None  # to store auxiliary grad_arrays
    for epoch in range(begin_epoch, num_epoch):
        tic = time.time()
        eval_metric.reset()
        nbatch = 0
        data_iter = iter(train_data)
        end_of_batch = False
        next_data_batch = next(data_iter)
        while not end_of_batch:
            data_batch = next_data_batch
            if monitor is not None:
                monitor.tic()
            mod.forward_backward(data_batch)
            arg_acc_grad_arrays = acc_grad_arrays(mod, arg_acc_grad_arrays)
            if nbatch * it_batch_size % update_batch_size == 0 and nbatch > 0:
                set_grad_arrays(
                    mod, arg_acc_grad_arrays, update_batch_size /
                    it_batch_size)  # normsize=1 by default(softmax norm)
                mod.update()
                arg_acc_grad_arrays = None
            try:
                # pre fetch next batch
                next_data_batch = next(data_iter)
                mod.prepare(next_data_batch)
            except StopIteration:
                end_of_batch = True

            mod.update_metric(eval_metric, data_batch.label)

            if monitor is not None:
                monitor.toc_print()

            if batch_end_callback is not None:
                batch_end_params = BatchEndParam(epoch=epoch,
                                                 nbatch=nbatch,
                                                 eval_metric=eval_metric,
                                                 locals=locals())
                for callback in _as_list(batch_end_callback):
                    callback(batch_end_params)
            nbatch += 1
        if arg_acc_grad_arrays is not None:  # left one update...
            set_grad_arrays(mod, arg_acc_grad_arrays,
                            update_batch_size / it_batch_size)
            mod.update()
            arg_acc_grad_arrays = None

        # one epoch of training is finished
        for name, val in eval_metric.get_name_value():
            mod.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
        toc = time.time()
        mod.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))

        # sync aux params across devices
        arg_params, aux_params = mod.get_params()
        mod.set_params(arg_params, aux_params)

        if epoch_end_callback is not None:
            for callback in _as_list(epoch_end_callback):
                callback(epoch, mod.symbol, arg_params, aux_params)

        #----------------------------------------
        # evaluation on validation set
        if eval_data:
            res = mod.score(eval_data,
                            validation_metric,
                            score_end_callback=eval_end_callback,
                            batch_end_callback=eval_batch_end_callback,
                            epoch=epoch)
            #TODO: pull this into default
            for name, val in res:
                mod.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val)

        # end of 1 epoch, reset the data-iter for another epoch
        train_data.reset()
Пример #23
0
    def fit(
        self,
        train_data,
        eval_data=None,
        eval_metric="acc",
        epoch_end_callback=None,
        batch_end_callback=None,
        kvstore="local",
        optimizer="sgd",
        optimizer_params=(("learning_rate", 0.01),),
        eval_end_callback=None,
        eval_batch_end_callback=None,
        initializer=Uniform(0.01),
        arg_params=None,
        aux_params=None,
        allow_missing=False,
        force_rebind=False,
        force_init=False,
        begin_epoch=0,
        num_epoch=None,
        validation_metric=None,
        monitor=None,
        prefix=None,
    ):
        """Train the module parameters.

        Parameters
        ----------
        train_data : DataIter
        eval_data : DataIter
            If not `None`, will be used as validation set and evaluate the performance
            after each epoch.
        eval_metric : str or EvalMetric
            Default `'acc'`. The performance measure used to display during training.
        epoch_end_callback : function or list of function
            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
            and `aux_params`.
        batch_end_callback : function or list of function
            Each callback will be called with a `BatchEndParam`.
        kvstore : str or KVStore
            Default `'local'`.
        optimizer : str or Optimizer
            Default `'sgd'`
        optimizer_params : dict
            Default `(('learning_rate', 0.01),)`. The parameters for the optimizer constructor.
            The default value is not a `dict`, just to avoid pylint warning on dangerous
            default values.
        eval_end_callback : function or list of function
            These will be called at the end of each full evaluation, with the metrics over
            the entire evaluation set.
        eval_batch_end_callback : function or list of function
            These will be called at the end of each minibatch during evaluation
        initializer : Initializer
            Will be called to initialize the module parameters if not already initialized.
        arg_params : dict
            Default `None`, if not `None`, should be existing parameters from a trained
            model or loaded from a checkpoint (previously saved model). In this case,
            the value here will be used to initialize the module parameters, unless they
            are already initialized by the user via a call to `init_params` or `fit`.
            `arg_params` has higher priority to `initializer`.
        aux_params : dict
            Default `None`. Similar to `arg_params`, except for auxiliary states.
        allow_missing : bool
            Default `False`. Indicate whether we allow missing parameters when `arg_params`
            and `aux_params` are not `None`. If this is `True`, then the missing parameters
            will be initialized via the `initializer`.
        force_rebind : bool
            Default `False`. Whether to force rebinding the executors if already binded.
        force_init : bool
            Default `False`. Indicate whether we should force initialization even if the
            parameters are already initialized.
        begin_epoch : int
            Default `0`. Indicate the starting epoch. Usually, if we are resuming from a
            checkpoint saved at a previous training phase at epoch N, then we should specify
            this value as N+1.
        num_epoch : int
            Number of epochs to run training.

        Examples
        --------
        An example of using fit for training::
            >>> #Assume training dataIter and validation dataIter are ready
            >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter,
                        optimizer_params={'learning_rate':0.01, 'momentum': 0.9},
                        num_epoch=10)
        """
        assert num_epoch is not None, "please specify number of epochs"

        self.bind(
            data_shapes=train_data.provide_data,
            label_shapes=train_data.provide_label,
            for_training=True,
            force_rebind=force_rebind,
        )
        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(
            initializer=initializer,
            arg_params=arg_params,
            aux_params=aux_params,
            allow_missing=allow_missing,
            force_init=force_init,
        )
        self.init_optimizer(
            kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params
        )

        if validation_metric is None:
            validation_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        ################################################################################
        # training loop
        ################################################################################
        # epoch 0
        if epoch_end_callback is not None:
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)
            for callback in _as_list(epoch_end_callback):
                callback(-1, self.symbol, arg_params, aux_params)

        from lib.pair_matching.batch_updater_py_multi import batchUpdaterPyMulti

        config = self.config
        if config.TRAIN.TENSORBOARD_LOG:
            from mxboard import SummaryWriter

            tf_log_dir = os.path.join(
                os.path.dirname(prefix),
                "logs/{}".format(time.strftime("%Y-%m-%d-%H-%M")),
            )
            summ_writer = SummaryWriter(logdir=tf_log_dir)

        interBatchUpdater = batchUpdaterPyMulti(config, 480, 640)
        last_lr = 0
        cur_step = 0
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            for nbatch, data_batch in enumerate(train_data):
                if monitor is not None:
                    monitor.tic()
                # disp weights L2 norm
                cur_lr = self._curr_module._optimizer._get_lr(0)
                if nbatch % (4000 / train_data.batch_size) == 0:
                    all_params = self._curr_module.get_params()[0]
                    all_param_names = all_params.keys()
                    all_param_names = sorted(all_param_names)
                    print_and_log(prefix, self.logger)
                    weight_str = ""
                    for view_name in all_param_names:
                        weight_str += "{}: {} ".format(
                            view_name, nd.norm(all_params[view_name]).asnumpy()
                        )
                    print_and_log(weight_str, self.logger)
                    print_and_log(
                        "batch {}: lr: {}".format(nbatch, cur_lr), self.logger
                    )
                    if config.TRAIN.TENSORBOARD_LOG:
                        summ_writer.add_scalar(
                            tag="learning_rate", value=cur_lr, global_step=cur_step
                        )
                if cur_lr != last_lr:
                    print_and_log(
                        "batch {}: lr: {}".format(nbatch, cur_lr), self.logger
                    )
                    last_lr = cur_lr
                    if config.TRAIN.TENSORBOARD_LOG:
                        summ_writer.add_scalar(
                            tag="learning_rate", value=cur_lr, global_step=cur_step
                        )

                train_iter_size = config.network.TRAIN_ITER_SIZE
                for iter_idx in range(train_iter_size):
                    self.forward_backward(data_batch)
                    preds = self._curr_module.get_outputs(False)
                    self.update()
                    if iter_idx != train_iter_size - 1:
                        data_batch = interBatchUpdater.forward(
                            data_batch, preds, config
                        )
                cur_step += 1
                self.update_metric(eval_metric, data_batch.label)

                if monitor is not None:
                    monitor.toc_print()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(
                        epoch=epoch,
                        nbatch=nbatch,
                        eval_metric=eval_metric,
                        locals=locals(),
                    )
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)
                if config.TRAIN.TENSORBOARD_LOG:
                    for name, val in eval_metric.get_name_value():
                        summ_writer.add_scalar(
                            tag="BatchTrain-{}".format(name),
                            value=val,
                            global_step=cur_step,
                        )

            # one epoch of training is finished
            for name, val in eval_metric.get_name_value():
                self.logger.info("Epoch[%d] Train-%s=%f", epoch, name, val)
                if config.TRAIN.TENSORBOARD_LOG:
                    summ_writer.add_scalar(
                        tag="EpochTrain-{}".format(name), value=val, global_step=epoch
                    )

            toc = time.time()
            self.logger.info("Epoch[%d] Time cost=%.3f", epoch, (toc - tic))

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)

            # ----------------------------------------
            # evaluation on validation set
            if eval_data:
                res = self.score(
                    eval_data,
                    validation_metric,
                    score_end_callback=eval_end_callback,
                    batch_end_callback=eval_batch_end_callback,
                    epoch=epoch,
                )
                # TODO: pull this into default
                for name, val in res:
                    self.logger.info("Epoch[%d] Validation-%s=%f", epoch, name, val)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()
Пример #24
0
 def __init__(self, bert, prefix=None, params=None, \
                 n_rnn_layers=0, rnn_hidden_size=600, num_rnn_layers=1, n_dense_layers=0, units_dense=600, \
                 add_query=False, \
                 apply_coattention=False, bert_out_dim=768,\
                 apply_self_attention=False, self_attention_dimension=None, n_attention_heads=4,
                 apply_transformer=False,
                 qanet_style_out=False,
                 bidaf_style_out=False,
                 remove_special_token=False):
     super(BertForQA, self).__init__(prefix=prefix, params=params)
     self.add_query=add_query
     self.apply_coattention = apply_coattention
     self.apply_self_attention = apply_self_attention
     self.apply_transformer = apply_transformer
     self.qanet_style_out = qanet_style_out
     self.bidaf_style_out = bidaf_style_out
     self.remove_special_token = remove_special_token
     self.bert = bert
     if self.apply_coattention:
         with self.name_scope():
             #self.co_attention_ = CoAttention("co-attention_", bert_out_dim) # try multiple layers
             self.co_attention = CoAttention("co-attention", bert_out_dim)
             if self.qanet_style_out:
                 self.project = gluon.nn.Dense(
                     units=bert_out_dim,
                     flatten=False,
                     use_bias=False,
                     weight_initializer=Xavier(),
                     prefix='projection_'
                 )
                 self.dropout = gluon.nn.Dropout(0.1)
                 self.model_encoder = TransformerEncoder(units=bert_out_dim)
                 self.predict_begin = gluon.nn.Dense(
                     units=1,
                     use_bias=True,
                     flatten=False,
                     weight_initializer=Xavier(
                         rnd_type='uniform', factor_type='in', magnitude=1),
                     bias_initializer=Uniform(1.0/bert_out_dim),
                     prefix='predict_start_'
                 )
                 self.predict_end = gluon.nn.Dense(
                     units=1,
                     use_bias=True,
                     flatten=False,
                     weight_initializer=Xavier(
                         rnd_type='uniform', factor_type='in', magnitude=1),
                     bias_initializer=Uniform(1.0/bert_out_dim),
                     prefix='predict_end_'
                 )
                 self.flatten = gluon.nn.Flatten()
             elif self.bidaf_style_out:
                 # BiDAF mode
                 self.modeling_layer = rnn.LSTM( hidden_size=int(bert_out_dim / 2), 
                                                 num_layers=2, 
                                                 dropout=0.0, 
                                                 bidirectional=True,
                                                 input_size=int(bert_out_dim * 4))
                 self.output_layer = BiDAFOutputLayer(span_start_input_dim=int(bert_out_dim / 2),
                                                     nlayers=1,
                                                     dropout=0.2)
             # '''
             # for the cls's encoding
             # used in version 2.0
             self.cls_mapping = nn.Dense(
                 units=2,
                 flatten=False,
                 weight_initializer=Xavier(),
                 prefix='cls_mapping_'
             )
             # '''
     if self.apply_self_attention:
         if self_attention_dimension is None:
             self_attention_dimension = bert_out_dim
         with self.name_scope():
             self.multi_head_attention = MultiHeadAttentionCell(DotProductAttentionCell(), \
                     self_attention_dimension, self_attention_dimension, self_attention_dimension, n_attention_heads)
     if self.apply_transformer:
         with self.name_scope():
             self.transformer = TransformerEncoder(units=bert_out_dim)
     if self.apply_coattention and (self.qanet_style_out or self.bidaf_style_out):
         self.span_classifier = None
     else:
         self.span_classifier = nn.HybridSequential()
         with self.span_classifier.name_scope():
             for i in range(n_rnn_layers):
                 self.span_classifier.add(rnn.LSTM( hidden_size=rnn_hidden_size, 
                                                     num_layers=num_rnn_layers, 
                                                     dropout=0.0, 
                                                     bidirectional=True))
             for i in range(n_dense_layers):
                 self.span_classifier.add(nn.Dense(units=units_dense, flatten=False, activation='relu'))
             self.span_classifier.add(nn.Dense(units=2, flatten=False))
Пример #25
0
    def train(self,
              train_data,
              epochs=1,
              batch_size=32,
              validation_data=None,
              train_resize_batch_num=None):
        """Train the model and update the model parameters."""
        stats = dict()
        if self.is_worker:
            config = self.config.copy()
            if "batch_size" not in config:
                config["batch_size"] = batch_size

            if train_resize_batch_num is not None:
                config["train_resize_batch_num"] = train_resize_batch_num
            train_data_iter = train_data(config, self.kv)
            val_data_iter = validation_data(
                config, self.kv) if validation_data else None

            start_time = time.time()
            if self.trainer:  # Imperative API

                def cpu_context(target_data):
                    if isinstance(target_data, list):
                        return [cpu_context(d) for d in target_data]
                    else:
                        return target_data.as_in_context(mx.cpu())

                for epoch in range(epochs):
                    # DataLoader doesn't need to be reset.
                    if isinstance(train_data_iter, mx.io.DataIter):
                        train_data_iter.reset()
                    if self.eval_metrics:
                        self.eval_metrics.reset(
                        )  # metrics will accumulate for one batch.
                    batch_start_time = time.time()
                    epoch_start_time = time.time()
                    for i, batch in enumerate(train_data_iter):
                        data = cpu_context(batch.data)
                        label = cpu_context(batch.label)
                        if not isinstance(data, list):
                            data = [data]
                        if not isinstance(label, list):
                            label = [label]
                        from mxnet import autograd as ag
                        with ag.record():
                            output = self.model(*data)  # forward
                            if not isinstance(output, list):
                                output = [output]
                            Ls = self.loss(*output, *label)
                            ag.backward(Ls)
                        self.trainer.step(batch_size)
                        if self.eval_metrics:
                            self.eval_metrics.update(label, output)
                        if not (i + 1) % self.config["log_interval"]:
                            # This would be logged on driver for each worker process.
                            iteration_log = \
                                "Epoch[%d] Batch[%d]  Speed: %f samples/sec  %s=%f" \
                                % (epoch, i,
                                   batch_size / (time.time() - batch_start_time),
                                   "loss", Ls.asnumpy().mean())
                            if self.eval_metrics:
                                names, accs = self.eval_metrics.get()
                                names, accs = to_list(names), to_list(accs)
                                for name, acc in zip(names, accs):
                                    iteration_log += "  %s=%f" % (name, acc)
                            self.logger.info(iteration_log)
                        batch_start_time = time.time()
                    # Epoch time log.
                    self.logger.info("[Epoch %d] time cost: %f" %
                                     (epoch, time.time() - epoch_start_time))
                    # Epoch metrics log on train data.
                    if self.eval_metrics:
                        epoch_train_log = "[Epoch %d] training: " % epoch
                        names, accs = self.eval_metrics.get()
                        names, accs = to_list(names), to_list(accs)
                        for name, acc in zip(names, accs):
                            epoch_train_log += "%s=%f  " % (name, acc)
                        self.logger.info(epoch_train_log)
                    # Epoch metrics log on validation data if any.
                    if val_data_iter:
                        if isinstance(val_data_iter, mx.io.DataIter):
                            val_data_iter.reset()
                        self.val_metrics.reset()
                        for batch in val_data_iter:
                            data = cpu_context(batch.data)
                            label = cpu_context(batch.label)
                            if not isinstance(data, list):
                                data = [data]
                            if not isinstance(label, list):
                                label = [label]
                            output = self.model(*data)
                            if not isinstance(output, list):
                                output = [output]
                            self.val_metrics.update(label, output)
                        epoch_val_log = "[Epoch %d] validation: " % epoch
                        names, accs = self.val_metrics.get()
                        names, accs = to_list(names), to_list(accs)
                        for name, acc in zip(names, accs):
                            epoch_val_log += "%s=%f  " % (name, acc)
                        self.logger.info(epoch_val_log)
                    # TODO: save checkpoints
                if self.eval_metrics:
                    names, accs = self.eval_metrics.get()
                    names, accs = to_list(names), to_list(accs)
                    for name, acc in zip(names, accs):
                        stats[name] = acc
            else:  # Symbolic API
                # TODO: seems no history (i.e. validation accuracy) returned by fit?
                if "init" not in self.config:
                    from mxnet.initializer import Uniform
                    self.config["init"] = Uniform(
                        0.01)  # This is the default value for MXNet.
                if self.eval_metrics is None:
                    self.eval_metrics = 'acc'  # This is the default value for MXNet.
                self.model.fit(
                    train_data=train_data_iter,
                    num_epoch=epochs,
                    initializer=self.config["init"],
                    kvstore=self.kv,
                    optimizer=self.config["optimizer"],
                    optimizer_params=self.config["optimizer_params"],
                    eval_data=val_data_iter,
                    eval_metric=self.eval_metrics,
                    validation_metric=self.val_metrics,
                    batch_end_callback=mx.callback.Speedometer(
                        batch_size, self.config["log_interval"]),
                    epoch_end_callback=None if "model" not in self.config else
                    mx.callback.do_checkpoint(self.config["model"]))
            epoch_time = time.time() - start_time
            stats["epoch_time"] = epoch_time
        return [stats]
Пример #26
0
def mlperf_fit(self,
               args,
               data_loader,
               epoch_size,
               eval_metric='acc',
               epoch_end_callback=None,
               batch_end_callback=None,
               kvstore='local',
               optimizer='sgd',
               optimizer_params=(('learning_rate', 0.01), ),
               explorer='linear',
               explorer_params=None,
               eval_end_callback=None,
               eval_batch_end_callback=None,
               initializer=Uniform(0.01),
               arg_params=None,
               aux_params=None,
               allow_missing=False,
               force_rebind=False,
               force_init=False,
               begin_epoch=0,
               num_epoch=None,
               validation_metric=None,
               monitor=None,
               sparse_row_id_fn=None,
               eval_offset=0,
               eval_period=1,
               accuracy_threshold=1.0):

    assert num_epoch is not None, 'please specify number of epochs'

    if monitor is not None:
        self.install_monitor(monitor)

    self.init_optimizer(kvstore=kvstore,
                        optimizer=optimizer,
                        optimizer_params=optimizer_params)

    explorer = Explorer.create_explorer(name=explorer,
                                        optimizer=self._optimizer,
                                        explorer_params=explorer_params)
    #This mxnet can not use several optimizers without sgd series
    explorer.set_best_coeff(0)
    explorer.set_best_wd_coeff(0)
    explorer.set_best_cg(0)
    exp_freq = explorer_params['explore_freq']
    exp_start_epoch = explorer_params['explore_start_epoch']

    if validation_metric is None:
        validation_metric = eval_metric
    ###########################################################################
    # Adding Correct and Total Count metrics
    ###########################################################################
    if not isinstance(validation_metric, list):
        validation_metric = [validation_metric]

    validation_metric = mx.metric.create(validation_metric)

    if not isinstance(validation_metric, mx.metric.CompositeEvalMetric):
        vm = mx.metric.CompositeEvalMetric()
        vm.append(validation_metric)
        validation_metric = vm

    for m in [CorrectCount(), TotalCount()]:
        validation_metric.metrics.append(m)
    ###########################################################################

    if not isinstance(eval_metric, mx.metric.EvalMetric):
        eval_metric = mx.metric.create(eval_metric)

    try:
        world_rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
        world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
    except:
        world_rank = 0
        world_size = 1

    use_cval_data =    explorer_params['add_one_fwd_epoch'] < num_epoch \
                    or explorer_params['no_augument_epoch'] < num_epoch

    best_rank = 0
    self.prepare_states()

    mx_resnet_print(key=mlperf_constants.INIT_STOP, sync=True)
    mx_resnet_print(key=mlperf_constants.RUN_START, sync=True)

    # data iterators
    (train_data, eval_data, cval_data) = data_loader(args, kvstore)
    if 'dist' in args.kv_store and not 'async' in args.kv_store:
        logging.info('Resizing training data to %d batches per machine',
                     epoch_size)
        # resize train iter to ensure each machine has same number of batches per epoch
        # if not, dist_sync can hang at the end with one machine waiting for other machines
        if not args.use_dali:
            train = mx.io.ResizeIter(train_data, epoch_size)

    block_epoch_start = begin_epoch
    block_epoch_count = eval_offset + 1 - (begin_epoch % eval_period)
    if block_epoch_count < 0:
        block_epoch_count += eval_period
    mx_resnet_print(key=mlperf_constants.BLOCK_START,
                    metadata={
                        'first_epoch_num': block_epoch_start + 1,
                        'epoch_count': block_epoch_count
                    })
    ################################################################################
    # training loop
    ################################################################################

    for epoch in range(begin_epoch, num_epoch):
        mx_resnet_print(key=mlperf_constants.EPOCH_START,
                        metadata={'epoch_num': epoch + 1})
        tic = time.time()
        eval_metric.reset()
        nbatch = 0

        use_normal_data_batch = epoch < explorer_params['no_augument_epoch']
        if not use_normal_data_batch:
            if world_rank == 0:
                self.logger.info('use non-augumented batch')

        end_of_batch = False

        if use_normal_data_batch:
            data_iter = iter(train_data)
            next_data_batch = next(data_iter)
        else:
            cval_iter = iter(cval_data)
            next_cval_batch = next(cval_iter)

        smooth_decay = explorer_params['smooth_decay']

        if not smooth_decay:
            explorer.apply_lr_decay_epoch(epoch)
            explorer.apply_wd_decay_epoch(epoch)
        explorer.set_mom(epoch)

        while not end_of_batch:
            if use_normal_data_batch:
                data_batch = next_data_batch
            else:
                cval_batch = next_cval_batch
            if monitor is not None:
                monitor.tic()

            if use_normal_data_batch:
                self.forward_backward(data_batch)
            else:
                self.forward_backward(cval_batch)

            if smooth_decay:
                explorer.apply_lr_decay_iter()
                explorer.apply_wd_decay_iter()
            explorer.apply_wd_warmup()
            explorer.apply_burn_in()

            use_explorer = (epoch == 0
                            and nbatch == 0) or (epoch >= exp_start_epoch
                                                 and nbatch % exp_freq == 0)
            if use_explorer:
                explorer.set_tmp_coeff(world_rank)
                explorer.set_tmp_wd_coeff(world_rank)
                explorer.set_tmp_cg(world_rank)

            explorer.set_best_coeff(0)
            explorer.set_best_wd_coeff(0)
            explorer.set_best_cg(world_rank)
            self.update()

            if use_normal_data_batch:
                if isinstance(data_batch, list):
                    self.update_metric(eval_metric,
                                       [db.label for db in data_batch],
                                       pre_sliced=True)
                else:
                    self.update_metric(eval_metric, data_batch.label)
            else:
                if isinstance(cval_batch, list):
                    self.update_metric(eval_metric,
                                       [db.label for db in cval_batch],
                                       pre_sliced=True)
                else:
                    self.update_metric(eval_metric, cval_batch.label)

            if use_normal_data_batch:
                try:
                    # pre fetch next batch
                    next_data_batch = next(data_iter)
                except StopIteration:
                    end_of_batch = True
            else:
                try:
                    # pre fetch next cval batch
                    next_cval_batch = next(cval_iter)
                except StopIteration:
                    end_of_batch = True

            if use_normal_data_batch:
                if not end_of_batch:
                    self.prepare(next_data_batch,
                                 sparse_row_id_fn=sparse_row_id_fn)
            else:
                if not end_of_batch:
                    self.prepare(next_cval_batch,
                                 sparse_row_id_fn=sparse_row_id_fn)

            if monitor is not None:
                monitor.toc_print()

            if batch_end_callback is not None:
                batch_end_params = BatchEndParam(epoch=epoch,
                                                 nbatch=nbatch,
                                                 eval_metric=eval_metric,
                                                 locals=locals())
                for callback in _as_list(batch_end_callback):
                    callback(batch_end_params)
            nbatch += 1

        mx_resnet_print(key=mlperf_constants.EPOCH_STOP,
                        metadata={"epoch_num": epoch + 1})
        # one epoch of training is finished
        toc = time.time()
        if kvstore:
            if kvstore.rank == 0:
                self.logger.info('Epoch[%d] Time cost=%.3f', epoch,
                                 (toc - tic))
        else:
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))

        # sync aux params across devices
        #arg_params, aux_params = self.get_params()
        #self.set_params(arg_params, aux_params)

        if epoch_end_callback is not None:
            for callback in _as_list(epoch_end_callback):
                callback(epoch, self.symbol, arg_params, aux_params)

        #----------------------------------------
        # evaluation on validation set
        if eval_data and epoch >= eval_offset and (
                epoch - eval_offset) % eval_period == 0:
            mx_resnet_print(key=mlperf_constants.EVAL_START,
                            metadata={'epoch_num': epoch + 1})
            res = self.score(eval_data,
                             validation_metric,
                             score_end_callback=eval_end_callback,
                             batch_end_callback=eval_batch_end_callback,
                             epoch=epoch)
            #TODO: pull this into default
            if kvstore:
                if kvstore.rank == 0:
                    for name, val in res:
                        self.logger.info('Epoch[%d] Validation-%s=%f', epoch,
                                         name, val)
            else:
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name,
                                     val)
            res = dict(res)

            acc = [res['correct-count'], res['total-count']]
            acc = all_reduce(acc)
            acc = acc[0] / acc[1]
            mx_resnet_print(key=mlperf_constants.EVAL_STOP,
                            metadata={'epoch_num': epoch + 1})

            mx_resnet_print(key=mlperf_constants.EVAL_ACCURACY,
                            val=acc,
                            metadata={'epoch_num': epoch + 1})

            mx_resnet_print(
                key=mlperf_constants.BLOCK_STOP,
                metadata={'first_epoch_num': block_epoch_start + 1})
            if acc > accuracy_threshold:
                mx_resnet_print(key=mlperf_constants.RUN_STOP,
                                metadata={'status': 'success'})

                return epoch

            if epoch < (num_epoch - 1):
                block_epoch_start = epoch + 1
                block_epoch_count = num_epoch - epoch - 1
                if block_epoch_count > eval_period:
                    block_epoch_count = eval_period
                mx_resnet_print(key=mlperf_constants.BLOCK_START,
                                metadata={
                                    'first_epoch_num': block_epoch_start + 1,
                                    'epoch_count': block_epoch_count
                                })

        # end of 1 epoch, reset the data-iter for another epoch
        if use_normal_data_batch:
            train_data.reset()
        else:
            cval_data.reset()

    mx_resnet_print(key=mlperf_constants.RUN_STOP,
                    metadata={'status': 'aborted'})
    return num_epoch
Пример #27
0
    def fit(self,
            train_data,
            eval_data=None,
            eval_metric='acc',
            epoch_end_callback=None,
            batch_end_callback=None,
            kvstore='local',
            optimizer='sgd',
            optimizer_params=(('learning_rate', 0.01), ),
            eval_end_callback=None,
            eval_batch_end_callback=None,
            initializer=Uniform(0.01),
            arg_params=None,
            aux_params=None,
            allow_missing=False,
            force_rebind=False,
            force_init=False,
            begin_epoch=0,
            num_epoch=None,
            best_model_callbacks=None,
            eval_interval=None,
            validation_metric=None,
            monitor=None):

        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data,
                  label_shapes=train_data.provide_label,
                  for_training=True,
                  force_rebind=force_rebind)

        if monitor is not None:
            self.install_monitor(monitor)

        self.init_params(initializer=initializer,
                         arg_params=arg_params,
                         aux_params=aux_params,
                         allow_missing=allow_missing,
                         force_init=force_init)

        self.init_optimizer(kvstore=kvstore,
                            optimizer=optimizer,
                            optimizer_params=optimizer_params)

        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        if validation_metric is None:
            validation_metric = copy.deepcopy(eval_metric)

        epoch_metric = copy.deepcopy(eval_metric)

        swa_arg_params = None
        swa_aux_params = None
        swa_cnt = 0

        ################################################################################
        # training loop
        ################################################################################
        for epoch in range(begin_epoch, num_epoch):
            tic_epoch = time.time()
            eval_metric.reset()

            nbatch = 0
            end_of_batch = False
            data_iter = iter(train_data)
            next_data_batch = next(data_iter)
            name_values = []

            while not end_of_batch:
                data_batch = next_data_batch

                if monitor is not None:
                    monitor.tic()

                self.forward_backward(data_batch)
                self.update()

                try:
                    # pre fetch next batch
                    next_data_batch = next(data_iter)
                    self.prepare(next_data_batch)
                except StopIteration:
                    end_of_batch = True

                self.update_metric(eval_metric, data_batch.label)
                if end_of_batch:
                    name_values = eval_metric.get_name_value()

                if monitor is not None:
                    monitor.toc_print()

                nbatch += 1

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch,
                                                     nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)

                    eval_metric.reset()

                # ----------------------------------------
                # evaluation on validation set
                to_go = eval_interval is not None and nbatch % eval_interval == 0
                if to_go and eval_data:
                    res = self.score(
                        eval_data,
                        validation_metric,
                        score_end_callback=eval_end_callback,
                        batch_end_callback=eval_batch_end_callback,
                        epoch=epoch)
                    for name, val in res:
                        self.logger.info(
                            'Epoch[%d] Batch[%d] Validation-%s=%f', epoch,
                            nbatch, name, val)

                    if best_model_callbacks is not None:
                        for callback in _as_list(best_model_callbacks):
                            if callback.is_best(validation_metric):
                                # sync aux params across devices
                                arg_params, aux_params = self.get_params()
                                sync_made = True
                                callback.checkpoint_if_only_best(
                                    validation_metric, self.symbol, arg_params,
                                    aux_params)
                                break

            # one epoch of training is finished
            for name, val in name_values:
                self.logger.info('Epoch[%d] Train-%s=%f', epoch + 1, name, val)
            toc_epoch = time.time()
            elapsed = (toc_epoch - tic_epoch)
            avg_speed = float(len(train_data)) / (toc_epoch - tic_epoch)
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch + 1, elapsed)
            self.logger.info('Epoch[%d] Average speed=%.3f samples/sec',
                             epoch + 1, avg_speed)

            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    callback(epoch, self.symbol, arg_params, aux_params)

            # evaluation on validation set
            if eval_data:
                res = self.score(eval_data,
                                 validation_metric,
                                 score_end_callback=eval_end_callback,
                                 batch_end_callback=eval_batch_end_callback,
                                 epoch=epoch + 1)
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch + 1,
                                     name, val)

                if best_model_callbacks is not None:
                    for callback in _as_list(best_model_callbacks):
                        callback.checkpoint_if_only_best(
                            validation_metric, self.symbol, arg_params,
                            aux_params)

            # end of epoch, reset the data-iter for another epoch
            train_data.reset()
Пример #28
0
    def fit(
            self,
            train_data,
            ogdb,
            eval_data=None,
            eval_metric='acc',
            epoch_end_callback=None,
            batch_end_callback=None,
            kvstore='local',
            optimizer='sgd',
            optimizer_params=(
                ('learning_rate',
                 0.01), ),  #,('rescale_grad', 1.0/8.0),), #8 gpu attempt
            eval_end_callback=None,
            iter_size=1,
            eval_batch_end_callback=None,
            initializer=Uniform(0.01),
            arg_params=None,
            aux_params=None,
            allow_missing=False,
            force_rebind=False,
            force_init=False,
            begin_epoch=0,
            num_epoch=None,
            validation_metric=None,
            monitor=None):
        """Ke's revision: add iter_size. Trains the module parameters.

        Checkout `Module Tutorial <http://mxnet.io/tutorials/basic/module.html>`_ to see
        a end-to-end use-case.

        Parameters
        ----------
        train_data : DataIter
            Train DataIter.
        eval_data : DataIter
            If not ``None``, will be used as validation set and the performance
            after each epoch will be evaluated.
        eval_metric : str or EvalMetric
            Defaults to 'accuracy'. The performance measure used to display during training.
            Other possible predefined metrics are:
            'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'.
        epoch_end_callback : function or list of functions
            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
            and `aux_params`.
        batch_end_callback : function or list of function
            Each callback will be called with a `BatchEndParam`.
        kvstore : str or KVStore
            Defaults to 'local'.
        optimizer : str or Optimizer
            Defaults to 'sgd'.
        optimizer_params : dict
            Defaults to ``(('learning_rate', 0.01),)``. The parameters for
            the optimizer constructor.
            The default value is not a dict, just to avoid pylint warning on dangerous
            default values.
        eval_end_callback : function or list of function
            These will be called at the end of each full evaluation, with the metrics over
            the entire evaluation set.
        eval_batch_end_callback : function or list of function
            These will be called at the end of each mini-batch during evaluation.
        initializer : Initializer
            The initializer is called to initialize the module parameters when they are
            not already initialized.
        arg_params : dict
            Defaults to ``None``, if not ``None``, should be existing parameters from a trained
            model or loaded from a checkpoint (previously saved model). In this case,
            the value here will be used to initialize the module parameters, unless they
            are already initialized by the user via a call to `init_params` or `fit`.
            `arg_params` has a higher priority than `initializer`.
        aux_params : dict
            Defaults to ``None``. Similar to `arg_params`, except for auxiliary states.
        allow_missing : bool
            Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params`
            and `aux_params` are not ``None``. If this is ``True``, then the missing parameters
            will be initialized via the `initializer`.
        force_rebind : bool
            Defaults to ``False``. Whether to force rebinding the executors if already bound.
        force_init : bool
            Defaults to ``False``. Indicates whether to force initialization even if the
            parameters are already initialized.
        begin_epoch : int
            Defaults to 0. Indicates the starting epoch. Usually, if resumed from a
            checkpoint saved at a previous training phase at epoch N, then this value should be
            N+1.
        num_epoch : int
            Number of epochs for training.

        Examples
        --------
        >>> # An example of using fit for training.
        >>> # Assume training dataIter and validation dataIter are ready
        >>> # Assume loading a previously checkpointed model
        >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 3)
        >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer='sgd',
        ...     optimizer_params={'learning_rate':0.01, 'momentum': 0.9},
        ...     arg_params=arg_params, aux_params=aux_params,
        ...     eval_metric='acc', num_epoch=10, begin_epoch=3)
        """
        assert num_epoch is not None, 'please specify number of epochs'

        self.bind(data_shapes=train_data.provide_data,
                  label_shapes=train_data.provide_label,
                  for_training=True,
                  force_rebind=force_rebind,
                  grad_req='add')
        if monitor is not None:
            self.install_monitor(monitor)
        self.init_params(initializer=initializer,
                         arg_params=arg_params,
                         aux_params=aux_params,
                         allow_missing=allow_missing,
                         force_init=force_init)
        self.init_optimizer(kvstore=kvstore,
                            optimizer=optimizer,
                            optimizer_params=optimizer_params)

        if validation_metric is None:
            validation_metric = eval_metric
        if not isinstance(eval_metric, metric.EvalMetric):
            eval_metric = metric.create(eval_metric)

        annealing_steps = 0  # number of current annealing steps in current epoch
        redo_training = 0  # Flag to redo training / resample
        val_list = []  # list of validation results per annealing step
        cur_val = 0
        target_prec = 50
        #Note: we want to identify the best cluster of images / training sets with a low percentage
        ################################################################################
        # training loop
        ################################################################################
        for epoch in range(begin_epoch, num_epoch):
            tic = time.time()
            eval_metric.reset()
            nbatch = 0
            if redo_training:
                annealing_steps = annealing_steps + 1
                self.logger.info('Redoing training to meet criteria = %d',
                                 annealing_steps)
                #sroidb = train_data.roidb #passthrough test

                atick = time.time()

                iterdiff = 1.0
                # Check if we've stagnated
                if len(val_list) > 2:
                    itermean = (val_list[-1] + val_list[-2] + val_list[-3]) / 3
                    iterdiff = abs(itermean - val_list[-1])
                    self.logger.info('Last 3 samples have diff of: %f',
                                     iterdiff)

                if iterdiff < 0.01:
                    self.logger.info(
                        'Reached a stagnated annealing criteria, dumping current samples'
                    )
                    # Do something drastic
                    # Lets try to instantly use the original db
                    sroidb = ogdb

                    # Try to read in another random subset
                    #sroidb = sample_roidb(ogdb, 25) # Sample with removal
                else:
                    # Continue as usual
                    # Select a new random subset
                    newroidb = sample_roidb(ogdb,
                                            15)  # Without removal, this is 10%

                    # Append old with new
                    sroidb = append_roidb(train_data.roidb, newroidb)

                # Create new training data instance by passing most of previous arguments and new random db
                train_data2 = AnchorLoader(
                    train_data.feat_sym,
                    sroidb,
                    train_data.batch_size,
                    train_data.shuffle,
                    train_data.ctx,
                    train_data.work_load_list,
                    train_data.feat_stride,
                    train_data.anchor_scales,
                    train_data.anchor_ratios,
                    train_data.aspect_grouping,
                    nThreads=default.prefetch_thread_num)

                # Overwrite old train_data with the new one
                train_data = train_data2
                data_iter = iter(train_data)

                atock = time.time()
                self.logger.info('Annealing[%d] Time cost=%.3f',
                                 annealing_steps, (atock - atick))
            else:
                data_iter = iter(train_data)
                annealing_steps = 0
                val_list = []
                #target_prec=cur_val+5
                target_prec = target_prec + 5
            end_of_batch = False
            next_data_batch = next(data_iter)

            while not end_of_batch:
                data_batch = next_data_batch
                if monitor is not None:
                    monitor.tic()
                # self.forward_backward(data_batch)
                self.forward(data_batch, is_train=True, grad_req='add')
                self.backward()
                if nbatch % iter_size == 0:  # update every iter_size batches
                    self.update()
                    for g in self._curr_module._exec_group.grad_arrays:
                        for g1 in g:
                            if g1 is not None:
                                g1[:] = 0.

                try:
                    # pre fetch next batch
                    next_data_batch = next(data_iter)
                    self.prepare(next_data_batch)
                except StopIteration:
                    end_of_batch = True

                self.update_metric(eval_metric, data_batch.label)

                if monitor is not None:
                    monitor.toc_print()

                if batch_end_callback is not None:
                    batch_end_params = BatchEndParam(epoch=epoch,
                                                     nbatch=nbatch,
                                                     eval_metric=eval_metric,
                                                     locals=locals())
                    for callback in _as_list(batch_end_callback):
                        callback(batch_end_params)
                nbatch += 1

            # one epoch of training is finished
            for name, val in eval_metric.get_name_value():
                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
            toc = time.time()
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))
            #print('Epoch[%d] Time cost=%.3f', epoch, (toc-tic))
            # sync aux params across devices
            arg_params, aux_params = self.get_params()
            self.set_params(arg_params, aux_params)

            if epoch_end_callback is not None:
                for callback in _as_list(epoch_end_callback):
                    cur_val = callback(epoch, self.symbol, arg_params,
                                       aux_params)

            self.logger.info('Returned Validation=%f', val)
            val_list.append(val)
            #----------------------------------------
            # evaluation on validation set
            if eval_data:
                self.logger.info('Evaluating data')
                res = self.score(eval_data,
                                 validation_metric,
                                 score_end_callback=eval_end_callback,
                                 batch_end_callback=eval_batch_end_callback,
                                 epoch=epoch)
                #TODO: pull this into default
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name,
                                     val)

            #----------
            # Check epoch if it falls within the validation threshold
            if cur_val < target_prec:
                # Evaluate list of precision/validation results first
                #val_list
                print(eval_data)

                #else
                redo_training = 1
                self.logger.info('Retraining data=%f', val)
            else:
                redo_training = 0

            self.logger.info('Annealing steps=%f', annealing_steps)

            # end of 1 epoch, reset the data-iter for another epoch
            train_data.reset()
Пример #29
0
    def fit(self, train_data_list, optimizer_params, batch_end_callback=None, kvstore='local',
            initializer=Uniform(0.01),
            arg_params=None, aux_params=None, allow_missing=False,
            force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None):

        assert num_epoch is not None, 'please specify number of epochs'
        assert arg_params is None and aux_params is None

        provide_data_list = []
        provide_label_list = []
        for td in train_data_list:
            provide_data_list.append(td.provide_data)
            provide_label_list.append(td.provide_label)

        self.bind(data_shapes_list=provide_data_list, label_shapes_list=provide_label_list,
                  for_training=True)

        self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params,
                         allow_missing=allow_missing, force_init=force_init)
        self.init_optimizer(optimizer_params=optimizer_params)

        _arg_params, _aux_params = self.backbone_module.get_params()
        _arg_params_rank_0 = self.broadcast_parameters(_arg_params)
        _aux_params_rank_0 = self.broadcast_parameters(_aux_params)
        self.backbone_module.set_params(_arg_params_rank_0, _aux_params_rank_0)
        data_end_id = 0
        ################################################################################
        # training loop
        ################################################################################
        num_epoch_list = [0] * self.head_num
        for epoch in range(begin_epoch, num_epoch):
            nbatch = 0
            end_of_batch = False
            data_iter_list = []
            for i in range(self.head_num):
                train_data_list[i].reset()
                data_iter_list.append(iter(train_data_list[i]))
            next_data_batch_list = []
            for i in range(self.head_num):
                next_data_batch_list.append(next(data_iter_list[i]))
            while not end_of_batch:
                data_batch_list = next_data_batch_list
                data_batch = self.combine(data_batch_list)

                self.forward_backward(data_batch)
                self.update()
                assert not isinstance(data_batch, list)

                for i in range(self.head_num):
                    try:
                        next_data_batch_list[i] = next(data_iter_list[i])
                        self.prepare(next_data_batch_list[i], sparse_row_id_fn=None)
                    except StopIteration:
                        num_epoch_list[i] += 1
                        data_end_id += 1
                        if data_end_id != self.head_num:
                            train_data_list[i].reset()
                            data_iter_list[i] = iter(train_data_list[i])
                            next_data_batch_list[i] = next(data_iter_list[i])
                            logging.info('reset dataset_%d' % i)

                if batch_end_callback is not None:
                    batch_end_params = self.batch_end_param(
                        loss_list=self.loss_cache,
                        epoch=epoch,
                        num_update=self.num_update,
                        num_epoch_list=num_epoch_list
                    )
                    batch_end_callback(batch_end_params)

                nbatch += 1
Пример #30
0
def mlperf_fit(self,
               train_data,
               eval_data=None,
               eval_metric='acc',
               epoch_end_callback=None,
               batch_end_callback=None,
               kvstore='local',
               optimizer='sgd',
               optimizer_params=(('learning_rate', 0.01), ),
               eval_end_callback=None,
               eval_batch_end_callback=None,
               initializer=Uniform(0.01),
               arg_params=None,
               aux_params=None,
               allow_missing=False,
               force_rebind=False,
               force_init=False,
               begin_epoch=0,
               num_epoch=None,
               validation_metric=None,
               monitor=None,
               sparse_row_id_fn=None,
               eval_offset=0,
               eval_period=1,
               accuracy_threshold=1.0):

    assert num_epoch is not None, 'please specify number of epochs'

    self.bind(data_shapes=train_data.provide_data,
              label_shapes=train_data.provide_label,
              for_training=True,
              force_rebind=force_rebind)

    if monitor is not None:
        self.install_monitor(monitor)

    self.init_params(initializer=initializer,
                     arg_params=arg_params,
                     aux_params=aux_params,
                     allow_missing=allow_missing,
                     force_init=force_init)
    self.init_optimizer(kvstore=kvstore,
                        optimizer=optimizer,
                        optimizer_params=optimizer_params)

    if validation_metric is None:
        validation_metric = eval_metric
    ###########################################################################
    # Adding Correct and Total Count metrics
    ###########################################################################
    if not isinstance(validation_metric, list):
        validation_metric = [validation_metric]

    validation_metric = mx.metric.create(validation_metric)

    if not isinstance(validation_metric, mx.metric.CompositeEvalMetric):
        vm = mx.metric.CompositeEvalMetric()
        vm.append(validation_metric)
        validation_metric = vm

    for m in [CorrectCount(), TotalCount()]:
        validation_metric.metrics.append(m)
    ###########################################################################

    if not isinstance(eval_metric, mx.metric.EvalMetric):
        eval_metric = mx.metric.create(eval_metric)

    mx_resnet_print(key=mlperf_log.TRAIN_LOOP)
    ################################################################################
    # training loop
    ################################################################################
    for epoch in range(begin_epoch, num_epoch):
        mx_resnet_print(key=mlperf_log.TRAIN_EPOCH, val=epoch)
        tic = time.time()
        eval_metric.reset()
        nbatch = 0
        data_iter = iter(train_data)
        end_of_batch = False
        next_data_batch = next(data_iter)
        while not end_of_batch:
            data_batch = next_data_batch
            if monitor is not None:
                monitor.tic()
            self.forward_backward(data_batch)
            self.update()

            if isinstance(data_batch, list):
                self.update_metric(eval_metric,
                                   [db.label for db in data_batch],
                                   pre_sliced=True)
            else:
                self.update_metric(eval_metric, data_batch.label)

            try:
                # pre fetch next batch
                next_data_batch = next(data_iter)
                self.prepare(next_data_batch,
                             sparse_row_id_fn=sparse_row_id_fn)
            except StopIteration:
                end_of_batch = True

            if monitor is not None:
                monitor.toc_print()

            if batch_end_callback is not None:
                batch_end_params = BatchEndParam(epoch=epoch,
                                                 nbatch=nbatch,
                                                 eval_metric=eval_metric,
                                                 locals=locals())
                for callback in _as_list(batch_end_callback):
                    callback(batch_end_params)
            nbatch += 1

        # one epoch of training is finished
        toc = time.time()
        if kvstore:
            if kvstore.rank == 0:
                self.logger.info('Epoch[%d] Time cost=%.3f', epoch,
                                 (toc - tic))
        else:
            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))

        # sync aux params across devices
        arg_params, aux_params = self.get_params()
        self.set_params(arg_params, aux_params)

        if epoch_end_callback is not None:
            for callback in _as_list(epoch_end_callback):
                callback(epoch, self.symbol, arg_params, aux_params)

        #----------------------------------------
        # evaluation on validation set
        if eval_data and epoch % eval_period == eval_offset:
            mx_resnet_print(key=mlperf_log.EVAL_START)
            res = self.score(eval_data,
                             validation_metric,
                             score_end_callback=eval_end_callback,
                             batch_end_callback=eval_batch_end_callback,
                             epoch=epoch)
            #TODO: pull this into default
            if kvstore:
                if kvstore.rank == 0:
                    for name, val in res:
                        self.logger.info('Epoch[%d] Validation-%s=%f', epoch,
                                         name, val)
            else:
                for name, val in res:
                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name,
                                     val)
            res = dict(res)

            acc = [res['correct-count'], res['total-count']]
            acc = all_reduce(acc)
            acc = acc[0] / acc[1]
            mx_resnet_print(key=mlperf_log.EVAL_ACCURACY,
                            val={
                                "epoch": epoch,
                                "value": acc
                            })
            mx_resnet_print(key=mlperf_log.EVAL_STOP)
            if acc > accuracy_threshold:
                return epoch

        # end of 1 epoch, reset the data-iter for another epoch
        train_data.reset()

    return num_epoch