예제 #1
0
    def init_learning_rate(self):
        # Horovod: scale learning rate by the number of GPUs.
        self.set_lr_val_mp(self.ctx.lr_initial)

        dt.info(
            dt.DC.TRAIN,
            'Initialize learning rate: initial {} * {}, minimal {}, curve {}'.
            format(self.ctx.lr_initial, hvd.size(), self.ctx.lr_minimal,
                   self.ctx.lr_curve))
예제 #2
0
 def post_model(self):
     if dt.train.is_chief():
         #dt.info(dt.DC.TRAIN, "\n{}".format(self._model))
         dt.summary.summary_model_patch(self._model)
         dt.info(
             dt.DC.TRAIN, "\n{}".format(
                 dt.summary.summary_model_fwd(self._model, (3, 32, 32),
                                              device='cpu')))
         dt.summary.summary_model_patch(self._model,
                                        patch_fn=dt.summary.patch_clear_dt)
예제 #3
0
    def post_epoch(self, **kwargs):
        step = kwargs['step']
        epoch = kwargs['epoch']

        now_time = time.time()

        if dt.train.is_chief():
            self._tqdm.close()
            self._tqdm = None
            self._ctx.stats.valid_speed = self._num_total / (now_time - self._epoch_start) * hvd.size()
        else:
            self._ctx.stats.valid_speed = self._num_total / (now_time - self._epoch_start)

        self._ctx.stats.valid_loss = dt.train.mp_average(self._loss_total/self._num_total, 'epoch_valid_loss')
        #dt.trace(dt.DC.TRAIN, '[EPOCH {}] local {}, avg {}'.format(epoch, self._loss_total/self._num_total, self._ctx.stats.valid_loss))
        if self._ctx.valid_only:
            train_avg_loss = 0
            train_metric = 0
        else:
            train_avg_loss = dt.train.mp_average(self._ctx.stats.avg_loss, 'epoch_avg_loss')
            train_metric = dt.train.mp_average(self._ctx.stats.train_metric, 'epoch_train_metric')

        dt.vis.add_scalar('valid/image/s', self._ctx.stats.valid_speed)
        dt.vis.add_scalar('valid/avg_loss', train_avg_loss)
        dt.vis.add_scalar('valid/avg_{}'.format(self._ctx.stats.train_metric_name), train_metric)
        dt.vis.add_scalar('valid/loss', self._ctx.stats.valid_loss)
        for i, val in enumerate(self._metric_name):
            if self._metric_name[i] is not None:
                self._metric_total[i].div_(self._num_total)
                self._metric_total[i] = hvd.allreduce(self._metric_total[i], name='epoch_metric_total_%d' % i)
                dt.vis.summary_tensor('metric/{}'.format(self._metric_name[i]), self._metric_total[i])
        self._ctx.stats.valid_metric_name = self._metric_name[0]
        self._ctx.stats.valid_metric = self._metric_total[0].item()

        # Horovod: print output only on first rank.
        if dt.train.is_chief():
            dt.info(dt.DC.TRAIN, '%s Epoch[%03d:lr=%.6f:gs=%06d] train (loss %s, %s %s), valid (loss %s, %s %s, %s %s), %.3f img/s' %
                                     (time.strftime("%H:%M:%S", time.gmtime(now_time - self._train_start)),
                                     (epoch+1), self._trainer.get_lr_val(), step,
                                     "{:.6f}".format(train_avg_loss), self._ctx.stats.train_metric_name, "{:.6f}".format(train_metric),
                                     "{:.6f}".format(self._ctx.stats.valid_loss),
                                     self._metric_name[0], "{:.6f}".format(self._metric_total[0].item()),
                                     self._metric_name[1], "{:.6f}".format(self._metric_total[1].item()),
                                     self._ctx.stats.train_speed))
        return None
예제 #4
0
    def post_config(self):
        # Set debug settings
        dt.dbg_cfg(level=self._opt.debug.level,
                   channel=self._opt.debug.channel)

        if self._opt.args.trace and not dt.dbg_lvl(dt.DL.TRACE):
            dt.dbg_cfg(level=dt.DL.TRACE)

        # dump important information
        dt.info(dt.DC.STD, "{} - {}".format(self._opt.args.name,
                                            self._opt.args.tag))
        dt.info(
            dt.DC.STD,
            "Rank {}: command [{}], argv [{}]".format(hvd.rank(),
                                                      self._command,
                                                      self._argv))
        dt.info(dt.DC.STD,
                "Rank {}: model_dir [{}]".format(hvd.rank(), self._model_dir))
        dt.info(dt.DC.STD, "Rank {}: opt [{}]".format(hvd.rank(), self._opt))
예제 #5
0
    def build_model(self):
        dt.trace(dt.DC.MODEL, "[{}] ({}) build model".format(self.tag, type(self).__name__))
        args = self._ctx.args
        pretrained = (args.pretrained > 0)

        if args.model_name == 'efficientnet':
            if args.model_type == 'b0':
                self._model = dt.model.efficientnet.efficientnet_b0(pretrained=pretrained)
            elif args.model_type == 'b1':
                self._model = dt.model.efficientnet.efficientnet_b1(pretrained=pretrained)
            elif args.model_type == 'b2':
                self._model = dt.model.efficientnet.efficientnet_b2(pretrained=pretrained)
            elif args.model_type == 'b3':
                self._model = dt.model.efficientnet.efficientnet_b3(pretrained=pretrained)
            elif args.model_type == 'b4':
                self._model = dt.model.efficientnet.efficientnet_b4(pretrained=pretrained)
            elif args.model_type == 'b5':
                self._model = dt.model.efficientnet.efficientnet_b5(pretrained=pretrained)
            elif args.model_type == 'b6':
                self._model = dt.model.efficientnet.efficientnet_b6(pretrained=pretrained)
            elif args.model_type == 'b7':
                self._model = dt.model.efficientnet.efficientnet_b7(pretrained=pretrained)
        elif args.model_name == 'efficientnet_lm':
            if args.model_type == 'b0' or \
               args.model_type == 'b1' or \
               args.model_type == 'b2' or \
               args.model_type == 'b3' or \
               args.model_type == 'b4' or \
               args.model_type == 'b5' or \
               args.model_type == 'b6' or \
               args.model_type == 'b7':
                model_arch = "efficientnet-{}".format(args.model_type)
                if pretrained:
                    self._model = dt.model.efficientnet.EfficientNetLM.from_pretrained(model_arch)
                else:
                    self._model = dt.model.efficientnet.EfficientNetLM.from_name(model_arch)
        elif args.model_name == 'efficientnet_rw':
            if args.model_type == 'b0' or \
               args.model_type == 'b1' or \
               args.model_type == 'b2' or \
               args.model_type == 'b3' or \
               args.model_type == 'b4' or \
               args.model_type == 'b5' or \
               args.model_type == 'b6' or \
               args.model_type == 'b7':
                model_arch = "efficientnet_{}".format(args.model_type)
                self._model = dt.model.timm.create_model(model_arch, pretrained=pretrained)
        elif args.model_name == 'fairnas':
            if args.model_type == 'a':
                self._model = dt.model.fairnas.FairNasA()         # 8-gpu
        elif args.model_name == 'resnet_rw':
            #if dt.train.is_chief():
            #    dt.print_pp(dt.model.timm.list_models())
            if args.model_type == '34':
                self._model = dt.model.timm.create_model('resnet34', pretrained=pretrained)
            elif args.model_type == '50':
                self._model = dt.model.timm.create_model('resnet50', pretrained=pretrained)
        else:
            #if dt.train.is_chief():
            #    dt.print_pp(torchvision.models.__dict__)
            self._model = torchvision.models.__dict__[args.model_name](pretrained=pretrained)

        dt.info(dt.DC.TRAIN, "model {}, type {}, pretrained {}".format(args.model_name, args.model_type, args.pretrained))

        return True
예제 #6
0
 def post_model(self):
     args = self._ctx.args
     if dt.train.is_chief():
         dt.summary.summary_model_patch(self._model)
         dt.info(dt.DC.TRAIN, "\n{}".format(dt.summary.summary_model_fwd(self._model, (3, args.out_size, args.out_size), device='cpu')))
         dt.summary.summary_model_patch(self._model, patch_fn=dt.summary.patch_clear_dt)
예제 #7
0
 def pre_train(self):
     dt.info(
         dt.DC.TRAIN,
         'pre train [{}] device: {}'.format(self.tag, self.trainer.device))
     return None
예제 #8
0
    def init(self, **kwargs):

        opt = dt.Opt(kwargs) + dt.get_ctx() + self._ctx

        # Set default device settings
        opt += dt.Opt(gpu0=0)

        # Set default train mode
        opt += dt.Opt(is_training=True, is_eval=False, is_pred=False)

        # Learning rate
        opt += dt.Opt(lr_initial=0.001,
                      lr_minimal=1e-6,
                      lr_curve=[['*', 0.1, 10, 1]])

        # Default training options
        opt += dt.Opt(optim='SGD',
                      alpha=0.9,
                      beta1=0.9,
                      beta2=0.99,
                      opt_eps=1e-6,
                      momentum=0.9,
                      weight_decay=5e-4,
                      model_dir='asset/train',
                      random_seed=0,
                      max_ep=100000,
                      save_interval=1,
                      validate_ep=1,
                      data_format=dt.dformat.DEFAULT)

        # Default horovod options
        opt += dt.Opt(fp16_allreduce=False)

        # Stats
        opt += dt.Opt(stats=dt.Opt(avg_loss=None,
                                   train_metric_name=None,
                                   train_metric=None,
                                   valid_loss=0,
                                   valid_metric_name='',
                                   valid_metric=0,
                                   valid_metric_max=None,
                                   train_speed=0,
                                   valid_speed=0))

        # Saver
        opt += dt.Opt(epoch_done=-1)

        # Update ctx
        self._ctx = opt

        # Initialize device
        self.init_device()
        dt.info(
            dt.DC.TRAIN,
            '[HOROVOD] rank {}/{}, local {}'.format(hvd.rank(), hvd.size(),
                                                    hvd.local_rank()))
        dt.info(
            dt.DC.TRAIN,
            '[DEVICE] use_cuda {}, device {}, gpu {}/{}, random_seed {}'.
            format(self.use_cuda, self.device, self.device_index,
                   self.device_count, self._ctx.random_seed))

        if is_chief():
            dt.info(dt.DC.TRAIN, '[TRAIN] ctx')
            dt.print_pp(dt.opt_to_dict(self._ctx))

        # Initialize training variables
        self.init_global_step()
        self.init_learning_rate()
        self.init_summary()
        self.init_saver()

        if self._ctx.random_seed != 0:
            self.set_random_seed(self._ctx.random_seed)

        if self.use_cuda:
            # Horovod: pin GPU to local rank.
            torch.cuda.set_device(self.device_index)

        # Horovod: limit # of CPU threads to be used per worker.
        torch.set_num_threads(1)

        return self