def init_learning_rate(self): # Horovod: scale learning rate by the number of GPUs. self.set_lr_val_mp(self.ctx.lr_initial) dt.info( dt.DC.TRAIN, 'Initialize learning rate: initial {} * {}, minimal {}, curve {}'. format(self.ctx.lr_initial, hvd.size(), self.ctx.lr_minimal, self.ctx.lr_curve))
def post_model(self): if dt.train.is_chief(): #dt.info(dt.DC.TRAIN, "\n{}".format(self._model)) dt.summary.summary_model_patch(self._model) dt.info( dt.DC.TRAIN, "\n{}".format( dt.summary.summary_model_fwd(self._model, (3, 32, 32), device='cpu'))) dt.summary.summary_model_patch(self._model, patch_fn=dt.summary.patch_clear_dt)
def post_epoch(self, **kwargs): step = kwargs['step'] epoch = kwargs['epoch'] now_time = time.time() if dt.train.is_chief(): self._tqdm.close() self._tqdm = None self._ctx.stats.valid_speed = self._num_total / (now_time - self._epoch_start) * hvd.size() else: self._ctx.stats.valid_speed = self._num_total / (now_time - self._epoch_start) self._ctx.stats.valid_loss = dt.train.mp_average(self._loss_total/self._num_total, 'epoch_valid_loss') #dt.trace(dt.DC.TRAIN, '[EPOCH {}] local {}, avg {}'.format(epoch, self._loss_total/self._num_total, self._ctx.stats.valid_loss)) if self._ctx.valid_only: train_avg_loss = 0 train_metric = 0 else: train_avg_loss = dt.train.mp_average(self._ctx.stats.avg_loss, 'epoch_avg_loss') train_metric = dt.train.mp_average(self._ctx.stats.train_metric, 'epoch_train_metric') dt.vis.add_scalar('valid/image/s', self._ctx.stats.valid_speed) dt.vis.add_scalar('valid/avg_loss', train_avg_loss) dt.vis.add_scalar('valid/avg_{}'.format(self._ctx.stats.train_metric_name), train_metric) dt.vis.add_scalar('valid/loss', self._ctx.stats.valid_loss) for i, val in enumerate(self._metric_name): if self._metric_name[i] is not None: self._metric_total[i].div_(self._num_total) self._metric_total[i] = hvd.allreduce(self._metric_total[i], name='epoch_metric_total_%d' % i) dt.vis.summary_tensor('metric/{}'.format(self._metric_name[i]), self._metric_total[i]) self._ctx.stats.valid_metric_name = self._metric_name[0] self._ctx.stats.valid_metric = self._metric_total[0].item() # Horovod: print output only on first rank. if dt.train.is_chief(): dt.info(dt.DC.TRAIN, '%s Epoch[%03d:lr=%.6f:gs=%06d] train (loss %s, %s %s), valid (loss %s, %s %s, %s %s), %.3f img/s' % (time.strftime("%H:%M:%S", time.gmtime(now_time - self._train_start)), (epoch+1), self._trainer.get_lr_val(), step, "{:.6f}".format(train_avg_loss), self._ctx.stats.train_metric_name, "{:.6f}".format(train_metric), "{:.6f}".format(self._ctx.stats.valid_loss), self._metric_name[0], "{:.6f}".format(self._metric_total[0].item()), self._metric_name[1], "{:.6f}".format(self._metric_total[1].item()), self._ctx.stats.train_speed)) return None
def post_config(self): # Set debug settings dt.dbg_cfg(level=self._opt.debug.level, channel=self._opt.debug.channel) if self._opt.args.trace and not dt.dbg_lvl(dt.DL.TRACE): dt.dbg_cfg(level=dt.DL.TRACE) # dump important information dt.info(dt.DC.STD, "{} - {}".format(self._opt.args.name, self._opt.args.tag)) dt.info( dt.DC.STD, "Rank {}: command [{}], argv [{}]".format(hvd.rank(), self._command, self._argv)) dt.info(dt.DC.STD, "Rank {}: model_dir [{}]".format(hvd.rank(), self._model_dir)) dt.info(dt.DC.STD, "Rank {}: opt [{}]".format(hvd.rank(), self._opt))
def build_model(self): dt.trace(dt.DC.MODEL, "[{}] ({}) build model".format(self.tag, type(self).__name__)) args = self._ctx.args pretrained = (args.pretrained > 0) if args.model_name == 'efficientnet': if args.model_type == 'b0': self._model = dt.model.efficientnet.efficientnet_b0(pretrained=pretrained) elif args.model_type == 'b1': self._model = dt.model.efficientnet.efficientnet_b1(pretrained=pretrained) elif args.model_type == 'b2': self._model = dt.model.efficientnet.efficientnet_b2(pretrained=pretrained) elif args.model_type == 'b3': self._model = dt.model.efficientnet.efficientnet_b3(pretrained=pretrained) elif args.model_type == 'b4': self._model = dt.model.efficientnet.efficientnet_b4(pretrained=pretrained) elif args.model_type == 'b5': self._model = dt.model.efficientnet.efficientnet_b5(pretrained=pretrained) elif args.model_type == 'b6': self._model = dt.model.efficientnet.efficientnet_b6(pretrained=pretrained) elif args.model_type == 'b7': self._model = dt.model.efficientnet.efficientnet_b7(pretrained=pretrained) elif args.model_name == 'efficientnet_lm': if args.model_type == 'b0' or \ args.model_type == 'b1' or \ args.model_type == 'b2' or \ args.model_type == 'b3' or \ args.model_type == 'b4' or \ args.model_type == 'b5' or \ args.model_type == 'b6' or \ args.model_type == 'b7': model_arch = "efficientnet-{}".format(args.model_type) if pretrained: self._model = dt.model.efficientnet.EfficientNetLM.from_pretrained(model_arch) else: self._model = dt.model.efficientnet.EfficientNetLM.from_name(model_arch) elif args.model_name == 'efficientnet_rw': if args.model_type == 'b0' or \ args.model_type == 'b1' or \ args.model_type == 'b2' or \ args.model_type == 'b3' or \ args.model_type == 'b4' or \ args.model_type == 'b5' or \ args.model_type == 'b6' or \ args.model_type == 'b7': model_arch = "efficientnet_{}".format(args.model_type) self._model = dt.model.timm.create_model(model_arch, pretrained=pretrained) elif args.model_name == 'fairnas': if args.model_type == 'a': self._model = dt.model.fairnas.FairNasA() # 8-gpu elif args.model_name == 'resnet_rw': #if dt.train.is_chief(): # dt.print_pp(dt.model.timm.list_models()) if args.model_type == '34': self._model = dt.model.timm.create_model('resnet34', pretrained=pretrained) elif args.model_type == '50': self._model = dt.model.timm.create_model('resnet50', pretrained=pretrained) else: #if dt.train.is_chief(): # dt.print_pp(torchvision.models.__dict__) self._model = torchvision.models.__dict__[args.model_name](pretrained=pretrained) dt.info(dt.DC.TRAIN, "model {}, type {}, pretrained {}".format(args.model_name, args.model_type, args.pretrained)) return True
def post_model(self): args = self._ctx.args if dt.train.is_chief(): dt.summary.summary_model_patch(self._model) dt.info(dt.DC.TRAIN, "\n{}".format(dt.summary.summary_model_fwd(self._model, (3, args.out_size, args.out_size), device='cpu'))) dt.summary.summary_model_patch(self._model, patch_fn=dt.summary.patch_clear_dt)
def pre_train(self): dt.info( dt.DC.TRAIN, 'pre train [{}] device: {}'.format(self.tag, self.trainer.device)) return None
def init(self, **kwargs): opt = dt.Opt(kwargs) + dt.get_ctx() + self._ctx # Set default device settings opt += dt.Opt(gpu0=0) # Set default train mode opt += dt.Opt(is_training=True, is_eval=False, is_pred=False) # Learning rate opt += dt.Opt(lr_initial=0.001, lr_minimal=1e-6, lr_curve=[['*', 0.1, 10, 1]]) # Default training options opt += dt.Opt(optim='SGD', alpha=0.9, beta1=0.9, beta2=0.99, opt_eps=1e-6, momentum=0.9, weight_decay=5e-4, model_dir='asset/train', random_seed=0, max_ep=100000, save_interval=1, validate_ep=1, data_format=dt.dformat.DEFAULT) # Default horovod options opt += dt.Opt(fp16_allreduce=False) # Stats opt += dt.Opt(stats=dt.Opt(avg_loss=None, train_metric_name=None, train_metric=None, valid_loss=0, valid_metric_name='', valid_metric=0, valid_metric_max=None, train_speed=0, valid_speed=0)) # Saver opt += dt.Opt(epoch_done=-1) # Update ctx self._ctx = opt # Initialize device self.init_device() dt.info( dt.DC.TRAIN, '[HOROVOD] rank {}/{}, local {}'.format(hvd.rank(), hvd.size(), hvd.local_rank())) dt.info( dt.DC.TRAIN, '[DEVICE] use_cuda {}, device {}, gpu {}/{}, random_seed {}'. format(self.use_cuda, self.device, self.device_index, self.device_count, self._ctx.random_seed)) if is_chief(): dt.info(dt.DC.TRAIN, '[TRAIN] ctx') dt.print_pp(dt.opt_to_dict(self._ctx)) # Initialize training variables self.init_global_step() self.init_learning_rate() self.init_summary() self.init_saver() if self._ctx.random_seed != 0: self.set_random_seed(self._ctx.random_seed) if self.use_cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(self.device_index) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) return self