def update_data(): global opt_buf global tb_fill_idx buf = opt_buf[tb_fill_idx] tb_fill_idx = (tb_fill_idx + 1) % 2 print("opt_buf, {}, {}".format(len(opt_buf[0]), len(opt_buf[1]))) new_tb = dt.Opt(time=[], display_time=[], lr=[], lr_log=[], loss=[], acc=[], ep_step=[]) new_tm = dt.Opt(time=[], display_time=[], loss_val=[], top1_val=[], top5_val=[], ep_idx=[]) for opt in buf: if opt.t == 'tb': new_tb.time.append(opt.ts) new_tb.display_time.append("{}".format(opt.ts)) new_tb.lr.append(opt.lr) new_tb.lr_log.append(math.log(opt.lr, 10)) new_tb.loss.append(opt.loss) new_tb.acc.append(opt.acc) new_tb.ep_step.append("{} ({})".format(opt.ep, opt.s)) elif opt.t == 'tm': new_tm.time.append(opt.ts) new_tm.display_time.append("{}".format(opt.ts)) new_tm.loss_val.append(opt.vals[0]) new_tm.top1_val.append(opt.vals[1]) new_tm.top5_val.append(opt.vals[2]) new_tm.ep_idx.append("{} ({})".format(opt.ep, opt.idx)) buf.clear() data_tb.stream( dict(time=new_tb.time, display_time=new_tb.display_time, lr=new_tb.lr, lr_log=new_tb.lr_log, loss=new_tb.loss, acc=new_tb.acc, ep_step=new_tb.ep_step), 20000) data_tm.stream( dict(time=new_tm.time, display_time=new_tm.display_time, loss_val=new_tm.loss_val, top1_val=new_tm.top1_val, top5_val=new_tm.top5_val, ep_idx=new_tm.ep_idx), 10000)
def metric(self, logits, labels, is_training): if is_training: correct = self.correct(logits, labels, is_training) acc = correct.float().sum().div_(len(labels)) return [dt.Opt(name='top1', tensor=acc)] else: acc = dt.metric.accuracy(logits, labels, topk=(1, 5)) return [ dt.Opt(name='top1', tensor=acc[0]), dt.Opt(name='top5', tensor=acc[1]) ]
def summary_model_patch(model, patch_fn=patch_add_dt, **kwargs): class_name = model.__class__.__name__ state = dt.Opt() cl = dt.create_ctx_list(args=dt.Opt(kwargs), patch_fn=patch_fn, state=state) with dt.ctx_cl(cl, None, level=0, key=None, path=''): _walk_module(cl, model) return True
def wrapper(tensor, **kwargs): # kwargs parsing opt = dt.Opt(kwargs) + dt.get_ctx() # set default train mode opt += dt.Opt(is_training=True, reuse=None) # set default data format opt += dt.Opt(data_format=dt.dformat.DEFAULT) # call sugar function out = func(tensor, opt) return out
def wrapper(**kwargs): # kwargs parsing _opt = dt.Opt(kwargs) + get_ctx() _out = func(_opt) return _out
def init_saver(self): # checkpoint self._saver = dt.Opt( model_latest=self.ctx.args.inst_dir + '/model_latest.pt', optimizer_latest=self.ctx.args.inst_dir + '/optimizer_latest.pt', model_best=self.ctx.args.inst_dir + '/model_best.pt', optimizer_best=self.ctx.args.inst_dir + '/optimizer_best.pt')
def wrapper(ctx_list, **kwargs): # kwargs parsing _opt = dt.Opt(kwargs) + get_ctx_cl(ctx_list) _out = func(ctx_list, _opt) return _out
def set_lr(lr): global g_lr if True or g_lr != lr: g_lr = lr dt.util.datalink().send_opt( dt.Opt(t='cmd', a='set', key='lr', val=g_lr)) lr_plot.title.text = "Training lr={}".format(g_lr)
def dict_to_opt(d): opt = dt.Opt() for k, v in d.items(): if type(v) is dict: opt[k] = dict_to_opt(v) else: opt[k] = v return opt
def save(optimizer, fname, **kwargs): params = dt.Opt(kwargs) torch.save( { 'optimizer_state_dict': optimizer.state_dict(), 'optimizer_params': params.to_dict(), }, fname)
def save(model, fname, **kwargs): params = dt.Opt(kwargs) torch.save( { 'model_state_dict': model.state_dict(), 'model_params': params.to_dict(), }, fname)
def init_data(self): dt.trace(dt.DC.DATA, "[{}] init data".format(self.tag)) self.train, self.valid, self.test = dt.Opt(), dt.Opt, dt.Opt() self.train.batch_size = self._batch_size self.valid.batch_size = self._valid_size self.test.batch_size = self._test_size self.train.num_total = ImageNet.TRAIN_NUM_PER_EPOCH self.valid.num_total = ImageNet.VALID_NUM_PER_EPOCH self.test.num_total = ImageNet.TEST_NUM_PER_EPOCH self.train.num_batch = int(math.ceil(ImageNet.TRAIN_NUM_PER_EPOCH / self._batch_size / hvd.size())) self.valid.num_batch = int(math.ceil(ImageNet.VALID_NUM_PER_EPOCH / self._valid_size / hvd.size())) self.test.num_batch = int(math.ceil(ImageNet.TEST_NUM_PER_EPOCH / self._test_size / hvd.size())) return self
def get_ctx(): global __global_ctx_list # merge current context res = dt.Opt() for c in reversed(__global_ctx_list): res += c return res
def patch_add_dt(module, gc): class_name = module.__class__.__name__ #dt.trace(dt.DC.TRAIN, "[PATCH] level {}, path {}, key {}, class {}".format( # gc.level, gc.path, gc.key, class_name)) if not hasattr(module, '_dt_'): module._dt_ = dt.Opt() module._dt_.level = gc.level module._dt_.path = gc.path module._dt_.key = gc.key module._dt_.class_name = class_name
def __init__(self, ctx, **kwargs): self._ctx = dt.Opt(kwargs) + dt.get_ctx() + ctx self._use_cuda = False self._device = torch.device('cpu') self._device_index = 0 self._device_count = 0 self._global_step = None self._learning_rate = None
def ctx(**kwargs): global __global_ctx_list # append current context when enter _cur_ctx = dt.Opt(kwargs) __global_ctx_list += [_cur_ctx] yield # clear current context when exit del __global_ctx_list[-1]
def init_data(self): dt.trace(dt.DC.DATA, "[{}] init data".format(self.tag)) self.train, self.valid, self.test = dt.Opt(), dt.Opt, dt.Opt() self.train.batch_size = self._batch_size self.valid.batch_size = self._valid_size self.test.batch_size = self._test_size self.train.num_total = Cifar10.TRAIN_NUM_PER_EPOCH self.valid.num_total = Cifar10.VALID_NUM_PER_EPOCH self.test.num_total = Cifar10.TEST_NUM_PER_EPOCH self.train.num_batch = int(math.ceil(Cifar10.TRAIN_NUM_PER_EPOCH / self._batch_size / hvd.size())) self.valid.num_batch = int(math.ceil(Cifar10.VALID_NUM_PER_EPOCH / self._valid_size / hvd.size())) self.test.num_batch = int(math.ceil(Cifar10.TEST_NUM_PER_EPOCH / self._test_size / hvd.size())) self.classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') return self
def ctx_cl(ctx_list, opt, **kwargs): global __global_ctx_list # append current context when enter if opt is None: _cur_ctx = dt.Opt(kwargs) else: _cur_ctx = opt + dt.Opt(kwargs) if ctx_list is None: __global_ctx_list += [_cur_ctx] else: ctx_list += [_cur_ctx] yield # clear current context when exit if ctx_list is None: del __global_ctx_list[-1] else: del ctx_list[-1]
def get_ctx_cl(ctx_list): global __global_ctx_list # merge current context res = dt.Opt() if ctx_list is None: for c in reversed(__global_ctx_list): res += c else: for c in reversed(ctx_list): res += c return res
def datalink_recv(socket, packet): opt = dt.Opt().loads(packet._data.decode()) opt_buf[tb_fill_idx].append(opt) print(opt)
def create_ctx_list(**kwargs): _cur_ctx = dt.Opt(kwargs) return [_cur_ctx]
def build_data(self): dt.trace(dt.DC.MODEL, "[{}] ({}) build data".format(self.tag, type(self).__name__)) args = self._ctx.args data = dt.data.Mnist(batch_size=args.batch_size, valid_size=args.valid_size, num_workers=1, pin_memory=self.use_cuda) data.init_data() data.load_data() self._data = data return True def build_model(self): dt.trace(dt.DC.MODEL, "[{}] ({}) build model".format(self.tag, type(self).__name__)) self._model = MnistNet() #model = torchvision.models.resnet50(False) # Have ResNet model take in grayscale rather than RGB #model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False) return True # Train with dt.ctx(optim=ARGS.optim, lr_initial=ARGS.lr_initial, lr_minimal=ARGS.lr_minimal, lr_curve=ARGS.lr_curve): dt.train.train(args=ARGS, est_class = MnistEstimator, est_cfg=dt.Opt(), batch_size=ARGS.batch_size, valid_size=ARGS.valid_size, summary_freq=ARGS.summary_freq, validate_ep=ARGS.validate_ep, max_ep=ARGS.max_ep, model_dir=ARGS.model_dir, save_interval=ARGS.save_interval, beta1=ARGS.beta1, beta2=ARGS.beta2, momentum=ARGS.momentum, weight_decay=ARGS.weight_decay, random_seed=1 * (hvd.rank()+1), deferred=ARGS.deferred)
class DbgLvl(IntEnum): NONE = 0 NOTSET = 0 MAX = 5 TRACE = 5 DEBUG = 10 MED = 15 INFO = 20 WARNING = 30 MIN = 35 ERROR = 40 CRITICAL = 50 _dbg_cfg = dt.Opt() _dbg_cfg += dt.Opt(level=DbgLvl.MAX, channel=DbgChn.ALL) def dbg_cfg_val(): global _dbg_cfg return _dbg_cfg def dbg_cfg(**kwargs): global _dbg_cfg _dbg_cfg *= dt.Opt(kwargs) if dbg_vld(DbgChn.STD, DbgLvl.DEBUG): dt.print_pp(_dbg_cfg)
def __init__(self, ctx, **kwargs): self._ctx = dt.Opt(kwargs) + ctx
def __init__(self, ctx, **kwargs): self._ctx = dt.Opt(kwargs) + ctx self._callbacks = []
def dbg_cfg(**kwargs): global _dbg_cfg _dbg_cfg *= dt.Opt(kwargs) if dbg_vld(DbgChn.STD, DbgLvl.DEBUG): dt.print_pp(_dbg_cfg)
def init(self, **kwargs): opt = dt.Opt(kwargs) + dt.get_ctx() + self._ctx # Set default device settings opt += dt.Opt(gpu0=0) # Set default train mode opt += dt.Opt(is_training=True, is_eval=False, is_pred=False) # Learning rate opt += dt.Opt(lr_initial=0.001, lr_minimal=1e-6, lr_curve=[['*', 0.1, 10, 1]]) # Default training options opt += dt.Opt(optim='SGD', alpha=0.9, beta1=0.9, beta2=0.99, opt_eps=1e-6, momentum=0.9, weight_decay=5e-4, model_dir='asset/train', random_seed=0, max_ep=100000, save_interval=1, validate_ep=1, data_format=dt.dformat.DEFAULT) # Default horovod options opt += dt.Opt(fp16_allreduce=False) # Stats opt += dt.Opt(stats=dt.Opt(avg_loss=None, train_metric_name=None, train_metric=None, valid_loss=0, valid_metric_name='', valid_metric=0, valid_metric_max=None, train_speed=0, valid_speed=0)) # Saver opt += dt.Opt(epoch_done=-1) # Update ctx self._ctx = opt # Initialize device self.init_device() dt.info( dt.DC.TRAIN, '[HOROVOD] rank {}/{}, local {}'.format(hvd.rank(), hvd.size(), hvd.local_rank())) dt.info( dt.DC.TRAIN, '[DEVICE] use_cuda {}, device {}, gpu {}/{}, random_seed {}'. format(self.use_cuda, self.device, self.device_index, self.device_count, self._ctx.random_seed)) if is_chief(): dt.info(dt.DC.TRAIN, '[TRAIN] ctx') dt.print_pp(dt.opt_to_dict(self._ctx)) # Initialize training variables self.init_global_step() self.init_learning_rate() self.init_summary() self.init_saver() if self._ctx.random_seed != 0: self.set_random_seed(self._ctx.random_seed) if self.use_cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(self.device_index) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) return self
momentum=self._ctx.momentum, weight_decay=self._ctx.weight_decay, warmup=0) elif self._ctx.optim == 'SDG': self._optimizer = optim.SGD(self._model.parameters(), lr=self.trainer.get_lr_val(), momentum=self._ctx.momentum, weight_decay=self._ctx.weight_decay) else: self._optimizer = None return True # Train ctx = dt.Opt(args=ARGS, optim=ARGS.optim, data_format=ARGS.data_format, lr_initial=ARGS.lr_initial, lr_minimal=ARGS.lr_minimal, lr_curve=ARGS.lr_curve, batch_size=ARGS.batch_size, valid_size=ARGS.valid_size, validate_ep=ARGS.validate_ep, max_ep=ARGS.max_ep, model_dir=ARGS.model_dir, save_interval=ARGS.save_interval, alpha=ARGS.alpha, beta1=ARGS.beta1, beta2=ARGS.beta2, opt_eps=ARGS.opt_eps, momentum=ARGS.momentum, weight_decay=ARGS.weight_decay, random_seed=dt.util.random_int(1, 999999), gpu0=ARGS.gpu0, valid_only=ARGS.valid_only) est = ImageNetEstimator(ctx) est.build_flow() trainer = dt.train.Trainer(ctx).init() trainer.bind_estimator(est) trainer.train_setup() trainer.train_begin() trainer.train()
def init_config(self): self._config = configparser.ConfigParser( interpolation=configparser.ExtendedInterpolation()) self._config.read(self._args.c) if 'args' not in self._config: self._config['args'] = {} if 'debug' not in self._config: self._config['debug'] = {} self._opt = dt.Opt() # load config to opt for section in self._config.sections(): opt = dt.Opt() for key in self._config[section]: val_str = self._config[section][key] val = json.loads(val_str) opt[key] = val self._opt[section] = opt # override with command line args for arg in vars(self._args): val = getattr(self._args, arg) if val is None: continue val_str = str(val) if arg in self._opt.args or arg in self._default_config['args']: if arg in self._opt.args: opt_val = self._opt.args[arg] else: opt_val = self._default_config['args'][arg] if isinstance(opt_val, str): val_str = '"' + val_str + '"' if type(opt_val) is not type(val): dt.log( log.DC.STD, "[Convert Arg] {}: {}, {} => {}".format( arg, val, type(val), type(opt_val))) self._opt.args[arg] = json.loads(val_str) else: self._opt.args[arg] = val self._config['args'][arg] = val_str else: self._opt.args[arg] = val self._config['args'][arg] = val_str # add default settings for section in self._default_config: opt = dt.Opt() for key in self._default_config[section]: if key not in self._opt[section]: opt[key] = self._default_config[section][key] self._config[section][key] = json.dumps(opt[key]) self._opt[section] += opt # additional post process if self._opt.args.add is not None and type(self._opt.args.add) is dict: self._opt.args.add = dt.util.dict_to_opt(self._opt.args.add)