def test(ctx=mx.cpu()): from mxboard import SummaryWriter sw = SummaryWriter(logdir='sphere_dynamic', flush_secs=5) net = nn.Sequential() b1 = base_net(48, 3, fun=special_conv, kernel_size=(3, 3), same_shape=False) b2 = base_net(1, 48, fun=special_conv, kernel_size=(3, 3), same_shape=False) fc = nn.Dense(3, in_units=9) net.add(b1, b2, fc) init_s(net, ctx) from mxnet import gluon, autograd trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01}) for i in range(10000): with autograd.record(): out = net(img) loss = nd.sum(nd.abs(out - target)) loss.backward() trainer.step(2) sw.add_scalar(tag='loss', value=loss.asscalar(), global_step=i) if i % 100 == 0: print i, loss.asscalar() sw.close()
def test_add_scalar(): sw = SummaryWriter(logdir=_LOGDIR) sw.add_scalar(tag='test_add_scalar', value=np.random.uniform(), global_step=0) sw.close() check_event_file_and_remove_logdir()
class CallBackLogging(object): def __init__(self, rank, size, prefix_dir): self.batch_size = config.batch_size self.rank = rank self.size = size self.prefix_dir = prefix_dir self.frequent = config.frequent self.init = False self.tic = 0 self.last_count = 0 self.loss_metric = MetricNdarray() t = time.localtime() self.summary_writer = SummaryWriter( logdir=os.path.join(self.prefix_dir, "log_tensorboard", "%s_%s_%s" % (str(t.tm_mon), str(t.tm_mday), str(t.tm_hour))), verbose=False) def __call__(self, param): """Callback to Show speed """ count = param.num_update if self.last_count > count: self.init = False self.last_count = count self.loss_metric.update(param.loss[0]) if self.init: if count % self.frequent == 0: nd.waitall() try: speed = self.frequent * self.batch_size / (time.time() - self.tic) speed_total = speed * self.size except ZeroDivisionError: speed = float('inf') speed_total = float('inf') # summary loss loss_scalar = self.loss_metric.get() self.summary_writer.add_scalar(tag="loss", value=loss_scalar, global_step=param.num_update) loss_str_format = "[%d][%s]:%.2f " % (param.num_epoch, "loss", loss_scalar) self.loss_metric.reset() # summary speed self.summary_writer.add_scalar( tag="speed", value=speed, global_step=param.num_update) self.summary_writer.flush() if self.rank == 0: logging.info( "Iter:%d Rank:%.2f it/sec Total:%.2f it/sec %s", param.num_update, speed, speed_total, loss_str_format) self.tic = time.time() else: self.init = True self.tic = time.time()
def train(transformer, data_iter, lr, num_epochs, vocab, ctx): print('start training') print('ctx:', ctx) trainer = gluon.Trainer(transformer.collect_params(), 'adam', {'learning_rate': lr}) loss = gloss.SoftmaxCrossEntropyLoss() best_epoch = 0 best_loss = float('Inf') sw = SummaryWriter(logdir='../logs', flush_secs=5) for epoch in range(num_epochs): l_sum = 0.0 for i, data in enumerate(data_iter): X, Y, label, X_valid_len, Y_valid_len = data # X = X.as_in_context(ctx) # Y = Y.as_in_context(ctx) # label = label.as_in_context(ctx) gpu_Xs = gutils.split_and_load(X, ctx, even_split=False) gpu_Ys = gutils.split_and_load(Y, ctx, even_split=False) gpu_labels = gutils.split_and_load(label, ctx, even_split=False) with autograd.record(): # l = batch_loss(transformer, X, Y, vocab, loss) ls = [batch_loss(transformer, gpu_X, gpu_Y, gpu_label, vocab, loss) for gpu_X, gpu_Y, gpu_label in zip(gpu_Xs, gpu_Ys, gpu_labels)] # l.backward() b_loss = 0.0 for l in ls: l.backward() b_loss += l.asscalar() trainer.step(X.shape[0]) nd.waitall() l_sum += b_loss if i % 100 == 0: info = "epoch %d, batch %d, batch_loss %.3f" % (epoch, i, b_loss) print(info) sw.add_scalar(tag='batch_loss', value=b_loss, global_step=i) cur_loss = l_sum / len(data_iter) # 保存模型 if cur_loss < best_loss: best_loss = cur_loss best_epoch = epoch if not os.path.exists('../model'): os.mkdir('../model') transformer.save_parameters('../model/transformer' + str(epoch) + '.params') info = "epoch %d, loss %.3f, best_loss %.3f, best_epoch %d" % ( epoch, cur_loss, best_loss, best_epoch) print(info) sw.add_scalar(tag='loss', value=cur_loss, global_step=epoch)
class TensorboardStatsWriter(StatsWriter): def __init__(self, save_path=None, **kwargs): if not save_path: raise ValueError('save_path not specified') from mxboard import SummaryWriter logdir = save_path save_path = os.path.join(logdir, 'save') os.makedirs(logdir, exist_ok=True) self.sw = SummaryWriter(logdir=logdir) StatsWriter.__init__(self, save_path=save_path, **kwargs) def _write(self, idx, key, value): self.sw.add_scalar(key, value, idx)
class TensorboardCallback(object): """Log metrics periodically in TensorBoard. This callback works almost same as `callback.Speedometer`, but write TensorBoard event file for visualization. For more usage, please refer https://github.com/dmlc/tensorboard Parameters ---------- logging_dir : str TensorBoard event file directory. After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization. prefix : str Prefix for a metric name of `scalar` value. You might want to use this param to leverage TensorBoard plot feature, where TensorBoard plots different curves in one graph when they have same `name`. The follow example shows the usage(how to compare a train and eval metric in a same graph). """ def __init__(self, logging_dir, total_step=0, prefix=None): self.prefix = prefix self.step = total_step try: from mxboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error( 'You can install tensorboard via `pip install mxboard`.') def __call__(self, param, name_value=None): """Callback to log training speed and metrics in TensorBoard.""" if name_value: self._add_scalar(name_value) if param.eval_metric is None: return # if param.add_step: self.step += 1 name_value = param.eval_metric.get_name_value() if name_value: self._add_scalar(name_value) def _add_scalar(self, name_value): for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_scalar(tag=name, value=value, global_step=self.step)
class LogMetricsCallback(object): """Log metrics periodically in TensorBoard. This callback works almost same as `callback.Speedometer`, but write TensorBoard event file for visualization. For more usage, please refer https://github.com/dmlc/tensorboard Parameters ---------- logging_dir : str TensorBoard event file directory. After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization. prefix : str Prefix for a metric name of `scalar` value. You might want to use this param to leverage TensorBoard plot feature, where TensorBoard plots different curves in one graph when they have same `name`. The follow example shows the usage(how to compare a train and eval metric in a same graph). Examples -------- >>> # log train and eval metrics under different directories. >>> training_log = 'logs/train' >>> evaluation_log = 'logs/eval' >>> # in this case, each training and evaluation metric pairs has same name, >>> # you can add a prefix to make it separate. >>> batch_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(training_log)] >>> eval_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(evaluation_log)] >>> # run >>> model.fit(train, >>> ... >>> batch_end_callback = batch_end_callbacks, >>> eval_end_callback = eval_end_callbacks) >>> # Then use `tensorboard --logdir=logs/` to launch TensorBoard visualization. """ def __init__(self, logging_dir, prefix=None): self.prefix = prefix try: from mxboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install mxboard via `pip install mxboard`.') def __call__(self, param): """Callback to log training speed and metrics in TensorBoard.""" if param.eval_metric is None: return name_value = param.eval_metric.get_name_value() for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_scalar(name, value, global_step=param.epoch)
def test_add_multiple_scalars(): sw = SummaryWriter(logdir=_LOGDIR) sw.add_scalar(tag='test_multiple_scalars', value=np.random.uniform(), global_step=0) sw.add_scalar(tag='test_multiple_scalars', value=('scalar1', np.random.uniform()), global_step=0) sw.add_scalar(tag='test_multiple_scalars', value=['scalar2', np.random.uniform()], global_step=0) sw.add_scalar(tag='test_multiple_scalars', value={ 'scalar3': np.random.uniform(), 'scalar4': np.random.uniform() }, global_step=0) items = os.listdir(_LOGDIR) assert len(items) == 2 assert 'test_multiple_scalars' in items items.remove('test_multiple_scalars') assert items[0].startswith(_EVENT_FILE_PREFIX) print(items[0]) assert file_exists(os.path.join(_LOGDIR, items[0])) named_scalar_dir = os.path.join(_LOGDIR, 'test_multiple_scalars') assert dir_exists(named_scalar_dir) for i in range(1, 5): sub_dir = os.path.join(named_scalar_dir, 'scalar%d' % i) assert dir_exists(sub_dir) sub_items = os.listdir(sub_dir) assert len(sub_items) == 1 assert sub_items[0].startswith(_EVENT_FILE_PREFIX)
def main(lstm,train_data,train_label): batch_size = 30 epochs = 2000 moving_loss = 0. learning_rate = 0.1 sw = SummaryWriter(logdir='./logs', flush_secs=5) global_step = 0 for e in range(epochs): # if e == 0: # net = lstm.lstm_rnn(input_sequence, h, c, temperature=temperature) # sw.add_graph(lstm) if ((e+1) %50 == 0): learning_rate = learning_rate /2.0 h = nd.zeros(shape=(batch_size,num_hidden)) c = nd.zeros(shape=(batch_size,num_hidden)) num_batches =2 for i in range(num_batches): data_one_hot = train_data[i] label_one_hot = train_label[i] data_one_hot.attach_grad() label_one_hot.attach_grad() with autograd.record(): outputs,h,c = lstm.lstm_rnn(inputs=data_one_hot,h=h,c=c) loss = lstm.average_ce_loss(outputs,label_one_hot) sw.add_scalar(tag='cross_entropy', value=loss.mean().asscalar(), global_step=global_step) loss.backward() global_step = global_step + 1 lstm.SGD(learning_rate) if learning_rate % 20 == 0: learning_rate = learning_rate * 0.1 if ( i == 0 ) and (e == 0): moving_loss = nd.mean(loss).asscalar() else: moving_loss = .99*moving_loss + .01*nd.mean(loss).asscalar() sw.add_scalar(tag='Loss', value=moving_loss, global_step=e) print("Epoch %s. Loss: %s" % (e, moving_loss)) print(sample("1 2 3 ", 10,h,c, temperature=.1)) print(sample("This eBook is for the use of anyone ", 10,h,c, temperature=.1))
class HistoryKeeper(): def __init__(self, logdir, keys=['val_acc', 'val_loss']): if not isinstance(keys, (list, tuple)): raise ValueError("Keys should be a list or a tuple.") self.keys = keys self.sw = SummaryWriter(logdir=os.path.join(logdir, 'tb')) self.csv_path = os.path.join(logdir, 'history.csv') with open(self.csv_path, 'w') as f: f.write(";".join(keys) + "\n") # Return True to interrupt training def __call__(self, model, params): epoch = params['epoch'] pars_ = [] for key in self.keys: if key in params: self.sw.add_scalar(key, params[key], epoch) pars_.append(str(params[key])) with open(self.csv_path, 'a') as f: f.write(";".join(pars_) + "\n")
def do_training(args, module, data_train, data_val, begin_epoch=0): from distutils.dir_util import mkpath from log_util import LogUtil log = LogUtil().getlogger() mkpath(os.path.dirname(get_checkpoint_path(args))) #seq_len = args.config.get('arch', 'max_t_count') batch_size = args.config.getint('common', 'batch_size') save_checkpoint_every_n_epoch = args.config.getint( 'common', 'save_checkpoint_every_n_epoch') save_checkpoint_every_n_batch = args.config.getint( 'common', 'save_checkpoint_every_n_batch') enable_logging_train_metric = args.config.getboolean( 'train', 'enable_logging_train_metric') enable_logging_validation_metric = args.config.getboolean( 'train', 'enable_logging_validation_metric') contexts = parse_contexts(args) num_gpu = len(contexts) eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_validation_metric, is_epoch_end=True) # mxboard setting loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_train_metric, is_epoch_end=False) optimizer = args.config.get('optimizer', 'optimizer') learning_rate = args.config.getfloat('train', 'learning_rate') learning_rate_annealing = args.config.getfloat('train', 'learning_rate_annealing') mode = args.config.get('common', 'mode') num_epoch = args.config.getint('train', 'num_epoch') clip_gradient = args.config.getfloat('optimizer', 'clip_gradient') weight_decay = args.config.getfloat('optimizer', 'weight_decay') save_optimizer_states = args.config.getboolean('train', 'save_optimizer_states') show_every = args.config.getint('train', 'show_every') optimizer_params_dictionary = json.loads( args.config.get('optimizer', 'optimizer_params_dictionary')) kvstore_option = args.config.get('common', 'kvstore_option') n_epoch = begin_epoch is_bucketing = args.config.getboolean('arch', 'is_bucketing') if clip_gradient == 0: clip_gradient = None if is_bucketing and mode == 'load': model_file = args.config.get('common', 'model_file') model_name = os.path.splitext(model_file)[0] model_num_epoch = int(model_name[-4:]) model_path = 'checkpoints/' + str(model_name[:-5]) symbol, data_names, label_names = module(1600) model = STTBucketingModule( sym_gen=module, default_bucket_key=data_train.default_bucket_key, context=contexts) data_train.reset() model.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) _, arg_params, aux_params = mx.model.load_checkpoint( model_path, model_num_epoch) model.set_params(arg_params, aux_params) module = model else: module.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) if begin_epoch == 0 and mode == 'train': module.init_params(initializer=get_initializer(args)) lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate) def reset_optimizer(force_init=False): optimizer_params = { 'lr_scheduler': lr_scheduler, 'clip_gradient': clip_gradient, 'wd': weight_decay } optimizer_params.update(optimizer_params_dictionary) module.init_optimizer(kvstore=kvstore_option, optimizer=optimizer, optimizer_params=optimizer_params, force_init=force_init) if mode == "train": reset_optimizer(force_init=True) else: reset_optimizer(force_init=False) data_train.reset() data_train.is_first_epoch = True #mxboard setting mxlog_dir = args.config.get('common', 'mxboard_log_dir') summary_writer = SummaryWriter(mxlog_dir) while True: if n_epoch >= num_epoch: break loss_metric.reset() log.info('---------train---------') for nbatch, data_batch in enumerate(data_train): module.forward_backward(data_batch) module.update() # mxboard setting if (nbatch + 1) % show_every == 0: module.update_metric(loss_metric, data_batch.label) #summary_writer.add_scalar('loss batch', loss_metric.get_batch_loss(), nbatch) if (nbatch + 1) % save_checkpoint_every_n_batch == 0: log.info('Epoch[%d] Batch[%d] SAVE CHECKPOINT', n_epoch, nbatch) module.save_checkpoint( prefix=get_checkpoint_path(args) + "n_epoch" + str(n_epoch) + "n_batch", epoch=(int( (nbatch + 1) / save_checkpoint_every_n_batch) - 1), save_optimizer_states=save_optimizer_states) # commented for Libri_sample data set to see only train cer log.info('---------validation---------') data_val.reset() eval_metric.reset() for nbatch, data_batch in enumerate(data_val): # when is_train = False it leads to high cer when batch_norm module.forward(data_batch, is_train=True) module.update_metric(eval_metric, data_batch.label) # mxboard setting val_cer, val_n_label, val_l_dist, _ = eval_metric.get_name_value() log.info("Epoch[%d] val cer=%f (%d / %d)", n_epoch, val_cer, int(val_n_label - val_l_dist), val_n_label) curr_acc = val_cer summary_writer.add_scalar('CER validation', val_cer, n_epoch) assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric' data_train.reset() data_train.is_first_epoch = False # mxboard setting train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value( ) summary_writer.add_scalar('loss epoch', train_ctc_loss, n_epoch) summary_writer.add_scalar('CER train', train_cer, n_epoch) # save checkpoints if n_epoch % save_checkpoint_every_n_epoch == 0: log.info('Epoch[%d] SAVE CHECKPOINT', n_epoch) module.save_checkpoint(prefix=get_checkpoint_path(args), epoch=n_epoch, save_optimizer_states=save_optimizer_states) n_epoch += 1 lr_scheduler.learning_rate = learning_rate / learning_rate_annealing log.info('FINISH')
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], offset_alloc_size=(64, 64), anchors={"shallow": [(10, 13), (16, 30), (33, 23)], "middle": [(30, 61), (62, 45), (59, 119)], "deep": [(116, 90), (156, 198), (373, 326)]}, graphviz=False, epoch=100, input_size=[416, 416], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=False, factor_scale=[13, 5], ignore_threshold=0.5, dynamic=False, data_augmentation=True, num_workers=4, optimizer="ADAM", save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, Darknetlayer=53, pretrained_base=True, pretrained_path="modelparam", AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, multiperclass=True, nms_thresh=0.5, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.05, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB') else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB') else: logging.info(f'Running on {ctx}') # 입력 사이즈를 32의 배수로 지정해 버리기 - stride가 일그러지는 것을 막기 위함 if input_size[0] % 32 != 0 and input_size[1] % 32 != 0: logging.info("The input size must be a multiple of 32") exit(0) if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training YoloV3 Detector") input_shape = (1, 3) + tuple(input_size) try: net = Yolov3(Darknetlayer=Darknetlayer, anchors=anchors, pretrained=False, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader(multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, ignore_threshold=ignore_threshold, dynamic=dynamic, from_sigmoid=False, make_target=True) valid_dataloader, valid_dataset = validdataloader(path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, ignore_threshold=ignore_threshold, dynamic=dynamic, from_sigmoid=False, make_target=True) except Exception: logging.info("dataset 없음") exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_P" + "Dark_" + str(Darknetlayer) else: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_Dark_" + str(Darknetlayer) weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 ''' mxnet c++에서 arbitrary input image 를 받기 위한 전략 alloc_size : tuple of int, default is (128, 128) For advanced users. Define `alloc_size` to generate large enough offset maps, which will later saved in parameters. During inference, we support arbitrary input image by cropping corresponding area of the anchor map. This allow us to export to symbol so we can run it in c++, Scalar, etc. ''' net = Yolov3(Darknetlayer=Darknetlayer, input_size=input_size, anchors=anchors, num_classes=num_classes, # foreground만 pretrained=pretrained_base, pretrained_path=pretrained_path, alloc_size=offset_alloc_size, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False}) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False}) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False}) else: logging.error("optimizer not selected") exit(0) loss = Yolov3Loss(sparse_label=True, from_sigmoid=False, batch_axis=None, num_classes=num_classes, reduction="sum", exclude=False) prediction = Prediction( from_sigmoid=False, num_classes=num_classes, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): xcyc_loss_sum = 0 wh_loss_sum = 0 object_loss_sum = 0 class_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, xcyc_all, wh_all, objectness_all, class_all, weights_all, _) in enumerate( train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) xcyc_all = mx.nd.split(data=xcyc_all, num_outputs=subdivision, axis=0) wh_all = mx.nd.split(data=wh_all, num_outputs=subdivision, axis=0) objectness_all = mx.nd.split(data=objectness_all, num_outputs=subdivision, axis=0) class_all = mx.nd.split(data=class_all, num_outputs=subdivision, axis=0) weights_all = mx.nd.split(data=weights_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] xcyc_all = [xcyc_all] wh_all = [wh_all] objectness_all = [objectness_all] class_all = [class_all] weights_all = [weights_all] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): xcyc_all_losses = [] wh_all_losses = [] object_all_losses = [] class_all_losses = [] for image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split in zip(image, xcyc_all, wh_all, objectness_all, class_all, weights_all): if GPU_COUNT <= 1: image_split = gluon.utils.split_and_load(image_split, [ctx], even_split=False) xcyc_split = gluon.utils.split_and_load(xcyc_split, [ctx], even_split=False) wh_split = gluon.utils.split_and_load(wh_split, [ctx], even_split=False) objectness_split = gluon.utils.split_and_load(objectness_split, [ctx], even_split=False) class_split = gluon.utils.split_and_load(class_split, [ctx], even_split=False) weights_split = gluon.utils.split_and_load(weights_split, [ctx], even_split=False) else: image_split = gluon.utils.split_and_load(image_split, ctx, even_split=False) xcyc_split = gluon.utils.split_and_load(xcyc_split, ctx, even_split=False) wh_split = gluon.utils.split_and_load(wh_split, ctx, even_split=False) objectness_split = gluon.utils.split_and_load(objectness_split, ctx, even_split=False) class_split = gluon.utils.split_and_load(class_split, ctx, even_split=False) weights_split = gluon.utils.split_and_load(weights_split, ctx, even_split=False) xcyc_losses = [] wh_losses = [] object_losses = [] class_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, xcyc_target, wh_target, objectness, class_target, weights in zip(image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split): output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target, wh_target, objectness, class_target, weights) xcyc_losses.append(xcyc_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) object_losses.append(object_loss.asscalar()) class_losses.append(class_loss.asscalar()) total_loss.append(xcyc_loss + wh_loss + object_loss + class_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) xcyc_all_losses.append(sum(xcyc_losses)) wh_all_losses.append(sum(wh_losses)) object_all_losses.append(sum(object_losses)) class_all_losses.append(sum(class_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() xcyc_loss_sum += sum(xcyc_all_losses) / td_batch_size wh_loss_sum += sum(wh_all_losses) / td_batch_size object_loss_sum += sum(object_all_losses) / td_batch_size class_loss_sum += sum(class_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info(f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[xcyc loss = {sum(xcyc_all_losses) / td_batch_size:.3f}]' f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]' f'[obj loss = {sum(object_all_losses) / td_batch_size:.3f}]' f'[class loss = {sum(class_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_xcyc_loss_mean = np.divide(xcyc_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_object_loss_mean = np.divide(object_loss_sum, train_update_number_per_epoch) train_class_loss_mean = np.divide(class_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_xcyc_loss_mean + train_wh_loss_mean + train_object_loss_mean + train_class_loss_mean logging.info( f"train xcyc loss : {train_xcyc_loss_mean} / " f"train wh loss : {train_wh_loss_mean} / " f"train object loss : {train_object_loss_mean} / " f"train class loss : {train_class_loss_mean} / " f"train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: xcyc_loss_sum = 0 wh_loss_sum = 0 object_loss_sum = 0 class_loss_sum = 0 # loss 구하기 for image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all, _ in valid_dataloader: vd_batch_size, _, height, width = image.shape if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) xcyc_all = gluon.utils.split_and_load(xcyc_all, [ctx], even_split=False) wh_all = gluon.utils.split_and_load(wh_all, [ctx], even_split=False) objectness_all = gluon.utils.split_and_load(objectness_all, [ctx], even_split=False) class_all = gluon.utils.split_and_load(class_all, [ctx], even_split=False) weights_all = gluon.utils.split_and_load(weights_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) xcyc_all = gluon.utils.split_and_load(xcyc_all, ctx, even_split=False) wh_all = gluon.utils.split_and_load(wh_all, ctx, even_split=False) objectness_all = gluon.utils.split_and_load(objectness_all, ctx, even_split=False) class_all = gluon.utils.split_and_load(class_all, ctx, even_split=False) weights_all = gluon.utils.split_and_load(weights_all, ctx, even_split=False) xcyc_losses = [] wh_losses = [] object_losses = [] class_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, xcyc_target, wh_target, objectness, class_target, weights in zip(image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) id, score, bbox = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target, wh_target, objectness, class_target, weights) xcyc_losses.append(xcyc_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) object_losses.append(object_loss.asscalar()) class_losses.append(class_loss.asscalar()) total_loss.append(xcyc_losses + wh_losses + object_losses + class_losses) xcyc_loss_sum += sum(xcyc_losses) / vd_batch_size wh_loss_sum += sum(wh_losses) / vd_batch_size object_loss_sum += sum(object_losses) / vd_batch_size class_loss_sum += sum(class_losses) / vd_batch_size valid_xcyc_loss_mean = np.divide(xcyc_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_object_loss_mean = np.divide(object_loss_sum, valid_update_number_per_epoch) valid_class_loss_mean = np.divide(class_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_xcyc_loss_mean + valid_wh_loss_mean + valid_object_loss_mean + valid_class_loss_mean logging.info( f"valid xcyc loss : {valid_xcyc_loss_mean} / " f"valid wh loss : {valid_wh_loss_mean} / " f"valid object loss : {valid_object_loss_mean} / " f"valid class loss : {valid_class_loss_mean} / " f"valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list() for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info(f"class {j}'s {name} AP : {round(AP * 100, round_position)}%") AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) ids, scores, bboxes = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3) for ig, gt_id, gt_box, id, score, bbox in zip(img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose( (1, 2, 0)) * mx.nd.array(std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # ground truth box 그리기 ground_truth = plot_bbox(ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox(ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append(prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="xy_loss", value={"train_xcyc_loss": train_xcyc_loss_mean, "valid_xcyc_loss": valid_xcyc_loss_mean}, global_step=i) summary.add_scalar(tag="wh_loss", value={"train_wh_loss": train_wh_loss_mean, "valid_wh_loss": valid_wh_loss_mean}, global_step=i) summary.add_scalar(tag="object_loss", value={"train_object_loss": train_object_loss_mean, "valid_object_loss": valid_object_loss_mean}, global_step=i) summary.add_scalar(tag="class_loss", value={"train_class_loss": train_class_loss_mean, "valid_class_loss": valid_class_loss_mean}, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean}, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) # for onnx net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함. export_block_for_cplusplus(path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3,)), epoch=i, preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay): """Perform CapsNet training""" summary_writer = SummaryWriter(args.tblog_dir) lr_scheduler = SimpleLRScheduler(learning_rate) optimizer_params = {'lr_scheduler': lr_scheduler} module.init_params() module.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) n_epoch = 0 while True: if n_epoch >= num_epoch: break train_iter.reset() val_iter.reset() loss_metric.reset() for n_batch, data_batch in enumerate(train_iter): module.forward_backward(data_batch) module.update() module.update_metric(loss_metric, data_batch.label) loss_metric.get_batch_log(n_batch) train_acc, train_loss, train_recon_err = loss_metric.get_name_value() loss_metric.reset() for n_batch, data_batch in enumerate(val_iter): module.forward(data_batch) module.update_metric(loss_metric, data_batch.label) loss_metric.get_batch_log(n_batch) val_acc, val_loss, val_recon_err = loss_metric.get_name_value() summary_writer.add_scalar('train_acc', train_acc, n_epoch) summary_writer.add_scalar('train_loss', train_loss, n_epoch) summary_writer.add_scalar('train_recon_err', train_recon_err, n_epoch) summary_writer.add_scalar('val_acc', val_acc, n_epoch) summary_writer.add_scalar('val_loss', val_loss, n_epoch) summary_writer.add_scalar('val_recon_err', val_recon_err, n_epoch) print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss, train_recon_err)) print('Epoch[%d] val acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, val_acc, val_loss, val_recon_err)) print('SAVE CHECKPOINT') module.save_checkpoint(prefix=model_prefix, epoch=n_epoch) n_epoch += 1 lr_scheduler.learning_rate = learning_rate * (decay ** n_epoch)
def train(epochs, ctx): # Collect all parameters from net and its children, then initialize them. net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) net.hybridize() # Trainer is for updating parameters with gradient. trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': opt.lr, 'momentum': opt.momentum}) metric = mx.metric.Accuracy() loss = gluon.loss.SoftmaxCrossEntropyLoss() # collect parameter names for logging the gradients of parameters in each epoch params = net.collect_params() param_names = params.keys() # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='./logs', flush_secs=5) global_step = 0 for epoch in range(epochs): # reset data iterator and metric at begining of epoch. metric.reset() for i, (data, label) in enumerate(train_data): # Copy data to ctx if necessary data = data.as_in_context(ctx) label = label.as_in_context(ctx) # Start recording computation graph with record() section. # Recorded graphs can then be differentiated with backward. with autograd.record(): output = net(data) L = loss(output, label) sw.add_scalar(tag='cross_entropy', value=L.mean().asscalar(), global_step=global_step) global_step += 1 L.backward() # take a gradient step with batch_size equal to data.shape[0] trainer.step(data.shape[0]) # update metric at last. metric.update([label], [output]) if i % opt.log_interval == 0 and i > 0: name, train_acc = metric.get() print('[Epoch %d Batch %d] Training: %s=%f' % (epoch, i, name, train_acc)) # Log the first batch of images of each epoch if i == 0: sw.add_image('minist_first_minibatch', data.reshape((opt.batch_size, 1, 28, 28)), epoch) if epoch == 0: sw.add_graph(net) grads = [i.grad() for i in net.collect_params().values()] assert len(grads) == len(param_names) # logging the gradients of parameters for checking convergence for i, name in enumerate(param_names): sw.add_histogram(tag=name, values=grads[i], global_step=epoch, bins=1000) name, train_acc = metric.get() print('[Epoch %d] Training: %s=%f' % (epoch, name, train_acc)) # logging training accuracy sw.add_scalar(tag='accuracy_curves', value=('train_acc', train_acc), global_step=epoch) name, val_acc = test(ctx) print('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc)) # logging the validation accuracy sw.add_scalar(tag='accuracy_curves', value=('valid_acc', val_acc), global_step=epoch) sw.export_scalars('scalar_dict.json') sw.close()
class ModuleLearner(): def __init__(self, model, run_id, gpu_idxs=None, tensorboard_logging=False): """ Parameters ---------- model: HybridBlock gpu_idxs: None or list of ints If None will set context to CPU. If list of ints, will set context to given GPUs. """ logging.info("Using Module Learner.") model.hybridize() logging.info("Hybridized model.") input = mx.sym.var('data') pre_output = model(input) output = mx.sym.SoftmaxOutput(pre_output, name='softmax') context = get_context(gpu_idxs) self.module = mx.mod.Module(symbol=output, context=context, data_names=['data'], label_names=['softmax_label']) self.tensorboard_logging = tensorboard_logging if self.tensorboard_logging: from mxboard import SummaryWriter current_folder = os.path.dirname(os.path.realpath(__file__)) tensorboard_folder = os.path.join(current_folder, "..", "logs", "tensorboard") summary_filepath = os.path.join(tensorboard_folder, run_id) self.writer = SummaryWriter(logdir=summary_filepath) def fit(self, train_data, valid_data, epochs=300, lr=None, lr_schedule=None, initializer=mx.init.Xavier(), optimizer=None, kvstore='device', log_frequency=10000, early_stopping_criteria=None): """ Uses accuracy as training and validation metric. Parameters ---------- train_iter : DataIter Contains training data validation_iter : DataIter Contains validation data epochs: int Number of epochs to run, unless stopped early by early_stopping_criteria. lr: float or int Learning rate lr_schedule : dict Contains change points of learning rate. Key is the epoch and value is the learning rate. Must contain epoch 0. initializer : mxnet.initializer.Initializer optimizer: mxnet.optimizer.Optimizer Defaults to be `mx.optimizer.SGD(learning_rate=lr_schedule[0], rescale_grad=1.0/batch_size, momentum=0.9)` kvstore : str, optional log_frequency : int, optional Number of batches between logs early_stopping_criteria: function (float -> boolean) Given validation accuracy, should return True if training should be stopped early. Returns ------- None Output is logged to file. """ if lr_schedule is None: assert lr is not None, "lr must be defined if not using lr_schedule" lr_schedule = {0: lr} else: assert lr is None, "lr should not be defined if using lr_schedule" assert 0 in lr_schedule.keys( ), "lr for epoch 0 must be defined in lr_schedule" mod = self.module batch_size = train_data.provide_data[0].shape[0] mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) mod.init_params(initializer=initializer) if optimizer is None: optimizer = mx.optimizer.SGD(learning_rate=lr_schedule[0], rescale_grad=1.0 / batch_size, momentum=0.9) mod.init_optimizer(kvstore=kvstore, optimizer=optimizer) train_metric = mx.metric.create('acc') validation_metric = mx.metric.create('acc') max_val_acc = {'val_acc': 0, 'trn_acc': 0, 'epoch': 0} for epoch in range(epochs): epoch_tick = time.time() # update learning rate if epoch in lr_schedule.keys(): mod._optimizer.lr = lr_schedule[epoch] logging.info("Epoch {}, Changed learning rate.".format(epoch)) logging.info('Epoch {}, Learning rate={}'.format( epoch, mod._optimizer.lr)) if self.tensorboard_logging: self.writer.add_scalar(tag='learning_rate', value=mod._optimizer.lr, global_step=epoch + 1) train_data.reset() train_metric.reset() samples_processed = 0 for batch_idx, batch in enumerate(train_data): batch_tick = time.time() mod.forward(batch, is_train=True) # compute predictions mod.update_metric( train_metric, batch.label) # accumulate prediction accuracy mod.backward() # compute gradients mod.update() # update parameters if self.tensorboard_logging: # log to tensorboard (on first batch) if batch_idx == 0: self.writer.add_image(tag="batch", image=batch.data[0], global_step=epoch + 1) # log batch speed (if a multiple of log_frequency is contained in the last batch) log_batch = (samples_processed // log_frequency) != ( (samples_processed + batch_size) // log_frequency) if ((batch_idx >= 1) and log_batch): # batch estimate, not averaged over multiple batches speed = batch_size / (time.time() - batch_tick) logging.info( 'Epoch {}, Batch {}, Speed={:.2f} images/second'. format(epoch, batch_idx, speed)) samples_processed += batch_size # log training accuracy _, trn_acc = train_metric.get() logging.info('Epoch {}, Training accuracy={}'.format( epoch, trn_acc)) if self.tensorboard_logging: self.writer.add_scalar(tag='accuracy/training', value=trn_acc * 100, global_step=epoch + 1) # log validation accuracy res = mod.score(valid_data, validation_metric) _, val_acc = res[0] logging.info('Epoch {}, Validation accuracy={}'.format( epoch, val_acc)) if self.tensorboard_logging: self.writer.add_scalar(tag='accuracy/validation', value=val_acc * 100, global_step=epoch + 1) # log maximum validation accuracy if val_acc > max_val_acc['val_acc']: max_val_acc = { 'val_acc': val_acc, 'trn_acc': trn_acc, 'epoch': epoch } logging.info(("Epoch {}, Max validation accuracy={} @ " "Epoch {} (with training accuracy={})").format( epoch, max_val_acc['val_acc'], max_val_acc['epoch'], max_val_acc['trn_acc'])) # log duration of epoch logging.info('Epoch {}, Duration={}'.format( epoch, time.time() - epoch_tick)) if early_stopping_criteria: if early_stopping_criteria(val_acc): logging.info( "Epoch {}, Reached early stopping target, stopping training." .format(epoch)) break def predict(self, test_data, log_frequency=10000): logging.info('Starting inference.') mod = self.module batch_size = test_data.provide_data[0].shape[0] mod.bind(data_shapes=test_data.provide_data, label_shapes=test_data.provide_label) samples_processed = 0 batch_tick = time.time() for pred, batch_idx, batch in mod.iter_predict(test_data): pred[0].wait_to_read() batch_tock = time.time() # log batch speed (if a multiple of log_frequency is contained in the last batch) log_batch = (samples_processed // log_frequency) != ( (samples_processed + batch_size) // log_frequency) warm_up_period = 5 if ((batch_idx >= warm_up_period) and log_batch): # batch estimate, not averaged over multiple batches latency = (batch_tock - batch_tick) # seconds speed = batch_size / latency logging.info( 'Inference. Batch {}, Latency={:.5f} ms, Speed={:.2f} images/second' .format(batch_idx, latency * 1000, speed)) samples_processed += batch_size batch_tick = time.time() logging.info('Completed inference.')
# train model global_step = 1 for epoch in range(1, epochs + 1): for train_w, train_d, train_r, train_t in train_loader: start_time = time() with autograd.record(): output = net([train_w, train_d, train_r]) l = loss_function(output, train_t) l.backward() trainer.step(train_t.shape[0]) training_loss = l.mean().asscalar() sw.add_scalar(tag = 'training_loss', value = training_loss, global_step = global_step) print('global step: %s, training loss: %.2f, time: %.2fs'\ %(global_step, training_loss, time() - start_time)) global_step += 1 # logging the gradients of parameters for checking convergence for name, param in net.collect_params().items(): try: sw.add_histogram(tag = name + "_grad", values = param.grad(), global_step = global_step, bins = 1000) except: print(name) print(param.grad()) # compute validation loss compute_val_loss(net, val_loader, loss_function, sw, epoch)
class FIFOScheduler(TaskScheduler): r"""Simple scheduler that just runs trials in submission order. Parameters ---------- train_fn : callable A task launch function for training. Note: please add the `@autogluon_method` decorater to the original function. args : object (optional) Default arguments for launching train_fn. resource : dict Computation resources. For example, `{'num_cpus':2, 'num_gpus':1}` searcher : str or object Autogluon searcher. For example, autogluon.searcher.self.argsRandomSampling time_attr : str A training result attr to use for comparing time. Note that you can pass in something non-temporal such as `training_epoch` as a measure of progress, the only requirement is that the attribute should increase monotonically. reward_attr : str The training result objective value attribute. As with `time_attr`, this may refer to any objective value. Stopping procedures will use this attribute. dist_ip_addrs : list of str IP addresses of remote machines. Examples -------- >>> import numpy as np >>> import autogluon as ag >>> @ag.args( ... lr=ag.space.Real(1e-3, 1e-2, log=True), ... wd=ag.space.Real(1e-3, 1e-2)) >>> def train_fn(args, reporter): ... print('lr: {}, wd: {}'.format(args.lr, args.wd)) ... for e in range(10): ... dummy_accuracy = 1 - np.power(1.8, -np.random.uniform(e, 2*e)) ... reporter(epoch=e, accuracy=dummy_accuracy, lr=args.lr, wd=args.wd) >>> scheduler = ag.scheduler.FIFOScheduler(train_fn, ... resource={'num_cpus': 2, 'num_gpus': 0}, ... num_trials=20, ... reward_attr='accuracy', ... time_attr='epoch') >>> scheduler.run() >>> scheduler.join_jobs() >>> scheduler.get_training_curves(plot=True) """ def __init__(self, train_fn, args=None, resource=None, searcher=None, search_options=None, checkpoint='./exp/checkpoint.ag', resume=False, num_trials=None, time_out=None, max_reward=1.0, time_attr='epoch', reward_attr='accuracy', visualizer='none', dist_ip_addrs=None): super().__init__(dist_ip_addrs) if resource is None: resource = {'num_cpus': 1, 'num_gpus': 0} self.resource = resource if searcher is None: searcher = 'random' # Default: Random searcher if isinstance(searcher, str): kwargs = search_options.copy() if search_options else dict() kwargs['configspace'] = train_fn.cs self.searcher: BaseSearcher = searcher_factory(searcher, **kwargs) else: assert isinstance(searcher, BaseSearcher) self.searcher: BaseSearcher = searcher assert isinstance(train_fn, _autogluon_method) self.train_fn = train_fn self.args = args if args else train_fn.args # meta data self.metadata = { 'search_space': train_fn.kwspaces, 'search_strategy': searcher, 'stop_criterion': { 'time_limits': time_out, 'max_reward': max_reward }, 'resources_per_trial': resource } self.num_trials = num_trials self.time_out = time_out self.max_reward = max_reward self._checkpoint = checkpoint self._time_attr = time_attr self._reward_attr = reward_attr self.visualizer = visualizer.lower() if self.visualizer == 'tensorboard' or self.visualizer == 'mxboard': try_import_mxboard() from mxboard import SummaryWriter self.mxboard = SummaryWriter(logdir=os.path.join( os.path.splitext(checkpoint)[0], 'logs'), flush_secs=3, verbose=False) self.log_lock = mp.Lock() self.training_history = OrderedDict() self.config_history = OrderedDict() if resume: if os.path.isfile(checkpoint): self.load_state_dict(load(checkpoint)) else: msg = f'checkpoint path {checkpoint} is not available for resume.' logger.exception(msg) raise FileExistsError(msg) def run(self, **kwargs): """Run multiple number of trials """ start_time = time.time() num_trials = kwargs.get('num_trials', self.num_trials) time_out = kwargs.get('time_out', self.time_out) logger.info('Starting Experiments') logger.info(f'Num of Finished Tasks is {self.num_finished_tasks}') logger.info( f'Num of Pending Tasks is {num_trials - self.num_finished_tasks}') tbar = tqdm(range(self.num_finished_tasks, num_trials)) for _ in tbar: if time_out and time.time() - start_time >= time_out \ or self.max_reward and self.get_best_reward() >= self.max_reward: break self.schedule_next() def save(self, checkpoint=None): """Save Checkpoint """ if checkpoint is None: if self._checkpoint is None: logger.warning("Checkpointing is disabled") else: checkpoint = self._checkpoint if checkpoint is not None: with self.LOCK: mkdir(os.path.dirname(checkpoint)) save(self.state_dict(), checkpoint) def schedule_next(self): """Schedule next searcher suggested task """ # Allow for the promotion of a previously chosen config. Also, # extra_kwargs contains extra info passed to both add_job and to # get_config (if no config is promoted) config, extra_kwargs = self._promote_config() if config is None: # No config to promote: Query next config to evaluate from searcher config = self.searcher.get_config(**extra_kwargs) extra_kwargs['new_config'] = True else: # This is not a new config, but a paused one which is now promoted extra_kwargs['new_config'] = False task = Task(self.train_fn, { 'args': self.args, 'config': config }, DistributedResource(**self.resource)) self.add_job(task, **extra_kwargs) def run_with_config(self, config): """Run with config for final fit. It launches a single training trial under any fixed values of the hyperparameters. For example, after HPO has identified the best hyperparameter values based on a hold-out dataset, one can use this function to retrain a model with the same hyperparameters on all the available labeled data (including the hold out set). It can also returns other objects or states. """ task = Task(self.train_fn, { 'args': self.args, 'config': config }, DistributedResource(**self.resource)) reporter = FakeReporter() task.args['reporter'] = reporter return self.run_job(task) def _dict_from_task(self, task): if isinstance(task, Task): return {'TASK_ID': task.task_id, 'Config': task.args['config']} else: assert isinstance(task, dict) return {'TASK_ID': task['TASK_ID'], 'Config': task['Config']} def add_job(self, task, **kwargs): """Adding a training task to the scheduler. Args: task (:class:`autogluon.scheduler.Task`): a new training task Relevant entries in kwargs: - bracket: HB bracket to be used. Has been sampled in _promote_config - new_config: If True, task starts new config eval, otherwise it promotes a config (only if type == 'promotion') Only if new_config == False: - config_key: Internal key for config - resume_from: config promoted from this milestone - milestone: config promoted to this milestone (next from resume_from) """ cls = FIFOScheduler cls.RESOURCE_MANAGER._request(task.resources) # reporter reporter = DistStatusReporter(remote=task.resources.node) task.args['reporter'] = reporter # Register pending evaluation self.searcher.register_pending(task.args['config']) # main process job = cls._start_distributed_job(task, cls.RESOURCE_MANAGER) # reporter thread rp = threading.Thread(target=self._run_reporter, args=(task, job, reporter, self.searcher), daemon=False) rp.start() task_dict = self._dict_from_task(task) task_dict.update({'Task': task, 'Job': job, 'ReporterThread': rp}) # checkpoint thread if self._checkpoint is not None: def _save_checkpoint_callback(fut): self._cleaning_tasks() self.save() job.add_done_callback(_save_checkpoint_callback) with self.LOCK: self.scheduled_tasks.append(task_dict) def _clean_task_internal(self, task_dict): task_dict['ReporterThread'].join() def _run_reporter(self, task, task_job, reporter, searcher): last_result = None while not task_job.done(): reported_result = reporter.fetch() if 'traceback' in reported_result: logger.exception(reported_result['traceback']) break if reported_result.get('done', False): reporter.move_on() break self._add_training_result(task.task_id, reported_result, config=task.args['config']) reporter.move_on() last_result = reported_result if last_result is not None: last_result['done'] = True searcher.update(config=task.args['config'], reward=last_result[self._reward_attr], **last_result) def _promote_config(self): """ Provides a hook in schedule_next, which allows to promote a config which has been selected and partially evaluated previously. :return: config, extra_args """ config = None extra_args = dict() return config, extra_args def get_best_config(self): """Get the best configuration from the finished jobs. """ return self.searcher.get_best_config() def get_best_reward(self): """Get the best reward from the finished jobs. """ return self.searcher.get_best_reward() def _add_training_result(self, task_id, reported_result, config=None): if self.visualizer == 'mxboard' or self.visualizer == 'tensorboard': if 'loss' in reported_result: self.mxboard.add_scalar( tag='loss', value=(f'task {task_id} valid_loss', reported_result['loss']), global_step=reported_result[self._reward_attr]) self.mxboard.add_scalar( tag=self._reward_attr, value=(f'task {task_id} {self._reward_attr}', reported_result[self._reward_attr]), global_step=reported_result[self._reward_attr]) with self.log_lock: # Note: We store all of reported_result in training_history[task_id], # not just the reward value. if task_id in self.training_history: self.training_history[task_id].append(reported_result) else: self.training_history[task_id] = [reported_result] if config: self.config_history[task_id] = config def get_training_curves(self, filename=None, plot=False, use_legend=True): """Get Training Curves Parameters ---------- filename : str plot : bool use_legend : bool Examples -------- >>> scheduler.run() >>> scheduler.join_jobs() >>> scheduler.get_training_curves(plot=True) .. image:: https://github.com/zhanghang1989/AutoGluonWebdata/blob/master/doc/api/autogluon.1.png?raw=true """ if filename is None and not plot: logger.warning( 'Please either provide filename or allow plot in get_training_curves' ) import matplotlib.pyplot as plt plt.ylabel(self._reward_attr) plt.xlabel(self._time_attr) plt.title("Performance vs Training-Time in each HPO Trial") with self.log_lock: for task_id, task_res in self.training_history.items(): rewards = [x[self._reward_attr] for x in task_res] x = list(range(len(task_res))) plt.plot(x, rewards, label=f'task {task_id}') if use_legend: plt.legend(loc='best') if filename: logger.info(f'Saving Training Curve in {filename}') plt.savefig(filename) if plot: plt.show() def state_dict(self, destination=None): """Returns a dictionary containing a whole state of the Scheduler Examples -------- >>> ag.save(scheduler.state_dict(), 'checkpoint.ag') """ destination = super().state_dict(destination) destination['searcher'] = pickle.dumps(self.searcher) with self.log_lock: destination['training_history'] = json.dumps(self.training_history) if self.visualizer == 'mxboard' or self.visualizer == 'tensorboard': destination['visualizer'] = json.dumps(self.mxboard._scalar_dict) return destination def load_state_dict(self, state_dict): """Load from the saved state dict. Examples -------- >>> scheduler.load_state_dict(ag.load('checkpoint.ag')) """ super().load_state_dict(state_dict) self.searcher = pickle.loads(state_dict['searcher']) with self.log_lock: self.training_history = json.loads(state_dict['training_history']) if self.visualizer == 'mxboard' or self.visualizer == 'tensorboard': self.mxboard._scalar_dict = json.loads(state_dict['visualizer']) logger.debug(f'Loading Searcher State {self.searcher}')
class Trainer(object): def __init__(self, args): self.args = args self.sw = SummaryWriter(logdir='logs', flush_secs=5) # image transform input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.485, .456, .406], [.229, .224, .225]), ]) # dataset and dataloader data_kwargs = {'transform': input_transform, 'base_size': args.base_size, 'crop_size': args.crop_size} trainset = COCOSemantic(split='train', mode='train', **data_kwargs) valset = COCOSemantic(split='val', mode='val', **data_kwargs) self.train_data = gluon.data.DataLoader( trainset, args.batch_size, shuffle=True, last_batch='rollover', num_workers=args.workers) self.eval_data = gluon.data.DataLoader(valset, args.test_batch_size, last_batch='rollover', num_workers=args.workers) model = model_zoo.PSPNet(nclass=trainset.NUM_CLASS, backbone='resnet50', aux=True, pretrained_base=True) model.cast(args.dtype) self.net = DataParallelModel(model, args.ctx, args.syncbn) self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx) # resume checkpoint if needed if args.resume is not None: if os.path.isfile(args.resume): model.load_parameters(args.resume, ctx=args.ctx) else: raise RuntimeError("=> no checkpoint found at '{}'" \ .format(args.resume)) # create criterion criterion = MixSoftmaxCrossEntropyLoss(args.aux, aux_weight=args.aux_weight) self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn) # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', baselr=args.lr, niters=len(self.train_data), nepochs=args.epochs) kv = mx.kv.create(args.kvstore) optimizer_params = {'lr_scheduler': self.lr_scheduler, 'wd': args.weight_decay, 'momentum': args.momentum} if args.dtype == 'float16': optimizer_params['multi_precision'] = True if args.no_wd: for k, v in self.net.module.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd', optimizer_params, kvstore=kv) # evaluation metrics self.metric = gluoncv.utils.metrics.SegmentationMetric(trainset.num_class) def training(self, epoch): tbar = tqdm(self.train_data) train_loss = 0.0 alpha = 0.2 for i, (data, target) in enumerate(tbar): self.lr_scheduler.update(i, epoch) with autograd.record(True): outputs = self.net(data.astype(args.dtype, copy=False)) losses = self.criterion(outputs, target) mx.nd.waitall() autograd.backward(losses) self.optimizer.step(self.args.batch_size) for loss in losses: train_loss += loss.asnumpy()[0] / len(losses) self.sw.add_scalar(tag='Training MixSoftmaxCrossEntropyLoss', value=train_loss, global_step=epoch) tbar.set_description('Epoch %d, training loss %.3f' % \ (epoch, train_loss / (i + 1))) mx.nd.waitall() # save every epoch save_checkpoint(self.net.module, self.args, False) def validation(self, epoch): # total_inter, total_union, total_correct, total_label = 0, 0, 0, 0 self.metric.reset() tbar = tqdm(self.eval_data) for i, (data, target) in enumerate(tbar): outputs = self.evaluator(data.astype(args.dtype, copy=False)) outputs = [x[0] for x in outputs] targets = mx.gluon.utils.split_and_load(target, args.ctx, even_split=False) self.metric.update(targets, outputs) pixAcc, mIoU = self.metric.get() tbar.set_description('Epoch %d, validation pixAcc: %.3f, mIoU: %.3f' % \ (epoch, pixAcc, mIoU)) self.sw.add_scalar(tag='Pixel Accuray (on valset)', value=pixAcc, global_step=epoch) self.sw.add_scalar(tag='mIoU (on valset)', value=mIoU, global_step=epoch) mx.nd.waitall()
class TabularNeuralNetModel(AbstractNeuralNetworkModel): """ Class for neural network models that operate on tabular data. These networks use different types of input layers to process different types of data in various columns. Attributes: _types_of_features (dict): keys = 'continuous', 'skewed', 'onehot', 'embed', 'language'; values = column-names of Dataframe corresponding to the features of this type feature_arraycol_map (OrderedDict): maps feature-name -> list of column-indices in df corresponding to this feature self.feature_type_map (OrderedDict): maps feature-name -> feature_type string (options: 'vector', 'embed', 'language') processor (sklearn.ColumnTransformer): scikit-learn preprocessor object. Note: This model always assumes higher values of self.eval_metric indicate better performance. """ # Constants used throughout this class: # model_internals_file_name = 'model-internals.pkl' # store model internals here unique_category_str = '!missing!' # string used to represent missing values and unknown categories for categorical features. Should not appear in the dataset rescale_losses = {gluon.loss.L1Loss:'std', gluon.loss.HuberLoss:'std', gluon.loss.L2Loss:'var'} # dict of loss names where we should rescale loss, value indicates how to rescale. Call self.loss_func.name params_file_name = 'net.params' # Stores parameters of final network temp_file_name = 'temp_net.params' # Stores temporary network parameters (eg. during the course of training) def __init__(self, **kwargs): super().__init__(**kwargs) """ TabularNeuralNetModel object. Parameters ---------- path (str): file-path to directory where to save files associated with this model name (str): name used to refer to this model problem_type (str): what type of prediction problem is this model used for eval_metric (func): function used to evaluate performance (Note: we assume higher = better) hyperparameters (dict): various hyperparameters for neural network and the NN-specific data processing features (list): List of predictive features to use, other features are ignored by the model. """ self.feature_arraycol_map = None self.feature_type_map = None self.features_to_drop = [] # may change between different bagging folds. TODO: consider just removing these from self.features if it works with bagging self.processor = None # data processor self.summary_writer = None self.ctx = mx.cpu() self.batch_size = None self.num_dataloading_workers = None self.num_dataloading_workers_inference = 0 self.params_post_fit = None self.num_net_outputs = None self._architecture_desc = None self.optimizer = None self.verbosity = None if self.stopping_metric is not None and self.eval_metric == roc_auc and self.stopping_metric == log_loss: self.stopping_metric = roc_auc # NN is overconfident so early stopping with logloss can halt training too quick self.eval_metric_name = self.stopping_metric.name def _set_default_params(self): """ Specifies hyperparameter values to use by default """ default_params = get_default_param(self.problem_type) for param, val in default_params.items(): self._set_default_param_value(param, val) def _get_default_auxiliary_params(self) -> dict: default_auxiliary_params = super()._get_default_auxiliary_params() extra_auxiliary_params = dict( ignored_type_group_raw=[R_OBJECT], ignored_type_group_special=[S_TEXT_NGRAM, S_TEXT_AS_CATEGORY], ) default_auxiliary_params.update(extra_auxiliary_params) return default_auxiliary_params def _get_default_searchspace(self): return get_default_searchspace(self.problem_type, num_classes=None) def set_net_defaults(self, train_dataset, params): """ Sets dataset-adaptive default values to use for our neural network """ if (self.problem_type == MULTICLASS) or (self.problem_type == SOFTCLASS): self.num_net_outputs = train_dataset.num_classes elif self.problem_type == REGRESSION: self.num_net_outputs = 1 if params['y_range'] is None: # Infer default y-range y_vals = train_dataset.dataset._data[train_dataset.label_index].asnumpy() min_y = float(min(y_vals)) max_y = float(max(y_vals)) std_y = np.std(y_vals) y_ext = params['y_range_extend'] * std_y if min_y >= 0: # infer y must be nonnegative min_y = max(0, min_y-y_ext) else: min_y = min_y-y_ext if max_y <= 0: # infer y must be non-positive max_y = min(0, max_y+y_ext) else: max_y = max_y+y_ext params['y_range'] = (min_y, max_y) elif self.problem_type == BINARY: self.num_net_outputs = 2 else: raise ValueError("unknown problem_type specified: %s" % self.problem_type) if params['layers'] is None: # Use default choices for MLP architecture if self.problem_type == REGRESSION: default_layer_sizes = [256, 128] # overall network will have 4 layers. Input layer, 256-unit hidden layer, 128-unit hidden layer, output layer. else: default_sizes = [256, 128] # will be scaled adaptively # base_size = max(1, min(self.num_net_outputs, 20)/2.0) # scale layer width based on number of classes base_size = max(1, min(self.num_net_outputs, 100) / 50) # TODO: Updated because it improved model quality and made training far faster default_layer_sizes = [defaultsize*base_size for defaultsize in default_sizes] layer_expansion_factor = 1 # TODO: consider scaling based on num_rows, eg: layer_expansion_factor = 2-np.exp(-max(0,train_dataset.num_examples-10000)) max_layer_width = params['max_layer_width'] params['layers'] = [int(min(max_layer_width, layer_expansion_factor*defaultsize)) for defaultsize in default_layer_sizes] if train_dataset.has_vector_features() and params['numeric_embed_dim'] is None: # Use default choices for numeric embedding size vector_dim = train_dataset.dataset._data[train_dataset.vectordata_index].shape[1] # total dimensionality of vector features prop_vector_features = train_dataset.num_vector_features() / float(train_dataset.num_features) # Fraction of features that are numeric min_numeric_embed_dim = 32 max_numeric_embed_dim = params['max_layer_width'] params['numeric_embed_dim'] = int(min(max_numeric_embed_dim, max(min_numeric_embed_dim, params['layers'][0]*prop_vector_features*np.log10(vector_dim+10) ))) return def _fit(self, X_train, y_train, X_val=None, y_val=None, time_limit=None, reporter=None, **kwargs): """ X_train (pd.DataFrame): training data features (not necessarily preprocessed yet) X_val (pd.DataFrame): test data features (should have same column names as Xtrain) y_train (pd.Series): y_val (pd.Series): are pandas Series kwargs: Can specify amount of compute resources to utilize (num_cpus, num_gpus). """ start_time = time.time() params = self.params.copy() self.verbosity = kwargs.get('verbosity', 2) params = fixedvals_from_searchspaces(params) if self.feature_metadata is None: raise ValueError("Trainer class must set feature_metadata for this model") if 'num_cpus' in kwargs: self.num_dataloading_workers = max(1, int(kwargs['num_cpus']/2.0)) else: self.num_dataloading_workers = 1 if self.num_dataloading_workers == 1: self.num_dataloading_workers = 0 # 0 is always faster and uses less memory than 1 self.batch_size = params['batch_size'] train_dataset, val_dataset = self.generate_datasets(X_train=X_train, y_train=y_train, params=params, X_val=X_val, y_val=y_val) logger.log(15, "Training data for neural network has: %d examples, %d features (%d vector, %d embedding, %d language)" % (train_dataset.num_examples, train_dataset.num_features, len(train_dataset.feature_groups['vector']), len(train_dataset.feature_groups['embed']), len(train_dataset.feature_groups['language']) )) # self._save_preprocessor() # TODO: should save these things for hyperparam tunning. Need one HP tuner for network-specific HPs, another for preprocessing HPs. if 'num_gpus' in kwargs and kwargs['num_gpus'] >= 1: self.ctx = mx.gpu() # Currently cannot use more than 1 GPU else: self.ctx = mx.cpu() self.get_net(train_dataset, params=params) if time_limit: time_elapsed = time.time() - start_time time_limit = time_limit - time_elapsed self.train_net(train_dataset=train_dataset, params=params, val_dataset=val_dataset, initialize=True, setup_trainer=True, time_limit=time_limit, reporter=reporter) self.params_post_fit = params """ # TODO: if we don't want to save intermediate network parameters, need to do something like saving in temp directory to clean up after training: with make_temp_directory() as temp_dir: save_callback = SaveModelCallback(self.model, monitor=self.metric, mode=save_callback_mode, name=self.name) with progress_disabled_ctx(self.model) as model: original_path = model.path model.path = Path(temp_dir) model.fit_one_cycle(self.epochs, self.lr, callbacks=save_callback) # Load the best one and export it model.load(self.name) print(f'Model validation metrics: {model.validate()}') model.path = original_path """ def get_net(self, train_dataset, params): """ Creates a Gluon neural net and context for this dataset. Also sets up trainer/optimizer as necessary. """ self.set_net_defaults(train_dataset, params) self.model = EmbedNet(train_dataset=train_dataset, params=params, num_net_outputs=self.num_net_outputs, ctx=self.ctx) # TODO: Below should not occur until at time of saving if not os.path.exists(self.path): os.makedirs(self.path) def train_net(self, train_dataset, params, val_dataset=None, initialize=True, setup_trainer=True, time_limit=None, reporter=None): """ Trains neural net on given train dataset, early stops based on test_dataset. Args: train_dataset (TabularNNDataset): training data used to learn network weights val_dataset (TabularNNDataset): validation data used for hyperparameter tuning initialize (bool): set = False to continue training of a previously trained model, otherwise initializes network weights randomly setup_trainer (bool): set = False to reuse the same trainer from a previous training run, otherwise creates new trainer from scratch """ start_time = time.time() logger.log(15, "Training neural network for up to %s epochs..." % params['num_epochs']) seed_value = params.get('seed_value') if seed_value is not None: # Set seed random.seed(seed_value) np.random.seed(seed_value) mx.random.seed(seed_value) if initialize: # Initialize the weights of network logging.debug("initializing neural network...") self.model.collect_params().initialize(ctx=self.ctx) self.model.hybridize() logging.debug("initialized") if setup_trainer: # Also setup mxboard to monitor training if visualizer has been specified: visualizer = params.get('visualizer', 'none') if visualizer == 'tensorboard' or visualizer == 'mxboard': try_import_mxboard() from mxboard import SummaryWriter self.summary_writer = SummaryWriter(logdir=self.path, flush_secs=5, verbose=False) self.optimizer = self.setup_trainer(params=params, train_dataset=train_dataset) best_val_metric = -np.inf # higher = better val_metric = None best_val_epoch = 0 val_improve_epoch = 0 # most recent epoch where validation-score strictly improved num_epochs = params['num_epochs'] if val_dataset is not None: y_val = val_dataset.get_labels() else: y_val = None if params['loss_function'] is None: if self.problem_type == REGRESSION: params['loss_function'] = gluon.loss.L1Loss() elif self.problem_type == SOFTCLASS: params['loss_function'] = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False, from_logits=self.model.from_logits) else: params['loss_function'] = gluon.loss.SoftmaxCrossEntropyLoss(from_logits=self.model.from_logits) loss_func = params['loss_function'] epochs_wo_improve = params['epochs_wo_improve'] loss_scaling_factor = 1.0 # we divide loss by this quantity to stabilize gradients loss_torescale = [key for key in self.rescale_losses if isinstance(loss_func, key)] if loss_torescale: loss_torescale = loss_torescale[0] if self.rescale_losses[loss_torescale] == 'std': loss_scaling_factor = np.std(train_dataset.get_labels())/5.0 + EPS # std-dev of labels elif self.rescale_losses[loss_torescale] == 'var': loss_scaling_factor = np.var(train_dataset.get_labels())/5.0 + EPS # variance of labels else: raise ValueError("Unknown loss-rescaling type %s specified for loss_func==%s" % (self.rescale_losses[loss_torescale], loss_func)) if self.verbosity <= 1: verbose_eval = -1 # Print losses every verbose epochs, Never if -1 elif self.verbosity == 2: verbose_eval = 50 elif self.verbosity == 3: verbose_eval = 10 else: verbose_eval = 1 net_filename = self.path + self.temp_file_name if num_epochs == 0: # use dummy training loop that stops immediately (useful for using NN just for data preprocessing / debugging) logger.log(20, "Not training Neural Net since num_epochs == 0. Neural network architecture is:") for batch_idx, data_batch in enumerate(train_dataset.dataloader): data_batch = train_dataset.format_batch_data(data_batch, self.ctx) with autograd.record(): output = self.model(data_batch) labels = data_batch['label'] loss = loss_func(output, labels) / loss_scaling_factor # print(str(nd.mean(loss).asscalar()), end="\r") # prints per-batch losses loss.backward() self.optimizer.step(labels.shape[0]) if batch_idx > 0: break self.model.save_parameters(net_filename) logger.log(15, "untrained Neural Net saved to file") return # Training Loop: for e in range(num_epochs): if e == 0: # special actions during first epoch: logger.log(15, "Neural network architecture:") logger.log(15, str(self.model)) cumulative_loss = 0 for batch_idx, data_batch in enumerate(train_dataset.dataloader): data_batch = train_dataset.format_batch_data(data_batch, self.ctx) with autograd.record(): output = self.model(data_batch) labels = data_batch['label'] loss = loss_func(output, labels) / loss_scaling_factor # print(str(nd.mean(loss).asscalar()), end="\r") # prints per-batch losses loss.backward() self.optimizer.step(labels.shape[0]) cumulative_loss += loss.sum() train_loss = cumulative_loss/float(train_dataset.num_examples) # training loss this epoch if val_dataset is not None: val_metric = self.score(X=val_dataset, y=y_val, eval_metric=self.stopping_metric, metric_needs_y_pred=self.stopping_metric_needs_y_pred) if (val_dataset is None) or (val_metric >= best_val_metric) or (e == 0): # keep training if score has improved if val_dataset is not None: if not np.isnan(val_metric): if val_metric > best_val_metric: val_improve_epoch = e best_val_metric = val_metric best_val_epoch = e # Until functionality is added to restart training from a particular epoch, there is no point in saving params without test_dataset if val_dataset is not None: self.model.save_parameters(net_filename) if val_dataset is not None: if verbose_eval > 0 and e % verbose_eval == 0: logger.log(15, "Epoch %s. Train loss: %s, Val %s: %s" % (e, train_loss.asscalar(), self.eval_metric_name, val_metric)) if self.summary_writer is not None: self.summary_writer.add_scalar(tag='val_'+self.eval_metric_name, value=val_metric, global_step=e) else: if verbose_eval > 0 and e % verbose_eval == 0: logger.log(15, "Epoch %s. Train loss: %s" % (e, train_loss.asscalar())) if self.summary_writer is not None: self.summary_writer.add_scalar(tag='train_loss', value=train_loss.asscalar(), global_step=e) # TODO: do we want to keep mxboard support? if reporter is not None: # TODO: Ensure reporter/scheduler properly handle None/nan values after refactor if val_dataset is not None and (not np.isnan(val_metric)): # TODO: This might work without the if statement # epoch must be number of epochs done (starting at 1) reporter(epoch=e+1, validation_performance=val_metric, train_loss=float(train_loss.asscalar())) # Higher val_metric = better if e - val_improve_epoch > epochs_wo_improve: break # early-stop if validation-score hasn't strictly improved in `epochs_wo_improve` consecutive epochs if time_limit: time_elapsed = time.time() - start_time time_left = time_limit - time_elapsed if time_left <= 0: logger.log(20, "\tRan out of time, stopping training early.") break if val_dataset is not None: self.model.load_parameters(net_filename) # Revert back to best model try: os.remove(net_filename) except FileNotFoundError: pass if val_dataset is None: logger.log(15, "Best model found in epoch %d" % best_val_epoch) else: # evaluate one final time: final_val_metric = self.score(X=val_dataset, y=y_val, eval_metric=self.stopping_metric, metric_needs_y_pred=self.stopping_metric_needs_y_pred) if np.isnan(final_val_metric): final_val_metric = -np.inf logger.log(15, "Best model found in epoch %d. Val %s: %s" % (best_val_epoch, self.eval_metric_name, final_val_metric)) self.params_trained['num_epochs'] = best_val_epoch + 1 return def _predict_proba(self, X, **kwargs): """ To align predict with abstract_model API. Preprocess here only refers to feature processing steps done by all AbstractModel objects, not tabularNN-specific preprocessing steps. If X is not DataFrame but instead TabularNNDataset object, we can still produce predictions, but cannot use preprocess in this case (needs to be already processed). """ if isinstance(X, TabularNNDataset): return self._predict_tabular_data(new_data=X, process=False, predict_proba=True) elif isinstance(X, pd.DataFrame): X = self.preprocess(X, **kwargs) return self._predict_tabular_data(new_data=X, process=True, predict_proba=True) else: raise ValueError("X must be of type pd.DataFrame or TabularNNDataset, not type: %s" % type(X)) def _predict_tabular_data(self, new_data, process=True, predict_proba=True): # TODO ensure API lines up with tabular.Model class. """ Specific TabularNN method to produce predictions on new (unprocessed) data. Returns 1D numpy array unless predict_proba=True and task is multi-class classification (not binary). Args: new_data (pd.Dataframe or TabularNNDataset): new data to make predictions on. If you want to make prediction for just a single row of new_data, pass in: new_data.iloc[[row_index]] process (bool): should new data be processed (if False, new_data must be TabularNNDataset) predict_proba (bool): should we output class-probabilities (not used for regression) """ if process: new_data = self.process_test_data(new_data, batch_size=self.batch_size, num_dataloading_workers=self.num_dataloading_workers_inference, labels=None) if not isinstance(new_data, TabularNNDataset): raise ValueError("new_data must of of type TabularNNDataset if process=False") if self.problem_type == REGRESSION or not predict_proba: preds = nd.zeros((new_data.num_examples,1)) else: preds = nd.zeros((new_data.num_examples, self.num_net_outputs)) i = 0 for batch_idx, data_batch in enumerate(new_data.dataloader): data_batch = new_data.format_batch_data(data_batch, self.ctx) preds_batch = self.model(data_batch) batch_size = len(preds_batch) if self.problem_type != REGRESSION: if not predict_proba: # need to take argmax preds_batch = nd.argmax(preds_batch, axis=1, keepdims=True) else: # need to take softmax preds_batch = nd.softmax(preds_batch, axis=1) preds[i:(i+batch_size)] = preds_batch i = i+batch_size if self.problem_type == REGRESSION or not predict_proba: return preds.asnumpy().flatten() # return 1D numpy array elif self.problem_type == BINARY and predict_proba: return preds[:,1].asnumpy() # for binary problems, only return P(Y==+1) return preds.asnumpy() # return 2D numpy array def generate_datasets(self, X_train, y_train, params, X_val=None, y_val=None): impute_strategy = params['proc.impute_strategy'] max_category_levels = params['proc.max_category_levels'] skew_threshold = params['proc.skew_threshold'] embed_min_categories = params['proc.embed_min_categories'] use_ngram_features = params['use_ngram_features'] if isinstance(X_train, TabularNNDataset): train_dataset = X_train else: X_train = self.preprocess(X_train) if self.features is None: self.features = list(X_train.columns) train_dataset = self.process_train_data( df=X_train, labels=y_train, batch_size=self.batch_size, num_dataloading_workers=self.num_dataloading_workers, impute_strategy=impute_strategy, max_category_levels=max_category_levels, skew_threshold=skew_threshold, embed_min_categories=embed_min_categories, use_ngram_features=use_ngram_features, ) if X_val is not None: if isinstance(X_val, TabularNNDataset): val_dataset = X_val else: X_val = self.preprocess(X_val) val_dataset = self.process_test_data(df=X_val, labels=y_val, batch_size=self.batch_size, num_dataloading_workers=self.num_dataloading_workers_inference) else: val_dataset = None return train_dataset, val_dataset def process_test_data(self, df, batch_size, num_dataloading_workers, labels=None): """ Process train or test DataFrame into a form fit for neural network models. Args: df (pd.DataFrame): Data to be processed (X) labels (pd.Series): labels to be processed (y) test (bool): Is this test data where each datapoint should be processed separately using predetermined preprocessing steps. Otherwise preprocessor uses all data to determine propreties like best scaling factors, number of categories, etc. Returns: Dataset object """ warnings.filterwarnings("ignore", module='sklearn.preprocessing') # sklearn processing n_quantiles warning if labels is not None and len(labels) != len(df): raise ValueError("Number of examples in Dataframe does not match number of labels") if (self.processor is None or self._types_of_features is None or self.feature_arraycol_map is None or self.feature_type_map is None): raise ValueError("Need to process training data before test data") if self.features_to_drop: drop_cols = [col for col in df.columns if col in self.features_to_drop] if drop_cols: df = df.drop(columns=drop_cols) df = self.processor.transform(df) # 2D numpy array. self.feature_arraycol_map, self.feature_type_map have been previously set while processing training data. return TabularNNDataset(df, self.feature_arraycol_map, self.feature_type_map, batch_size=batch_size, num_dataloading_workers=num_dataloading_workers, problem_type=self.problem_type, labels=labels, is_test=True) def process_train_data(self, df, batch_size, num_dataloading_workers, impute_strategy, max_category_levels, skew_threshold, embed_min_categories, use_ngram_features, labels): """ Preprocess training data and create self.processor object that can be used to process future data. This method should only be used once per TabularNeuralNetModel object, otherwise will produce Warning. # TODO no label processing for now # TODO: language features are ignored for now # TODO: add time/ngram features # TODO: no filtering of data-frame columns based on statistics, e.g. categorical columns with all unique variables or zero-variance features. This should be done in default_learner class for all models not just TabularNeuralNetModel... """ warnings.filterwarnings("ignore", module='sklearn.preprocessing') # sklearn processing n_quantiles warning if set(df.columns) != set(self.features): raise ValueError("Column names in provided Dataframe do not match self.features") if labels is None: raise ValueError("Attempting process training data without labels") if len(labels) != len(df): raise ValueError("Number of examples in Dataframe does not match number of labels") self._types_of_features, df = self._get_types_of_features(df, skew_threshold=skew_threshold, embed_min_categories=embed_min_categories, use_ngram_features=use_ngram_features) # dict with keys: : 'continuous', 'skewed', 'onehot', 'embed', 'language', values = column-names of df logger.log(15, "AutoGluon Neural Network infers features are of the following types:") logger.log(15, json.dumps(self._types_of_features, indent=4)) logger.log(15, "\n") self.processor = self._create_preprocessor(impute_strategy=impute_strategy, max_category_levels=max_category_levels) df = self.processor.fit_transform(df) # 2D numpy array self.feature_arraycol_map = self._get_feature_arraycol_map(max_category_levels=max_category_levels) # OrderedDict of feature-name -> list of column-indices in df corresponding to this feature num_array_cols = np.sum([len(self.feature_arraycol_map[key]) for key in self.feature_arraycol_map]) # should match number of columns in processed array if num_array_cols != df.shape[1]: raise ValueError("Error during one-hot encoding data processing for neural network. Number of columns in df array does not match feature_arraycol_map.") self.feature_type_map = self._get_feature_type_map() # OrderedDict of feature-name -> feature_type string (options: 'vector', 'embed', 'language') return TabularNNDataset(df, self.feature_arraycol_map, self.feature_type_map, batch_size=batch_size, num_dataloading_workers=num_dataloading_workers, problem_type=self.problem_type, labels=labels, is_test=False) def setup_trainer(self, params, train_dataset=None): """ Set up optimizer needed for training. Network must first be initialized before this. """ optimizer_opts = {'learning_rate': params['learning_rate'], 'wd': params['weight_decay'], 'clip_gradient': params['clip_gradient']} if 'lr_scheduler' in params and params['lr_scheduler'] is not None: if train_dataset is None: raise ValueError("train_dataset cannot be None when lr_scheduler is specified.") base_lr = params.get('base_lr', 1e-6) target_lr = params.get('target_lr', 1.0) warmup_epochs = params.get('warmup_epochs', 10) lr_decay = params.get('lr_decay', 0.1) lr_mode = params['lr_scheduler'] num_batches = train_dataset.num_examples // params['batch_size'] lr_decay_epoch = [max(warmup_epochs, int(params['num_epochs']/3)), max(warmup_epochs+1, int(params['num_epochs']/2)), max(warmup_epochs+2, int(2*params['num_epochs']/3))] lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=base_lr, target_lr=target_lr, nepochs=warmup_epochs, iters_per_epoch=num_batches), LRScheduler(lr_mode, base_lr=target_lr, target_lr=base_lr, nepochs=params['num_epochs'] - warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=lr_decay, power=2) ]) optimizer_opts['lr_scheduler'] = lr_scheduler if params['optimizer'] == 'sgd': if 'momentum' in params: optimizer_opts['momentum'] = params['momentum'] optimizer = gluon.Trainer(self.model.collect_params(), 'sgd', optimizer_opts) elif params['optimizer'] == 'adam': # TODO: Can we try AdamW? optimizer = gluon.Trainer(self.model.collect_params(), 'adam', optimizer_opts) else: raise ValueError("Unknown optimizer specified: %s" % params['optimizer']) return optimizer @staticmethod def convert_df_dtype_to_str(df): return df.astype(str) def _get_feature_arraycol_map(self, max_category_levels): """ Returns OrderedDict of feature-name -> list of column-indices in processed data array corresponding to this feature """ feature_preserving_transforms = set(['continuous','skewed', 'ordinal', 'language']) # these transforms do not alter dimensionality of feature feature_arraycol_map = {} # unordered version current_colindex = 0 for transformer in self.processor.transformers_: transformer_name = transformer[0] transformed_features = transformer[2] if transformer_name in feature_preserving_transforms: for feature in transformed_features: if feature in feature_arraycol_map: raise ValueError("same feature is processed by two different column transformers: %s" % feature) feature_arraycol_map[feature] = [current_colindex] current_colindex += 1 elif transformer_name == 'onehot': oh_encoder = [step for (name, step) in transformer[1].steps if name == 'onehot'][0] for i in range(len(transformed_features)): feature = transformed_features[i] if feature in feature_arraycol_map: raise ValueError("same feature is processed by two different column transformers: %s" % feature) oh_dimensionality = min(len(oh_encoder.categories_[i]), max_category_levels+1) feature_arraycol_map[feature] = list(range(current_colindex, current_colindex+oh_dimensionality)) current_colindex += oh_dimensionality else: raise ValueError("unknown transformer encountered: %s" % transformer_name) return OrderedDict([(key, feature_arraycol_map[key]) for key in feature_arraycol_map]) def _get_feature_type_map(self): """ Returns OrderedDict of feature-name -> feature_type string (options: 'vector', 'embed', 'language') """ if self.feature_arraycol_map is None: raise ValueError("must first call _get_feature_arraycol_map() before _get_feature_type_map()") vector_features = self._types_of_features['continuous'] + self._types_of_features['skewed'] + self._types_of_features['onehot'] feature_type_map = OrderedDict() for feature_name in self.feature_arraycol_map: if feature_name in vector_features: feature_type_map[feature_name] = 'vector' elif feature_name in self._types_of_features['embed']: feature_type_map[feature_name] = 'embed' elif feature_name in self._types_of_features['language']: feature_type_map[feature_name] = 'language' else: raise ValueError("unknown feature type encountered") return feature_type_map def _create_preprocessor(self, impute_strategy, max_category_levels): """ Defines data encoders used to preprocess different data types and creates instance variable which is sklearn ColumnTransformer object """ if self.processor is not None: Warning("Attempting to process training data for TabularNeuralNetModel, but previously already did this.") continuous_features = self._types_of_features['continuous'] skewed_features = self._types_of_features['skewed'] onehot_features = self._types_of_features['onehot'] embed_features = self._types_of_features['embed'] language_features = self._types_of_features['language'] transformers = [] # order of various column transformers in this list is important! if continuous_features: continuous_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy=impute_strategy)), ('scaler', StandardScaler())]) transformers.append( ('continuous', continuous_transformer, continuous_features) ) if skewed_features: power_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy=impute_strategy)), ('quantile', QuantileTransformer(output_distribution='normal')) ]) # Or output_distribution = 'uniform' transformers.append( ('skewed', power_transformer, skewed_features) ) if onehot_features: onehot_transformer = Pipeline(steps=[ # TODO: Consider avoiding converting to string for improved memory efficiency ('to_str', FunctionTransformer(self.convert_df_dtype_to_str)), ('imputer', SimpleImputer(strategy='constant', fill_value=self.unique_category_str)), ('onehot', OneHotMergeRaresHandleUnknownEncoder(max_levels=max_category_levels, sparse=False))]) # test-time unknown values will be encoded as all zeros vector transformers.append( ('onehot', onehot_transformer, onehot_features) ) if embed_features: # Ordinal transformer applied to convert to-be-embedded categorical features to integer levels ordinal_transformer = Pipeline(steps=[ ('to_str', FunctionTransformer(self.convert_df_dtype_to_str)), ('imputer', SimpleImputer(strategy='constant', fill_value=self.unique_category_str)), ('ordinal', OrdinalMergeRaresHandleUnknownEncoder(max_levels=max_category_levels))]) # returns 0-n when max_category_levels = n-1. category n is reserved for unknown test-time categories. transformers.append( ('ordinal', ordinal_transformer, embed_features) ) if language_features: raise NotImplementedError("language_features cannot be used at the moment") return ColumnTransformer(transformers=transformers) # numeric features are processed in the same order as in numeric_features vector, so feature-names remain the same. def save(self, path: str = None, verbose=True) -> str: if self.model is not None: self._architecture_desc = self.model.architecture_desc temp_model = self.model temp_sw = self.summary_writer self.model = None self.summary_writer = None path_final = super().save(path=path, verbose=verbose) self.model = temp_model self.summary_writer = temp_sw self._architecture_desc = None # Export model if self.model is not None: params_filepath = path_final + self.params_file_name # TODO: Don't use os.makedirs here, have save_parameters function in tabular_nn_model that checks if local path or S3 path os.makedirs(os.path.dirname(path_final), exist_ok=True) self.model.save_parameters(params_filepath) return path_final @classmethod def load(cls, path: str, reset_paths=True, verbose=True): model: TabularNeuralNetModel = super().load(path=path, reset_paths=reset_paths, verbose=verbose) if model._architecture_desc is not None: model.model = EmbedNet(architecture_desc=model._architecture_desc, ctx=model.ctx) # recreate network from architecture description model._architecture_desc = None # TODO: maybe need to initialize/hybridize? model.model.load_parameters(model.path + model.params_file_name, ctx=model.ctx) model.summary_writer = None return model def hyperparameter_tune(self, X_train, y_train, X_val, y_val, scheduler_options, **kwargs): time_start = time.time() """ Performs HPO and sets self.params to best hyperparameter values """ self.verbosity = kwargs.get('verbosity', 2) logger.log(15, "Beginning hyperparameter tuning for Neural Network...") self._set_default_searchspace() # changes non-specified default hyperparams from fixed values to search-spaces. if self.feature_metadata is None: raise ValueError("Trainer class must set feature_metadata for this model") scheduler_func = scheduler_options[0] scheduler_options = scheduler_options[1] if scheduler_func is None or scheduler_options is None: raise ValueError("scheduler_func and scheduler_options cannot be None for hyperparameter tuning") num_cpus = scheduler_options['resource']['num_cpus'] # num_gpus = scheduler_options['resource']['num_gpus'] # TODO: Currently unused params_copy = self.params.copy() self.num_dataloading_workers = max(1, int(num_cpus/2.0)) self.batch_size = params_copy['batch_size'] train_dataset, val_dataset = self.generate_datasets(X_train=X_train, y_train=y_train, params=params_copy, X_val=X_val, y_val=y_val) train_path = self.path + "train" val_path = self.path + "validation" train_dataset.save(file_prefix=train_path) val_dataset.save(file_prefix=val_path) if not np.any([isinstance(params_copy[hyperparam], Space) for hyperparam in params_copy]): logger.warning("Warning: Attempting to do hyperparameter optimization without any search space (all hyperparameters are already fixed values)") else: logger.log(15, "Hyperparameter search space for Neural Network: ") for hyperparam in params_copy: if isinstance(params_copy[hyperparam], Space): logger.log(15, str(hyperparam)+ ": "+str(params_copy[hyperparam])) util_args = dict( train_path=train_path, val_path=val_path, model=self, time_start=time_start, time_limit=scheduler_options['time_out'] ) tabular_nn_trial.register_args(util_args=util_args, **params_copy) scheduler = scheduler_func(tabular_nn_trial, **scheduler_options) if ('dist_ip_addrs' in scheduler_options) and (len(scheduler_options['dist_ip_addrs']) > 0): # TODO: Ensure proper working directory setup on remote machines # This is multi-machine setting, so need to copy dataset to workers: logger.log(15, "Uploading preprocessed data to remote workers...") scheduler.upload_files([train_path+TabularNNDataset.DATAOBJ_SUFFIX, train_path+TabularNNDataset.DATAVALUES_SUFFIX, val_path+TabularNNDataset.DATAOBJ_SUFFIX, val_path+TabularNNDataset.DATAVALUES_SUFFIX]) # TODO: currently does not work. logger.log(15, "uploaded") scheduler.run() scheduler.join_jobs() scheduler.get_training_curves(plot=False, use_legend=False) return self._get_hpo_results(scheduler=scheduler, scheduler_options=scheduler_options, time_start=time_start) def get_info(self): info = super().get_info() info['hyperparameters_post_fit'] = self.params_post_fit return info def reduce_memory_size(self, remove_fit=True, requires_save=True, **kwargs): super().reduce_memory_size(remove_fit=remove_fit, requires_save=requires_save, **kwargs) if remove_fit and requires_save: self.optimizer = None
def do_training(args, module, data_train, data_val, begin_epoch=0): from distutils.dir_util import mkpath from log_util import LogUtil log = LogUtil.getInstance().getlogger() mkpath(os.path.dirname(get_checkpoint_path(args))) #seq_len = args.config.get('arch', 'max_t_count') batch_size = args.config.getint('common', 'batch_size') save_checkpoint_every_n_epoch = args.config.getint('common', 'save_checkpoint_every_n_epoch') save_checkpoint_every_n_batch = args.config.getint('common', 'save_checkpoint_every_n_batch') enable_logging_train_metric = args.config.getboolean('train', 'enable_logging_train_metric') enable_logging_validation_metric = args.config.getboolean('train', 'enable_logging_validation_metric') contexts = parse_contexts(args) num_gpu = len(contexts) eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_validation_metric,is_epoch_end=True) # mxboard setting loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_train_metric,is_epoch_end=False) optimizer = args.config.get('optimizer', 'optimizer') learning_rate = args.config.getfloat('train', 'learning_rate') learning_rate_annealing = args.config.getfloat('train', 'learning_rate_annealing') mode = args.config.get('common', 'mode') num_epoch = args.config.getint('train', 'num_epoch') clip_gradient = args.config.getfloat('optimizer', 'clip_gradient') weight_decay = args.config.getfloat('optimizer', 'weight_decay') save_optimizer_states = args.config.getboolean('train', 'save_optimizer_states') show_every = args.config.getint('train', 'show_every') optimizer_params_dictionary = json.loads(args.config.get('optimizer', 'optimizer_params_dictionary')) kvstore_option = args.config.get('common', 'kvstore_option') n_epoch=begin_epoch is_bucketing = args.config.getboolean('arch', 'is_bucketing') if clip_gradient == 0: clip_gradient = None if is_bucketing and mode == 'load': model_file = args.config.get('common', 'model_file') model_name = os.path.splitext(model_file)[0] model_num_epoch = int(model_name[-4:]) model_path = 'checkpoints/' + str(model_name[:-5]) symbol, data_names, label_names = module(1600) model = STTBucketingModule( sym_gen=module, default_bucket_key=data_train.default_bucket_key, context=contexts) data_train.reset() model.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) _, arg_params, aux_params = mx.model.load_checkpoint(model_path, model_num_epoch) model.set_params(arg_params, aux_params) module = model else: module.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) if begin_epoch == 0 and mode == 'train': module.init_params(initializer=get_initializer(args)) lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate) def reset_optimizer(force_init=False): optimizer_params = {'lr_scheduler': lr_scheduler, 'clip_gradient': clip_gradient, 'wd': weight_decay} optimizer_params.update(optimizer_params_dictionary) module.init_optimizer(kvstore=kvstore_option, optimizer=optimizer, optimizer_params=optimizer_params, force_init=force_init) if mode == "train": reset_optimizer(force_init=True) else: reset_optimizer(force_init=False) data_train.reset() data_train.is_first_epoch = True #mxboard setting mxlog_dir = args.config.get('common', 'mxboard_log_dir') summary_writer = SummaryWriter(mxlog_dir) while True: if n_epoch >= num_epoch: break loss_metric.reset() log.info('---------train---------') for nbatch, data_batch in enumerate(data_train): module.forward_backward(data_batch) module.update() # mxboard setting if (nbatch + 1) % show_every == 0: module.update_metric(loss_metric, data_batch.label) #summary_writer.add_scalar('loss batch', loss_metric.get_batch_loss(), nbatch) if (nbatch+1) % save_checkpoint_every_n_batch == 0: log.info('Epoch[%d] Batch[%d] SAVE CHECKPOINT', n_epoch, nbatch) module.save_checkpoint(prefix=get_checkpoint_path(args)+"n_epoch"+str(n_epoch)+"n_batch", epoch=(int((nbatch+1)/save_checkpoint_every_n_batch)-1), save_optimizer_states=save_optimizer_states) # commented for Libri_sample data set to see only train cer log.info('---------validation---------') data_val.reset() eval_metric.reset() for nbatch, data_batch in enumerate(data_val): # when is_train = False it leads to high cer when batch_norm module.forward(data_batch, is_train=True) module.update_metric(eval_metric, data_batch.label) # mxboard setting val_cer, val_n_label, val_l_dist, _ = eval_metric.get_name_value() log.info("Epoch[%d] val cer=%f (%d / %d)", n_epoch, val_cer, int(val_n_label - val_l_dist), val_n_label) curr_acc = val_cer summary_writer.add_scalar('CER validation', val_cer, n_epoch) assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric' data_train.reset() data_train.is_first_epoch = False # mxboard setting train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value() summary_writer.add_scalar('loss epoch', train_ctc_loss, n_epoch) summary_writer.add_scalar('CER train', train_cer, n_epoch) # save checkpoints if n_epoch % save_checkpoint_every_n_epoch == 0: log.info('Epoch[%d] SAVE CHECKPOINT', n_epoch) module.save_checkpoint(prefix=get_checkpoint_path(args), epoch=n_epoch, save_optimizer_states=save_optimizer_states) n_epoch += 1 lr_scheduler.learning_rate=learning_rate/learning_rate_annealing log.info('FINISH')
# in case our last batch was the tail batch of the dataloader, # make sure we feed a full batch of noise noise = mx.ndarray.random.normal(shape=(opt.batchSize, nz, 1, 1), ctx=context) with autograd.record(): fake = netG(noise) errG = netD(fake) errG.backward() trainerG.step(1) gen_iterations += 1 print('[%d/%d][%d/%d][%d] Loss_D: %f Loss_G: %f Loss_D_real: %f Loss_D_fake %f' % (epoch, opt.niter, i, len(dataloader), gen_iterations, errD.asnumpy()[0], errG.asnumpy()[0], errD_real.asnumpy()[0], errD_fake.asnumpy()[0])) sw.add_scalar( tag='loss_D', value=-errD.asnumpy()[0], global_step=gen_iterations ) if gen_iterations % 500 == 0: real_cpu = data * 0.5 + 0.5 save_images(real_cpu.asnumpy().transpose(0, 2, 3, 1), '{0}/real_samples.png'.format(opt.experiment)) fake = netG(fixed_noise.as_in_context(context)) fake = fake * 0.5 + 0.5 save_images(fake.asnumpy().transpose(0, 2, 3, 1), '{0}/fake_samples_{1}.png'.format(opt.experiment, gen_iterations)) # do checkpointing netG.save_params('{0}/netG_epoch_{1}.pth'.format(opt.experiment, epoch)) netD.save_params('{0}/netD_epoch_{1}.pth'.format(opt.experiment, epoch))
def model_train(blocks, args, dataset, cheb_polys, ctx, logdir='./logdir'): ''' Parameters ---------- blocks: list[list], model structure, e.g. [[1, 32, 64], [64, 32, 128]] args: argparse.Namespace dataset: Dataset cheb_polys: mx.ndarray, shape is (num_of_vertices, order_of_cheb * num_of_vertices) ctx: mx.context.Context logdir: str, path of mxboard logdir ''' num_of_vertices = args.num_of_vertices n_his, n_pred = args.n_his, args.n_pred order_of_cheb, Kt = args.order_of_cheb, args.kt batch_size, epochs = args.batch_size, args.epochs opt = args.opt keep_prob = args.keep_prob # data train = dataset['train'].transpose((0, 3, 1, 2)) val = dataset['val'].transpose((0, 3, 1, 2)) test = dataset['test'].transpose((0, 3, 1, 2)) train_x, train_y = train[:, :, :n_his, :], train[:, :, n_his:, :] val_x, val_y = val[:, :, :n_his, :], val[:, :, n_his:, :] test_x, test_y = test[:, :, :n_his, :], test[:, :, n_his:, :] print(train_x.shape, train_y.shape, val_x.shape, val_y.shape, test_x.shape, test_y.shape) train_loader = gluon.data.DataLoader(gluon.data.ArrayDataset( nd.array(train_x), nd.array(train_y)), batch_size=batch_size, shuffle=False) val_loader = gluon.data.DataLoader(gluon.data.ArrayDataset( nd.array(val_x), nd.array(val_y)), batch_size=batch_size, shuffle=False) test_loader = gluon.data.DataLoader(gluon.data.ArrayDataset( nd.array(test_x), nd.array(test_y)), batch_size=batch_size, shuffle=False) ground_truth = ( np.concatenate([y.asnumpy() for x, y in test_loader], axis=0) * dataset.std + dataset.mean) # model model = hybrid_model.STGCN(n_his=n_his, order_of_cheb=order_of_cheb, Kt=Kt, blocks=blocks, keep_prob=keep_prob, num_of_vertices=num_of_vertices, cheb_polys=cheb_polys) model.initialize(ctx=ctx, init=mx.init.Xavier()) model.hybridize() # loss function loss = gluon.loss.L2Loss() # trainer trainer = gluon.Trainer(model.collect_params(), args.opt) trainer.set_learning_rate(args.lr) if not os.path.exists('params'): os.mkdir('params') sw = SummaryWriter(logdir=logdir, flush_secs=5) train_step = 0 val_step = 0 for epoch in range(epochs): start_time = time.time() for x, y in train_loader: tmp = nd.concat(x, y, dim=2) for pred_idx in range(n_pred): end_idx = pred_idx + n_his x_ = tmp[:, :, pred_idx:end_idx, :] y_ = tmp[:, :, end_idx:end_idx + 1, :] with autograd.record(): l = loss(model(x_.as_in_context(ctx)), y_.as_in_context(ctx)) l.backward() sw.add_scalar(tag='training_loss', value=l.mean().asscalar(), global_step=train_step) trainer.step(x.shape[0]) train_step += 1 val_loss_list = [] for x, y in val_loader: pred = predict_batch(model, ctx, x, n_pred) val_loss_list.append(loss(pred, y).mean().asscalar()) sw.add_scalar(tag='val_loss', value=sum(val_loss_list) / len(val_loss_list), global_step=val_step) evaluate(model, ctx, ground_truth, test_loader, n_pred, dataset.mean, dataset.std, sw, val_step) val_step += 1 if (epoch + 1) % args.save == 0: model.save_parameters('params/{}.params'.format(epoch + 1)) sw.close()
class TrainerAgentMXNET: # Probably needs refactoring """Main training loop""" def __init__(self, model, symbol, val_iter, train_config: TrainConfig, train_objects: TrainObjects, use_rtpt: bool, augment=False): """ Class for training the neural network. :param model: The model loaded with the MXNet Module functionalities. :param symbol: The architecture of the neural network. :param val_iter: Iteratable object over the validation data. :param train_config: An instance of the TrainConfig data class. :param train_objects: Am omstamce pf the TrainObject data class. :param use_rtpt: If True, an RTPT object will be created and modified within this class. """ # Too many instance attributes (29/7) - Too many arguments (24/5) - Too many local variables (25/15) # Too few public methods (1/2) self.tc = train_config self.to = train_objects if self.to.metrics is None: self.to.metrics = {} self._model = model self._symbol = symbol self._val_iter = val_iter self.x_train = self.yv_train = self.yp_train = None self._ctx = get_context(train_config.context, train_config.device_id) self._augment = augment # define a summary writer that logs data and flushes to the file every 5 seconds if self.tc.log_metrics_to_tensorboard: self.sum_writer = SummaryWriter(logdir=self.tc.export_dir + "logs", flush_secs=5, verbose=False) # Define the optimizer if self.tc.optimizer_name == "adam": self.optimizer = mx.optimizer.Adam( learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, lazy_update=True, rescale_grad=(1.0 / self.tc.batch_size)) elif self.tc.optimizer_name == "nag": self.optimizer = mx.optimizer.NAG( momentum=self.to.momentum_schedule(0), wd=self.tc.wd, rescale_grad=(1.0 / self.tc.batch_size)) else: raise Exception("%s is currently not supported as an optimizer." % self.tc.optimizer_name) self.ordering = list( range(self.tc.nb_parts) ) # define a list which describes the order of the processed batches # if we augment the data set each part is loaded twice if self._augment: self.ordering += self.ordering # decides if the policy indices shall be selected directly from spatial feature maps without dense layer self.batch_end_callbacks = [self.batch_callback] # few variables which are internally used self.val_loss_best = self.val_p_acc_best = self.k_steps_best = \ self.old_label = self.value_out = self.t_s = None self.patience_cnt = self.batch_proc_tmp = None # calculate how many log states will be processed self.k_steps_end = round(self.tc.total_it / self.tc.batch_steps) if self.k_steps_end == 0: self.k_steps_end = 1 self.k_steps = self.cur_it = self.nb_spikes = self.old_val_loss = self.continue_training = self.t_s_steps = None self._train_iter = self.graph_exported = self.val_metric_values = self.val_loss = self.val_p_acc = None self.val_metric_values_best = None self.use_rtpt = use_rtpt if use_rtpt: # we use k-steps instead of epochs here self.rtpt = RTPT(name_initials=self.tc.name_initials, experiment_name='crazyara', max_iterations=self.k_steps_end - self.tc.k_steps_initial) def _log_metrics(self, metric_values, global_step, prefix="train_"): """ Logs a dictionary object of metric value to the console and to tensorboard if _log_metrics_to_tensorboard is set to true :param metric_values: Dictionary object storing the current metrics :param global_step: X-Position point of all metric entries :param prefix: Used for labelling the metrics :return: """ for name in metric_values.keys(): # show the metric stats print(" - %s%s: %.4f" % (prefix, name, metric_values[name]), end="") # add the metrics to the tensorboard event file if self.tc.log_metrics_to_tensorboard: self.sum_writer.add_scalar( name, [prefix.replace("_", ""), metric_values[name]], global_step) def train(self, cur_it=None): # Probably needs refactoring """ Training model :param cur_it: Current iteration which is used for the learning rate and momentum schedule. If set to None it will be initialized :return: return_metrics_and_stop_training() """ # Too many local variables (44/15) - Too many branches (18/12) - Too many statements (108/50) # set a custom seed for reproducibility if self.tc.seed is not None: random.seed(self.tc.seed) # define and initialize the variables which will be used self.t_s = time() # track on how many batches have been processed in this epoch self.patience_cnt = epoch = self.batch_proc_tmp = 0 self.k_steps = self.tc.k_steps_initial # counter for thousands steps if cur_it is None: self.cur_it = self.tc.k_steps_initial * 1000 else: self.cur_it = cur_it self.nb_spikes = 0 # count the number of spikes that have been detected # initialize the loss to compare with, with a very high value self.old_val_loss = 9000 self.graph_exported = False # create a state variable to check if the net architecture has been reported yet self.continue_training = True self.optimizer.lr = self.to.lr_schedule(self.cur_it) if self.tc.optimizer_name == "nag": self.optimizer.momentum = self.to.momentum_schedule(self.cur_it) if not self.ordering: # safety check to prevent eternal loop raise Exception( "You must have at least one part file in your planes-dataset directory!" ) if self.use_rtpt: # Start the RTPT tracking self.rtpt.start() while self.continue_training: # Too many nested blocks (7/5) # reshuffle the ordering of the training game batches (shuffle works in place) random.shuffle(self.ordering) epoch += 1 logging.info("EPOCH %d", epoch) logging.info("=========================") self.t_s_steps = time() self._model.init_optimizer(optimizer=self.optimizer) if self._augment: # stores part ids that were not augmented yet parts_not_augmented = list(set(self.ordering.copy())) # stores part ids that were loaded before but not augmented parts_to_augment = [] for part_id in tqdm_notebook(self.ordering): if MODE == MODE_XIANGQI: _, self.x_train, self.yv_train, self.yp_train, _ = load_xiangqi_dataset( dataset_type="train", part_id=part_id, normalize=self.tc.normalize, verbose=False) if self._augment: # check whether the current part should be augmented if part_id in parts_to_augment: augment(self.x_train, self.yp_train) logging.debug( "Using augmented part with id {}".format( part_id)) elif part_id in parts_not_augmented: if random.randint(0, 1): augment(self.x_train, self.yp_train) parts_not_augmented.remove(part_id) logging.debug( "Using augmented part with id {}".format( part_id)) else: parts_to_augment.append(part_id) logging.debug( "Using unaugmented part with id {}".format( part_id)) else: # load one chunk of the dataset from memory _, self.x_train, self.yv_train, self.yp_train, plys_to_end, _ = load_pgn_dataset( dataset_type="train", part_id=part_id, normalize=self.tc.normalize, verbose=False, q_value_ratio=self.tc.q_value_ratio) # fill_up_batch if there aren't enough games if len(self.yv_train) < self.tc.batch_size: logging.info("filling up batch with too few samples %d" % len(self.yv_train)) self.x_train = fill_up_batch(self.x_train, self.tc.batch_size) self.yv_train = fill_up_batch(self.yv_train, self.tc.batch_size) self.yp_train = fill_up_batch(self.yp_train, self.tc.batch_size) if MODE != MODE_XIANGQI: if plys_to_end is not None: plys_to_end = fill_up_batch( plys_to_end, self.tc.batch_size) if MODE != MODE_XIANGQI: if self.tc.discount != 1: self.yv_train *= self.tc.discount**plys_to_end self.yp_train = prepare_policy( self.yp_train, self.tc.select_policy_from_plane, self.tc.sparse_policy_label, self.tc.is_policy_from_plane_data) if self.tc.use_wdl and self.tc.use_plys_to_end: self._train_iter = mx.io.NDArrayIter( {'data': self.x_train}, { 'value_label': self.yv_train, 'policy_label': self.yp_train, 'wdl_label': value_to_wdl_label(self.yv_train), 'plys_to_end_label': prepare_plys_label(plys_to_end) }, self.tc.batch_size, shuffle=True) else: self._train_iter = mx.io.NDArrayIter( {'data': self.x_train}, { 'value_label': self.yv_train, 'policy_label': self.yp_train }, self.tc.batch_size, shuffle=True) # avoid memory leaks by adding synchronization mx.nd.waitall() reset_metrics(self.to.metrics) for batch in self._train_iter: self._model.forward(batch, is_train=True) # compute predictions for metric in self.to.metrics: # update the metrics self._model.update_metric(metric, batch.label) self._model.backward() # compute gradients self._model.update() # update parameters self.batch_callback() if not self.continue_training: logging.info('Elapsed time for training(hh:mm:ss): ' + str( datetime.timedelta( seconds=round(time() - self.t_s)))) return return_metrics_and_stop_training( self.k_steps, self.val_metric_values, self.k_steps_best, self.val_metric_values_best) # add the graph representation of the network to the tensorboard log file if not self.graph_exported and self.tc.log_metrics_to_tensorboard: # self.sum_writer.add_graph(self._symbol) self.graph_exported = True def _fill_train_metrics(self): """ Fills in the training metrics :return: """ self.train_metric_values = {} for metric in self.to.metrics: name, value = metric.get() self.train_metric_values[name] = value self.train_metric_values["loss"] = 0.01 * self.train_metric_values["value_loss"] + \ 0.99 * self.train_metric_values["policy_loss"] def recompute_eval(self): """ Recomputes the score on the validataion data :return: """ ms_step = ((time() - self.t_s_steps) / self.tc.batch_steps) * 1000 logging.info("Step %dK/%dK - %dms/step", self.k_steps, self.k_steps_end, ms_step) logging.info("-------------------------") logging.debug("Iteration %d/%d", self.cur_it, self.tc.total_it) if self.tc.optimizer_name == "nag": logging.debug("lr: %.7f - momentum: %.7f", self.optimizer.lr, self.optimizer.momentum) else: logging.debug("lr: %.7f - momentum: -", self.optimizer.lr) # the metric values have already been computed during training for the train set self._fill_train_metrics() self.val_metric_values = evaluate_metrics( self.to.metrics, self._val_iter, self._model, ) if self.use_rtpt: # update process title according to loss self.rtpt.step( subtitle=f"loss={self.val_metric_values['loss']:2.2f}") if self.tc.use_spike_recovery and ( self.old_val_loss * self.tc.spike_thresh < self.val_metric_values["loss"] or np.isnan( self.val_metric_values["loss"])): # check for spikes self.handle_spike() else: self.update_eval() def handle_spike(self): """ Handles the occurence of a spike during training, in the case validation loss increased dramatically. :return: self._return_metrics_and_stop_training() """ self.nb_spikes += 1 logging.warning( "Spike %d/%d occurred - val_loss: %.3f", self.nb_spikes, self.tc.max_spikes, self.val_metric_values["loss"], ) if self.nb_spikes >= self.tc.max_spikes: val_loss = self.val_metric_values["loss"] val_p_acc = self.val_metric_values["policy_acc"] # finally stop training because the number of lr drops has been achieved logging.debug( "The maximum number of spikes has been reached. Stop training." ) self.continue_training = False if self.tc.log_metrics_to_tensorboard: self.sum_writer.close() return return_metrics_and_stop_training( self.k_steps, self.val_metric_values, self.k_steps_best, self.val_metric_values_best) logging.debug("Recover to latest checkpoint") # Load the best model once again prefix = self.tc.export_dir + "weights/model-%.5f-%.3f" % ( self.val_loss_best, self.val_p_acc_best) logging.debug("load current best model:%s", prefix) self._model.load(prefix, epoch=self.k_steps_best) self.k_steps = self.k_steps_best logging.debug("k_step is back at %d", self.k_steps_best) # print the elapsed time t_delta = time() - self.t_s_steps print(" - %.ds" % t_delta) self.t_s_steps = time() def update_eval(self): """ Updates the evaluation metrics :return: """ # update the val_loss_value to compare with using spike recovery self.old_val_loss = self.val_metric_values["loss"] # log the metric values to tensorboard self._log_metrics(self.train_metric_values, global_step=self.k_steps, prefix="train_") self._log_metrics(self.val_metric_values, global_step=self.k_steps, prefix="val_") # check if a new checkpoint shall be created if self.val_loss_best is None or self.val_metric_values[ "loss"] < self.val_loss_best: # update val_loss_best self.val_loss_best = self.val_metric_values["loss"] self.val_p_acc_best = self.val_metric_values["policy_acc"] self.val_metric_values_best = self.val_metric_values self.k_steps_best = self.k_steps if self.tc.export_weights: prefix = self.tc.export_dir + "weights/model-%.5f-%.3f" % ( self.val_loss_best, self.val_p_acc_best) # the export function saves both the architecture and the weights print() self._model.save_checkpoint(prefix, epoch=self.k_steps_best) self.patience_cnt = 0 # reset the patience counter # print the elapsed time t_delta = time() - self.t_s_steps print(" - %.ds" % t_delta) self.t_s_steps = time() # log the samples per second metric to tensorboard self.sum_writer.add_scalar( tag="samples_per_second", value={ "hybrid_sync": self.tc.batch_size * self.tc.batch_steps / t_delta }, global_step=self.k_steps, ) # log the current learning rate self.sum_writer.add_scalar(tag="lr", value=self.to.lr_schedule(self.cur_it), global_step=self.k_steps) if self.tc.optimizer_name == "nag": # log the current momentum value self.sum_writer.add_scalar(tag="momentum", value=self.to.momentum_schedule( self.cur_it), global_step=self.k_steps) if self.cur_it >= self.tc.total_it: self.continue_training = False self.val_loss = self.val_metric_values["loss"] self.val_p_acc = self.val_metric_values["policy_acc"] # finally stop training because the number of lr drops has been achieved logging.debug("The number of given iterations has been reached") if self.tc.log_metrics_to_tensorboard: self.sum_writer.close() def batch_callback(self): """ Callback which is executed after every batch to update the momentum and learning rate :return: """ # update the learning rate and momentum self.optimizer.lr = self.to.lr_schedule(self.cur_it) if self.tc.optimizer_name == "nag": self.optimizer.momentum = self.to.momentum_schedule(self.cur_it) self.cur_it += 1 self.batch_proc_tmp += 1 if self.batch_proc_tmp >= self.tc.batch_steps: # show metrics every thousands steps self.batch_proc_tmp = self.batch_proc_tmp - self.tc.batch_steps # update the counters self.k_steps += 1 self.patience_cnt += 1 self.recompute_eval() self.custom_metric_eval() def custom_metric_eval(self): """ Evaluates the model based on the validation set of different variants """ if self.to.variant_metrics is None: return for part_id, variant_name in enumerate(self.to.variant_metrics): # load one chunk of the dataset from memory _, x_val, yv_val, yp_val, _, _ = load_pgn_dataset( dataset_type="val", part_id=part_id, normalize=self.tc.normalize, verbose=False, q_value_ratio=self.tc.q_value_ratio) if self.tc.select_policy_from_plane: val_iter = mx.io.NDArrayIter({'data': x_val}, { 'value_label': yv_val, 'policy_label': np.array(FLAT_PLANE_IDX)[yp_val.argmax(axis=1)] }, self.tc.batch_size) else: val_iter = mx.io.NDArrayIter( {'data': x_val}, { 'value_label': yv_val, 'policy_label': yp_val.argmax(axis=1) }, self.tc.batch_size) results = self._model.score(val_iter, self.to.metrics) prefix = "val_" for entry in results: name = variant_name + "_" + entry[0] value = entry[1] print(" - %s%s: %.4f" % (prefix, name, value), end="") # add the metrics to the tensorboard event file if self.tc.log_metrics_to_tensorboard: self.sum_writer.add_scalar( name, [prefix.replace("_", ""), value], self.k_steps) print()
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], graphviz=True, epoch=100, input_size=[512, 512], batch_size=16, batch_log=100, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], data_augmentation=True, num_workers=4, optimizer="ADAM", lambda_off=1, lambda_size=0.1, save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base=18, pretrained_base=True, pretrained_path="modelparam", AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, topk=100, plot_class_thresh=0.5): ''' AMP 가 모든 연산을 지원하지는 않는다. modulated convolution을 지원하지 않음 ''' if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Center Detector") input_shape = (1, 3) + tuple(input_size) scale_factor = 4 # 고정 logging.info(f"scale factor {scale_factor}") try: train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) except Exception as E: logging.info(E) exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base) else: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base) weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 net = CenterNet(base=base, heads=OrderedDict([('heatmap', { 'num_output': num_classes, 'bias': -2.19 }), ('offset', { 'num_output': 2 }), ('wh', { 'num_output': 2 })]), head_conv_channel=64, pretrained=pretrained_base, root=pretrained_path, use_dcnv2=False, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0001, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0001, "momentum": 0.9, 'multi_precision': False }) else: logging.error("optimizer not selected") exit(0) heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4) normedl1loss = NormedL1Loss() prediction = Prediction(batch_size=valid_size, topk=topk, scale=scale_factor) precision_recall = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 time_stamp = time.time() ''' target generator를 train_dataloader에서 만들어 버리는게 학습 속도가 훨씬 빠르다. ''' for batch_count, (image, _, heatmap, offset_target, wh_target, mask_target, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image_split = mx.nd.split(data=image, num_outputs=subdivision, axis=0) heatmap_split = mx.nd.split(data=heatmap, num_outputs=subdivision, axis=0) offset_target_split = mx.nd.split(data=offset_target, num_outputs=subdivision, axis=0) wh_target_split = mx.nd.split(data=wh_target, num_outputs=subdivision, axis=0) mask_target_split = mx.nd.split(data=mask_target, num_outputs=subdivision, axis=0) if subdivision == 1: image_split = [image_split] heatmap_split = [heatmap_split] offset_target_split = [offset_target_split] wh_target_split = [wh_target_split] mask_target_split = [mask_target_split] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): heatmap_all_losses = [] offset_all_losses = [] wh_all_losses = [] for image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part in zip( image_split, heatmap_split, offset_target_split, wh_target_split, mask_target_split): if GPU_COUNT <= 1: image_part = gluon.utils.split_and_load( image_part, [ctx], even_split=False) heatmap_part = gluon.utils.split_and_load( heatmap_part, [ctx], even_split=False) offset_target_part = gluon.utils.split_and_load( offset_target_part, [ctx], even_split=False) wh_target_part = gluon.utils.split_and_load( wh_target_part, [ctx], even_split=False) mask_target_part = gluon.utils.split_and_load( mask_target_part, [ctx], even_split=False) else: image_part = gluon.utils.split_and_load( image_part, ctx, even_split=False) heatmap_part = gluon.utils.split_and_load( heatmap_part, ctx, even_split=False) offset_target_part = gluon.utils.split_and_load( offset_target_part, ctx, even_split=False) wh_target_part = gluon.utils.split_and_load( wh_target_part, ctx, even_split=False) mask_target_part = gluon.utils.split_and_load( mask_target_part, ctx, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, heatmap_target, offset_target, wh_target, mask_target in zip( image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part): heatmap_pred, offset_pred, wh_pred = net(img) heatmap_loss = heatmapfocalloss( heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) total_loss.append(heatmap_loss + offset_loss + wh_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) heatmap_all_losses.append(sum(heatmap_losses)) offset_all_losses.append(sum(offset_losses)) wh_all_losses.append(sum(wh_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() heatmap_loss_sum += sum(heatmap_all_losses) / td_batch_size offset_loss_sum += sum(offset_all_losses) / td_batch_size wh_loss_sum += sum(wh_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[heatmap loss = {sum(heatmap_all_losses) / td_batch_size:.3f}]' f'[offset loss = {sum(offset_all_losses) / td_batch_size:.3f}]' f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_heatmap_loss_mean = np.divide(heatmap_loss_sum, train_update_number_per_epoch) train_offset_loss_mean = np.divide(offset_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean logging.info( f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 # loss 구하기 for image, label, heatmap_all, offset_target_all, wh_target_all, mask_target_all, _ in valid_dataloader: vd_batch_size = image.shape[0] if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) heatmap_split = gluon.utils.split_and_load( heatmap_all, [ctx], even_split=False) offset_target_split = gluon.utils.split_and_load( offset_target_all, [ctx], even_split=False) wh_target_split = gluon.utils.split_and_load( wh_target_all, [ctx], even_split=False) mask_target_split = gluon.utils.split_and_load( mask_target_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) heatmap_split = gluon.utils.split_and_load( heatmap_all, ctx, even_split=False) offset_target_split = gluon.utils.split_and_load( offset_target_all, ctx, even_split=False) wh_target_split = gluon.utils.split_and_load( wh_target_all, ctx, even_split=False) mask_target_split = gluon.utils.split_and_load( mask_target_all, ctx, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, heatmap_target, offset_target, wh_target, mask_target in zip( image, label, heatmap_split, offset_target_split, wh_target_split, mask_target_split): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) id, score, bbox = prediction(heatmap_pred, offset_pred, wh_pred) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box * scale_factor, gt_labels=gt_id) heatmap_loss = heatmapfocalloss(heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) heatmap_loss_sum += sum(heatmap_losses) / vd_batch_size offset_loss_sum += sum(offset_losses) / vd_batch_size wh_loss_sum += sum(wh_losses) / vd_batch_size valid_heatmap_loss_mean = np.divide(heatmap_loss_sum, valid_update_number_per_epoch) valid_offset_loss_mean = np.divide(offset_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean logging.info( f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] heatmap_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) ids, scores, bboxes = prediction(heatmap_pred, offset_pred, wh_pred) for ig, gt_id, gt_box, heatmap, id, score, bbox in zip( img, gt_ids, gt_boxes, heatmap_pred, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # heatmap 그리기 heatmap = mx.nd.multiply(heatmap, 255.0) # 0 ~ 255 범위로 바꾸기 heatmap = mx.nd.max( heatmap, axis=0, keepdims=True) # channel 축으로 가장 큰것 뽑기 heatmap = mx.nd.transpose( heatmap, axes=(1, 2, 0)) # (height, width, channel=1) heatmap = mx.nd.repeat( heatmap, repeats=3, axis=-1) # (height, width, channel=3) heatmap = heatmap.asnumpy( ) # mxnet.ndarray -> numpy.ndarray heatmap = cv2.resize(heatmap, dsize=(input_size[1], input_size[0])) # 사이즈 원복 heatmap = heatmap.astype("uint8") # float32 -> uint8 heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) heatmap[:, :, (0, 1, 2)] = heatmap[:, :, (2, 1, 0)] # BGR -> RGB heatmap = np.transpose( heatmap, axes=(2, 0, 1)) # (channel=3, height, width) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box * scale_factor, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) heatmap_image.append(heatmap) all_image = np.concatenate( [np.array(batch_image), np.array(heatmap_image)], axis=-1) summary.add_image(tag="valid_result", image=all_image, global_step=i) summary.add_scalar(tag="heatmap_loss", value={ "train_heatmap_loss_mean": train_heatmap_loss_mean, "valid_heatmap_loss_mean": valid_heatmap_loss_mean }, global_step=i) summary.add_scalar(tag="offset_loss", value={ "train_offset_loss_mean": train_offset_loss_mean, "valid_offset_loss_mean": valid_offset_loss_mean }, global_step=i) summary.add_scalar(tag="wh_loss", value={ "train_wh_loss_mean": train_wh_loss_mean, "valid_wh_loss_mean": valid_wh_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: if not os.path.exists(weight_path): os.makedirs(weight_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) # 새로운 객체가 생성 try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 export_block_for_cplusplus( path=os.path.join(weight_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def train_net(net, train_iter, valid_iter, batch_size, trainer, ctx, num_epochs, lr_sch, save_prefix): logger.info("===================START TRAINING====================") if use_mxboard: sw = SummaryWriter(logdir='logs', flush_secs=5) cls_loss = gluon.loss.SoftmaxCrossEntropyLoss() cls_acc = mx.metric.Accuracy(name="train acc") top_acc = 0 iter_num = 0 #test_acc,test_loss = test_net(net, valid_iter, ctx) #sw.add_graph(net) #only hybrid block supported param_names = net.collect_params().keys() for epoch in range(num_epochs): train_loss = [] t0 = time.time() if isinstance(train_iter, mx.io.MXDataIter): train_iter.reset() total = 0 trainer.set_learning_rate(lr_sch(epoch)) for batch in train_iter: iter_num += 1 # print("iter ",iter_num," start") if isinstance(batch, mx.io.DataBatch): X, Y = batch.data[0], batch.label[0] #total += X.shape[0] #print(total) else: X, Y = batch #print(X.shape,Y.shape) #print(Y) X = X.as_in_context(ctx) Y = Y.as_in_context(ctx) with autograd.record(True): out = net(X) #out = out.as_in_context(mx.cpu()) loss = cls_loss(out, Y) # print(out.asnumpy()[0]) # print('loss = ',loss.sum().asscalar()) loss.backward() train_loss.append(loss.sum().asscalar()) trainer.step(batch_size) cls_acc.update(Y, out) nd.waitall() #print("iter ",iter_num," end") if use_mxboard: if iter_num % 100 == 0: sw.add_scalar(tag='train_loss', value=loss.mean().asscalar(), global_step=iter_num) sw.add_scalar(tag='train_acc', value=cls_acc.get(), global_step=iter_num) if iter_num % 100 == 0: for name in net.collect_params(): param = net.collect_params()[name] if param.grad_req != "null": sw.add_histogram(tag=name, values=param.grad(), global_step=iter_num, bins=1000) logger.info("epoch {} lr {} {}sec".format(epoch, trainer.learning_rate, time.time() - t0)) train_loss, train_acc = np.mean(train_loss) / batch_size, cls_acc.get() logger.info("\ttrain loss {} {}".format(train_loss, train_acc)) if epoch > 0 and (epoch % 10) == 0: test_acc, test_loss = test_net(net, valid_iter, ctx) if use_mxboard: sw.add_scalar(tag='test_acc', value=test_acc, global_step=epoch) sw.add_scalar(tag='test_loss', value=test_loss, global_step=epoch) if top_acc < test_acc: top_acc = test_acc logger.info('\ttop valid acc {}'.format(test_acc)) if isinstance(net, mx.gluon.nn.HybridSequential) or isinstance( net, mx.gluon.nn.HybridBlock): pf = '{}_{:.3f}.params'.format(save_prefix, top_acc) net.export(pf, epoch) else: net_path = '{}top_acc_{}_{:.3f}.params'.format( save_prefix, epoch, top_acc) net.save_parameters(net_path) if use_mxboard: sw.close()
def mytrain(net,num_classes,train_data,valid_data,ctx,start_epoch, end_epoch, \ arm_cls_loss=arm_cls_loss,cls_loss=cls_loss,box_loss=box_loss,trainer=None): if trainer is None: # trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01,'momentum':0.9, 'wd':50.0}) trainer = gluon.Trainer(net.collect_params(), 'adam', { 'learning_rate': 0.001, 'clip_gradient': 2.0 }) # trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 0.003}) box_metric = metric.MAE() ## add visible # collect parameter names for logging the gradients of parameters in each epoch params = net.collect_params() # param_names = params.keys() # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='./logs', flush_secs=5) global_step = 0 for e in range(start_epoch, end_epoch): # print(e) train_data.reset() valid_data.reset() box_metric.reset() tic = time.time() _loss = [0, 0] arm_loss = [0, 0] # if e == 6 or e == 100: # trainer.set_learning_rate(trainer.learning_rate * 0.2) outs, labels = None, None for i, batch in enumerate(train_data): # print('----- batch {} start ----'.format(i)) data = batch.data[0].as_in_context(ctx) label = batch.label[0].as_in_context(ctx) # print('label shape: ',label.shape) with autograd.record(): # 1. generate results according to extract network ssd_layers = net(data) arm_loc_preds, arm_cls_preds, arm_anchor_boxes, odm_loc_preds, odm_cls_preds = multibox_layer(ssd_layers,\ num_classes,sizes,ratios,normalizations) # arm_loc_preds, arm_cls_preds, arm_anchor_boxes, odm_loc_preds, odm_cls_preds = net(data) # print('---------1111-----------') # 2. ARM predict ## 2.1 modify label as [-1,0,..] label_arm = nd.Custom(label, op_type='modify_label') arm_tmp = MultiBoxTarget(arm_anchor_boxes,label_arm,arm_cls_preds,overlap_threshold=.5,\ negative_mining_ratio=3,negative_mining_thresh=.5) arm_loc_target = arm_tmp[0] # box offset arm_loc_target_mask = arm_tmp[1] # box mask (only 0,1) arm_cls_target = arm_tmp[2] # every anchor' idx # print(sum(arm_cls_target[0])) # print('---------2222-----------') # 3. ODM predict ## 3.1 refine anchor generator originate in ARM odm_anchor_boxes = refine_anchor_generator( arm_anchor_boxes, arm_loc_preds) #(batch,h*w*num_anchors[:layers],4) # ### debug backward err # odm_anchor_boxes = arm_anchor_boxes odm_anchor_boxes_bs = nd.split( data=odm_anchor_boxes, axis=0, num_outputs=label.shape[0]) # list # print('---3 : odm_anchor_boxes_bs shape : {}'.format(odm_anchor_boxes_bs[0].shape)) # print('---------3333-----------') ## 3.2 对当前所有batch的data计算 Target (多个gpu使用) odm_loc_target = [] odm_loc_target_mask = [] odm_cls_target = [] label_bs = nd.split(data=label, axis=0, num_outputs=label.shape[0]) odm_cls_preds_bs = nd.split(data=odm_cls_preds, axis=0, num_outputs=label.shape[0]) # print('---4 : odm_cls_preds_bs shape: {}'.format(odm_cls_preds_bs[0].shape)) # print('---4 : label_bs shape: {}'.format(label_bs[0].shape)) for j in range(label.shape[0]): if label.shape[0] == 1: odm_tmp = MultiBoxTarget(odm_anchor_boxes_bs[j].expand_dims(axis=0),label_bs[j].expand_dims(axis=0),\ odm_cls_preds_bs[j].expand_dims(axis=0),overlap_threshold=.5,negative_mining_ratio=2,negative_mining_thresh=.5) ## 多个batch else: odm_tmp = MultiBoxTarget(odm_anchor_boxes_bs[j],label_bs[j],\ odm_cls_preds_bs[j],overlap_threshold=.5,negative_mining_ratio=3,negative_mining_thresh=.5) odm_loc_target.append(odm_tmp[0]) odm_loc_target_mask.append(odm_tmp[1]) odm_cls_target.append(odm_tmp[2]) ### concat ,上面为什么会单独计算每张图,odm包含了batch,so需要拆 odm_loc_target = nd.concat(*odm_loc_target, dim=0) odm_loc_target_mask = nd.concat(*odm_loc_target_mask, dim=0) odm_cls_target = nd.concat(*odm_cls_target, dim=0) # 4. negitave filter group = nd.Custom(arm_cls_preds, odm_cls_target, odm_loc_target_mask, op_type='negative_filtering') odm_cls_target = group[0] #用ARM中的cls过滤后的odm_cls odm_loc_target_mask = group[1] #过滤掉的mask为0 # print('---------4444-----------') # 5. calc loss # TODO:add 1/N_arm, 1/N_odm (num of positive anchors) # arm_cls_loss = gluon.loss.SoftmaxCrossEntropyLoss() arm_loss_cls = arm_cls_loss(arm_cls_preds.transpose((0, 2, 1)), arm_cls_target) arm_loss_loc = box_loss(arm_loc_preds, arm_loc_target, arm_loc_target_mask) # print('55555 loss-> arm_loss_cls : {} arm_loss_loc {}'.format(arm_loss_cls.shape,arm_loss_loc.shape)) # print('arm_loss_cls loss : {}'.format(arm_loss_cls)) # odm_cls_prob = nd.softmax(odm_cls_preds,axis=2) tmp = odm_cls_preds.transpose((0, 2, 1)) odm_loss_cls = cls_loss(odm_cls_preds.transpose((0, 2, 1)), odm_cls_target) odm_loss_loc = box_loss(odm_loc_preds, odm_loc_target, odm_loc_target_mask) # print('66666 loss-> odm_loss_cls : {} odm_loss_loc {}'.format(odm_loss_cls.shape,odm_loss_loc.shape)) # print('odm_loss_cls loss :{} '.format(odm_loss_cls)) # print('odm_loss_loc loss :{} '.format(odm_loss_loc)) # print('N_arm: {} ; N_odm: {} '.format(nd.sum(arm_loc_target_mask,axis=1)/4.0,nd.sum(odm_loc_target_mask,axis=1)/4.0)) # loss = arm_loss_cls+arm_loss_loc+odm_loss_cls+odm_loss_loc loss = 1/(nd.sum(arm_loc_target_mask,axis=1)/4.0) *(arm_loss_cls+arm_loss_loc) + \ 1/(nd.sum(odm_loc_target_mask,axis=1)/4.0)*(odm_loss_cls+odm_loss_loc) sw.add_scalar(tag='loss', value=loss.mean().asscalar(), global_step=global_step) global_step += 1 loss.backward(retain_graph=False) # autograd.backward(loss) # print(net.collect_params().get('conv4_3_weight').data()) # print(net.collect_params().get('vgg0_conv9_weight').grad()) ### 单独测试梯度 # arm_loss_cls.backward(retain_graph=False) # arm_loss_loc.backward(retain_graph=False) # odm_loss_cls.backward(retain_graph=False) # odm_loss_loc.backward(retain_graph=False) trainer.step(data.shape[0]) _loss[0] += nd.mean(odm_loss_cls).asscalar() _loss[1] += nd.mean(odm_loss_loc).asscalar() arm_loss[0] += nd.mean(arm_loss_cls).asscalar() arm_loss[1] += nd.mean(arm_loss_loc).asscalar() # print(arm_loss) arm_cls_prob = nd.SoftmaxActivation(arm_cls_preds, mode='channel') odm_cls_prob = nd.SoftmaxActivation(odm_cls_preds, mode='channel') out = MultiBoxDetection(odm_cls_prob,odm_loc_preds,odm_anchor_boxes,\ force_suppress=True,clip=False,nms_threshold=.5,nms_topk=400) # print('out shape: {}'.format(out.shape)) if outs is None: outs = out labels = label else: outs = nd.concat(outs, out, dim=0) labels = nd.concat(labels, label, dim=0) box_metric.update([odm_loc_target], [odm_loc_preds * odm_loc_target_mask]) print('-------{} epoch end ------'.format(e)) train_AP = evaluate_MAP(outs, labels) valid_AP, val_box_metric = evaluate_acc(net, valid_data, ctx) info["train_ap"].append(train_AP) info["valid_ap"].append(valid_AP) info["loss"].append(_loss) print('odm loss: ', _loss) print('arm loss: ', arm_loss) if e == 0: sw.add_graph(net) # grads = [i.grad() for i in net.collect_params().values()] # grads_4_3 = net.collect_params().get('vgg0_conv9_weight').grad() # sw.add_histogram(tag ='vgg0_conv9_weight',values=grads_4_3,global_step=e, bins=1000 ) grads_4_2 = net.collect_params().get('vgg0_conv5_weight').grad() sw.add_histogram(tag='vgg0_conv5_weight', values=grads_4_2, global_step=e, bins=1000) # assert len(grads) == len(param_names) # logging the gradients of parameters for checking convergence # for i, name in enumerate(param_names): # sw.add_histogram(tag=name, values=grads[i], global_step=e, bins=1000) # net.export('./Model/RefineDet_MeterDetect') # net if (e + 1) % 5 == 0: print( "epoch: %d time: %.2f cls loss: %.4f,reg loss: %.4f lr: %.5f" % (e, time.time() - tic, _loss[0], _loss[1], trainer.learning_rate)) print("train mae: %.4f AP: %.4f" % (box_metric.get()[1], train_AP)) print("valid mae: %.4f AP: %.4f" % (val_box_metric.get()[1], valid_AP)) sw.add_scalar(tag='train_AP', value=train_AP, global_step=e) sw.add_scalar(tag='valid_AP', value=valid_AP, global_step=e) sw.close() if True: info["loss"] = np.array(info["loss"]) info["cls_loss"] = info["loss"][:, 0] info["box_loss"] = info["loss"][:, 1] plt.figure(figsize=(12, 4)) plt.subplot(121) plot("train_ap") plot("valid_ap") plt.legend(loc="upper right") plt.subplot(122) plot("cls_loss") plot("box_loss") plt.legend(loc="upper right") plt.savefig('loss_curve.png')
def train(): # load_data batch_size = args.batch_size * max(args.num_gpus, 1) train_set = gluon.data.vision.CIFAR10(train=True, transform=transform_train) train_data = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=args.num_workers, last_batch='discard') val_set = gluon.data.vision.CIFAR10(train=False, transform=transform_val) val_data = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=args.num_workers) # set the network and trainer ctx = [mx.gpu(i) for i in range(args.num_gpus)] if args.num_gpus > 0 else [mx.cpu()] net = get_attention_cifar(10, num_layers=args.num_layers) net.initialize(init=mx.initializer.MSRAPrelu(), ctx=ctx) net.hybridize() trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': args.lr, 'momentum': args.momentum, 'wd': args.wd }) cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=not use_mix_up) train_metric = mtc.Accuracy() if not use_mix_up else mx.metric.RMSE() # set log output train_mode = 'MixUP' if use_mix_up else 'Vanilla' logger = logging.getLogger('TRAIN') logger.setLevel("INFO") logger.addHandler(logging.StreamHandler()) logger.addHandler( logging.FileHandler( os.path.join( args.log_dir, 'text/cifar10_attention%d_%s_%s.log' % (args.num_layers, train_mode, datetime.strftime(datetime.now(), '%Y%m%d%H%M'))))) sw = SummaryWriter(logdir=os.path.join( args.log_dir, 'board/cifar10_attention%d_%s_%s' % (args.num_layers, train_mode, datetime.strftime(datetime.now(), '%Y%m%d%H%M'))), verbose=False) # record the training hyper parameters logger.info(args) lr_counter = 0 lr_steps = [int(s) for s in args.lr_steps.strip().split(',')] num_batch = len(train_data) epochs = args.epochs + 1 alpha = args.alpha max_accuracy = 0.9 for epoch in range(epochs): if epoch == lr_steps[lr_counter]: trainer.set_learning_rate(trainer.learning_rate * 0.1) if lr_counter + 1 < len(lr_steps): lr_counter += 1 train_loss = 0 train_metric.reset() tic = time.time() for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) labels = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) if use_mix_up and epoch < epochs - 20: lam = np.random.beta(alpha, alpha) data = [lam * X + (1 - lam) * X[::-1] for X in data] labels = [lam * Y + (1 - lam) * Y[::-1] for Y in labels] with ag.record(): outputs = [net(X) for X in data] losses = [ cross_entropy(yhat, y) for yhat, y in zip(outputs, labels) ] for l in losses: ag.backward(l) trainer.step(batch_size) train_metric.update(labels, outputs) train_loss += sum([l.mean().asscalar() for l in losses]) / len(losses) _, train_acc = train_metric.get() train_loss /= num_batch val_acc, val_loss = validate(net, val_data, ctx) sw.add_scalar("AttentionNet/Loss", { 'train': train_loss, 'val': val_loss }, epoch) sw.add_scalar("AttentionNet/Metric", { 'train': train_acc, 'val': val_acc }, epoch) logger.info('[Epoch %d] train metric: %.6f, train loss: %.6f | ' 'val accuracy: %.6f, val loss: %.6f, time: %.1f' % (epoch, train_acc, train_loss, val_acc, val_loss, time.time() - tic)) if (epoch % args.save_period) == 0 and epoch != 0: net.save_parameters( "./models/attention%d-cifar10-epoch-%d-%s.params" % (args.num_layers, epoch, train_mode)) if val_acc > max_accuracy: net.save_parameters( "./models/best-%f-attention%d-cifar10-epoch-%d-%s.params" % (val_acc, args.num_layers, epoch, train_mode)) max_accuracy = val_acc sw.close() logger.info("Train End.")
loss = loss_fun(y_hat, label_i) loss.backward() trainer.step(batch_size=batch_size) train_loss += loss.mean().asscalar() f_train_acc += acc(y_hat, label_i) for batch_data in test_data: data = batch_data[0].as_in_context(ctx) label = batch_data[1].as_in_context(ctx) f_val_acc += acc(mnist_net(data), label) f_train_acc = f_train_acc * ninv_train f_val_acc = f_val_acc * ninv_test sw.add_scalar(tag="RMNIST-07_training_accuracy", value=f_train_acc, global_step=a_epoch) sw.add_scalar(tag="MNIST-07_validation_accuracy", value=f_val_acc, global_step=a_epoch) # print( "Epoch %d: Loss: %.3f, Train acc %.3f, Test acc %.3f, Time %.1f sec" % (a_epoch, train_loss / len(train_data), f_train_acc, f_val_acc, time() - tic)) # requires either full layer specification or we pass one batch of data through the net # so that deferred initialization can work it's magic no_in_channel = 50 no_out_channel = 100
def train(): """training""" image_pool = ImagePool(pool_size) metric = mx.metric.CustomMetric(facc) stamp = datetime.now().strftime('%Y_%m_%d-%H_%M') logging.basicConfig(level=logging.DEBUG) # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='%s' % dir_out_sw, flush_secs=5, verbose=False) global_step = 0 for epoch in range(epochs): if epoch == 0: netG.hybridize() netD.hybridize() # sw.add_graph(netG) # sw.add_graph(netD) tic = time.time() btic = time.time() train_data.reset() val_data.reset() iter = 0 for local_step, batch in enumerate(train_data): ############################ # (1) Update D network: maximize log(D(x, y)) + log(1 - D(x, G(x, z))) ########################### tmp = mx.nd.concat(batch.data[0], batch.data[1], batch.data[2], dim=1) tmp = augmenter(tmp, patch_size=128, offset=offset, aug_type=1, aug_methods=aug_methods, random_crop=False) real_in = tmp[:, :1].as_in_context(ctx) real_out = tmp[:, 1:2].as_in_context(ctx) m = tmp[:, 2:3].as_in_context(ctx) # mask fake_out = netG(real_in) * m # loss weight based on mask, applied on L1 loss if no_loss_weights: loss_weight = m else: loss_weight = m.asnumpy() loss_weight[loss_weight == 0] = .1 loss_weight = mx.nd.array(loss_weight, ctx=m.context) fake_concat = image_pool.query(nd.concat(real_in, fake_out, dim=1)) with autograd.record(): # Train with fake image # Use image pooling to utilize history images output = netD(fake_concat) fake_label = nd.zeros(output.shape, ctx=ctx) errD_fake = GAN_loss(output, fake_label) metric.update([ fake_label, ], [ output, ]) # Train with real image real_concat = nd.concat(real_in, real_out, dim=1) output = netD(real_concat) real_label = nd.ones(output.shape, ctx=ctx) errD_real = GAN_loss(output, real_label) errD = (errD_real + errD_fake) * 0.5 errD.backward() metric.update([ real_label, ], [ output, ]) trainerD.step(batch.data[0].shape[0]) ############################ # (2) Update G network: maximize log(D(x, G(x, z))) - lambda1 * L1(y, G(x, z)) ########################### with autograd.record(): fake_out = netG(real_in) fake_concat = nd.concat(real_in, fake_out, dim=1) output = netD(fake_concat) real_label = nd.ones(output.shape, ctx=ctx) errG = GAN_loss(output, real_label) + loss_2nd( real_out, fake_out, loss_weight) * lambda1 errG.backward() trainerG.step(batch.data[0].shape[0]) sw.add_scalar(tag='loss', value=('d_loss', errD.mean().asscalar()), global_step=global_step) sw.add_scalar(tag='loss', value=('g_loss', errG.mean().asscalar()), global_step=global_step) global_step += 1 if epoch + local_step == 0: sw.add_graph((netG)) img_in_list, img_out_list, m_val = val_data.next().data m_val = m_val.as_in_context(ctx) sw.add_image('first_minibatch_train_real', norm3(real_out)) sw.add_image('first_minibatch_val_real', norm3(img_out_list.as_in_context(ctx))) netG.export('%snetG' % dir_out_checkpoints) if local_step == 0: # Log the first batch of images of each epoch (training) sw.add_image('first_minibatch_train_fake', norm3(fake_out * m) * m, epoch) sw.add_image( 'first_minibatch_val_fake', norm3(netG(img_in_list.as_in_context(ctx)) * m_val) * m_val, epoch) # norm3(netG(img_in_list.as_in_context(ctx)) * m_val.as_in_context(ctx)), epoch) if (iter + 1) % 10 == 0: name, acc = metric.get() logging.info('speed: {} samples/s'.format( batch_size / (time.time() - btic))) logging.info( 'discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d' % (nd.mean(errD).asscalar(), nd.mean(errG).asscalar(), acc, iter, epoch)) iter += 1 btic = time.time() sw.add_scalar(tag='binary_training_acc', value=('acc', acc), global_step=epoch) name, acc = metric.get() metric.reset() fake_val = netG(val_data.data[0][1].as_in_context(ctx)) loss_val = loss_2nd(val_data.data[1][1].as_in_context(ctx), fake_val, val_data.data[2][1].as_in_context(ctx)) * lambda1 sw.add_scalar(tag='loss_val', value=('g_loss', loss_val.mean().asscalar()), global_step=epoch) if (epoch % check_point_interval == 0) | (epoch == epochs - 1): netD.save_params('%snetD-%04d' % (dir_out_checkpoints, epoch)) netG.save_params('%snetG-%04d' % (dir_out_checkpoints, epoch)) logging.info('\nbinary training acc at epoch %d: %s=%f' % (epoch, name, acc)) logging.info('time: %f' % (time.time() - tic)) sw.export_scalars('scalar_dict.json') sw.close()
class Segmentation(object): def __init__(self, args): self.args = args self.lr = 0.0005 self.device = [ mx.gpu(0) ] #([mx.gpu(0), mx.gpu(1)] if not args.cpu else [mx.cpu(0)]) if args.train else [mx.gpu(0)] self.test_device = [mx.gpu(0)] self.lr_decay_step = 5000 # self.lr_decay_epoch = 2 self.lr_decay_rate_epoch = 0.9 self.lr_decay_rate = 0.9 self.save_freq = 1 # epoch self.save_freq_step = 5000 self.arch_name = config['arch_name'] self.arch_path = os.path.join('model', self.arch_name) self.mode = 'train' if self.args.train else 'test' self.batch_size = config[self.mode]['batch_size'] self.config = config self.segmentator = Segmentator(self.args, self.config) self.cgeddata = CGEDData(self.segmentator.tokenizer, self.segmentator.transformer, self.config, self.mode, useDecoder=self.args.decoder, args=args) self.csc15data = SighanCSC15Data(self.segmentator.tokenizer, self.segmentator.transformer, self.config, self.mode, args=args, vocab_tgt=self.segmentator.vocab_tgt) self.csc14data = SighanCSC14Data(self.segmentator.tokenizer, self.segmentator.transformer, self.config, self.mode, args=args, vocab_tgt=self.segmentator.vocab_tgt) if args.dataset != 'CGED16': self.reviewdata = ReviewData(self.segmentator.tokenizer, self.segmentator.transformer, self.segmentator.vocab_tgt, self.config, self.mode) self.reviewdata_val = ReviewData(self.segmentator.tokenizer, self.segmentator.transformer, self.segmentator.vocab_tgt, self.config, 'test') def init_network(self): self.segmentator.initialize(mx.init.Xavier(), ctx=self.device) if self.args.dataset != 'CGED16': emb_pretrained = self.segmentator._collect_params_with_prefix( )['encoder.word_embed.0.weight'].data()[:len(self.segmentator. vocab_tgt)] self.segmentator.collect_params().reset_ctx(self.device) self.segmentator._collect_params_with_prefix( )['emb_tgt.0.weight']._load_init(emb_pretrained, self.device) else: self.segmentator.collect_params().reset_ctx(self.device) if self.args.pretrain: return load_pretrained_model_only_same_shape( self.segmentator, 'model/bert_multi_full/0000140000-0.params', self.device) else: return load_latest_checkpoint(self.segmentator, self.arch_path, self.device) # if os.path.isdir('model/bert'): def init_trainer(self, step_start): self.lr_scheduler = mx.lr_scheduler.FactorScheduler( self.lr_decay_step, self.lr_decay_rate) self.optimizer = 'lamb' self.options = { 'learning_rate': self.lr * (self.lr_decay_rate_epoch**int( step_start / float(self.lr_decay_step))), 'lr_scheduler': self.lr_scheduler, 'clip_gradient': 0.1, # 'momentum' : 0.9, 'wd': 0.0001 } self.trainer = mx.gluon.Trainer(self.segmentator.collect_params(), self.optimizer, self.options) def train(self): step_start, _ = self.init_network() self.init_trainer(step_start) self.sw = SummaryWriter(logdir='logs/' + self.config['arch_name'], flush_secs=5) # self.cgedloader = self.cgeddata.get_loader() self.reviewloader = self.reviewdata.get_loader() # self.csc15loader = self.csc15data.get_loader() # self.csc14loader = self.csc14data.get_loader() self.reviewloader_val = self.reviewdata_val.get_loader() # self.int_cged_samples = len(self.cgeddata.data) self.int_review_samples = len(self.reviewdata.data) # self.int_csc15_samples = len(self.csc15data.data) # self.int_csc14_samples = len(self.csc14data.data) self.segmentator.hybridize() list_input_texts_test = None list_target_texts_test = None # self.cged_enumerator = enumerate(self.cgedloader) # self.cged_epoch = 0 self.review_epoch = 0 # self.csc15_epoch = 0 # self.csc14_epoch = 0 self.review_enumerator = enumerate(self.reviewloader) # self.csc15_enumerator = enumerate(self.csc15loader) # self.csc14_enumerator = enumerate(self.csc14loader) self.review_enumerator_val = enumerate(self.reviewloader_val) # progress_cged = tqdm(total = self.int_cged_samples, desc = 'cged') progress_review = tqdm(total=self.int_review_samples, desc='review') # progress_csc15 = tqdm(total = self.int_csc15_samples, desc = 'csc15') # progress_csc14 = tqdm(total = self.int_csc14_samples, desc = 'csc14') # for intialize parameter in network for multi-task # i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, nd_target_valid_len, nd_target_segment, nd_pm_error_idx, nd_pm_add_idx, nd_pm_remove_idx, list_input_texts, list_target_texts) = next(self.review_enumerator, (-1, [None] * 11)) # nd_input_word_idx = gluon.utils.split_and_load(nd_input_word_idx[:2], self.device) # nd_input_valid_len = gluon.utils.split_and_load(nd_input_valid_len[:2], self.device) # nd_input_segment = gluon.utils.split_and_load(nd_input_segment[:2], self.device) # nd_target_word_idx = gluon.utils.split_and_load(nd_target_word_idx[:2], self.device) # nd_target_valid_len = gluon.utils.split_and_load(nd_target_valid_len[:2], self.device) # nd_target_segment = gluon.utils.split_and_load(nd_target_segment[:2], self.device) # self.segmentator.initialize_full_network(nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, self.device) for _step in range(step_start, step_start + 30001): # try: # 為了解決 batch 剩 1 的問題,但是要 debug 時很難 debug _dataset = np.random.choice(['review', 'cged', 'csc15', 'csc14'], p=[1, 0, 0, 0]) if (_step % self.config['val_freq'] == 0 or _step % 100 == 0) and _step != 0: _dataset = 'review' if _dataset == 'cged': # i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, list_ids, list_input_texts, list_target_texts) = self.cged_enumerator.__next__() i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_error_idx, list_input_texts, list_target_texts) = next(self.cged_enumerator, (-1, [None] * 6)) if i == -1: self.cgedloader = self.cgeddata.get_loader() self.cged_epoch += 1 self.cged_enumerator = enumerate(self.cgedloader) i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_error_idx, list_input_texts, list_target_texts) = next(self.cged_enumerator, (-1, [None] * 6)) progress_cged = tqdm(total=self.int_cged_samples, desc='cged') _batch_size = int(nd_input_word_idx.shape[0] / len(self.device)) * len(self.device) nd_input_word_idx = gluon.utils.split_and_load( nd_input_word_idx[:_batch_size], self.device) nd_input_valid_len = gluon.utils.split_and_load( nd_input_valid_len[:_batch_size], self.device) nd_input_segment = gluon.utils.split_and_load( nd_input_segment[:_batch_size], self.device) nd_error_idx = gluon.utils.split_and_load( nd_error_idx[:_batch_size], self.device) loss = self.segmentator.train_CGED(nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_error_idx, self.device, _batch_size, self.trainer) progress_cged.update(_batch_size) if (_step % 100 == 0): print('CGED Loss => {}, Epoch => {}, Lr => {}'.format( loss, self.cged_epoch, self.trainer.learning_rate)) elif _dataset == 'review': i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, nd_target_valid_len, nd_target_segment, nd_pm_error_idx, nd_pm_add_idx, nd_pm_remove_idx, list_input_texts, list_target_texts) = next(self.review_enumerator, (-1, [None] * 11)) if i == -1: self.reviewloader = self.reviewdata.get_loader() self.review_epoch += 1 self.review_enumerator = enumerate(self.reviewloader) i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, nd_target_valid_len, nd_target_segment, nd_pm_error_idx, nd_pm_add_idx, nd_pm_remove_idx, list_input_texts, list_target_texts) = next(self.review_enumerator, (-1, [None] * 11)) progress_review = tqdm(total=self.int_review_samples, desc='review') _batch_size = int(nd_input_word_idx.shape[0] / len(self.device)) * len(self.device) nd_input_word_idx = gluon.utils.split_and_load( nd_input_word_idx[:_batch_size], self.device) nd_input_valid_len = gluon.utils.split_and_load( nd_input_valid_len[:_batch_size], self.device) nd_input_segment = gluon.utils.split_and_load( nd_input_segment[:_batch_size], self.device) nd_pm_error_idx = gluon.utils.split_and_load( nd_pm_error_idx[:_batch_size], self.device) nd_pm_add_idx = gluon.utils.split_and_load( nd_pm_add_idx[:_batch_size], self.device) nd_pm_remove_idx = gluon.utils.split_and_load( nd_pm_remove_idx[:_batch_size], self.device) nd_target_word_idx = gluon.utils.split_and_load( nd_target_word_idx[:_batch_size], self.device) nd_target_valid_len = gluon.utils.split_and_load( nd_target_valid_len[:_batch_size], self.device) nd_target_segment = gluon.utils.split_and_load( nd_target_segment[:_batch_size], self.device) loss, loss_pm = self.segmentator.train( nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, nd_target_valid_len, nd_target_segment, nd_pm_error_idx, nd_pm_add_idx, nd_pm_remove_idx, list_input_texts[:_batch_size], list_target_texts[:_batch_size], self.device, _batch_size, self.trainer) if (_step % 100 == 0): print('Review Loss => {}, Epoch => {}, Lr => {}'.format( loss, self.review_epoch, self.trainer.learning_rate)) self.sw.add_scalar(tag='review_loss', value=loss, global_step=_step) if self.config['use_encoder_constraint']: self.sw.add_scalar(tag='pm_loss', value=loss_pm, global_step=_step) progress_review.update(_batch_size) elif _dataset == 'csc15': if not self.config['csc_fixed']: i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, nd_target_valid_len, nd_target_segment, list_input_texts, list_target_texts) = next(self.csc14_enumerator, (-1, [None] * 8)) else: i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, list_input_texts, list_target_texts) = next(self.csc14_enumerator, (-1, [None] * 6)) if i == -1: self.csc15loader = self.csc15data.get_loader() self.csc15_epoch += 1 self.csc15_enumerator = enumerate(self.csc15loader) i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, nd_target_valid_len, nd_target_segment, list_input_texts, list_target_texts) = next(self.csc15_enumerator, (-1, [None] * 8)) progress_csc15 = tqdm(total=self.int_csc15_samples, desc='csc15') _batch_size = int(nd_input_word_idx.shape[0] / len(self.device)) * len(self.device) nd_input_word_idx = gluon.utils.split_and_load( nd_input_word_idx[:_batch_size], self.device) nd_input_valid_len = gluon.utils.split_and_load( nd_input_valid_len[:_batch_size], self.device) nd_input_segment = gluon.utils.split_and_load( nd_input_segment[:_batch_size], self.device) nd_target_word_idx = gluon.utils.split_and_load( nd_target_word_idx[:_batch_size], self.device) # nd_target_valid_len = gluon.utils.split_and_load(nd_target_valid_len[:_batch_size], self.device) # nd_target_segment = gluon.utils.split_and_load(nd_target_segment[:_batch_size], self.device) if not self.config['csc_fixed']: nd_target_valid_len = gluon.utils.split_and_load( nd_target_valid_len[:_batch_size], self.device) nd_target_segment = gluon.utils.split_and_load( nd_target_segment[:_batch_size], self.device) loss = self.segmentator.train( nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, nd_target_valid_len, nd_target_segment, list_input_texts[:_batch_size], list_target_texts[:_batch_size], self.device, _batch_size, self.trainer) else: loss = self.segmentator.train_csc_fixed( nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, self.device, _batch_size, self.trainer) # loss = self.segmentator.train(nd_input_word_idx, nd_input_valid_len, nd_input_segment, # nd_target_word_idx, nd_target_valid_len, nd_target_segment, #list_input_texts[:_batch_size], list_target_texts[:_batch_size], self.device, _batch_size, self.trainer) if (_step % 100 == 0): print('CSC15 Loss => {}, Epoch => {}, Lr => {}'.format( loss, self.csc15_epoch, self.trainer.learning_rate)) progress_csc15.update(_batch_size) # print('Epoch {} Step {} => {}, LR : {}'.format(e, i, loss, self.trainer.learning_rate)) elif _dataset == 'csc14': if not self.config['csc_fixed']: i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, nd_target_valid_len, nd_target_segment, list_input_texts, list_target_texts) = next(self.csc14_enumerator, (-1, [None] * 8)) else: i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, list_input_texts, list_target_texts) = next(self.csc14_enumerator, (-1, [None] * 6)) if i == -1: self.csc14loader = self.csc14data.get_loader() self.csc14_epoch += 1 self.csc14_enumerator = enumerate(self.csc14loader) i, (nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, nd_target_valid_len, nd_target_segment, list_input_texts, list_target_texts) = next(self.csc14_enumerator, (-1, [None] * 8)) progress_csc14 = tqdm(total=self.int_csc14_samples, desc='csc14') _batch_size = int(nd_input_word_idx.shape[0] / len(self.device)) * len(self.device) nd_input_word_idx = gluon.utils.split_and_load( nd_input_word_idx[:_batch_size], self.device) nd_input_valid_len = gluon.utils.split_and_load( nd_input_valid_len[:_batch_size], self.device) nd_input_segment = gluon.utils.split_and_load( nd_input_segment[:_batch_size], self.device) nd_target_word_idx = gluon.utils.split_and_load( nd_target_word_idx[:_batch_size], self.device) if not self.config['csc_fixed']: nd_target_valid_len = gluon.utils.split_and_load( nd_target_valid_len[:_batch_size], self.device) nd_target_segment = gluon.utils.split_and_load( nd_target_segment[:_batch_size], self.device) loss = self.segmentator.train( nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, nd_target_valid_len, nd_target_segment, list_input_texts[:_batch_size], list_target_texts[:_batch_size], self.device, _batch_size, self.trainer) else: loss = self.segmentator.train_csc_fixed( nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_target_word_idx, self.device, _batch_size, self.trainer) if (_step % 100 == 0): print('CSC14 Loss => {}, Epoch => {}, Lr => {}'.format( loss, self.csc14_epoch, self.trainer.learning_rate)) progress_csc14.update(_batch_size) # print('Epoch {} Step {} => {}, LR : {}'.format(e, i, loss, self.trainer.learning_rate)) # except Exception as e: # # print(e) # continue # if e % self.save_freq == 0: # save_gluon_model(self.segmentator, self.arch_path, e, 0) # use main dataset if _step % self.save_freq_step == 0: save_gluon_model(self.segmentator, self.arch_path, _step, 0) # use main dataset if _step % self.config['val_freq'] == 0 and _step != 0: # _, batch_test = self.review_enumerator_val.__next__() # list_input_texts_test = batch_test[0] # list_target_texts_test = batch_test[1] # print('Input => ', list_input_texts_test) # print('Target => ', list_target_texts_test) # text = self.segmentator.run(list_input_texts_test, self.device) avg_val_bleu, predict_text = self.val_using_testset() self.sw.add_scalar(tag='avg_val_bleu', value=avg_val_bleu, global_step=_step) # self.options['learning_rate'] = self.trainer.learning_rate * self.lr_decay_rate_epoch # self.trainer = mx.gluon.Trainer(self.segmentator.collect_params(), self.optimizer, self.options) def test(self): self.segmentator.hybridize() self.loader = self.reviewdata_val.get_loader() for i, batch in enumerate(self.loader): str_input_text, str_target_text = batch str_predict_text = self.segmentator.run(str_input_text, self.test_device) score = sentence_bleu([[t for t in str_predict_text]], [t for t in str_target_text]) raise pass def val_using_testset(self): # self.segmentator.hybridize() progress_val = tqdm(total=self.config['int_val_set'], desc='cged') self.loader = self.reviewdata_val.get_loader() scores = [] for i, batch in enumerate(self.loader): if i == self.config['int_val_set']: break str_input_text, str_target_text = batch str_predict_text = self.segmentator.run(str_input_text, self.test_device) score = sentence_bleu([[t for t in str_predict_text]], [t for t in str_target_text]) scores.append(score) progress_val.update(1) return np.mean(scores), str_predict_text def run(self, text): self.init_network() self.segmentator.hybridize() self.segmentator.run(text, self.device) def interact(self): self.init_network() self.segmentator.hybridize() shell(self.segmentator, self.arch_path, self.device).cmdloop() def train_CGED(self): epoch_start, _ = self.init_network() epoch_start += 1 self.init_trainer(epoch_start) self.loader = self.data.get_loader() # self.loader_val = self.data_val.get_loader() self.num_samples = len(self.data.data) self.segmentator.hybridize() for e in range(epoch_start, epoch_start + 10): progress = tqdm(total=self.num_samples) for i, batch in enumerate(self.loader): nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_error_idx, list_input_texts, list_target_texts = batch _batch_size = int(nd_input_word_idx.shape[0] / len(self.device)) * len(self.device) nd_input_word_idx = gluon.utils.split_and_load( nd_input_word_idx[:_batch_size], self.device) nd_input_valid_len = gluon.utils.split_and_load( nd_input_valid_len[:_batch_size], self.device) nd_input_segment = gluon.utils.split_and_load( nd_input_segment[:_batch_size], self.device) nd_error_idx = gluon.utils.split_and_load( nd_error_idx[:_batch_size], self.device) # nd_start_idx = gluon.utils.split_and_load(nd_start_idx[:_batch_size], self.device) # nd_end_idx = gluon.utils.split_and_load(nd_end_idx[:_batch_size], self.device) if self.args.decoder: loss = self.segmentator.train_CGEDDecoder( nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_error_idx, self.device, _batch_size, self.trainer) # if (i % 100 == 0): # print("=" * 10) # print('Epoch {} Step {} => {}, LR : {}'.format(e, i, loss, self.trainer.learning_rate)) # _, batch_test = enumerator_val.__next__() # list_input_texts_test = batch_test[6] # list_target_texts_test = batch_test[7] # print('Input => ', list_input_texts_test[0]) # print('Target => ', list_target_texts_test[0]) # text = self.segmentator.run(list_input_texts_test[0], self.device) else: loss = self.segmentator.train_CGED( nd_input_word_idx, nd_input_valid_len, nd_input_segment, nd_error_idx, self.device, _batch_size, self.trainer) progress.update(self.batch_size) if (i % 10 == 0): loss = sum([_loss.asnumpy().mean() for _loss in loss]) # / _batch_size print('[*] Loss : {}, Lr : {}'.format( loss, self.trainer.learning_rate)) if e % self.save_freq == 0: save_gluon_model(self.segmentator, self.arch_path, e, 0) # use main dataset elif i % self.save_freq_step == 0: save_gluon_model(self.segmentator, self.arch_path, e - 1, i) # use main dataset self.options[ 'learning_rate'] = self.trainer.learning_rate * self.lr_decay_rate_epoch self.trainer = mx.gluon.Trainer(self.segmentator.collect_params(), self.optimizer, self.options) def test_CGED(self): self.init_network() self.loader = self.cgeddata.get_loader() self.num_samples = len(self.cgeddata.data) self.segmentator.hybridize() progress = tqdm(total=self.num_samples) list_result = [] for i, batch in enumerate(self.loader): nd_input_word_idx, nd_input_valid_len, nd_input_segment, list_ids, list_input_texts, list_target_texts = batch _batch_size = int(nd_input_word_idx.shape[0] / len(self.device)) * len(self.device) nd_input_word_idx = gluon.utils.split_and_load( nd_input_word_idx[:_batch_size], self.device) nd_input_valid_len = gluon.utils.split_and_load( nd_input_valid_len[:_batch_size], self.device) nd_input_segment = gluon.utils.split_and_load( nd_input_segment[:_batch_size], self.device) if args.decoder: _predict_error_idx = self.segmentator.test_CGEDDecoder( nd_input_word_idx, nd_input_valid_len, nd_input_segment, self.device) else: _predict_error_idx = self.segmentator.test_CGED( nd_input_word_idx, nd_input_valid_len, nd_input_segment, self.device) _list_result = self.cgeddata.convert_to_report_format( list_ids, _predict_error_idx) list_result.extend(_list_result) progress.update(self.batch_size) self.cgeddata.gen_eval_report(list_result) def train_paper(self): epoch_start, _ = self.init_network() epoch_start += 1 self.init_trainer(epoch_start) self.loader = self.data.get_loader() # self.loader_val = self.data_val.get_loader() self.num_samples = len(self.data.data) self.segmentator.hybridize() pass def test_CSC(self): self.init_network() self.segmentator.hybridize() if self.args.dataset == 'CSC14': self.cscdata = self.csc14data self.cscloader = self.csc14data.get_loader() self.int_samples = len(self.csc14data.data) else: self.cscloader = self.csc15data.get_loader() self.int_samples = len(self.csc15data.data) self.cscdata = self.csc15data self.num_samples = len(self.cscdata.data) progress = tqdm(total=self.num_samples) reports = [] for i, batch in enumerate(self.cscloader): nd_input_word_idx, nd_input_valid_len, nd_input_segment, list_input_texts, list_target_text, list_ids = batch print('input => ', list_input_texts[0]) print('target => ', list_target_text[0]) prediction = self.segmentator.run(list_input_texts[0], self.device) report = self.cscdata.compare_input_prediction( list_ids[0], prediction, list_input_texts[0]) reports.append(report) progress.update(1) self.csc15data.gen_eval_report(reports)
def train(opt, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] kv = mx.kv.create(opt.kvstore) train_data, val_data, batch_fn = get_data_iters(opt) net.collect_params().reset_ctx(ctx) trainer = gluon.Trainer(net.collect_params(), *get_optimizer(opt), kvstore=kv) if opt.resume_states != '': trainer.load_states(opt.resume_states) loss = gluon.loss.SoftmaxCrossEntropyLoss() # dummy forward pass to initialize binary layers data, _ = get_dummy_data(opt, ctx[0]) _ = net(data) if opt.mode == 'hybrid': net.hybridize() # set batch norm wd to zero params = net.collect_params('.*batchnorm.*') for key in params: params[key].wd_mult = 0.0 if opt.plot_network is not None: plot_network() if opt.dry_run: return summary_writer = None if opt.write_summary: from mxboard import SummaryWriter summary_writer = SummaryWriter(logdir=opt.write_summary, flush_secs=60) write_net_summaries(summary_writer, ctx[0], write_grads=False) track_lr = LRTracker(trainer, summary_writer) total_time = 0 num_epochs = 0 best_acc = 0 epoch_time = -1 num_examples = get_num_examples(opt.dataset) for epoch in range(opt.start_epoch, opt.epochs): global_step = epoch * num_examples track_lr(epoch, global_step) tic = time.time() if hasattr(train_data, "reset"): train_data.reset() metric.reset() btic = time.time() for i, batch in enumerate(train_data): data, label = batch_fn(batch, ctx) outputs = [] Ls = [] with autograd.record(): for x, y in zip(data, label): z = net(x) L = loss(z, y) # store the loss and do backward after we have done forward # on all GPUs for better speed on multiple GPUs. Ls.append(L) outputs.append(z) autograd.backward(Ls) trainer.step(batch_size) metric.update(label, outputs) if opt.log_interval and not (i + 1) % opt.log_interval: name, acc = metric.get() log_metrics("batch", name, acc, epoch, summary_writer, global_step, sep=" [%d]\tSpeed: %f samples/sec\t" % (i, batch_size / (time.time() - btic))) log_progress(num_examples, opt, epoch, i, time.time() - tic, epoch_time) track_lr(epoch, global_step) btic = time.time() global_step += batch_size if opt.test_run: break epoch_time = time.time() - tic write_net_summaries(summary_writer, ctx[0], global_step=global_step) # First epoch will usually be much slower than the subsequent epics, # so don't factor into the average if num_epochs > 0: total_time = total_time + epoch_time num_epochs = num_epochs + 1 logger.info('[Epoch %d] time cost: %f' % (epoch, epoch_time)) if summary_writer: summary_writer.add_scalar("training/epoch", epoch, global_step=global_step) summary_writer.add_scalar("training/epoch-time", epoch_time, global_step=global_step) # train name, acc = metric.get() log_metrics("training", name, acc, epoch, summary_writer, global_step) # test name, val_acc = test(ctx, val_data, batch_fn, opt.test_run) log_metrics("validation", name, val_acc, epoch, summary_writer, global_step) if opt.interrupt_at is not None and epoch + 1 == opt.interrupt_at: logging.info( "[Epoch %d] Interrupting run now because 'interrupt-at' was set to %d..." % (epoch, opt.interrupt_at)) save_checkpoint(trainer, epoch, val_acc[0], best_acc, force_save=True) sys.exit(3) # save model if meet requirements save_checkpoint(trainer, epoch, val_acc[0], best_acc) best_acc = max(best_acc, val_acc[0]) if num_epochs > 1: print('Average epoch time: {}'.format( float(total_time) / (num_epochs - 1))) if opt.mode != 'hybrid': net.hybridize() # dummy forward pass to save model data, _ = get_dummy_data(opt, ctx[0]) _ = net(data) net.export(os.path.join(opt.prefix, "image-classifier-{}bit".format(opt.bits)), epoch=0)
class FIFOScheduler(TaskScheduler): r"""Simple scheduler that just runs trials in submission order. Parameters ---------- train_fn: callable A task launch function for training. args: object (optional) Default arguments for launching train_fn. resource: dict Computation resources. For example, `{'num_cpus':2, 'num_gpus':1}` searcher: str or BaseSearcher Searcher (get_config decisions). If str, this is passed to searcher_factory along with search_options. search_options: dict If searcher is str, these arguments are passed to searcher_factory. checkpoint: str If filename given here, a checkpoint of scheduler (and searcher) state is written to file every time a job finishes. Note: May not be fully supported by all searchers. resume: bool If True, scheduler state is loaded from checkpoint, and experiment starts from there. Note: May not be fully supported by all searchers. num_trials: int Maximum number of jobs run in experiment. time_out: float If given, jobs are started only until this time_out (wall clock time) reward_attr: str Name of reward (i.e., metric to maximize) attribute in data obtained from reporter time_attr: str Name of resource (or time) attribute in data obtained from reporter. This attribute is optional for FIFO scheduling, but becomes mandatory in multi-fidelity scheduling (e.g., Hyperband). Note: The type of resource must be int. dist_ip_addrs: list of str IP addresses of remote machines. training_history_callback: callable Callback function func called every time a job finishes, if at least training_history_callback_delta_secs seconds passed since the last recent call. The call has the form: func(self.training_history, self._start_time) Here, self._start_time is time stamp for when experiment started. Use this callback to serialize self.training_history after regular intervals. training_history_callback_delta_secs: float See training_history_callback. delay_get_config: bool If True, the call to searcher.get_config is delayed until a worker resource for evaluation is available. Otherwise, get_config is called just after a job has been started. For searchers which adapt to past data, True should be preferred. Otherwise, it does not matter. Examples -------- >>> import numpy as np >>> import autogluon as ag >>> @ag.args( ... lr=ag.space.Real(1e-3, 1e-2, log=True), ... wd=ag.space.Real(1e-3, 1e-2)) >>> def train_fn(args, reporter): ... print('lr: {}, wd: {}'.format(args.lr, args.wd)) ... for e in range(10): ... dummy_accuracy = 1 - np.power(1.8, -np.random.uniform(e, 2*e)) ... reporter(epoch=e, accuracy=dummy_accuracy, lr=args.lr, wd=args.wd) >>> scheduler = ag.scheduler.FIFOScheduler(train_fn, ... resource={'num_cpus': 2, 'num_gpus': 0}, ... num_trials=20, ... reward_attr='accuracy', ... time_attr='epoch') >>> scheduler.run() >>> scheduler.join_jobs() >>> scheduler.get_training_curves(plot=True) """ def __init__(self, train_fn, args=None, resource=None, searcher=None, search_options=None, checkpoint=None, resume=False, num_trials=None, time_out=None, max_reward=None, reward_attr='accuracy', time_attr='epoch', visualizer='none', dist_ip_addrs=None, training_history_callback=None, training_history_callback_delta_secs=60, delay_get_config=True): super().__init__(dist_ip_addrs) if resource is None: resource = {'num_cpus': 1, 'num_gpus': 0} self.resource = resource if searcher is None: searcher = 'random' # RandomSearcher if isinstance(searcher, str): if search_options is None: search_options = dict() _search_options = search_options.copy() _search_options['configspace'] = train_fn.cs _search_options['reward_attribute'] = reward_attr # Adjoin scheduler info to search_options, if not already done by # subclass if 'scheduler' not in _search_options: _search_options['scheduler'] = 'fifo' self.searcher: BaseSearcher = searcher_factory( searcher, **_search_options) else: assert isinstance(searcher, BaseSearcher) self.searcher: BaseSearcher = searcher assert isinstance(train_fn, _autogluon_method) self.train_fn = train_fn self.args = args if args else train_fn.args if num_trials is None: assert time_out is not None, \ "Need stopping criterion: Either num_trials or time_out" num_trials = 100000 # time_out is what matters self.num_trials = num_trials self.time_out = time_out self.max_reward = max_reward # meta data self.metadata = { 'search_space': train_fn.kwspaces, 'search_strategy': searcher, 'stop_criterion': { 'time_limits': time_out, 'max_reward': max_reward}, 'resources_per_trial': resource} self._checkpoint = checkpoint self._reward_attr = reward_attr self._time_attr = time_attr self.visualizer = visualizer.lower() if self.visualizer == 'tensorboard' or self.visualizer == 'mxboard': assert checkpoint is not None, \ "Need checkpoint to be set" try_import_mxboard() from mxboard import SummaryWriter self.mxboard = SummaryWriter( logdir=os.path.join(os.path.splitext(checkpoint)[0], 'logs'), flush_secs=3, verbose=False ) self._fifo_lock = mp.Lock() # training_history maintains the complete history of the experiment, # in terms of all results obtained from the reporter. Keys are # str(task.task_id) self.training_history = OrderedDict() self.config_history = OrderedDict() # Resume experiment from checkpoint? if resume: assert checkpoint is not None, \ "Need checkpoint to be set if resume = True" if os.path.isfile(checkpoint): self.load_state_dict(load(checkpoint)) else: msg = f'checkpoint path {checkpoint} is not available for resume.' logger.exception(msg) raise FileExistsError(msg) # Needed for training_history callback mechanism, which is used to # serialize training_history after each # training_history_call_delta_secs seconds self._start_time = None self._training_history_callback_last_block = None self._training_history_callback_last_len = None self.training_history_callback = training_history_callback self.training_history_callback_delta_secs = \ training_history_callback_delta_secs self._delay_get_config = delay_get_config def run(self, **kwargs): """Run multiple number of trials """ # Make sure that this scheduler is configured at the searcher self.searcher.configure_scheduler(self) start_time = time.time() self._start_time = start_time num_trials = kwargs.get('num_trials', self.num_trials) time_out = kwargs.get('time_out', self.time_out) # For training_history callback mechanism: self._training_history_callback_last_block = -1 self._training_history_callback_last_len = len(self.training_history) logger.info('Starting Experiments') logger.info(f'Num of Finished Tasks is {self.num_finished_tasks}') logger.info(f'Num of Pending Tasks is {num_trials - self.num_finished_tasks}') if time_out is not None: logger.info(f'Time out (secs) is {time_out}') # TODO: This bar is misleading if num_trials not set tbar = tqdm(range(self.num_finished_tasks, num_trials)) for _ in tbar: if (time_out and time.time() - start_time >= time_out) or \ (self.max_reward and self.get_best_reward() >= self.max_reward): break self.schedule_next() def save(self, checkpoint=None): """Save Checkpoint """ if checkpoint is None: checkpoint = self._checkpoint if checkpoint is not None: mkdir(os.path.dirname(checkpoint)) save(self.state_dict(), checkpoint) def _create_new_task(self, config, resources=None): if resources is None: resources = DistributedResource(**self.resource) return Task( self.train_fn, {'args': self.args, 'config': config}, resources=resources) def schedule_next(self): """Schedule next searcher suggested task """ resources = DistributedResource(**self.resource) if self._delay_get_config: # Wait for available resource here, instead of in add_job. This # delays the get_config call until a resource is available FIFOScheduler.RESOURCE_MANAGER._request(resources) # Allow for the promotion of a previously chosen config. Also, # extra_kwargs contains extra info passed to both add_job and to # get_config (if no config is promoted) config, extra_kwargs = self._promote_config() # Time stamp to be used in get_config, and maybe in add_job extra_kwargs['elapsed_time'] = self._elapsed_time() if config is None: # No config to promote: Query next config to evaluate from searcher config = self.searcher.get_config(**extra_kwargs) extra_kwargs['new_config'] = True else: # This is not a new config, but a paused one which is now promoted extra_kwargs['new_config'] = False task = self._create_new_task(config, resources=resources) self.add_job(task, **extra_kwargs) def run_with_config(self, config): """Run with config for final fit. It launches a single training trial under any fixed values of the hyperparameters. For example, after HPO has identified the best hyperparameter values based on a hold-out dataset, one can use this function to retrain a model with the same hyperparameters on all the available labeled data (including the hold out set). It can also returns other objects or states. """ task = self._create_new_task(config) reporter = FakeReporter() task.args['reporter'] = reporter return self.run_job(task) def _dict_from_task(self, task): if isinstance(task, Task): return {'TASK_ID': task.task_id, 'Config': task.args['config']} else: assert isinstance(task, dict) return {'TASK_ID': task['TASK_ID'], 'Config': task['Config']} def add_job(self, task, **kwargs): """Adding a training task to the scheduler. Args: task (:class:`autogluon.scheduler.Task`): a new training task Relevant entries in kwargs: - bracket: HB bracket to be used. Has been sampled in _promote_config - new_config: If True, task starts new config eval, otherwise it promotes a config (only if type == 'promotion') Only if new_config == False: - config_key: Internal key for config - resume_from: config promoted from this milestone - milestone: config promoted to this milestone (next from resume_from) """ cls = FIFOScheduler if not self._delay_get_config: # Wait for resource to become available here, as this has not happened # in schedule_next before cls.RESOURCE_MANAGER._request(task.resources) # reporter reporter = DistStatusReporter(remote=task.resources.node) task.args['reporter'] = reporter # Register pending evaluation self.searcher.register_pending(task.args['config']) # main process job = cls._start_distributed_job(task, cls.RESOURCE_MANAGER) # reporter thread rp = threading.Thread( target=self._run_reporter, args=(task, job, reporter), daemon=False) rp.start() task_dict = self._dict_from_task(task) task_dict.update({'Task': task, 'Job': job, 'ReporterThread': rp}) # Checkpoint thread. This is also used for training_history # callback if self._checkpoint is not None or \ self.training_history_callback is not None: self._add_checkpointing_to_job(job) with self.LOCK: self.scheduled_tasks.append(task_dict) def _clean_task_internal(self, task_dict): task_dict['ReporterThread'].join() def _add_checkpointing_to_job(self, job): def _save_checkpoint_callback(fut): self._cleaning_tasks() self.save() # training_history callback with self._fifo_lock: if self._trigger_training_history_callback(): logger.debug("Execute training_history callback") self.training_history_callback( self.training_history, self._start_time) job.add_done_callback(_save_checkpoint_callback) def _trigger_training_history_callback(self): if self.training_history_callback is None: return False assert self._training_history_callback_last_block is not None current_block = int(np.floor( self._elapsed_time() / self.training_history_callback_delta_secs)) current_len = len(self.training_history) ret_val = (current_block > self._training_history_callback_last_block) and \ current_len > self._training_history_callback_last_len if ret_val: self._training_history_callback_last_block = current_block self._training_history_callback_last_len = current_len return ret_val def _run_reporter(self, task, task_job, reporter): last_result = None while not task_job.done(): reported_result = reporter.fetch() # Time since start of experiment elapsed_time = self._elapsed_time() reported_result['time_since_start'] = elapsed_time if 'traceback' in reported_result: # Evaluation has failed logger.exception(reported_result['traceback']) self.searcher.evaluation_failed( config=task.args['config'], **reported_result) reporter.move_on() break if reported_result.get('done', False): reporter.move_on() break # Extra information from searcher (optional) dataset_size = self.searcher.dataset_size() if dataset_size > 0: reported_result['searcher_data_size'] = dataset_size for k, v in self.searcher.cumulative_profile_record().items(): reported_result['searcher_profile_' + k] = v for k, v in self.searcher.model_parameters().items(): reported_result['searcher_params_' + k] = v self._add_training_result( task.task_id, reported_result, config=task.args['config']) reporter.move_on() last_result = reported_result # Pass all of last_result to searcher if last_result is not None: self.searcher.update(config=task.args['config'], **last_result) def _promote_config(self): """ Provides a hook in schedule_next, which allows to promote a config which has been selected and partially evaluated previously. :return: config, extra_args """ config = None extra_args = dict() return config, extra_args def _elapsed_time(self): """ :return: Time elapsed since start of experiment (see 'run') """ assert self._start_time is not None, \ "Experiment has not been started yet" return time.time() - self._start_time def get_best_config(self): """Get the best configuration from the finished jobs. """ return self.searcher.get_best_config() def get_best_reward(self): """Get the best reward from the finished jobs. """ return self.searcher.get_best_reward() def _add_training_result(self, task_id, reported_result, config=None): if self.visualizer == 'mxboard' or self.visualizer == 'tensorboard': if 'loss' in reported_result: self.mxboard.add_scalar( tag='loss', value=( f'task {task_id} valid_loss', reported_result['loss'] ), global_step=reported_result[self._reward_attr] ) self.mxboard.add_scalar( tag=self._reward_attr, value=( f'task {task_id} {self._reward_attr}', reported_result[self._reward_attr] ), global_step=reported_result[self._reward_attr] ) with self._fifo_lock: # Note: We store all of reported_result in training_history[task_id], # not just the reward value. task_key = str(task_id) new_entry = copy.copy(reported_result) if task_key in self.training_history: self.training_history[task_key].append(new_entry) else: self.training_history[task_key] = [new_entry] if config: self.config_history[task_key] = config def get_training_curves(self, filename=None, plot=False, use_legend=True): """Get Training Curves Parameters ---------- filename : str plot : bool use_legend : bool Examples -------- >>> scheduler.run() >>> scheduler.join_jobs() >>> scheduler.get_training_curves(plot=True) .. image:: https://github.com/zhanghang1989/AutoGluonWebdata/blob/master/doc/api/autogluon.1.png?raw=true """ if filename is None and not plot: logger.warning('Please either provide filename or allow plot in get_training_curves') import matplotlib.pyplot as plt plt.ylabel(self._reward_attr) plt.xlabel(self._time_attr) plt.title("Performance vs Training-Time in each HPO Trial") with self._fifo_lock: for task_id, task_res in self.training_history.items(): rewards = [x[self._reward_attr] for x in task_res] x = list(range(len(task_res))) plt.plot(x, rewards, label=f'task {task_id}') if use_legend: plt.legend(loc='best') if filename: logger.info(f'Saving Training Curve in {filename}') plt.savefig(filename) if plot: plt.show() def state_dict(self, destination=None): """ Returns a dictionary containing a whole state of the Scheduler. This is used for checkpointing. Note that the checkpoint only contains information which has been registered at scheduler and searcher. It does not contain information about currently running jobs, except what they reported before the checkpoint. Therefore, resuming an experiment from a checkpoint is slightly different from continuing the experiment past the checkpoint. The former behaves as if all currently running jobs are terminated at the checkpoint, and new jobs are scheduled from there, starting from scheduler and searcher state according to all information recorded until the checkpoint. Examples -------- >>> ag.save(scheduler.state_dict(), 'checkpoint.ag') """ destination = super().state_dict(destination) with self._fifo_lock: # The result of searcher.get_state can always be pickled destination['searcher'] = pickle.dumps(self.searcher.get_state()) destination['training_history'] = json.dumps(self.training_history) destination['config_history'] = json.dumps(self.config_history) if self.visualizer == 'mxboard' or self.visualizer == 'tensorboard': destination['visualizer'] = json.dumps(self.mxboard._scalar_dict) return destination def load_state_dict(self, state_dict): """ Load from the saved state dict. This can be used to resume an experiment from a checkpoint (see 'state_dict' for caveats). This method must only be called as part of scheduler construction. Calling it in the middle of an experiment can lead to an undefined inner state of scheduler or searcher. Examples -------- >>> scheduler.load_state_dict(ag.load('checkpoint.ag')) """ super().load_state_dict(state_dict) with self._fifo_lock: self.searcher = self.searcher.clone_from_state( pickle.loads(state_dict['searcher'])) self.training_history = json.loads(state_dict['training_history']) self.config_history = json.loads(state_dict['config_history']) if self.visualizer == 'mxboard' or self.visualizer == 'tensorboard': self.mxboard._scalar_dict = json.loads(state_dict['visualizer']) logger.debug(f'Loading Searcher State {self.searcher}')
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], anchor_alloc_size=[256, 256], anchor_sizes=[32, 64, 128, 256, 512], anchor_size_ratios=[1, pow(2, 1 / 3), pow(2, 2 / 3)], anchor_aspect_ratios=[0.5, 1, 2], anchor_box_clip=True, graphviz=True, epoch=100, input_size=[512, 512], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], foreground_iou_thresh=0.5, background_iou_thresh=0.4, data_augmentation=True, num_workers=4, optimizer="ADAM", weight_decay=0.000001, save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base=0, AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", valid_html_auto_open=True, using_mlflow=True, decode_number=5000, multiperclass=True, nms_thresh=0.5, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.05, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Efficient Detector") input_shape = (1, 3) + tuple(input_size) net = Efficient(version=base, anchor_sizes=anchor_sizes, anchor_size_ratios=anchor_size_ratios, anchor_aspect_ratios=anchor_aspect_ratios, anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, background_iou_thresh=background_iou_thresh, make_target=True) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, background_iou_thresh=background_iou_thresh, make_target=True) valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_EFF_" + str(base) weight_path = os.path.join("weights", f"{model}") sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') optimizer_path = os.path.join(weight_path, f'{model}-{load_period:04d}.opt') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)}\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 net = Efficient( version=base, input_size=input_size, anchor_sizes=anchor_sizes, anchor_size_ratios=anchor_size_ratios, anchor_aspect_ratios=anchor_aspect_ratios, num_classes=num_classes, # foreground만 anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) if AMP: amp.init_trainer(trainer) # optimizer weight 불러오기 if os.path.exists(optimizer_path): try: trainer.load_states(optimizer_path) except Exception as E: logging.info(E) else: logging.info(f"loading {os.path.basename(optimizer_path)}\n") ''' localization loss -> Smooth L1 loss confidence loss -> Focal ''' confidence_loss = FocalLoss(alpha=0.25, gamma=2, sparse_label=True, from_sigmoid=False, batch_axis=None, num_class=num_classes, reduction="sum", exclude=False) localization_loss = HuberLoss(rho=1, batch_axis=None, reduction="sum", exclude=False) prediction = Prediction(batch_size=batch_size, from_sigmoid=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) ctx_list = ctx if isinstance(ctx, (list, tuple)) else [ctx] start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): conf_loss_sum = 0 loc_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, cls_all, box_all, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) cls_all = mx.nd.split(data=cls_all, num_outputs=subdivision, axis=0) box_all = mx.nd.split(data=box_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] cls_all = [cls_all] box_all = [box_all] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): cls_all_losses = [] box_all_losses = [] for image_split, cls_split, box_split in zip( image, cls_all, box_all): image_split = gluon.utils.split_and_load(image_split, ctx_list, even_split=False) cls_split = gluon.utils.split_and_load(cls_split, ctx_list, even_split=False) box_split = gluon.utils.split_and_load(box_split, ctx_list, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, cls_target, box_target in zip( image_split, cls_split, box_split): cls_pred, box_pred, anchor = net(img) except_ignore_samples = cls_target > -1 positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss( cls_pred, cls_target, except_ignore_samples.expand_dims(axis=-1)) conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) box_losses.append(loc_loss.asscalar()) total_loss.append(conf_loss + loc_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) cls_all_losses.append(sum(cls_losses)) box_all_losses.append(sum(box_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() conf_loss_sum += sum(cls_all_losses) / td_batch_size loc_loss_sum += sum(box_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]' f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]' ) time_stamp = time.time() train_conf_loss_mean = np.divide(conf_loss_sum, train_update_number_per_epoch) train_loc_loss_mean = np.divide(loc_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean logging.info( f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) # optimizer weight 저장하기 try: trainer.save_states( os.path.join(weight_path, f'{model}-{i:04d}.opt')) except Exception as E: logging.error(f"optimizer weight export 예외 발생 : {E}") else: logging.info("optimizer weight export 성공") ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) ''' mxnet1.6.0 버전 에서 AMP 사용시 위에 미리 선언한 prediction을 사용하면 문제가 될 수 있다. -yolo v3, gaussian yolo v3 에서는 문제가 발생한다. mxnet 1.5.x 버전에서는 아래와 같이 새로 선언하지 않아도 정상 동작한다. block들은 함수 인자로 보낼 경우 자기 자신이 보내진다.(복사되는 것이 아님) export_block_for_cplusplus 에서 prediction 이 hybridize 되면서 미리 선언한 prediction도 hybridize화 되면서 symbol 형태가 된다. 이런 현상을 보면 아래와같이 다시 선언해 주는게 맞는 것 같다. ''' auxnet = Prediction(from_sigmoid=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) postnet = PostNet(net=net, auxnet=auxnet) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 export_block_for_cplusplus( path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) if i % eval_period == 0 and valid_list: conf_loss_sum = 0 loc_loss_sum = 0 # loss 구하기 for image, label, cls_all, box_all, _ in valid_dataloader: vd_batch_size = image.shape[0] image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) cls_all = gluon.utils.split_and_load(cls_all, ctx_list, even_split=False) box_all = gluon.utils.split_and_load(box_all, ctx_list, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, cls_target, box_target in zip( image, label, cls_all, box_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) id, score, bbox = prediction(cls_pred, box_pred, anchor) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) except_ignore_samples = cls_target > -1 positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss( cls_pred, cls_target, except_ignore_samples.expand_dims(axis=-1)) conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) box_losses.append(loc_loss.asscalar()) conf_loss_sum += sum(cls_losses) / vd_batch_size loc_loss_sum += sum(box_losses) / vd_batch_size valid_conf_loss_mean = np.divide(conf_loss_sum, valid_update_number_per_epoch) valid_loc_loss_mean = np.divide(loc_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean logging.info( f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) AP_appender = np.nan_to_num(AP_appender) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i, auto_open=valid_html_auto_open) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _ = next(dataloader_iter) image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 1, 0) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) ids, scores, bboxes = prediction(cls_pred, box_pred, anchor) for ig, gt_id, gt_box, id, score, bbox in zip( img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) ig = ig.astype(np.uint8) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 (height, width, channel) -> (channel, height, width) 를한다. prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="conf_loss", value={ "train_conf_loss": train_conf_loss_mean, "valid_conf_loss": valid_conf_loss_mean }, global_step=i) summary.add_scalar(tag="loc_loss", value={ "train_loc_loss": train_loc_loss_mean, "valid_loc_loss": valid_loc_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) for p in net.collect_params().values(): summary.add_histogram(tag=p.name, values=p.data(ctx=ctx_list[0]), global_step=i, bins='default') end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))