def worker(max_err): net = MnistNet(has_bn=True) net.load_state_dict(checkpoint["net_init"]) lr = checkpoint["sgd_lr"] opt = SGD(net.parameters(), lr=lr) gm = ad.GradManager().attach( net.parameters(), callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)] ) # use same data and label for all gpu's # such that the result does not depend on number of gpu data_train = Tensor(data) label_train = Tensor(label) loss = train(data_train, label_train, net, opt, gm) np.testing.assert_allclose(loss.numpy(), checkpoint["loss"], atol=max_err) if dist.get_rank(): return for param, param_ref in zip( net.state_dict().items(), checkpoint["net_updated"].items() ): assert param[0] == param_ref[0] if "bn" in param[0]: ref = param_ref[1].reshape(param[1].shape) np.testing.assert_allclose(param[1], ref, atol=max_err) else: np.testing.assert_allclose(param[1], param_ref[1], atol=max_err)
def train_and_evaluate(model, manager): rank = dist.get_rank() # reload weights from restore_file if specified if args.restore_file is not None: manager.load_checkpoints() world_size = dist.get_world_size() if world_size > 1: dist.bcast_list_(model.parameters()) dist.bcast_list_(model.buffers()) gm = GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None, ) for epoch in range(manager.params.num_epochs): # compute number of batches in one epoch (one full pass over the training set) train(model, manager, gm) # Evaluate for one epoch on validation set evaluate(model, manager) # Save best model weights accroding to the params.major_metric if rank == 0: manager.check_best_save_last_checkpoints(latest_freq=5)
def worker(args): current_network = import_from_file(args.file) model = current_network.Net(current_network.Cfg()) model.train() if dist.get_rank() == 0: logger.info(get_config_info(model.cfg)) logger.info(repr(model)) params_with_grad = [] for name, param in model.named_parameters(): if "bottom_up.conv1" in name and model.cfg.backbone_freeze_at >= 1: continue if "bottom_up.layer1" in name and model.cfg.backbone_freeze_at >= 2: continue params_with_grad.append(param) opt = SGD( params_with_grad, lr=model.cfg.basic_lr * args.batch_size, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay * dist.get_world_size(), ) gm = GradManager() if dist.get_world_size() > 1: gm.attach(params_with_grad, callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)]) else: gm.attach(params_with_grad) if args.weight_file is not None: # model.backbone.bottom_up.load_state_dict(weights, strict=False) logger.info("Loading Base-Pretrain weights...") weights = mge.load(args.weight_file) weight_new = {k: v for k, v in weights.items() if 'pred_' not in k} model.load_state_dict(weight_new, strict=False) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters if dist.get_rank() == 0: logger.info("Prepare dataset") train_loader = iter( build_dataloader(args.batch_size, args.dataset_dir, model.cfg)) for epoch in range(model.cfg.max_epoch): train_one_epoch(model, train_loader, opt, gm, epoch, args) if dist.get_rank() == 0: save_path = "logs/{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch) mge.save( { "epoch": epoch, "state_dict": model.state_dict() }, save_path, ) logger.info("dump weights to %s", save_path)
def run_syncbn(trace_mode): x = F.ones([2, 16, 4, 4], dtype="float32") net = Sequential( Conv2d(16, 16, 1), SyncBatchNorm(16), Conv2d(16, 16, 1), SyncBatchNorm(16), ) gm = ad.GradManager().attach( net.parameters(), callbacks=dist.make_allreduce_cb("MEAN") ) opt = optimizer.SGD(net.parameters(), 1e-3) def train_func(x): with gm: y = net(x) loss = y.mean() gm.backward(loss) opt.step().clear_grad() return loss if trace_mode is not None: train_func = trace(train_func, symbolic=trace_mode) for _ in range(3): loss = train_func(x) loss.numpy()
def worker(): net = Simple() opt = SGD(net.parameters(), lr=0.1) gm = ad.GradManager().attach( net.parameters(), callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)]) opt.clear_grad() with gm: x = tensor(data) loss = net(x) loss = loss.sum() gm.backward(loss) for p in net.params: np.testing.assert_equal(p.grad.numpy(), 1)
def update_model(model_path): """ Update the dumped model with test cases for new reference values. The model with pre-trained weights is trained for one iter with the test data attached. The loss and updated net state dict is dumped. .. code-block:: python from test_dp_correctness import update_model update_model('mnist_model_with_test.mge') # for gpu update_model('mnist_model_with_test_cpu.mge') # for cpu """ net = MnistNet(has_bn=True) checkpoint = mge.load(model_path) net.load_state_dict(checkpoint["net_init"]) lr = checkpoint["sgd_lr"] opt = SGD(net.parameters(), lr=lr) gm = ad.GradManager().attach( net.parameters(), callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)]) data = Tensor(checkpoint["data"], dtype=np.float32) label = Tensor(checkpoint["label"], dtype=np.int32) opt.clear_grad() loss = train(data, label, net=net, opt=opt) opt.step() xpu_name = get_xpu_name() checkpoint.update({ "net_updated": net.state_dict(), "loss": loss.numpy(), "xpu": xpu_name }) mge.serialization.save(checkpoint, model_path)
def worker(): net = Simple(param_shape) opt = SGD(net.parameters(), lr=0.1) allreduce_cb = dist.make_allreduce_cb("MEAN", dist.WORLD) if threshold is not None: allreduce_cb._param_pack_thd = threshold gm = ad.GradManager().attach(net.parameters(), callbacks=[allreduce_cb]) def run(): opt.clear_grad() with gm: x = tensor(data) loss = net(x) loss = loss.sum() gm.backward(loss) for i in range(n_iters): run() for p in net.params: np.testing.assert_equal(p.grad.numpy(), np.ones_like(p.grad.numpy()))
def worker(args): current_network = import_from_file(args.file) model = current_network.Net(current_network.Cfg()) model.train() if dist.get_rank() == 0: logger.info(get_config_info(model.cfg)) logger.info(repr(model)) backbone_params = [] head_params = [] for name, param in model.named_parameters(): if "backbone" in name: backbone_params.append(param) else: head_params.append(param) opt = SGD( [ { "params": backbone_params, "lr": model.cfg.learning_rate * 0.1 }, { "params": head_params }, ], lr=model.cfg.learning_rate, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay * dist.get_world_size(), ) gm = GradManager() if dist.get_world_size() > 1: gm.attach(model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)]) else: gm.attach(model.parameters()) cur_epoch = 0 if args.resume is not None: pretrained = mge.load(args.resume) cur_epoch = pretrained["epoch"] + 1 model.load_state_dict(pretrained["state_dict"]) opt.load_state_dict(pretrained["opt"]) if dist.get_rank() == 0: logger.info("load success: epoch %d", cur_epoch) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters if dist.get_rank() == 0: logger.info("Prepare dataset") train_loader = iter( build_dataloader(model.cfg.batch_size, args.dataset_dir, model.cfg)) for epoch in range(cur_epoch, model.cfg.max_epoch): train_one_epoch(model, train_loader, opt, gm, epoch) if dist.get_rank() == 0: save_path = "log-of-{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch) mge.save( { "epoch": epoch, "state_dict": model.state_dict(), "opt": opt.state_dict() }, save_path) logger.info("dump weights to %s", save_path)
def build_gradmanager(module): world_size = dist.get_world_size() gm = GradManager().attach( module.parameters(), callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None) return gm
def worker(rank, world_size, ngpus_per_node, args): # pylint: disable=too-many-statements if rank == 0: os.makedirs(os.path.join(args.save, args.arch), exist_ok=True) megengine.logger.set_log_file( os.path.join(args.save, args.arch, "log.txt")) # init process group if world_size > 1: dist.init_process_group( master_ip=args.dist_addr, port=args.dist_port, world_size=world_size, rank=rank, device=rank % ngpus_per_node, backend="nccl", ) logging.info("init process group rank %d / %d", dist.get_rank(), dist.get_world_size()) # build dataset train_dataloader, valid_dataloader = build_dataset(args) train_queue = iter(train_dataloader) # infinite steps_per_epoch = 1280000 // (world_size * args.batch_size) # build model model = resnet_model.__dict__[args.arch]() # Sync parameters if world_size > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None, ) # Optimizer opt = optim.SGD( model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay * world_size, # scale weight decay in "SUM" mode ) # train and valid func def train_step(image, label): with gm: logits = model(image) loss = F.nn.cross_entropy(logits, label) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) gm.backward(loss) opt.step().clear_grad() return loss, acc1, acc5 def valid_step(image, label): logits = model(image) loss = F.nn.cross_entropy(logits, label) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) # calculate mean values if world_size > 1: loss = F.distributed.all_reduce_sum(loss) / world_size acc1 = F.distributed.all_reduce_sum(acc1) / world_size acc5 = F.distributed.all_reduce_sum(acc5) / world_size return loss, acc1, acc5 # multi-step learning rate scheduler with warmup def adjust_learning_rate(step): lr = args.lr * 0.1**bisect.bisect_right( [30 * steps_per_epoch, 60 * steps_per_epoch, 80 * steps_per_epoch], step) if step < 5 * steps_per_epoch: # warmup lr = args.lr * (step / (5 * steps_per_epoch)) for param_group in opt.param_groups: param_group["lr"] = lr return lr # start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") clck = AverageMeter("Time") for step in range(0, args.epochs * steps_per_epoch): lr = adjust_learning_rate(step) t = time.time() image, label = next(train_queue) image = megengine.tensor(image, dtype="float32") label = megengine.tensor(label, dtype="int32") loss, acc1, acc5 = train_step(image, label) objs.update(loss.item()) top1.update(100 * acc1.item()) top5.update(100 * acc5.item()) clck.update(time.time() - t) if step % args.print_freq == 0 and dist.get_rank() == 0: logging.info( "Epoch %d Step %d, LR %.4f, %s %s %s %s", step // steps_per_epoch, step, lr, objs, top1, top5, clck, ) objs.reset() top1.reset() top5.reset() clck.reset() if (step + 1) % steps_per_epoch == 0: model.eval() _, valid_acc1, valid_acc5 = valid(valid_step, valid_dataloader, args) model.train() logging.info( "Epoch %d Test Acc@1 %.3f, Acc@5 %.3f", (step + 1) // steps_per_epoch, valid_acc1, valid_acc5, ) megengine.save( { "epoch": (step + 1) // steps_per_epoch, "state_dict": model.state_dict(), }, os.path.join(args.save, args.arch, "checkpoint.pkl"), )
def worker(master_ip, port, rank, world_size, args): if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format(rank, world_size)) dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) model = getattr(kpm, args.arch)() model.train() start_epoch = 0 if args.resume is not None: file = mge.load(args.resume) model.load_state_dict(file["state_dict"]) start_epoch = file["epoch"] optimizer = optim.Adam( model.parameters(), lr=cfg.initial_lr, weight_decay=cfg.weight_decay ) gm = GradManager() if dist.get_world_size() > 1: gm.attach( model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)], ) else: gm.attach(model.parameters()) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters # Build train datasets logger.info("preparing dataset..") ann_file = os.path.join( cfg.data_root, "annotations", "person_keypoints_train2017.json" ) train_dataset = COCOJoints( cfg.data_root, ann_file, image_set="train2017", order=("image", "keypoints", "boxes", "info"), ) logger.info("Num of Samples: {}".format(len(train_dataset))) train_sampler = data.RandomSampler( train_dataset, batch_size=cfg.batch_size, drop_last=True ) transforms = [ T.Normalize(mean=cfg.img_mean, std=cfg.img_std), RandomHorizontalFlip(0.5, keypoint_flip_order=cfg.keypoint_flip_order) ] if cfg.half_body_transform: transforms.append( HalfBodyTransform( cfg.upper_body_ids, cfg.lower_body_ids, cfg.prob_half_body ) ) if cfg.extend_boxes: transforms.append( ExtendBoxes(cfg.x_ext, cfg.y_ext, cfg.input_shape[1] / cfg.input_shape[0]) ) transforms += [ RandomBoxAffine( degrees=cfg.rotate_range, scale=cfg.scale_range, output_shape=cfg.input_shape, rotate_prob=cfg.rotation_prob, scale_prob=cfg.scale_prob, ) ] transforms += [T.ToMode()] train_queue = data.DataLoader( train_dataset, sampler=train_sampler, num_workers=args.workers, transform=T.Compose(transforms=transforms, order=train_dataset.order,), collator=HeatmapCollator( cfg.input_shape, cfg.output_shape, cfg.keypoint_num, cfg.heat_thr, cfg.heat_kernels if args.multi_scale_supervision else cfg.heat_kernels[-1:], cfg.heat_range, ), ) # Start training for epoch in range(start_epoch, cfg.epochs): loss = train(model, train_queue, optimizer, gm, epoch=epoch) logger.info("Epoch %d Train %.6f ", epoch, loss) if rank == 0 and epoch % cfg.save_freq == 0: # save checkpoint mge.save( {"epoch": epoch + 1, "state_dict": model.state_dict()}, os.path.join(save_dir, "epoch_{}.pkl".format(epoch)), )
def worker(master_ip, port, world_size, rank, configs): if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) logger.info("init process group for gpu{} done".format(rank)) # set up logger os.makedirs(configs["base_dir"], exist_ok=True) worklog_path = os.path.join(configs["base_dir"], "worklog.txt") mge.set_log_file(worklog_path) # prepare model-related components model = FaceRecognitionModel(configs) # prepare data-related components preprocess = T.Compose([T.Normalize(mean=127.5, std=128), T.ToMode("CHW")]) augment = T.Compose([T.RandomHorizontalFlip()]) train_dataset = get_train_dataset(configs["dataset"], dataset_dir=configs["dataset_dir"]) train_sampler = data.RandomSampler(train_dataset, batch_size=configs["batch_size"], drop_last=True) train_queue = data.DataLoader(train_dataset, sampler=train_sampler, transform=T.Compose([augment, preprocess])) # prepare optimize-related components configs["learning_rate"] = configs["learning_rate"] * dist.get_world_size() if dist.get_world_size() > 1: dist.bcast_list_(model.parameters()) gm = ad.GradManager().attach( model.parameters(), callbacks=[dist.make_allreduce_cb("mean")]) else: gm = ad.GradManager().attach(model.parameters()) opt = optim.SGD( model.parameters(), lr=configs["learning_rate"], momentum=configs["momentum"], weight_decay=configs["weight_decay"], ) # try to load checkpoint model, start_epoch = try_load_latest_checkpoint(model, configs["base_dir"]) # do training def train_one_epoch(): def train_func(images, labels): opt.clear_grad() with gm: loss, accuracy, _ = model(images, labels) gm.backward(loss) if dist.is_distributed(): # all_reduce_mean loss = dist.functional.all_reduce_sum( loss) / dist.get_world_size() accuracy = dist.functional.all_reduce_sum( accuracy) / dist.get_world_size() opt.step() return loss, accuracy model.train() average_loss = AverageMeter("loss") average_accuracy = AverageMeter("accuracy") data_time = AverageMeter("data_time") train_time = AverageMeter("train_time") total_step = len(train_queue) data_iter = iter(train_queue) for step in range(total_step): # get next batch of data data_tic = time.time() images, labels = next(data_iter) data_toc = time.time() # forward pass & backward pass train_tic = time.time() images = mge.tensor(images, dtype="float32") labels = mge.tensor(labels, dtype="int32") loss, accuracy = train_func(images, labels) train_toc = time.time() # do the statistics and logging n = images.shape[0] average_loss.update(loss.item(), n) average_accuracy.update(accuracy.item() * 100, n) data_time.update(data_toc - data_tic) train_time.update(train_toc - train_tic) if step % configs["log_interval"] == 0 and dist.get_rank() == 0: logger.info( "epoch: %d, step: %d, %s, %s, %s, %s", epoch, step, average_loss, average_accuracy, data_time, train_time, ) for epoch in range(start_epoch, configs["num_epoch"]): adjust_learning_rate(opt, epoch, configs) train_one_epoch() if dist.get_rank() == 0: checkpoint_path = os.path.join(configs["base_dir"], f"epoch-{epoch+1}-checkpoint.pkl") mge.save( { "epoch": epoch + 1, "state_dict": model.state_dict() }, checkpoint_path, )
def worker(world_size, args): # pylint: disable=too-many-statements rank = dist.get_rank() if world_size > 1: logger.info("init distributed process group {} / {}".format( rank, world_size)) save_dir = os.path.join(args.save, args.arch + "." + args.mode) if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) mge.set_log_file(os.path.join(save_dir, "log.txt")) model = models.__dict__[args.arch]() cfg = config.get_config(args.arch) cfg.LEARNING_RATE *= world_size # scale learning rate in distributed training total_batch_size = cfg.BATCH_SIZE * world_size steps_per_epoch = 1280000 // total_batch_size total_steps = steps_per_epoch * cfg.EPOCHS if args.mode != "normal": quantize_qat(model, qconfig=Q.ema_fakequant_qconfig) if world_size > 1: # Sync parameters dist.bcast_list_(model.parameters(), dist.WORLD) # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("MEAN") if world_size > 1 else None, ) optimizer = optim.SGD( get_parameters(model, cfg), lr=cfg.LEARNING_RATE, momentum=cfg.MOMENTUM, ) # Define train and valid graph def train_func(image, label): with gm: model.train() logits = model(image) loss = F.loss.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, (1, 5)) gm.backward(loss) optimizer.step().clear_grad() return loss, acc1, acc5 def valid_func(image, label): model.eval() logits = model(image) loss = F.loss.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, (1, 5)) return loss, acc1, acc5 # Build train and valid datasets logger.info("preparing dataset..") train_dataset = data.dataset.ImageNet(args.data, train=True) train_sampler = data.Infinite( data.RandomSampler(train_dataset, batch_size=cfg.BATCH_SIZE, drop_last=True)) train_queue = data.DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose([ T.RandomResizedCrop(224), T.RandomHorizontalFlip(), cfg.COLOR_JITTOR, T.Normalize(mean=128), T.ToMode("CHW"), ]), num_workers=args.workers, ) train_queue = iter(train_queue) valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler(valid_dataset, batch_size=100, drop_last=False) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose([ T.Resize(256), T.CenterCrop(224), T.Normalize(mean=128), T.ToMode("CHW") ]), num_workers=args.workers, ) def adjust_learning_rate(step, epoch): learning_rate = cfg.LEARNING_RATE if cfg.SCHEDULER == "Linear": learning_rate *= 1 - float(step) / total_steps elif cfg.SCHEDULER == "Multistep": learning_rate *= cfg.SCHEDULER_GAMMA**bisect.bisect_right( cfg.SCHEDULER_STEPS, epoch) else: raise ValueError(cfg.SCHEDULER) for param_group in optimizer.param_groups: param_group["lr"] = learning_rate return learning_rate # Start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") total_time = AverageMeter("Time") t = time.time() for step in range(0, total_steps): # Linear learning rate decay epoch = step // steps_per_epoch learning_rate = adjust_learning_rate(step, epoch) image, label = next(train_queue) image = mge.tensor(image, dtype="float32") label = mge.tensor(label, dtype="int32") n = image.shape[0] loss, acc1, acc5 = train_func(image, label) top1.update(100 * acc1.numpy()[0], n) top5.update(100 * acc5.numpy()[0], n) objs.update(loss.numpy()[0], n) total_time.update(time.time() - t) t = time.time() if step % args.report_freq == 0 and rank == 0: logger.info( "TRAIN e%d %06d %f %s %s %s %s", epoch, step, learning_rate, objs, top1, top5, total_time, ) objs.reset() top1.reset() top5.reset() total_time.reset() if step != 0 and step % 10000 == 0 and rank == 0: logger.info("SAVING %06d", step) mge.save( { "step": step, "state_dict": model.state_dict() }, os.path.join(save_dir, "checkpoint.pkl"), ) if step % 10000 == 0 and step != 0: _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5) mge.save( { "step": step, "state_dict": model.state_dict() }, os.path.join(save_dir, "checkpoint-final.pkl"), ) _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5)
def worker(args): # pylint: disable=too-many-statements rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: os.makedirs(os.path.join(args.save, args.arch), exist_ok=True) megengine.logger.set_log_file(os.path.join(args.save, args.arch, "log.txt")) # init process group # build dataset train_dataloader, valid_dataloader = build_dataset(args) train_queue = iter(train_dataloader) # infinite steps_per_epoch = args.steps_per_epoch # build model model = UNetD(3) # Sync parameters if world_size > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None, ) # Optimizer opt = optim.Adam( model.parameters(), lr=args.lr, weight_decay=args.weight_decay * world_size, # scale weight decay in "SUM" mode ) # mixup def preprocess(image, label): if args.dnd: image, label = MixUp_AUG(image, label) return image, label # train and valid func def train_step(image, label): with gm: logits = model(image) logits = image - logits loss = F.nn.l1_loss(logits, label) gm.backward(loss) opt.step().clear_grad() return loss def valid_step(image, label): pred = model(image) pred = image - pred mae_iter = F.nn.l1_loss(pred, label) psnr_it = batch_PSNR(pred, label) #print(psnr_it.item()) if world_size > 1: mae_iter = F.distributed.all_reduce_sum(mae_iter) / world_size psnr_it = F.distributed.all_reduce_sum(psnr_it) / world_size return mae_iter, psnr_it # multi-step learning rate scheduler with warmup def adjust_learning_rate(step): #lr = 1e-6 + 0.5 * (args.lr - 1e-6)*(1 + np.cos(step/(args.epochs*steps_per_epoch) * np.pi)) lr = args.lr * (np.cos(step / (steps_per_epoch * args.epochs) * np.pi) + 1) / 2 for param_group in opt.param_groups: param_group["lr"] = lr return lr # start training for step in range(0, int(args.epochs * steps_per_epoch)): #print(step) lr = adjust_learning_rate(step) t_step = time.time() image, label = next(train_queue) if step > steps_per_epoch: image, label = preprocess(image, label) image = megengine.tensor(image) label = megengine.tensor(label) t_data = time.time() - t_step loss = train_step(image, label) t_train = time.time() - t_step speed = 1. / t_train if step % args.print_freq == 0 and dist.get_rank() == 0: logging.info( "Epoch {} Step {}, Speed={:.2g} mb/s, dp_cost={:.2g}, Loss={:5.2e}, lr={:.2e}".format( step // int(steps_per_epoch), step, speed, t_data/t_train, loss.item(), lr )) #print(steps_per_epoch) if (step + 1) % steps_per_epoch == 0: model.eval() loss, psnr_v = valid(valid_step, valid_dataloader) model.train() logging.info( "Epoch {} Test mae {:.3f}, psnr {:.3f}".format( (step + 1) // steps_per_epoch, loss.item(), psnr_v.item(), )) megengine.save( { "epoch": (step + 1) // steps_per_epoch, "state_dict": model.state_dict(), }, os.path.join(args.save, args.arch, "checkpoint.pkl"), ) if rank == 0 else None
def worker(master_ip, port, world_size, rank, args): if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) logger.info("Init process group for gpu{} done".format(rank)) current_network = import_from_file(args.file) model = current_network.Net(current_network.Cfg()) model.train() if dist.get_rank() == 0: logger.info(get_config_info(model.cfg)) logger.info(repr(model)) params_with_grad = [] for name, param in model.named_parameters(): if "bottom_up.conv1" in name and model.cfg.backbone_freeze_at >= 1: continue if "bottom_up.layer1" in name and model.cfg.backbone_freeze_at >= 2: continue params_with_grad.append(param) opt = SGD( params_with_grad, lr=model.cfg.basic_lr * args.batch_size, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay * dist.get_world_size(), ) gm = GradManager() if dist.get_world_size() > 1: gm.attach(params_with_grad, callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)]) else: gm.attach(params_with_grad) if args.weight_file is not None: weights = mge.load(args.weight_file) model.backbone.bottom_up.load_state_dict(weights, strict=False) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters if dist.get_rank() == 0: logger.info("Prepare dataset") train_loader = iter( build_dataloader(args.batch_size, args.dataset_dir, model.cfg)) for epoch in range(model.cfg.max_epoch): train_one_epoch(model, train_loader, opt, gm, epoch, args) if dist.get_rank() == 0: # save_path = "log-of-{}/epoch_{}.pkl".format( # os.path.basename(args.file).split(".")[0], epoch # ) save_path = os.path.join(args.log_dir, "epoch_{}.pkl".format(epoch)) mge.save( { "epoch": epoch, "state_dict": model.state_dict() }, save_path, ) logger.info("dump weights to %s", save_path)
def worker(args): # pylint: disable=too-many-statements if dist.get_rank() == 0: os.makedirs(os.path.join(args.save, args.arch), exist_ok=True) megengine.logger.set_log_file( os.path.join(args.save, args.arch, "log.txt")) # build dataset train_dataloader, valid_dataloader = build_dataset(args) train_queue = iter(train_dataloader) # infinite steps_per_epoch = 1280000 // (dist.get_world_size() * args.batch_size) # build model model = snet_model.__dict__[args.arch]() # Sync parameters and buffers if dist.get_world_size() > 1: dist.bcast_list_(model.parameters()) dist.bcast_list_(model.buffers()) # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("mean") if dist.get_world_size() > 1 else None, ) # Optimizer params_wd = [] params_nwd = [] for n, p in model.named_parameters(): if n.find("weight") >= 0 and len(p.shape) > 1: print("include ", n, p.shape) params_wd.append(p) else: print("NOT include ", n, p.shape) params_nwd.append(p) opt = optim.SGD( [ { "params": params_wd }, { "params": params_nwd, "weight_decay": 0 }, ], lr=args.lr * dist.get_world_size(), momentum=args.momentum, weight_decay=args.weight_decay, ) # train and valid func def train_step(image, label): with gm: logits = model(image) loss = F.nn.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) gm.backward(loss) opt.step().clear_grad() return loss, acc1, acc5 def valid_step(image, label): logits = model(image) loss = F.nn.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) # calculate mean values if dist.get_world_size() > 1: loss = F.distributed.all_reduce_sum(loss) / dist.get_world_size() acc1 = F.distributed.all_reduce_sum(acc1) / dist.get_world_size() acc5 = F.distributed.all_reduce_sum(acc5) / dist.get_world_size() return loss, acc1, acc5 # linear learning rate scheduler def adjust_learning_rate(step): lr = args.lr * (1 - step / (args.epochs * steps_per_epoch)) for param_group in opt.param_groups: param_group["lr"] = lr return lr # start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") clck = AverageMeter("Time") for step in range(0, args.epochs * steps_per_epoch): lr = adjust_learning_rate(step) t = time.time() image, label = next(train_queue) image = megengine.tensor(image, dtype="float32") label = megengine.tensor(label, dtype="int32") loss, acc1, acc5 = train_step(image, label) objs.update(loss.item()) top1.update(100 * acc1.item()) top5.update(100 * acc5.item()) clck.update(time.time() - t) if step % args.print_freq == 0 and dist.get_rank() == 0: logging.info( "Epoch %d Step %d, LR %.4f, %s %s %s %s", step // steps_per_epoch, step, lr, objs, top1, top5, clck, ) objs.reset() top1.reset() top5.reset() clck.reset() if (step + 1) % steps_per_epoch == 0: model.eval() _, valid_acc1, valid_acc5 = valid(valid_step, valid_dataloader, args) model.train() logging.info( "Epoch %d Test Acc@1 %.3f, Acc@5 %.3f", (step + 1) // steps_per_epoch, valid_acc1, valid_acc5, ) if dist.get_rank() == 0: megengine.save( { "epoch": (step + 1) // steps_per_epoch, "state_dict": model.state_dict(), }, os.path.join(args.save, args.arch, "checkpoint.pkl"), )