def worker(args): current_network = import_from_file(args.file) model = current_network.Net(current_network.Cfg()) model.train() if dist.get_rank() == 0: logger.info(get_config_info(model.cfg)) logger.info(repr(model)) params_with_grad = [] for name, param in model.named_parameters(): if "bottom_up.conv1" in name and model.cfg.backbone_freeze_at >= 1: continue if "bottom_up.layer1" in name and model.cfg.backbone_freeze_at >= 2: continue params_with_grad.append(param) opt = SGD( params_with_grad, lr=model.cfg.basic_lr * args.batch_size, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay * dist.get_world_size(), ) gm = GradManager() if dist.get_world_size() > 1: gm.attach(params_with_grad, callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)]) else: gm.attach(params_with_grad) if args.weight_file is not None: # model.backbone.bottom_up.load_state_dict(weights, strict=False) logger.info("Loading Base-Pretrain weights...") weights = mge.load(args.weight_file) weight_new = {k: v for k, v in weights.items() if 'pred_' not in k} model.load_state_dict(weight_new, strict=False) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters if dist.get_rank() == 0: logger.info("Prepare dataset") train_loader = iter( build_dataloader(args.batch_size, args.dataset_dir, model.cfg)) for epoch in range(model.cfg.max_epoch): train_one_epoch(model, train_loader, opt, gm, epoch, args) if dist.get_rank() == 0: save_path = "logs/{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch) mge.save( { "epoch": epoch, "state_dict": model.state_dict() }, save_path, ) logger.info("dump weights to %s", save_path)
def valid_func(image, label): model.eval() logits = model(image) loss = F.loss.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.functional.all_reduce_sum(loss) / dist.get_world_size() acc1 = dist.functional.all_reduce_sum(acc1) / dist.get_world_size() acc5 = dist.functional.all_reduce_sum(acc5) / dist.get_world_size() return loss, acc1, acc5
def valid_step(image, label): logits = model(image) loss = F.nn.cross_entropy(logits, label) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) # calculate mean values if dist.get_world_size() > 1: loss = F.distributed.all_reduce_sum(loss) / dist.get_world_size() acc1 = F.distributed.all_reduce_sum(acc1) / dist.get_world_size() acc5 = F.distributed.all_reduce_sum(acc5) / dist.get_world_size() return loss, acc1, acc5
def valid_func(image, label): model.eval() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5
def train_func(image, label): model.train() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) optimizer.backward(loss) # compute gradients if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss) / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1) / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5) / dist.get_world_size() return loss, acc1, acc5
def train_func(images, labels): opt.clear_grad() with gm: loss, accuracy, _ = model(images, labels) gm.backward(loss) if dist.is_distributed(): # all_reduce_mean loss = dist.functional.all_reduce_sum( loss) / dist.get_world_size() accuracy = dist.functional.all_reduce_sum( accuracy) / dist.get_world_size() opt.step() return loss, accuracy
def worker(current_network, weight_file, dataset_dir, result_list, master_ip=None, port=None, world_size=1, rank=0): if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) cfg = current_network.Cfg() cfg.backbone_pretrained = False model = current_network.Net(cfg) model.eval() state_dict = mge.load(weight_file) if "state_dict" in state_dict: state_dict = state_dict["state_dict"] model.load_state_dict(state_dict) evaluator = DetEvaluator(model) test_loader = build_dataloader(dataset_dir, model.cfg) if dist.get_world_size() == 1: test_loader = tqdm(test_loader) for data in test_loader: image, im_info = DetEvaluator.process_inputs( data[0][0], model.cfg.test_image_short_size, model.cfg.test_image_max_size, ) pred_res = evaluator.predict(image=mge.tensor(image), im_info=mge.tensor(im_info)) result = { "det_res": pred_res, "image_id": int(data[1][2][0]), } if dist.get_world_size() > 1: result_list.put_nowait(result) else: result_list.append(result)
def train_and_evaluate(model, manager): rank = dist.get_rank() # reload weights from restore_file if specified if args.restore_file is not None: manager.load_checkpoints() world_size = dist.get_world_size() if world_size > 1: dist.bcast_list_(model.parameters()) dist.bcast_list_(model.buffers()) gm = GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None, ) for epoch in range(manager.params.num_epochs): # compute number of batches in one epoch (one full pass over the training set) train(model, manager, gm) # Evaluate for one epoch on validation set evaluate(model, manager) # Save best model weights accroding to the params.major_metric if rank == 0: manager.check_best_save_last_checkpoints(latest_freq=5)
def worker(): rank = dist.get_rank() size = dist.get_world_size() x = mge.tensor(np.random.randn(1, rank * 2 + 2), dtype=np.float32) m = M.Linear(rank * 2 + 2, rank * 2 + 4) gm = GradManager().attach(m.parameters()) opt = optim.SGD(m.parameters(), 1e-3, momentum=0.9) def train_func(x): with gm: if rank != 0: x = dist.functional.remote_recv(rank - 1, shape=(1, rank * 2 + 2), dtype=np.float32) y = m(x) if rank != size - 1: dist.functional.remote_send(y, dest_rank=rank + 1) gm.backward() else: y = y.mean() gm.backward(y) opt.step().clear_grad() train_funcs = [ train_func, trace(symbolic=False)(train_func), trace(symbolic=True)(train_func), ] for func in train_funcs: for i in range(3): func(x)
def train_generator_batch(image, label, *, gm, netG, netloss): B, T, _, h, w = image.shape biup = get_bilinear(image) netG.train() with gm: forward_hiddens = [] backward_hiddens = [] res = [] hidden = F.zeros((2 * B, netG.hidden_channels, h, w)) for i in range(T): now_frame = F.concat([image[:, i, ...], image[:, T - i - 1, ...]], axis=0) if i == 0: flow = netG.flownet(now_frame, now_frame) else: ref = F.concat([image[:, i - 1, ...], image[:, T - i, ...]], axis=0) flow = netG.flownet(now_frame, ref) hidden = netG(hidden, flow, now_frame) forward_hiddens.append(hidden[0:B, ...]) backward_hiddens.append(hidden[B:2 * B, ...]) for i in range(T): res.append( netG.do_upsample(forward_hiddens[i], backward_hiddens[T - i - 1])) res = F.stack(res, axis=1) # [B,T,3,H,W] loss = netloss(res + biup, label) gm.backward(loss) if dist.is_distributed(): loss = dist.functional.all_reduce_sum(loss) / dist.get_world_size() return loss
def train_func(): loss = model.calc_loss() optimizer.backward(loss) # compute gradients if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "train_loss") / dist.get_world_size() return loss
def worker( arch, model_file, data_root, ann_file, ): """ :param net_file: network description file :param model_file: file of dump weights :param data_dir: the dataset directory :param worker_id: the index of the worker :param total_worker: number of gpu for evaluation """ model = getattr(kpm, arch)() model.eval() weight = mge.load(model_file) weight = weight["state_dict"] if "state_dict" in weight.keys() else weight model.load_state_dict(weight) loader = build_dataloader(dist.get_rank(), dist.get_world_size(), data_root, ann_file) if dist.get_rank() == 0: loader = tqdm(loader) result_list = [] for data_dict in loader: img, bbox, info = data_dict fliped_img = img[:, :, :, ::-1] - np.zeros_like(img) data = np.concatenate([img, fliped_img], 0) data = np.ascontiguousarray(data).astype(np.float32) outs = model.predict(mge.tensor(data)).numpy() preds = outs[:img.shape[0]] preds_fliped = outs[img.shape[0]:, cfg.keypoint_flip_order, :, ::-1] preds = (preds + preds_fliped) / 2 for i in range(preds.shape[0]): results = find_keypoints(preds[i], bbox[i, 0]) final_score = float(results[:, -1].mean() * info[-1][i]) image_id = int(info[-2][i]) keypoints = results.copy() keypoints[:, -1] = 1 keypoints = keypoints.reshape(-1, ).tolist() instance = { "image_id": image_id, "category_id": 1, "score": final_score, "keypoints": keypoints, } result_list.append(instance) return result_list
def worker(rank, backend, q): if not mge.is_cuda_available(): return _init_process_group_wrapper(world_size, rank, rank, backend, q) assert dist.is_distributed() == True assert dist.get_master_ip() == _LOCALHOST assert dist.get_master_port() > 0 assert dist.get_world_size() == world_size assert dist.get_rank() == rank assert dist.get_backend() == backend
def worker( current_network, weight_file, dataset_dir, result_list, master_ip=None, port=None, world_size=None, rank=None ): if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) cfg = current_network.Cfg() cfg.backbone_pretrained = False model = current_network.Net(cfg) model.eval() state_dict = mge.load(weight_file) if "state_dict" in state_dict: state_dict = state_dict["state_dict"] model.load_state_dict(state_dict) def pred_func(data): pred = model(data) return pred test_loader = build_dataloader(dataset_dir, model.cfg) if dist.get_world_size() == 1: test_loader = tqdm(test_loader) for data in test_loader: img = data[0].squeeze() label = data[1].squeeze() im_info = data[2] pred = evaluate(pred_func, img, model.cfg) result = {"pred": pred, "gt": label, "name": im_info[2]} if dist.get_world_size() > 1: result_list.put_nowait(result) else: result_list.append(result)
def __init__( self, dataset, batch_size=1, drop_last=False, num_samples=None, world_size=None, rank=None, seed=None, ): if ( not isinstance(batch_size, int) or isinstance(batch_size, bool) or batch_size <= 0 ): raise ValueError( "batch_size should be a positive integer value, " "but got batch_size={}".format(batch_size) ) if not isinstance(drop_last, bool): raise ValueError( "drop_last should be a boolean value, but got " "drop_last={}".format(drop_last) ) if num_samples is not None and ( not isinstance(num_samples, int) or isinstance(num_samples, bool) or num_samples <= 0 ): raise ValueError( "num_samples should be a positive integer " "value, but got num_samples={}".format(num_samples) ) self.batch_size = batch_size self.dataset = dataset self.drop_last = drop_last if world_size is None: world_size = dist.get_world_size() if dist.is_distributed() else 1 self.world_size = world_size if rank is None: rank = dist.get_rank() if dist.is_distributed() else 0 self.rank = rank if num_samples is None: num_samples = len(self.dataset) self.num_samples = int(math.ceil(num_samples / self.world_size)) # Make sure seeds are the same at each rank if seed is None and self.world_size > 1: seed = 0 self.rng = np.random.RandomState(seed)
def train_generator_batch(image, label, *, gm, netG, netloss): B, T, _, h, w = image.shape biup = get_bilinear(image) # np_weight = [0,-1,0,-1,4,-1,0,-1,0] # (1,1,3,3) # conv_weight = mge.tensor(np.array(np_weight).astype(np.float32)).reshape(1,1,3,3) # HR_mask = F.mean(label, axis=2, keepdims=False) # [B,T,H,W] 对T是做depthwise # HR_mask = HR_mask.reshape(B*T, 1, 4*h, 4*w) # HR_mask = F.conv2d(HR_mask, conv_weight, padding=1) # # HR_mask = (F.abs(HR_mask) > 0.1).astype("float32") # [B*T, 1, H, W] # HR_mask = HR_mask.reshape(B, T, 1, 4*h, 4*w) # HR_mask = 1 + HR_mask * 0.1 HR_mask = 1 netG.train() with gm: forward_hiddens = [] backward_hiddens = [] res = [] # 对所有的image提取特征 image = image.reshape(B * T, 3, h, w) image = netG.rgb(image).reshape(B, T, -1, h, w) # T=0 now_frame = image[:, 0, ...] hidden = now_frame forward_hiddens.append(now_frame) for i in range(1, T): now_frame = image[:, i, ...] hidden = netG.aggr(F.concat([hidden, now_frame], axis=1)) forward_hiddens.append(hidden) # T=-1 now_frame = image[:, T - 1, ...] hidden = now_frame backward_hiddens.append(now_frame) for i in range(T - 2, -1, -1): now_frame = image[:, i, ...] hidden = netG.aggr(F.concat([hidden, now_frame], axis=1)) backward_hiddens.append(hidden) # do upsample for all frames for i in range(T): res.append( netG.upsample( F.concat([forward_hiddens[i], backward_hiddens[T - i - 1]], axis=1))) res = F.stack(res, axis=1) # [B,T,3,H,W] res = res + biup loss = netloss(res, label, HR_mask) # 加上edge损失 # 探测label的edge map gm.backward(loss) if dist.is_distributed(): loss = dist.functional.all_reduce_sum(loss) / dist.get_world_size() return loss
def train_one_epoch(model, data_queue, opt, gm, epoch, args): def train_func(image, im_info, gt_boxes): with gm: loss_dict = model(image=image, im_info=im_info, gt_boxes=gt_boxes) gm.backward(loss_dict["total_loss"]) loss_list = list(loss_dict.values()) opt.step().clear_grad() return loss_list meter = AverageMeter(record_len=model.cfg.num_losses) time_meter = AverageMeter(record_len=2) log_interval = model.cfg.log_interval tot_step = model.cfg.nr_images_epoch // (args.batch_size * dist.get_world_size()) for step in range(tot_step): adjust_learning_rate(opt, epoch, step, model.cfg, args) data_tik = time.time() mini_batch = next(data_queue) data_tok = time.time() tik = time.time() loss_list = train_func( image=mge.tensor(mini_batch["data"]), im_info=mge.tensor(mini_batch["im_info"]), gt_boxes=mge.tensor(mini_batch["gt_boxes"]) ) tok = time.time() time_meter.update([tok - tik, data_tok - data_tik]) if dist.get_rank() == 0: info_str = "e%d, %d/%d, lr:%f, " loss_str = ", ".join( ["{}:%f".format(loss) for loss in model.cfg.losses_keys] ) time_str = ", train_time:%.3fs, data_time:%.3fs" log_info_str = info_str + loss_str + time_str meter.update([loss.numpy() for loss in loss_list]) if step % log_interval == 0: logger.info( log_info_str, epoch, step, tot_step, opt.param_groups[0]["lr"], *meter.average(), *time_meter.average() ) meter.reset() time_meter.reset()
def worker(rank): dist.init_process_group("localhost", port, world_size, rank, rank, backend) assert dist.is_distributed() == True assert dist.get_rank() == rank assert dist.get_world_size() == world_size assert dist.get_backend() == backend py_server_addr = dist.get_py_server_addr() assert py_server_addr[0] == "localhost" assert py_server_addr[1] == port mm_server_addr = dist.get_mm_server_addr() assert mm_server_addr[0] == "localhost" assert mm_server_addr[1] > 0 assert isinstance(dist.get_client(), dist.Client)
def evaluate(model, manager): rank = dist.get_rank() world_size = dist.get_world_size() # set model to evaluation mode model.eval() # compute metrics over the dataset if manager.dataloaders["val"] is not None: # loss status and val status initial manager.reset_loss_status() manager.reset_metric_status("val") for data_batch in manager.dataloaders["val"]: # compute the real batch size bs = data_batch["img1"].shape[0] # move to GPU if available data_batch = utils.tensor_mge(data_batch) data_batch["imgs"] = F.concat( [data_batch["img1"] / 255.0, data_batch["img2"] / 255.0], 1) # compute model output output_batch = model(data_batch) # compute all loss on this batch # loss = compute_losses(data_batch, output_batch, manager.params) metrics = {} metrics["EPE"] = compute_metrics(data_batch, output_batch) if world_size > 1: # loss['total'] = F.distributed.all_reduce_sum(loss['total']) / world_size metrics['EPE'] = F.distributed.all_reduce_sum( metrics['EPE']) / world_size # manager.update_loss_status(loss, "val", bs) # compute all metrics on this batch manager.update_metric_status(metrics, "val", bs) # manager.print_metrics("val", title="Val", color="green") # update data to tensorboard if rank == 0: # manager.writer.add_scalar("Loss/val", manager.loss_status["total"].avg, manager.epoch) # manager.logger.info("Loss/valid epoch {}: {}".format(manager.epoch, manager.loss_status['total'].avg)) for k, v in manager.val_status.items(): manager.writer.add_scalar("Metric/val/{}".format(k), v.avg, manager.epoch) # manager.logger.info("Metric/valid epoch {}: {}".format(manager.epoch, v.avg)) # For each epoch, print the metric manager.print_metrics("val", title="Val", color="green")
def worker(rank, world_size, ngpus_per_node, args): if world_size > 1: # init process group dist.init_process_group( master_ip=args.dist_addr, port=args.dist_port, world_size=world_size, rank=rank, device=rank % ngpus_per_node, backend="nccl", ) logging.info("init process group rank %d / %d", dist.get_rank(), dist.get_world_size()) # build dataset _, valid_dataloader = build_dataset(args) # build model model = resnet_model.__dict__[args.arch](pretrained=args.model is None) if args.model is not None: logging.info("load from checkpoint %s", args.model) checkpoint = megengine.load(args.model) if "state_dict" in checkpoint: state_dict = checkpoint["state_dict"] model.load_state_dict(state_dict) def valid_step(image, label): logits = model(image) loss = F.nn.cross_entropy(logits, label) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) # calculate mean values if world_size > 1: loss = F.distributed.all_reduce_sum(loss) / world_size acc1 = F.distributed.all_reduce_sum(acc1) / world_size acc5 = F.distributed.all_reduce_sum(acc5) / world_size return loss, acc1, acc5 model.eval() _, valid_acc1, valid_acc5 = valid(valid_step, valid_dataloader, args) logging.info( "Test Acc@1 %.3f, Acc@5 %.3f", valid_acc1, valid_acc5, )
def train_one_epoch(model, data_queue, opt, gm, epoch): # @trace(symbolic=True) def train_func(data, label): with gm: pred = model(data) loss = cross_entropy(pred, label, ignore_label=model.cfg.ignore_label) gm.backward(loss) opt.step().clear_grad() return loss meter = AverageMeter(record_len=1) time_meter = AverageMeter(record_len=2) log_interval = model.cfg.log_interval tot_step = model.cfg.nr_images_epoch // (model.cfg.batch_size * dist.get_world_size()) for step in range(tot_step): adjust_learning_rate(opt, epoch, step, tot_step, model.cfg) data_tik = time.time() inputs, labels = next(data_queue) labels = np.squeeze(labels, axis=1).astype(np.int32) data_tok = time.time() tik = time.time() loss = train_func(mge.tensor(inputs), mge.tensor(labels)) tok = time.time() time_meter.update([tok - tik, data_tok - data_tik]) if dist.get_rank() == 0: info_str = "e%d, %d/%d, lr:%f, " loss_str = ", ".join(["{}:%f".format(loss) for loss in ["loss"]]) time_str = ", train_time:%.3fs, data_time:%.3fs" log_info_str = info_str + loss_str + time_str meter.update([loss.numpy() for loss in [loss]]) if step % log_interval == 0: logger.info(log_info_str, epoch, step, tot_step, opt.param_groups[1]["lr"], *meter.average(), *time_meter.average()) meter.reset() time_meter.reset()
def __init__(self, dataloader, **eval_kwargs): if not isinstance(dataloader, DataLoader): raise TypeError( 'dataloader must be a mge DataLoader, but got {}'.format( type(dataloader))) self.dataloader = dataloader self.eval_kwargs = eval_kwargs self.interval = self.eval_kwargs.pop('interval', 10000) self.save_image = self.eval_kwargs.pop('save_image', False) self.save_path = self.eval_kwargs.pop('save_path', None) self.log_path = self.eval_kwargs.pop('log_path', None) self.multi_process = self.eval_kwargs.pop('multi_process', False) self.ensemble = self.eval_kwargs.pop('ensemble', False) mkdir_or_exist(self.save_path) self.logger = get_logger(name="EvalIterHook", log_file=self.log_path) # only for rank0 if is_distributed(): self.local_rank = dist.get_rank() self.nranks = dist.get_world_size() else: self.local_rank = 0 self.nranks = 1
def worker(args): current_network = import_from_file(args.file) model = current_network.Net(current_network.Cfg()) model.train() if dist.get_rank() == 0: logger.info(get_config_info(model.cfg)) logger.info(repr(model)) backbone_params = [] head_params = [] for name, param in model.named_parameters(): if "backbone" in name: backbone_params.append(param) else: head_params.append(param) opt = SGD( [ { "params": backbone_params, "lr": model.cfg.learning_rate * 0.1 }, { "params": head_params }, ], lr=model.cfg.learning_rate, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay * dist.get_world_size(), ) gm = GradManager() if dist.get_world_size() > 1: gm.attach(model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)]) else: gm.attach(model.parameters()) cur_epoch = 0 if args.resume is not None: pretrained = mge.load(args.resume) cur_epoch = pretrained["epoch"] + 1 model.load_state_dict(pretrained["state_dict"]) opt.load_state_dict(pretrained["opt"]) if dist.get_rank() == 0: logger.info("load success: epoch %d", cur_epoch) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters if dist.get_rank() == 0: logger.info("Prepare dataset") train_loader = iter( build_dataloader(model.cfg.batch_size, args.dataset_dir, model.cfg)) for epoch in range(cur_epoch, model.cfg.max_epoch): train_one_epoch(model, train_loader, opt, gm, epoch) if dist.get_rank() == 0: save_path = "log-of-{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch) mge.save( { "epoch": epoch, "state_dict": model.state_dict(), "opt": opt.state_dict() }, save_path) logger.info("dump weights to %s", save_path)
def build_gradmanager(module): world_size = dist.get_world_size() gm = GradManager().attach( module.parameters(), callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None) return gm
def all_reduce_mean(array: Tensor) -> Tensor: if dist.get_world_size() > 1: array = dist.functional.all_reduce_sum(array) / dist.get_world_size() return array
def worker(rank, world_size, ngpus_per_node, args): # pylint: disable=too-many-statements if rank == 0: os.makedirs(os.path.join(args.save, args.arch), exist_ok=True) megengine.logger.set_log_file( os.path.join(args.save, args.arch, "log.txt")) # init process group if world_size > 1: dist.init_process_group( master_ip=args.dist_addr, port=args.dist_port, world_size=world_size, rank=rank, device=rank % ngpus_per_node, backend="nccl", ) logging.info("init process group rank %d / %d", dist.get_rank(), dist.get_world_size()) # build dataset train_dataloader, valid_dataloader = build_dataset(args) train_queue = iter(train_dataloader) # infinite steps_per_epoch = 1280000 // (world_size * args.batch_size) # build model model = resnet_model.__dict__[args.arch]() # Sync parameters if world_size > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None, ) # Optimizer opt = optim.SGD( model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay * world_size, # scale weight decay in "SUM" mode ) # train and valid func def train_step(image, label): with gm: logits = model(image) loss = F.nn.cross_entropy(logits, label) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) gm.backward(loss) opt.step().clear_grad() return loss, acc1, acc5 def valid_step(image, label): logits = model(image) loss = F.nn.cross_entropy(logits, label) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) # calculate mean values if world_size > 1: loss = F.distributed.all_reduce_sum(loss) / world_size acc1 = F.distributed.all_reduce_sum(acc1) / world_size acc5 = F.distributed.all_reduce_sum(acc5) / world_size return loss, acc1, acc5 # multi-step learning rate scheduler with warmup def adjust_learning_rate(step): lr = args.lr * 0.1**bisect.bisect_right( [30 * steps_per_epoch, 60 * steps_per_epoch, 80 * steps_per_epoch], step) if step < 5 * steps_per_epoch: # warmup lr = args.lr * (step / (5 * steps_per_epoch)) for param_group in opt.param_groups: param_group["lr"] = lr return lr # start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") clck = AverageMeter("Time") for step in range(0, args.epochs * steps_per_epoch): lr = adjust_learning_rate(step) t = time.time() image, label = next(train_queue) image = megengine.tensor(image, dtype="float32") label = megengine.tensor(label, dtype="int32") loss, acc1, acc5 = train_step(image, label) objs.update(loss.item()) top1.update(100 * acc1.item()) top5.update(100 * acc5.item()) clck.update(time.time() - t) if step % args.print_freq == 0 and dist.get_rank() == 0: logging.info( "Epoch %d Step %d, LR %.4f, %s %s %s %s", step // steps_per_epoch, step, lr, objs, top1, top5, clck, ) objs.reset() top1.reset() top5.reset() clck.reset() if (step + 1) % steps_per_epoch == 0: model.eval() _, valid_acc1, valid_acc5 = valid(valid_step, valid_dataloader, args) model.train() logging.info( "Epoch %d Test Acc@1 %.3f, Acc@5 %.3f", (step + 1) // steps_per_epoch, valid_acc1, valid_acc5, ) megengine.save( { "epoch": (step + 1) // steps_per_epoch, "state_dict": model.state_dict(), }, os.path.join(args.save, args.arch, "checkpoint.pkl"), )
def worker(args): # pylint: disable=too-many-statements rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: os.makedirs(os.path.join(args.save, args.arch), exist_ok=True) megengine.logger.set_log_file(os.path.join(args.save, args.arch, "log.txt")) # init process group # build dataset train_dataloader, valid_dataloader = build_dataset(args) train_queue = iter(train_dataloader) # infinite steps_per_epoch = args.steps_per_epoch # build model model = UNetD(3) # Sync parameters if world_size > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None, ) # Optimizer opt = optim.Adam( model.parameters(), lr=args.lr, weight_decay=args.weight_decay * world_size, # scale weight decay in "SUM" mode ) # mixup def preprocess(image, label): if args.dnd: image, label = MixUp_AUG(image, label) return image, label # train and valid func def train_step(image, label): with gm: logits = model(image) logits = image - logits loss = F.nn.l1_loss(logits, label) gm.backward(loss) opt.step().clear_grad() return loss def valid_step(image, label): pred = model(image) pred = image - pred mae_iter = F.nn.l1_loss(pred, label) psnr_it = batch_PSNR(pred, label) #print(psnr_it.item()) if world_size > 1: mae_iter = F.distributed.all_reduce_sum(mae_iter) / world_size psnr_it = F.distributed.all_reduce_sum(psnr_it) / world_size return mae_iter, psnr_it # multi-step learning rate scheduler with warmup def adjust_learning_rate(step): #lr = 1e-6 + 0.5 * (args.lr - 1e-6)*(1 + np.cos(step/(args.epochs*steps_per_epoch) * np.pi)) lr = args.lr * (np.cos(step / (steps_per_epoch * args.epochs) * np.pi) + 1) / 2 for param_group in opt.param_groups: param_group["lr"] = lr return lr # start training for step in range(0, int(args.epochs * steps_per_epoch)): #print(step) lr = adjust_learning_rate(step) t_step = time.time() image, label = next(train_queue) if step > steps_per_epoch: image, label = preprocess(image, label) image = megengine.tensor(image) label = megengine.tensor(label) t_data = time.time() - t_step loss = train_step(image, label) t_train = time.time() - t_step speed = 1. / t_train if step % args.print_freq == 0 and dist.get_rank() == 0: logging.info( "Epoch {} Step {}, Speed={:.2g} mb/s, dp_cost={:.2g}, Loss={:5.2e}, lr={:.2e}".format( step // int(steps_per_epoch), step, speed, t_data/t_train, loss.item(), lr )) #print(steps_per_epoch) if (step + 1) % steps_per_epoch == 0: model.eval() loss, psnr_v = valid(valid_step, valid_dataloader) model.train() logging.info( "Epoch {} Test mae {:.3f}, psnr {:.3f}".format( (step + 1) // steps_per_epoch, loss.item(), psnr_v.item(), )) megengine.save( { "epoch": (step + 1) // steps_per_epoch, "state_dict": model.state_dict(), }, os.path.join(args.save, args.arch, "checkpoint.pkl"), ) if rank == 0 else None
def worker(master_ip, port, rank, world_size, args): if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format(rank, world_size)) dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) model = getattr(kpm, args.arch)() model.train() start_epoch = 0 if args.resume is not None: file = mge.load(args.resume) model.load_state_dict(file["state_dict"]) start_epoch = file["epoch"] optimizer = optim.Adam( model.parameters(), lr=cfg.initial_lr, weight_decay=cfg.weight_decay ) gm = GradManager() if dist.get_world_size() > 1: gm.attach( model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)], ) else: gm.attach(model.parameters()) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters # Build train datasets logger.info("preparing dataset..") ann_file = os.path.join( cfg.data_root, "annotations", "person_keypoints_train2017.json" ) train_dataset = COCOJoints( cfg.data_root, ann_file, image_set="train2017", order=("image", "keypoints", "boxes", "info"), ) logger.info("Num of Samples: {}".format(len(train_dataset))) train_sampler = data.RandomSampler( train_dataset, batch_size=cfg.batch_size, drop_last=True ) transforms = [ T.Normalize(mean=cfg.img_mean, std=cfg.img_std), RandomHorizontalFlip(0.5, keypoint_flip_order=cfg.keypoint_flip_order) ] if cfg.half_body_transform: transforms.append( HalfBodyTransform( cfg.upper_body_ids, cfg.lower_body_ids, cfg.prob_half_body ) ) if cfg.extend_boxes: transforms.append( ExtendBoxes(cfg.x_ext, cfg.y_ext, cfg.input_shape[1] / cfg.input_shape[0]) ) transforms += [ RandomBoxAffine( degrees=cfg.rotate_range, scale=cfg.scale_range, output_shape=cfg.input_shape, rotate_prob=cfg.rotation_prob, scale_prob=cfg.scale_prob, ) ] transforms += [T.ToMode()] train_queue = data.DataLoader( train_dataset, sampler=train_sampler, num_workers=args.workers, transform=T.Compose(transforms=transforms, order=train_dataset.order,), collator=HeatmapCollator( cfg.input_shape, cfg.output_shape, cfg.keypoint_num, cfg.heat_thr, cfg.heat_kernels if args.multi_scale_supervision else cfg.heat_kernels[-1:], cfg.heat_range, ), ) # Start training for epoch in range(start_epoch, cfg.epochs): loss = train(model, train_queue, optimizer, gm, epoch=epoch) logger.info("Epoch %d Train %.6f ", epoch, loss) if rank == 0 and epoch % cfg.save_freq == 0: # save checkpoint mge.save( {"epoch": epoch + 1, "state_dict": model.state_dict()}, os.path.join(save_dir, "epoch_{}.pkl".format(epoch)), )
def worker(master_ip, port, world_size, rank, configs): if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) logger.info("init process group for gpu{} done".format(rank)) # set up logger os.makedirs(configs["base_dir"], exist_ok=True) worklog_path = os.path.join(configs["base_dir"], "worklog.txt") mge.set_log_file(worklog_path) # prepare model-related components model = FaceRecognitionModel(configs) # prepare data-related components preprocess = T.Compose([T.Normalize(mean=127.5, std=128), T.ToMode("CHW")]) augment = T.Compose([T.RandomHorizontalFlip()]) train_dataset = get_train_dataset(configs["dataset"], dataset_dir=configs["dataset_dir"]) train_sampler = data.RandomSampler(train_dataset, batch_size=configs["batch_size"], drop_last=True) train_queue = data.DataLoader(train_dataset, sampler=train_sampler, transform=T.Compose([augment, preprocess])) # prepare optimize-related components configs["learning_rate"] = configs["learning_rate"] * dist.get_world_size() if dist.get_world_size() > 1: dist.bcast_list_(model.parameters()) gm = ad.GradManager().attach( model.parameters(), callbacks=[dist.make_allreduce_cb("mean")]) else: gm = ad.GradManager().attach(model.parameters()) opt = optim.SGD( model.parameters(), lr=configs["learning_rate"], momentum=configs["momentum"], weight_decay=configs["weight_decay"], ) # try to load checkpoint model, start_epoch = try_load_latest_checkpoint(model, configs["base_dir"]) # do training def train_one_epoch(): def train_func(images, labels): opt.clear_grad() with gm: loss, accuracy, _ = model(images, labels) gm.backward(loss) if dist.is_distributed(): # all_reduce_mean loss = dist.functional.all_reduce_sum( loss) / dist.get_world_size() accuracy = dist.functional.all_reduce_sum( accuracy) / dist.get_world_size() opt.step() return loss, accuracy model.train() average_loss = AverageMeter("loss") average_accuracy = AverageMeter("accuracy") data_time = AverageMeter("data_time") train_time = AverageMeter("train_time") total_step = len(train_queue) data_iter = iter(train_queue) for step in range(total_step): # get next batch of data data_tic = time.time() images, labels = next(data_iter) data_toc = time.time() # forward pass & backward pass train_tic = time.time() images = mge.tensor(images, dtype="float32") labels = mge.tensor(labels, dtype="int32") loss, accuracy = train_func(images, labels) train_toc = time.time() # do the statistics and logging n = images.shape[0] average_loss.update(loss.item(), n) average_accuracy.update(accuracy.item() * 100, n) data_time.update(data_toc - data_tic) train_time.update(train_toc - train_tic) if step % configs["log_interval"] == 0 and dist.get_rank() == 0: logger.info( "epoch: %d, step: %d, %s, %s, %s, %s", epoch, step, average_loss, average_accuracy, data_time, train_time, ) for epoch in range(start_epoch, configs["num_epoch"]): adjust_learning_rate(opt, epoch, configs) train_one_epoch() if dist.get_rank() == 0: checkpoint_path = os.path.join(configs["base_dir"], f"epoch-{epoch+1}-checkpoint.pkl") mge.save( { "epoch": epoch + 1, "state_dict": model.state_dict() }, checkpoint_path, )
def __init__( self, dataset, batch_size=1, drop_last=False, num_samples=None, world_size=None, rank=None, seed=None, ): r""" An abstract class for all sampler. :type dataset: `dataset` :param dataset: dataset to sample from. :type batch_size: positive integer :param batch_size: batch size for batch method. :type drop_last: bool :param drop_last: set ``True`` to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If ``False`` and the size of dataset is not divisible by the batch_size, then the last batch will be smaller. Default: False :type num_samples: positive integer :param num_samples: number of samples assigned to one rank. :type world_size: positive integer :param world_size: number of ranks. :type rank: non-negative integer within 0 and world_size :param rank: rank id, non-negative interger within 0 and ``world_size``. :type seed: non-negative integer :param seed: seed for random operators. """ if (not isinstance(batch_size, int) or isinstance(batch_size, bool) or batch_size <= 0): raise ValueError("batch_size should be a positive integer value, " "but got batch_size={}".format(batch_size)) if not isinstance(drop_last, bool): raise ValueError("drop_last should be a boolean value, but got " "drop_last={}".format(drop_last)) if num_samples is not None and (not isinstance(num_samples, int) or isinstance(num_samples, bool) or num_samples <= 0): raise ValueError( "num_samples should be a positive integer " "value, but got num_samples={}".format(num_samples)) self.batch_size = batch_size self.dataset = dataset self.drop_last = drop_last if world_size is None: world_size = dist.get_world_size() if dist.is_distributed() else 1 self.world_size = world_size if rank is None: rank = dist.get_rank() if dist.is_distributed() else 0 self.rank = rank if num_samples is None: num_samples = len(self.dataset) self.num_samples = int(math.ceil(num_samples / self.world_size)) # Make sure seeds are the same at each rank if seed is None and self.world_size > 1: seed = 0 self.rng = np.random.RandomState(seed)