def test_reduce_sum(): world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank, data, expect, port): if mge.get_device_count("gpu") < world_size: return dist.init_process_group("localhost", port, world_size, rank, rank) inp = tensor(data) output = reduce_sum(inp) if rank == 0: assert np.allclose(output.numpy(), expect) else: assert np.allclose(output.numpy(), 0) def check(shape): x = np.random.rand(*shape).astype("float32") y = np.random.rand(*shape).astype("float32") z = x + y p0 = mp.Process(target=worker, args=(0, x, z, port)) p1 = mp.Process(target=worker, args=(1, y, None, port)) p0.start() p1.start() p0.join(10) p1.join(10) assert p0.exitcode == 0 and p1.exitcode == 0 for shape in [(2, 3), (8, 10), (99, 77)]: check(shape)
def test_all_to_all(): world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank, data, expect, port): if mge.get_device_count("gpu") < world_size: return dist.init_process_group("localhost", port, world_size, rank, rank) inp = tensor(data) output = all_to_all(inp) assert np.allclose(output.numpy(), expect) def check(shape): x = np.random.rand(*shape).astype("float32") y = np.random.rand(*shape).astype("float32") a = np.concatenate((x[:shape[0] // 2], y[:shape[0] // 2])) b = np.concatenate((x[shape[0] // 2:], y[shape[0] // 2:])) p0 = mp.Process(target=worker, args=(0, x, a, port)) p1 = mp.Process(target=worker, args=(1, y, b, port)) p0.start() p1.start() p0.join(10) p1.join(10) assert p0.exitcode == 0 and p1.exitcode == 0 for shape in [(2, 3), (8, 10), (100, 77)]: check(shape)
def test_broadcast(): world_size = 2 server = dist.Server() port = server.py_server_port def worker(rank, data, expect, port): if mge.get_device_count("gpu") < world_size: return dist.init_process_group("localhost", port, world_size, rank, rank) inp = tensor(data) output = broadcast(inp) assert np.allclose(output.numpy(), expect) def check(shape): x = np.random.rand(*shape).astype("float32") y = x + 1 p0 = mp.Process(target=worker, args=(0, x, x, port)) p1 = mp.Process(target=worker, args=(1, y, x, port)) p0.start() p1.start() p0.join(10) p1.join(10) assert p0.exitcode == 0 and p1.exitcode == 0 for shape in [(2, 3), (8, 10), (99, 77)]: check(shape)
def run_test( model_path, use_jit, use_symbolic, sublinear_memory_config=None, max_err=None, ): """ Load the model with test cases and run the training for one iter. The loss and updated weights are compared with reference value to verify the correctness. Dump a new file with updated result by calling update_model if you think the test fails due to numerical rounding errors instead of bugs. Please think twice before you do so. """ checkpoint = mge.load(model_path) data = checkpoint["data"] label = checkpoint["label"] port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank, max_err): dist.init_process_group("localhost", port, p_num, rank, rank) net = MnistNet(has_bn=True) net.load_state_dict(checkpoint["net_init"]) lr = checkpoint["sgd_lr"] opt = SGD(net.parameters(), lr=lr) gm = ad.GradManager().attach( net.parameters(), callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)] ) # use same data and label for all gpu's # such that the result does not depend on number of gpu data_train = Tensor(data) label_train = Tensor(label) loss = train(data_train, label_train, net, opt, gm) np.testing.assert_allclose(loss.numpy(), checkpoint["loss"], atol=max_err) if dist.get_rank(): return for param, param_ref in zip( net.state_dict().items(), checkpoint["net_updated"].items() ): assert param[0] == param_ref[0] if "bn" in param[0]: ref = param_ref[1].reshape(param[1].shape) np.testing.assert_allclose(param[1], ref, atol=max_err) else: np.testing.assert_allclose(param[1], param_ref[1], atol=max_err) procs = [] for rank in range(p_num): p = mp.Process(target=worker, args=(rank, max_err,)) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0
def test_synchronized(): world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) @dist.synchronized def func(rank, q): q.put(rank) def worker(rank, q): dist.init_process_group("localhost", port, world_size, rank, rank) dist.group_barrier() if rank == 0: func(0, q) # q.put(0) q.put(2) else: _assert_q_val(q, 0) # func executed in rank 0 _assert_q_empty(q) # q.put(2) is not executed func(1, q) _assert_q_val( q, 1) # func in rank 1 executed earlier than q.put(2) in rank 0 _assert_q_val(q, 2) # q.put(2) executed in rank 0 Q = mp.Queue() procs = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, Q)) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0
def test_init_process_group(): world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank, backend): dist.init_process_group("localhost", port, world_size, rank, rank, backend) assert dist.is_distributed() == True assert dist.get_rank() == rank assert dist.get_world_size() == world_size assert dist.get_backend() == backend py_server_addr = dist.get_py_server_addr() assert py_server_addr[0] == "localhost" assert py_server_addr[1] == port mm_server_addr = dist.get_mm_server_addr() assert mm_server_addr[0] == "localhost" assert mm_server_addr[1] > 0 assert isinstance(dist.get_client(), dist.Client) def check(backend): procs = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, backend)) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0 check("nccl")
def main(): parser = make_parser() args = parser.parse_args() # ------------------------ begin training -------------------------- # logger.info("Device Count = %d", args.ngpus) log_dir = "log-of-{}".format(os.path.basename(args.file).split(".")[0]) if not os.path.isdir(log_dir): os.makedirs(log_dir) if args.ngpus > 1: master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) processes = list() for rank in range(args.ngpus): process = mp.Process( target=worker, args=(master_ip, port, args.ngpus, rank, args) ) process.start() processes.append(process) for p in processes: p.join() else: worker(None, None, 1, 0, args)
def test_io_remote(): world_size = 2 server = dist.Server() port = server.py_server_port val = np.random.rand(4, 5).astype(np.float32) def worker(rank): if mge.get_device_count("gpu") < world_size: return if rank == 0: # remote send dist.init_process_group("localhost", port, world_size, rank, rank) x = Tensor(val, device="gpu0") y = remote_send(x, 1) assert y.numpy()[0] == 0 else: # remote recv dist.init_process_group("localhost", port, world_size, rank, rank) y = remote_recv(0, val.shape, val.dtype) assert y.device == "gpu1" np.testing.assert_almost_equal(val, y.numpy()) procs = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, )) p.start() procs.append(p) for p in procs: p.join(10) assert p.exitcode == 0
def test_group_barrier(): world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank, q): dist.init_process_group("localhost", port, world_size, rank, rank) dist.group_barrier() if rank == 0: dist.group_barrier() q.put(0) # to be observed in rank 1 else: _assert_q_empty(q) # q.put(0) is not executed in rank 0 dist.group_barrier() _assert_q_val(q, 0) # q.put(0) executed in rank 0 Q = mp.Queue() procs = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, Q)) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-f", "--file", default="net.py", type=str, help="net description file" ) parser.add_argument( "-w", "--weight_file", default=None, type=str, help="weights file", ) parser.add_argument( "-n", "--devices", default=1, type=int, help="total number of gpus for testing", ) parser.add_argument( "-d", "--dataset_dir", default="/data/datasets", type=str, ) args = parser.parse_args() current_network = import_from_file(args.file) cfg = current_network.Cfg() result_list = [] if args.devices > 1: result_queue = Queue(500) master_ip = "localhost" server = dist.Server() port = server.py_server_port procs = [] for i in range(args.devices): proc = Process( target=worker, args=( current_network, args.weight_file, args.dataset_dir, result_queue, master_ip, port, args.devices, i, ), ) proc.start() procs.append(proc) num_imgs = dict(VOC2012=1449, Cityscapes=500) for _ in tqdm(range(num_imgs[cfg.dataset])): result_list.append(result_queue.get()) for p in procs: p.join() else: worker(current_network, args.weight_file, args.dataset_dir, result_list) if cfg.val_save_path is not None: save_results(result_list, cfg.val_save_path, cfg) logger.info("Start evaluation!") compute_metric(result_list, cfg)
def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.dynamic = args.dynamic cfg.ensemble = args.ensemble if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) logger.info('Config:\n{}'.format(cfg.text)) gpu_list = [item.strip() for item in args.gpuids.split(",")] if gpu_list[0] == "-1": world_size = 0 # use cpu logger.info('test use only cpu') else: world_size = len(gpu_list) logger.info('test gpus num: {}'.format(world_size)) # assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') elif world_size == 1: mge.set_default_device(device='gpu' + gpu_list[0]) else: pass if world_size > 1: port = dist.util.get_free_ports(1)[0] server = dist.Server(port) processes = [] for rank in range(world_size): logger.info("init distributed process group {} / {}".format( rank, world_size)) p = mp.Process(target=worker, args=(rank, world_size, cfg, gpu_list[rank], port)) p.start() processes.append(p) for rank in range(world_size): processes[rank].join() code = processes[rank].exitcode assert code == 0, "subprocess {} exit with code {}".format( rank, code) else: worker(0, 1, cfg)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-a", "--arch", default="simplebaseline_res50", type=str, choices=cfg.model_choices, ) parser.add_argument("-s", "--save", default="/data/models", type=str) parser.add_argument("-b", "--batch_size", default=32, type=int) parser.add_argument("-lr", "--initial_lr", default=3e-4, type=float) parser.add_argument("--resume", default=None, type=str) parser.add_argument("--multi_scale_supervision", action="store_true") parser.add_argument("-n", "--ngpus", default=8, type=int) parser.add_argument("-w", "--workers", default=8, type=int) args = parser.parse_args() model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) if args.batch_size != cfg.batch_size: cfg.batch_size = args.batch_size if args.initial_lr != cfg.initial_lr: cfg.initial_lr = args.initial_lr world_size = mge.get_device_count("gpu") if args.ngpus is None else args.ngpus if world_size > 1: # scale learning rate by number of gpus master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) cfg.weight_decay *= world_size # start distributed training, dispatch sub-processes processes = [] for rank in range(world_size): p = mp.Process( target=worker, args=(master_ip, port, rank, world_size, args) ) p.start() processes.append(p) for p in processes: p.join() else: worker(None, None, 0, 1, args)
def test_syncbn(enable_amp): nr_chan = 8 data_shape = (3, nr_chan, 4, 16) momentum = 0.9 eps = 1e-5 running_mean = np.zeros((1, nr_chan, 1, 1), dtype=np.float32) running_var = np.ones((1, nr_chan, 1, 1), dtype=np.float32) steps = 4 nr_ranks = 2 server = dist.Server() port = server.py_server_port @dist.launcher(n_gpus=2) def worker(data, yv_expect, running_mean, running_var): with amp.autocast(enabled=enable_amp): rank = dist.get_rank() bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps) for i in range(steps): yv = bn(Tensor(data[rank][i])) if enable_amp: np.testing.assert_allclose( yv.numpy(), yv_expect[rank], atol=5e-4, rtol=5e-4 ) else: _assert_allclose(yv.numpy(), yv_expect[rank]) _assert_allclose(bn.running_mean.numpy(), running_mean) _assert_allclose(bn.running_var.numpy(), running_var) xv = [] for i in range(steps): xv.append(np.random.normal(loc=2.3, size=data_shape).astype(np.float32)) xv_transposed = np.transpose(xv[i], [0, 2, 3, 1]).reshape( (data_shape[0] * data_shape[2] * data_shape[3], nr_chan) ) mean = np.mean(xv_transposed, axis=0).reshape(1, nr_chan, 1, 1) var_biased = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1, 1)) sd = np.sqrt(var_biased + eps) var_unbiased = np.var(xv_transposed, axis=0, ddof=1).reshape((1, nr_chan, 1, 1)) running_mean = running_mean * momentum + mean * (1 - momentum) running_var = running_var * momentum + var_unbiased * (1 - momentum) yv_expect = (xv[i] - mean) / sd data = [] for i in range(nr_ranks): data.append([]) for j in range(steps): data[i].append(xv[j][:, :, :, i * 8 : i * 8 + 8]) yv_expect = [yv_expect[:, :, :, i * 8 : i * 8 + 8] for i in range(nr_ranks)] worker(data, yv_expect, running_mean, running_var)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-f", "--file", default="net.py", type=str, help="net description file") parser.add_argument("-n", "--ngpus", type=int, default=8, help="batch size for training") parser.add_argument( "-d", "--dataset_dir", type=str, default="/data/datasets", ) parser.add_argument("-r", "--resume", type=str, default=None, help="resume model file") args = parser.parse_args() # ------------------------ begin training -------------------------- # logger.info("Device Count = %d", args.ngpus) log_dir = "log-of-{}".format(os.path.basename(args.file).split(".")[0]) if not os.path.isdir(log_dir): os.makedirs(log_dir) if args.ngpus > 1: master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) processes = list() for rank in range(args.ngpus): process = mp.Process(target=worker, args=(master_ip, port, args.ngpus, rank, args)) process.start() processes.append(process) for p in processes: p.join() else: worker(None, None, 1, 0, args)
def test_dist_grad(): world_size = 2 x_np = np.random.rand(10).astype("float32") port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker0(): dist.init_process_group("localhost", port, world_size, 0, 0) mge.device.set_default_device("gpu0") grad = Grad() x = as_tensor(x_np) grad.wrt(x, callback=save_to(x)) # need a placeholder to trace operator send_x = remote_send(x, 1) recv_x = remote_recv(1, x_np.shape, x_np.dtype, "gpu0") y = recv_x * recv_x grad([y], [as_tensor(np.ones_like(x_np))]) np.testing.assert_almost_equal(x.grad.numpy(), x.numpy() * 2) def worker1(): dist.init_process_group("localhost", port, world_size, 1, 1) mge.device.set_default_device("gpu1") grad = Grad() recv_x = remote_recv(0, x_np.shape, x_np.dtype, "gpu1") send_x = remote_send(recv_x, 0) grad([], []) # sync because grad has a send operator sync() send_x.device._cn._sync_all() import multiprocessing as mp p0 = mp.Process(target=worker0) p1 = mp.Process(target=worker1) p0.start() p1.start() p0.join(10) p1.join(10) assert p0.exitcode == 0 and p1.exitcode == 0
def test_user_set_get(): world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank): dist.init_process_group("localhost", port, world_size, rank, rank) # set in race condition dist.get_client().user_set("foo", 1) # get in race condition ret = dist.get_client().user_get("foo") assert ret == 1 procs = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, )) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0
def main(args): configs = load_config_from_path(args.config_file) num_devices = dist.helper.get_device_count_by_fork("gpu") if num_devices > 1: # distributed training master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) processes = [] for rank in range(num_devices): process = mp.Process(target=worker, args=(master_ip, port, num_devices, rank, configs)) process.start() processes.append(process) for p in processes: p.join() else: # non-distributed training worker(None, None, 1, 0, configs)
def test_sync_min_max_observer(): x = np.random.rand(6, 3, 3, 3).astype("float32") np_min, np_max = x.min(), x.max() world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank, slc): dist.init_process_group("localhost", port, world_size, rank, rank) m = ob.SyncMinMaxObserver() y = mge.tensor(x[slc]) m(y) assert m.min_val == np_min and m.max_val == np_max procs = [] for rank in range(world_size): slc = slice(rank * 3, (rank + 1) * 3) p = mp.Process(target=worker, args=(rank, slc,), daemon=True) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0
def test_new_group(): world_size = 3 ranks = [2, 0] port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank): dist.init_process_group("localhost", port, world_size, rank, rank) if rank in ranks: group = dist.new_group(ranks) assert group.size == 2 assert group.key == "2,0" assert group.rank == ranks.index(rank) assert group.comp_node == "gpu{}:2".format(rank) procs = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, )) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0
def main(): # pylint: disable=import-outside-toplevel,too-many-branches,too-many-statements from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval parser = make_parser() args = parser.parse_args() current_network = import_from_file(args.file) cfg = current_network.Cfg() if args.weight_file: args.start_epoch = args.end_epoch = -1 else: if args.start_epoch == -1: args.start_epoch = cfg.max_epoch - 1 if args.end_epoch == -1: args.end_epoch = args.start_epoch assert 0 <= args.start_epoch <= args.end_epoch < cfg.max_epoch for epoch_num in range(args.start_epoch, args.end_epoch + 1): if args.weight_file: weight_file = args.weight_file else: weight_file = "log-of-{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch_num) result_list = [] if args.devices > 1: result_queue = Queue(2000) master_ip = "localhost" server = dist.Server() port = server.py_server_port procs = [] for i in range(args.devices): proc = Process( target=worker, args=( current_network, weight_file, args.dataset_dir, result_queue, master_ip, port, args.devices, i, ), ) proc.start() procs.append(proc) num_imgs = dict(coco=5000, objects365=30000) for _ in tqdm(range(num_imgs[cfg.test_dataset["name"]])): result_list.append(result_queue.get()) for p in procs: p.join() else: worker(current_network, weight_file, args.dataset_dir, result_list) all_results = DetEvaluator.format(result_list, cfg) json_path = "log-of-{}/epoch_{}.json".format( os.path.basename(args.file).split(".")[0], epoch_num) all_results = json.dumps(all_results) with open(json_path, "w") as fo: fo.write(all_results) logger.info("Save to %s finished, start evaluation!", json_path) eval_gt = COCO( os.path.join(args.dataset_dir, cfg.test_dataset["name"], cfg.test_dataset["ann_file"])) eval_dt = eval_gt.loadRes(json_path) cocoEval = COCOeval(eval_gt, eval_dt, iouType="bbox") cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() metrics = [ "AP", "[email protected]", "[email protected]", "APs", "APm", "APl", "AR@1", "AR@10", "AR@100", "ARs", "ARm", "ARl", ] logger.info("mmAP".center(32, "-")) for i, m in enumerate(metrics): logger.info("|\t%s\t|\t%.03f\t|", m, cocoEval.stats[i]) logger.info("-" * 32)
def worker(rank, gpu_num, args): # using sublinear os.environ["MGB_COMP_GRAPH_OPT"] = "enable_sublinear_memory_opt=1;seq_opt.enable_seq_comp_node_opt=0" os.environ["MGB_SUBLINEAR_MEMORY_GENETIC_NR_ITER"] = '10' os.environ['MGB_CUDA_RESERVE_MEMORY'] = '1' # establish the server if is the master dist_port = args.port if rank == 0: dist.Server(port=dist_port) if gpu_num> 1: dist.init_process_group( master_ip="localhost", port=dist_port, world_size=gpu_num, rank=rank, device=rank, ) logger.info("Init process group for gpu%d done", rank) model = network.Network() params = model.parameters(requires_grad=True) model.train() # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=allreduce_cb, ) opt = optim.SGD( params, lr=cfg.basic_lr * gpu_num * cfg.batch_per_gpu, momentum=cfg.momentum, weight_decay=cfg.weight_decay, ) if cfg.pretrain_weight is not None: weights = mge.load(cfg.pretrain_weight) del weights['fc.weight'] del weights['fc.bias'] model.resnet50.load_state_dict(weights) start_epoch = 0 if args.resume_weights is not None: assert osp.exists(args.resume_weights) model_file = args.resume_weights print('Loading {} to initialize FPN...'.format(model_file)) model_dict = mge.load(model_file) start_epoch, weights = model_dict['epoch'] + 1, model_dict['state_dict'] model.load_state_dict(weights, strict=False) logger.info("Prepare dataset") # train_loader = dataset.train_dataset(rank) train_dataset = CrowdHuman(cfg, if_train=True) train_sampler = data.Infinite(data.RandomSampler( train_dataset, batch_size = cfg.batch_per_gpu, drop_last=True, world_size = gpu_num, rank = rank,)) train_loader = data.DataLoader( train_dataset, sampler=train_sampler, collator = train_dataset, num_workers=4, ) train_loader = iter(train_loader) logger.info("Training...") for epoch_id in range(start_epoch, cfg.max_epoch): for param_group in opt.param_groups: param_group["lr"] = ( cfg.basic_lr * gpu_num * cfg.batch_per_gpu * (cfg.lr_decay_rate ** bisect.bisect_right(cfg.lr_decay_sates, epoch_id)) ) max_steps = cfg.nr_images_epoch // (cfg.batch_per_gpu * gpu_num) train_one_epoch(model, gm, train_loader, opt, max_steps, rank, epoch_id, gpu_num) if rank == 0: save_path = osp.join(cfg.model_dir, 'epoch-{}.pkl'.format(epoch_id + 1)) state_dict = model.state_dict() names = [k for k, _ in state_dict.items()] for name in names: if name.startswith('inputs.'): del state_dict[name] mge.save( {"epoch": epoch_id, "state_dict": state_dict}, save_path, ) logger.info("dump weights to %s", save_path)
def main(): from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval parser = make_parser() args = parser.parse_args() model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save_dir, model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) args.ngpus = (dist.helper.get_device_count_by_fork("gpu") if args.ngpus is None else args.ngpus) cfg.batch_size = cfg.batch_size if args.batch_size is None else args.batch_size dt_path = os.path.join(cfg.data_root, "person_detection_results", args.dt_file) dets = json.load(open(dt_path, "r")) gt_path = os.path.join(cfg.data_root, "annotations", "person_keypoints_val2017.json") eval_gt = COCO(gt_path) gt = eval_gt.dataset dets = [ i for i in dets if (i["image_id"] in eval_gt.imgs and i["category_id"] == 1) ] ann_file = {"images": gt["images"], "annotations": dets} if args.end_epoch == -1: args.end_epoch = args.start_epoch for epoch_num in range(args.start_epoch, args.end_epoch + 1, args.test_freq): if args.model: model_file = args.model else: model_file = "{}/epoch_{}.pkl".format(args.model_dir, epoch_num) logger.info("Load Model : %s completed", model_file) all_results = list() result_queue = Queue(5000) procs = [] for i in range(args.ngpus): master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) proc = Process( target=worker, args=( args.arch, model_file, cfg.data_root, ann_file, master_ip, port, i, args.ngpus, result_queue, ), ) proc.start() procs.append(proc) for _ in tqdm(range(len(dets))): all_results.append(result_queue.get()) for p in procs: p.join() json_name = "log-of-{}_epoch_{}.json".format(args.arch, epoch_num) json_path = os.path.join(save_dir, json_name) all_results = json.dumps(all_results) with open(json_path, "w") as fo: fo.write(all_results) logger.info("Save to %s finished, start evaluation!", json_path) eval_dt = eval_gt.loadRes(json_path) cocoEval = COCOeval(eval_gt, eval_dt, iouType="keypoints") cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() metrics = [ "AP", "[email protected]", "[email protected]", "APm", "APl", "AR", "[email protected]", "[email protected]", "ARm", "ARl", ] logger.info("mmAP".center(32, "-")) for i, m in enumerate(metrics): logger.info("|\t%s\t|\t%.03f\t|", m, cocoEval.stats[i]) logger.info("-" * 32)
def main(): # pylint: disable=import-outside-toplevel,too-many-branches,too-many-statements from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval parser = make_parser() args = parser.parse_args() current_network = import_from_file(args.file) cfg = current_network.Cfg() # if args.weight_file: if not args.weight_dir: args.start_epoch = args.end_epoch = -1 else: if args.start_epoch == -1: args.start_epoch = cfg.max_epoch - 1 if args.end_epoch == -1: args.end_epoch = args.start_epoch assert 0 <= args.start_epoch <= args.end_epoch < cfg.max_epoch for epoch_num in range(args.start_epoch, args.end_epoch + 1): # if args.weight_file: # weight_file = args.weight_file # else: # weight_file = "log-of-{}/epoch_{}.pkl".format( # os.path.basename(args.file).split(".")[0], epoch_num # ) if args.weight_dir: weight_dir = args.weight_dir else: weight_dir = "train_log/baseline" weight_file = os.path.join(args.weight_dir, "epoch_{}.pkl".format(epoch_num)) if args.ngpus > 1: master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) result_list = [] result_queue = Queue(2000) procs = [] for i in range(args.ngpus): proc = Process( target=worker, args=( current_network, weight_file, args.dataset_dir, master_ip, port, args.ngpus, i, result_queue, ), ) proc.start() procs.append(proc) num_imgs = dict(coco=5000, cocomini=5000, objects365=30000) for _ in tqdm(range(num_imgs[cfg.test_dataset["name"]])): result_list.append(result_queue.get()) for p in procs: p.join() else: result_list = [] worker( current_network, weight_file, args.dataset_dir, None, None, 1, 0, result_list ) total_time = sum([x["perf_time"] for x in result_list]) average_time = total_time / len(result_list) fps = 1.0 / average_time logger.info( "average inference speed: {:.4}s / iter, fps:{:.3}".format(average_time, fps) ) all_results = DetEvaluator.format(result_list, cfg) # json_path = "log-of-{}/epoch_{}.json".format( # os.path.basename(args.file).split(".")[0], epoch_num # ) json_path = os.path.join(args.weight_dir, "epoch_{}.json".format(epoch_num)) all_results = json.dumps(all_results) with open(json_path, "w") as fo: fo.write(all_results) logger.info("Save to %s finished, start evaluation!", json_path) eval_gt = COCO( os.path.join( args.dataset_dir, cfg.test_dataset["name"], cfg.test_dataset["ann_file"] ) ) eval_dt = eval_gt.loadRes(json_path) cocoEval = COCOeval(eval_gt, eval_dt, iouType="bbox") cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() metrics = [ "AP", "[email protected]", "[email protected]", "APs", "APm", "APl", "AR@1", "AR@10", "AR@100", "ARs", "ARm", "ARl", ] logger.info("mmAP".center(32, "-")) for i, m in enumerate(metrics): logger.info("|\t%s\t|\t%.03f\t|", m, cocoEval.stats[i]) logger.info("-" * 32)
def test_syncbn(): nr_chan = 8 data_shape = (3, nr_chan, 4, 16) momentum = 0.9 eps = 1e-5 running_mean = np.zeros((1, nr_chan, 1, 1), dtype=np.float32) running_var = np.ones((1, nr_chan, 1, 1), dtype=np.float32) steps = 4 nr_ranks = 2 server = dist.Server(0) port = server.py_server_port def worker(rank, data, yv_expect, running_mean, running_var): if mge.get_device_count("gpu") < nr_ranks: return dist.init_process_group("localhost", port, nr_ranks, rank, rank) bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps) for i in range(steps): yv = bn(Tensor(data[i])) _assert_allclose(yv.numpy(), yv_expect) _assert_allclose(bn.running_mean.numpy(), running_mean) _assert_allclose(bn.running_var.numpy(), running_var) xv = [] for i in range(steps): xv.append( np.random.normal(loc=2.3, size=data_shape).astype(np.float32)) xv_transposed = np.transpose(xv[i], [0, 2, 3, 1]).reshape( (data_shape[0] * data_shape[2] * data_shape[3], nr_chan)) mean = np.mean(xv_transposed, axis=0).reshape(1, nr_chan, 1, 1) var_biased = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1, 1)) sd = np.sqrt(var_biased + eps) var_unbiased = np.var(xv_transposed, axis=0, ddof=1).reshape( (1, nr_chan, 1, 1)) running_mean = running_mean * momentum + mean * (1 - momentum) running_var = running_var * momentum + var_unbiased * (1 - momentum) yv_expect = (xv[i] - mean) / sd data = [] for i in range(nr_ranks): data.append([]) for j in range(steps): data[i].append(xv[j][:, :, :, i * 8:i * 8 + 8]) procs = [] for rank in range(nr_ranks): p = mp.Process( target=worker, args=( rank, data[rank], yv_expect[:, :, :, rank * 8:rank * 8 + 8], running_mean, running_var, ), ) p.start() procs.append(p) for p in procs: p.join(10) assert p.exitcode == 0
def main(): parser = argparse.ArgumentParser(description="MegEngine ImageNet Training") parser.add_argument("-d", "--data", metavar="DIR", help="path to imagenet dataset") parser.add_argument( "-a", "--arch", default="resnet50", help="model architecture (default: resnet50)", ) parser.add_argument( "-n", "--ngpus", default=None, type=int, help="number of GPUs per node (default: None, use all available GPUs)", ) parser.add_argument( "--save", metavar="DIR", default="output", help="path to save checkpoint and log", ) parser.add_argument( "--epochs", default=90, type=int, help="number of total epochs to run (default: 90)", ) parser.add_argument( "-b", "--batch-size", metavar="SIZE", default=64, type=int, help="batch size for single GPU (default: 64)", ) parser.add_argument( "--lr", "--learning-rate", metavar="LR", default=0.025, type=float, help="learning rate for single GPU (default: 0.025)", ) parser.add_argument("--momentum", default=0.9, type=float, help="momentum (default: 0.9)") parser.add_argument("--weight-decay", default=1e-4, type=float, help="weight decay (default: 0.9)") parser.add_argument("-j", "--workers", default=2, type=int) parser.add_argument( "-p", "--print-freq", default=20, type=int, metavar="N", help="print frequency (default: 10)", ) parser.add_argument("--dist-addr", default="localhost") parser.add_argument("--dist-port", default=23456, type=int) parser.add_argument("--world-size", default=1, type=int) parser.add_argument("--rank", default=0, type=int) args = parser.parse_args() # create server if is master if args.rank <= 0: server = dist.Server(port=args.dist_port) # pylint: disable=unused-variable # noqa: F841 # get device count with multiprocessing.Pool(1) as pool: ngpus_per_node, _ = pool.map(megengine.get_device_count, ["gpu", "cpu"]) if args.ngpus: ngpus_per_node = args.ngpus # launch processes procs = [] for local_rank in range(ngpus_per_node): p = multiprocessing.Process( target=worker, kwargs=dict( rank=args.rank * ngpus_per_node + local_rank, world_size=args.world_size * ngpus_per_node, ngpus_per_node=ngpus_per_node, args=args, ), ) p.start() procs.append(p) # join processes for p in procs: p.join()
def main(): parser = argparse.ArgumentParser(description="MegEngine ImageNet Training") parser.add_argument("-d", "--data", metavar="DIR", help="path to imagenet dataset") parser.add_argument( "-a", "--arch", default="resnet50", help="model architecture (default: resnet50)", ) parser.add_argument( "-n", "--ngpus", default=None, type=int, help="number of GPUs per node (default: None, use all available GPUs)", ) parser.add_argument("-m", "--model", metavar="PKL", default=None, help="path to model checkpoint") parser.add_argument("-j", "--workers", default=2, type=int) parser.add_argument( "-p", "--print-freq", default=20, type=int, metavar="N", help="print frequency (default: 10)", ) parser.add_argument("--dist-addr", default="localhost") parser.add_argument("--dist-port", default=23456, type=int) parser.add_argument("--world-size", default=1, type=int) parser.add_argument("--rank", default=0, type=int) args = parser.parse_args() # create server if is master if args.rank <= 0: server = dist.Server(port=args.dist_port) # pylint: disable=unused-variable # noqa: F841 # get device count with multiprocessing.Pool(1) as pool: ngpus_per_node, _ = pool.map(megengine.get_device_count, ["gpu", "cpu"]) if args.ngpus: ngpus_per_node = args.ngpus # launch processes procs = [] for local_rank in range(ngpus_per_node): p = multiprocessing.Process( target=worker, kwargs=dict( rank=args.rank * ngpus_per_node + local_rank, world_size=args.world_size * ngpus_per_node, ngpus_per_node=ngpus_per_node, args=args, ), ) p.start() procs.append(p) # join processes for p in procs: p.join()