def test_broadcast(): world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank, data, expect, port): if mge.get_device_count("gpu") < world_size: return dist.init_process_group("localhost", port, world_size, rank, rank) inp = tensor(data) output = broadcast(inp) assert np.allclose(output.numpy(), expect) def check(shape): x = np.random.rand(*shape).astype("float32") y = x + 1 p0 = mp.Process(target=worker, args=(0, x, x, port)) p1 = mp.Process(target=worker, args=(1, y, x, port)) p0.start() p1.start() p0.join(10) p1.join(10) assert p0.exitcode == 0 and p1.exitcode == 0 for shape in [(2, 3), (8, 10), (99, 77)]: check(shape)
def test_io_remote(): world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) val = np.random.rand(4, 5).astype(np.float32) def worker(rank): if mge.get_device_count("gpu") < world_size: return if rank == 0: # remote send dist.init_process_group("localhost", port, world_size, rank, rank) x = Tensor(val, device="gpu0") y = remote_send(x, 1) assert y.numpy()[0] == 0 else: # remote recv dist.init_process_group("localhost", port, world_size, rank, rank) y = remote_recv(0, val.shape, val.dtype) assert y.device == "gpu1" np.testing.assert_almost_equal(val, y.numpy()) procs = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, )) p.start() procs.append(p) for p in procs: p.join(10) assert p.exitcode == 0
def main(): parser = make_parser() args = parser.parse_args() # ------------------------ begin training -------------------------- # logger.info("Device Count = %d", args.ngpus) log_dir = "log-of-{}".format(os.path.basename(args.file).split(".")[0]) if not os.path.isdir(log_dir): os.makedirs(log_dir) if args.ngpus > 1: master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) processes = list() for rank in range(args.ngpus): process = mp.Process( target=worker, args=(master_ip, port, args.ngpus, rank, args) ) process.start() processes.append(process) for p in processes: p.join() else: worker(None, None, 1, 0, args)
def run_test( model_path, use_jit, use_symbolic, sublinear_memory_config=None, max_err=None, ): """ Load the model with test cases and run the training for one iter. The loss and updated weights are compared with reference value to verify the correctness. Dump a new file with updated result by calling update_model if you think the test fails due to numerical rounding errors instead of bugs. Please think twice before you do so. """ checkpoint = mge.load(model_path) data = checkpoint["data"] label = checkpoint["label"] port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank, max_err): dist.init_process_group("localhost", port, p_num, rank, rank) net = MnistNet(has_bn=True) net.load_state_dict(checkpoint["net_init"]) lr = checkpoint["sgd_lr"] opt = SGD(net.parameters(), lr=lr) gm = ad.GradManager().attach( net.parameters(), callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)] ) # use same data and label for all gpu's # such that the result does not depend on number of gpu data_train = Tensor(data) label_train = Tensor(label) loss = train(data_train, label_train, net, opt, gm) np.testing.assert_allclose(loss.numpy(), checkpoint["loss"], atol=max_err) if dist.get_rank(): return for param, param_ref in zip( net.state_dict().items(), checkpoint["net_updated"].items() ): assert param[0] == param_ref[0] if "bn" in param[0]: ref = param_ref[1].reshape(param[1].shape) np.testing.assert_allclose(param[1], ref, atol=max_err) else: np.testing.assert_allclose(param[1], param_ref[1], atol=max_err) procs = [] for rank in range(p_num): p = mp.Process(target=worker, args=(rank, max_err,)) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0
def test_group_barrier(): world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank, q): dist.init_process_group("localhost", port, world_size, rank, rank) dist.group_barrier() if rank == 0: dist.group_barrier() q.put(0) # to be observed in rank 1 else: _assert_q_empty(q) # q.put(0) is not executed in rank 0 dist.group_barrier() _assert_q_val(q, 0) # q.put(0) executed in rank 0 Q = mp.Queue() procs = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, Q)) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0
def test_init_process_group(): world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank, backend): dist.init_process_group("localhost", port, world_size, rank, rank, backend) assert dist.is_distributed() == True assert dist.get_rank() == rank assert dist.get_world_size() == world_size assert dist.get_backend() == backend py_server_addr = dist.get_py_server_addr() assert py_server_addr[0] == "localhost" assert py_server_addr[1] == port mm_server_addr = dist.get_mm_server_addr() assert mm_server_addr[0] == "localhost" assert mm_server_addr[1] > 0 assert isinstance(dist.get_client(), dist.Client) def check(backend): procs = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, backend)) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0 check("nccl")
def test_synchronized(): world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) @dist.synchronized def func(rank, q): q.put(rank) def worker(rank, q): dist.init_process_group("localhost", port, world_size, rank, rank) dist.group_barrier() if rank == 0: func(0, q) # q.put(0) q.put(2) else: _assert_q_val(q, 0) # func executed in rank 0 _assert_q_empty(q) # q.put(2) is not executed func(1, q) _assert_q_val( q, 1) # func in rank 1 executed earlier than q.put(2) in rank 0 _assert_q_val(q, 2) # q.put(2) executed in rank 0 Q = mp.Queue() procs = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, Q)) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-a", "--arch", default="simplebaseline_res50", type=str, choices=cfg.model_choices, ) parser.add_argument("-s", "--save", default="/data/models", type=str) parser.add_argument("-b", "--batch_size", default=32, type=int) parser.add_argument("-lr", "--initial_lr", default=3e-4, type=float) parser.add_argument("--resume", default=None, type=str) parser.add_argument("--multi_scale_supervision", action="store_true") parser.add_argument("-n", "--ngpus", default=8, type=int) parser.add_argument("-w", "--workers", default=8, type=int) args = parser.parse_args() model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) if args.batch_size != cfg.batch_size: cfg.batch_size = args.batch_size if args.initial_lr != cfg.initial_lr: cfg.initial_lr = args.initial_lr world_size = mge.get_device_count("gpu") if args.ngpus is None else args.ngpus if world_size > 1: # scale learning rate by number of gpus master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) cfg.weight_decay *= world_size # start distributed training, dispatch sub-processes processes = [] for rank in range(world_size): p = mp.Process( target=worker, args=(master_ip, port, rank, world_size, args) ) p.start() processes.append(p) for p in processes: p.join() else: worker(None, None, 0, 1, args)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-f", "--file", default="net.py", type=str, help="net description file") parser.add_argument("-n", "--ngpus", type=int, default=8, help="batch size for training") parser.add_argument( "-d", "--dataset_dir", type=str, default="/data/datasets", ) parser.add_argument("-r", "--resume", type=str, default=None, help="resume model file") args = parser.parse_args() # ------------------------ begin training -------------------------- # logger.info("Device Count = %d", args.ngpus) log_dir = "log-of-{}".format(os.path.basename(args.file).split(".")[0]) if not os.path.isdir(log_dir): os.makedirs(log_dir) if args.ngpus > 1: master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) processes = list() for rank in range(args.ngpus): process = mp.Process(target=worker, args=(master_ip, port, args.ngpus, rank, args)) process.start() processes.append(process) for p in processes: p.join() else: worker(None, None, 1, 0, args)
def test_dist_grad(): world_size = 2 x_np = np.random.rand(10).astype("float32") port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker0(): dist.init_process_group("localhost", port, world_size, 0, 0) mge.device.set_default_device("gpu0") grad = Grad() x = as_tensor(x_np) grad.wrt(x, callback=save_to(x)) # need a placeholder to trace operator send_x = remote_send(x, 1) recv_x = remote_recv(1, x_np.shape, x_np.dtype, "gpu0") y = recv_x * recv_x grad([y], [as_tensor(np.ones_like(x_np))]) np.testing.assert_almost_equal(x.grad.numpy(), x.numpy() * 2) def worker1(): dist.init_process_group("localhost", port, world_size, 1, 1) mge.device.set_default_device("gpu1") grad = Grad() recv_x = remote_recv(0, x_np.shape, x_np.dtype, "gpu1") send_x = remote_send(recv_x, 0) grad([], []) # sync because grad has a send operator sync() send_x.device._cn._sync_all() import multiprocessing as mp p0 = mp.Process(target=worker0) p1 = mp.Process(target=worker1) p0.start() p1.start() p0.join(10) p1.join(10) assert p0.exitcode == 0 and p1.exitcode == 0
def test_user_set_get(): world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank): dist.init_process_group("localhost", port, world_size, rank, rank) # set in race condition dist.get_client().user_set("foo", 1) # get in race condition ret = dist.get_client().user_get("foo") assert ret == 1 procs = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, )) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0
def main(args): configs = load_config_from_path(args.config_file) num_devices = dist.helper.get_device_count_by_fork("gpu") if num_devices > 1: # distributed training master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) processes = [] for rank in range(num_devices): process = mp.Process(target=worker, args=(master_ip, port, num_devices, rank, configs)) process.start() processes.append(process) for p in processes: p.join() else: # non-distributed training worker(None, None, 1, 0, configs)
def test_sync_min_max_observer(): x = np.random.rand(6, 3, 3, 3).astype("float32") np_min, np_max = x.min(), x.max() world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank, slc): dist.init_process_group("localhost", port, world_size, rank, rank) m = ob.SyncMinMaxObserver() y = mge.tensor(x[slc]) m(y) assert m.min_val == np_min and m.max_val == np_max procs = [] for rank in range(world_size): slc = slice(rank * 3, (rank + 1) * 3) p = mp.Process(target=worker, args=(rank, slc,), daemon=True) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0
def test_new_group(): world_size = 3 ranks = [2, 0] port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank): dist.init_process_group("localhost", port, world_size, rank, rank) if rank in ranks: group = dist.new_group(ranks) assert group.size == 2 assert group.key == "2,0" assert group.rank == ranks.index(rank) assert group.comp_node == "gpu{}:2".format(rank) procs = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, )) p.start() procs.append(p) for p in procs: p.join(20) assert p.exitcode == 0
def main(): from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval parser = make_parser() args = parser.parse_args() model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save_dir, model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) args.ngpus = (dist.helper.get_device_count_by_fork("gpu") if args.ngpus is None else args.ngpus) cfg.batch_size = cfg.batch_size if args.batch_size is None else args.batch_size dt_path = os.path.join(cfg.data_root, "person_detection_results", args.dt_file) dets = json.load(open(dt_path, "r")) gt_path = os.path.join(cfg.data_root, "annotations", "person_keypoints_val2017.json") eval_gt = COCO(gt_path) gt = eval_gt.dataset dets = [ i for i in dets if (i["image_id"] in eval_gt.imgs and i["category_id"] == 1) ] ann_file = {"images": gt["images"], "annotations": dets} if args.end_epoch == -1: args.end_epoch = args.start_epoch for epoch_num in range(args.start_epoch, args.end_epoch + 1, args.test_freq): if args.model: model_file = args.model else: model_file = "{}/epoch_{}.pkl".format(args.model_dir, epoch_num) logger.info("Load Model : %s completed", model_file) all_results = list() result_queue = Queue(5000) procs = [] for i in range(args.ngpus): master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) proc = Process( target=worker, args=( args.arch, model_file, cfg.data_root, ann_file, master_ip, port, i, args.ngpus, result_queue, ), ) proc.start() procs.append(proc) for _ in tqdm(range(len(dets))): all_results.append(result_queue.get()) for p in procs: p.join() json_name = "log-of-{}_epoch_{}.json".format(args.arch, epoch_num) json_path = os.path.join(save_dir, json_name) all_results = json.dumps(all_results) with open(json_path, "w") as fo: fo.write(all_results) logger.info("Save to %s finished, start evaluation!", json_path) eval_dt = eval_gt.loadRes(json_path) cocoEval = COCOeval(eval_gt, eval_dt, iouType="keypoints") cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() metrics = [ "AP", "[email protected]", "[email protected]", "APm", "APl", "AR", "[email protected]", "[email protected]", "ARm", "ARl", ] logger.info("mmAP".center(32, "-")) for i, m in enumerate(metrics): logger.info("|\t%s\t|\t%.03f\t|", m, cocoEval.stats[i]) logger.info("-" * 32)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-f", "--file", default="net.py", type=str, help="net description file") parser.add_argument( "-w", "--weight_file", default=None, type=str, help="weights file", ) parser.add_argument( "-n", "--ngpus", default=1, type=int, help="total number of gpus for testing", ) parser.add_argument( "-d", "--dataset_dir", type=str, default="/data/datasets", ) args = parser.parse_args() current_network = import_from_file(args.file) cfg = current_network.Cfg() if args.ngpus > 1: master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) result_list = [] result_queue = Queue(500) procs = [] for i in range(args.ngpus): proc = Process( target=worker, args=( current_network, args.weight_file, args.dataset_dir, master_ip, port, args.ngpus, i, result_queue, ), ) proc.start() procs.append(proc) num_imgs = dict(VOC2012=1449, Cityscapes=500) for _ in tqdm(range(num_imgs[cfg.dataset])): result_list.append(result_queue.get()) for p in procs: p.join() else: result_list = [] worker(current_network, args.weight_file, args.dataset_dir, None, None, 1, 0, result_list) if cfg.val_save_path is not None: save_results(result_list, cfg.val_save_path, cfg) logger.info("Start evaluation!") compute_metric(result_list, cfg)
def main(): # pylint: disable=import-outside-toplevel,too-many-branches,too-many-statements from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval parser = make_parser() args = parser.parse_args() current_network = import_from_file(args.file) cfg = current_network.Cfg() # if args.weight_file: if not args.weight_dir: args.start_epoch = args.end_epoch = -1 else: if args.start_epoch == -1: args.start_epoch = cfg.max_epoch - 1 if args.end_epoch == -1: args.end_epoch = args.start_epoch assert 0 <= args.start_epoch <= args.end_epoch < cfg.max_epoch for epoch_num in range(args.start_epoch, args.end_epoch + 1): # if args.weight_file: # weight_file = args.weight_file # else: # weight_file = "log-of-{}/epoch_{}.pkl".format( # os.path.basename(args.file).split(".")[0], epoch_num # ) if args.weight_dir: weight_dir = args.weight_dir else: weight_dir = "train_log/baseline" weight_file = os.path.join(args.weight_dir, "epoch_{}.pkl".format(epoch_num)) if args.ngpus > 1: master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) result_list = [] result_queue = Queue(2000) procs = [] for i in range(args.ngpus): proc = Process( target=worker, args=( current_network, weight_file, args.dataset_dir, master_ip, port, args.ngpus, i, result_queue, ), ) proc.start() procs.append(proc) num_imgs = dict(coco=5000, cocomini=5000, objects365=30000) for _ in tqdm(range(num_imgs[cfg.test_dataset["name"]])): result_list.append(result_queue.get()) for p in procs: p.join() else: result_list = [] worker( current_network, weight_file, args.dataset_dir, None, None, 1, 0, result_list ) total_time = sum([x["perf_time"] for x in result_list]) average_time = total_time / len(result_list) fps = 1.0 / average_time logger.info( "average inference speed: {:.4}s / iter, fps:{:.3}".format(average_time, fps) ) all_results = DetEvaluator.format(result_list, cfg) # json_path = "log-of-{}/epoch_{}.json".format( # os.path.basename(args.file).split(".")[0], epoch_num # ) json_path = os.path.join(args.weight_dir, "epoch_{}.json".format(epoch_num)) all_results = json.dumps(all_results) with open(json_path, "w") as fo: fo.write(all_results) logger.info("Save to %s finished, start evaluation!", json_path) eval_gt = COCO( os.path.join( args.dataset_dir, cfg.test_dataset["name"], cfg.test_dataset["ann_file"] ) ) eval_dt = eval_gt.loadRes(json_path) cocoEval = COCOeval(eval_gt, eval_dt, iouType="bbox") cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() metrics = [ "AP", "[email protected]", "[email protected]", "APs", "APm", "APl", "AR@1", "AR@10", "AR@100", "ARs", "ARm", "ARl", ] logger.info("mmAP".center(32, "-")) for i, m in enumerate(metrics): logger.info("|\t%s\t|\t%.03f\t|", m, cocoEval.stats[i]) logger.info("-" * 32)