def after_train_iter(self, runner): if not self.every_n_iters(runner, self.interval): return self.logger.info("start to eval for iter: {}".format(runner.iter + 1)) save_path = os.path.join(self.save_path, "iter_{}".format(runner.iter + 1)) mkdir_or_exist(save_path) results = [] # list of dict if self.multi_process: assert is_distributed( ), "when set multiprocess eval, you should use multi process training" raise NotImplementedError("not support multi process for eval now") elif self.local_rank == 0: # 全部交给rank0来处理 for data in self.dataloader: outputs = runner.model.test_step(data, save_image=self.save_image, save_path=save_path, ensemble=self.ensemble) result = runner.model.cal_for_eval(outputs, data) assert isinstance(result, list) results += result self.evaluate(results, runner.iter + 1) else: pass if is_distributed(): dist.group_barrier()
def worker(rank, q): dist.init_process_group("localhost", port, world_size, rank, rank) dist.group_barrier() if rank == 0: func(0, q) # q.put(0) q.put(2) else: _assert_q_val(q, 0) # func executed in rank 0 _assert_q_empty(q) # q.put(2) is not executed func(1, q) _assert_q_val( q, 1) # func in rank 1 executed earlier than q.put(2) in rank 0 _assert_q_val(q, 2) # q.put(2) executed in rank 0
def worker(rank, q): if not mge.is_cuda_available(): return _init_process_group_wrapper(world_size, rank, rank, backend, q) dist.group_barrier() if rank == 0: func(0, q) # q.put(0) q.put(2) else: _assert_q_val(q, 0) # func executed in rank 0 _assert_q_empty(q) # q.put(2) is not executed func(1, q) _assert_q_val( q, 1 ) # func in rank 1 executed earlier than q.put(2) in rank 0 _assert_q_val(q, 2) # q.put(2) executed in rank 0
def worker(rank, q): dist.init_process_group("localhost", port, world_size, rank, rank) dist.group_barrier() if rank == 0: dist.group_barrier() q.put(0) # to be observed in rank 1 else: _assert_q_empty(q) # q.put(0) is not executed in rank 0 dist.group_barrier() _assert_q_val(q, 0) # q.put(0) executed in rank 0
def worker(rank, q): if not mge.is_cuda_available(): return _init_process_group_wrapper(world_size, rank, rank, backend, q) dist.group_barrier() if rank == 0: dist.group_barrier() q.put(0) # to be observed in rank 1 else: _assert_q_empty(q) # q.put(0) is not executed in rank 0 dist.group_barrier() _assert_q_val(q, 0) # q.put(0) executed in rank 0