예제 #1
0
    def init_horovod_if_needed(self):
        self._set_horovod_env()
        for _ in range(DEFAULT_MAX_ALLREDUCE_RETRY_NUM):
            rank_response = self._master_client.get_comm_rank()
            if rank_response.rank_id < 0:
                logger.warning("The master has not added the worker host into "
                               "rendezvous yet. Retrying to get rank")
                time.sleep(RETRY_ALLREDUCE_INTERVAL_SECS)
            else:
                break

        # If the rendezvous from master is unequal to self._rendezvous_id,
        # the worker should rebuild the communication because the master
        # has updated the communication group.
        if rank_response.rendezvous_id != self._rendezvous_id:
            logger.info(
                "Initialize Horovod with rank = {} and size = {}".format(
                    rank_response.rank_id, rank_response.world_size))
            os.environ[HorovodEnv.RENDEZVOUS_PORT] = str(
                rank_response.rendezvous_port)
            os.environ[HorovodEnv.RANK] = str(rank_response.rank_id)
            os.environ[HorovodEnv.SIZE] = str(rank_response.world_size)
            # Not using Horovod elastic feature in init, but need it for
            # allreduce to call allreduce op when size=1.
            os.environ[HorovodEnv.ELASTIC] = str(0)
            hvd.shutdown()
            hvd.init()
            os.environ[HorovodEnv.ELASTIC] = str(1)
            self._rendezvous_id = rank_response.rendezvous_id
            self.need_broadcast = True
예제 #2
0
 def _init_rendezvous_server(self):
     logger.info("Initialize rendezvous server with hosts {}".format(
         self._next_rendezvous_hosts))
     self._cur_rendezvous_hosts = self._next_rendezvous_hosts
     self._next_rendezvous_hosts = None
     host_alloc_plan = self._get_host_plan()
     self._rendezvous_server.init(host_alloc_plan)
     self._rendezvous_id += 1
     self._cur_rendezvous_completed = False
예제 #3
0
 def remove_worker(self, worker_host):
     with self._lock:
         logger.info(
             "Remove worker host {} from rendenzvous.".format(worker_host))
         if worker_host in self._cur_rendezvous_hosts:
             if self._next_rendezvous_hosts is None:
                 self._next_rendezvous_hosts = copy.deepcopy(
                     self._cur_rendezvous_hosts)
             self._next_rendezvous_hosts.pop(
                 self._next_rendezvous_hosts.index(worker_host))
예제 #4
0
 def add_worker(self, worker_host):
     with self._lock:
         logger.info(
             "Add worker host {} into rendenzvous and cur hosts {}.".format(
                 worker_host, self._cur_rendezvous_hosts))
         if worker_host:
             if self._next_rendezvous_hosts is None:
                 self._next_rendezvous_hosts = copy.deepcopy(
                     self._cur_rendezvous_hosts)
             if worker_host not in self._next_rendezvous_hosts:
                 self._next_rendezvous_hosts.append(worker_host)
예제 #5
0
 def add_worker(self, worker_host):
     with self._lock:
         logger.info(
             "Add worker host {} into rendenzvous and cur hosts {}.".format(
                 worker_host, self._cur_rendezvous_hosts))
         if worker_host:
             if self._next_rendezvous_hosts is None:
                 self._next_rendezvous_hosts = copy.deepcopy(
                     self._cur_rendezvous_hosts)
             # Master will not add any worker if the current rendezvous
             # hosts become empty after starting training.
             if self._rendezvous_id > 0 and not self._next_rendezvous_hosts:
                 return
             if worker_host not in self._next_rendezvous_hosts:
                 self._next_rendezvous_hosts.append(worker_host)
예제 #6
0
 def reset_backward_passes_per_step(self):
     # Only reset backward_passes_per_step when using the optimizer
     # with fixed_global_batch_size
     if (hasattr(self._optimizer, "fixed_global_batch_size")
             and self._optimizer.fixed_global_batch_size):
         world_size = hvd.size()
         rank = hvd.rank()
         self.backward_passes_per_step = (self.global_batch_num_per_step //
                                          world_size)
         if rank < self.global_batch_num_per_step % world_size:
             self.backward_passes_per_step += 1
         if (self.backward_passes_per_step !=
                 self._optimizer.backward_passes_per_step):
             self._optimizer.set_backward_passes_per_step(
                 self.backward_passes_per_step)
             logger.info("Backward passes per step = {}".format(
                 self._optimizer.backward_passes_per_step))
예제 #7
0
    def init_horovod_if_needed(self):
        self._set_horovod_env()
        for _ in range(DEFAULT_MAX_ALLREDUCE_RETRY_NUM):
            rank_response = self._master_client.get_comm_rank()
            if rank_response.rank_id < 0:
                logger.warning("The master has not added the worker host into "
                               "rendezvous yet. Retrying to get rank")
                time.sleep(RETRY_ALLREDUCE_INTERVAL_SECS)
            else:
                break
        if rank_response.rank_id < 0:
            raise ValueError("Invalid rank {}".format(rank_response.rank_id))

        # If the rendezvous from master is unequal to self._rendezvous_id,
        # the worker should rebuild the communication because the master
        # has updated the communication group.
        if rank_response.rendezvous_id != self._rendezvous_id:
            logger.info(
                "Initialize Horovod with rank = {} and size = {}".format(
                    rank_response.rank_id, rank_response.world_size))
            self._restart_hvd(rank_response)
예제 #8
0
 def _broadcast_if_needed(self):
     if self._rendezvous_manager.need_broadcast:
         logger.info("Broadcast models")
         self.broadcast()
         self._rendezvous_manager.need_broadcast = False