Exemplo n.º 1
0
    def init_horovod_if_needed(self):
        self._set_horovod_env()
        for _ in range(DEFAULT_MAX_ALLREDUCE_RETRY_NUM):
            rank_response = self._master_client.get_comm_rank()
            if rank_response.rank_id < 0:
                logger.warning("The master has not added the worker host into "
                               "rendezvous yet. Retrying to get rank")
                time.sleep(RETRY_ALLREDUCE_INTERVAL_SECS)
            else:
                break

        # If the rendezvous from master is unequal to self._rendezvous_id,
        # the worker should rebuild the communication because the master
        # has updated the communication group.
        if rank_response.rendezvous_id != self._rendezvous_id:
            logger.info(
                "Initialize Horovod with rank = {} and size = {}".format(
                    rank_response.rank_id, rank_response.world_size))
            os.environ[HorovodEnv.RENDEZVOUS_PORT] = str(
                rank_response.rendezvous_port)
            os.environ[HorovodEnv.RANK] = str(rank_response.rank_id)
            os.environ[HorovodEnv.SIZE] = str(rank_response.world_size)
            # Not using Horovod elastic feature in init, but need it for
            # allreduce to call allreduce op when size=1.
            os.environ[HorovodEnv.ELASTIC] = str(0)
            hvd.shutdown()
            hvd.init()
            os.environ[HorovodEnv.ELASTIC] = str(1)
            self._rendezvous_id = rank_response.rendezvous_id
            self.need_broadcast = True
Exemplo n.º 2
0
 def train_one_batch_with_retries(self, func, *args, **kwargs):
     for _ in range(DEFAULT_MAX_ALLREDUCE_RETRY_NUM):
         try:
             self._broadcast_if_needed()
             result = func(*args, **kwargs)
             break
         except UnknownError as e:
             logger.warning("Failed to perform allreduce operation on "
                            "the gradients. Retrying...")
             # Those error message show that the communication
             # to merge gradient fails and we can rebuild the
             # communication.
             if ("HorovodAllreduce" in e.message
                     or "HorovodAllgather" in e.message
                     or "HorovodBroadcast" in e.message):
                 time.sleep(3)
                 self._rendezvous_manager.init_horovod_if_needed()
     return result
Exemplo n.º 3
0
    def init_horovod_if_needed(self):
        self._set_horovod_env()
        for _ in range(DEFAULT_MAX_ALLREDUCE_RETRY_NUM):
            rank_response = self._master_client.get_comm_rank()
            if rank_response.rank_id < 0:
                logger.warning("The master has not added the worker host into "
                               "rendezvous yet. Retrying to get rank")
                time.sleep(RETRY_ALLREDUCE_INTERVAL_SECS)
            else:
                break
        if rank_response.rank_id < 0:
            raise ValueError("Invalid rank {}".format(rank_response.rank_id))

        # If the rendezvous from master is unequal to self._rendezvous_id,
        # the worker should rebuild the communication because the master
        # has updated the communication group.
        if rank_response.rendezvous_id != self._rendezvous_id:
            logger.info(
                "Initialize Horovod with rank = {} and size = {}".format(
                    rank_response.rank_id, rank_response.world_size))
            self._restart_hvd(rank_response)
Exemplo n.º 4
0
 def train_one_batch_with_retries(self, func, *args, **kwargs):
     self.reset_backward_passes_per_step()
     allreduce_success = False
     for _ in range(DEFAULT_MAX_ALLREDUCE_RETRY_NUM):
         try:
             self._broadcast_if_needed()
             result = func(*args, **kwargs)
             allreduce_success = True
             break
         except HorovodInternalError:
             logger.warning("Failed to perform allreduce operation on "
                            "the gradients. Retrying...")
             # Those error message show that the communication
             # to merge gradient fails and we can rebuild the
             # communication.
             self.restore()
         except RuntimeError:
             traceback.print_exc()
             self.restore()
     if not allreduce_success:
         raise RuntimeError("Failed to perform allreduce.")
     self._update_completed_minibatches()
     return result