Python get_rpc_timeout示例

编程语言: Python

命名空间/包名称: torch._C._distributed_rpc

方法/功能: get_rpc_timeout

hotexamples.com的示例: 2

Python get_rpc_timeout - 已找到2个示例。这些是从开源项目中提取的最受好评的torch._C._distributed_rpc.get_rpc_timeout现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： api.py 项目： huangjundashuaige/pytorch-X

def _all_gather(obj, timeout=UNSET_RPC_TIMEOUT):
    r"""
    This is similar to torch.distributed.all_gather(), but is using RPC. It
    picks the worker with the smallest name (alphabetic order) as the leader.
    Then all followers send their data ``obj`` to the leader. After the leader
    has received all, it will broadcast the results back to all followers. This
    function blocks until all workers have received the gathered results.
    """
    assert (_ALL_WORKER_NAMES is not None
            ), "`_ALL_WORKER_NAMES` is not initialized for `def _all_gather`."
    leader_name = sorted(_ALL_WORKER_NAMES)[0]

    self_name = _get_current_rpc_agent().get_worker_info().name

    global _all_gather_sequence_id
    with _all_gather_dict_lock:
        sequence_id = _all_gather_sequence_id
        _all_gather_sequence_id += 1

    is_leader = leader_name == self_name
    if timeout == UNSET_RPC_TIMEOUT:
        timeout = get_rpc_timeout()

    # Phase 1: Followers send it's object to the leader
    if is_leader:
        _gather_to_leader(sequence_id, self_name, obj)
    else:
        rpc_sync(
            leader_name,
            _gather_to_leader,
            args=(sequence_id, self_name, obj),
            timeout=timeout,
        )

    with _all_gather_dict_lock:
        states = _all_gather_sequence_id_to_states[sequence_id]
    states.proceed_signal.wait()

    # Phase 2: Leader broadcast gathered results to all followers
    # Leader's signal is the first to be unblocked, after receiving all
    # followers' data objects.
    if is_leader:
        worker_name_to_response_future_dict = dict()
        for follower_name in _ALL_WORKER_NAMES - {leader_name}:
            fut = rpc_async(follower_name,
                            _broadcast_to_followers,
                            args=(sequence_id, states.gathered_objects),
                            timeout=timeout)
            worker_name_to_response_future_dict[follower_name] = fut

        errors = []
        for follower_name, fut in worker_name_to_response_future_dict.items():
            try:
                fut.wait()
            except RuntimeError as ex:
                errors.append((follower_name, ex))

        if errors:
            raise RuntimeError(
                f"Followers {[e[0] for e in errors]} timed out in _all_gather "
                f"after {timeout:.2f} seconds. The first exception is {errors[0][1]}"
            )

    return states.gathered_objects

示例#2

显示文件

文件： api.py 项目： yuguo68/pytorch

def _all_gather(obj, worker_names=None, timeout=UNSET_RPC_TIMEOUT):
    r"""
    This is similar to torch.distributed.all_gather(), but is using RPC. It
    picks the worker with the smallest name (alphabetic order) as the leader.
    Then all followers send their data ``obj`` to the leader. After the leader
    has received all, it will broadcast the results back to all followers. This
    function blocks until all workers have received the gathered results.
    """
    if not worker_names:
        assert (
            _ALL_WORKER_NAMES is not None
        ), "`_ALL_WORKER_NAMES` is not initialized for `def _all_gather`."
        worker_names = _ALL_WORKER_NAMES
    leader_name = sorted(worker_names)[0]

    self_name = _get_current_rpc_agent().get_worker_info().name

    with _all_gather_dict_lock:
        concat_names = "".join(sorted(worker_names))
        sequence_num = _all_gather_sequence_id.get(concat_names, 0)
        _all_gather_sequence_id[concat_names] = sequence_num + 1
        sequence_id = concat_names + str(sequence_num)

    is_leader = leader_name == self_name

    if timeout == UNSET_RPC_TIMEOUT:
        # Timeout is specified by agent for RPC calls
        rpc_timeout = get_rpc_timeout()
        # No timeout for signal
        signal_timeout = None
    elif timeout == DEFAULT_SHUTDOWN_TIMEOUT:
        # No timeout for RPC
        rpc_timeout = timeout
        # No timeout for signal
        signal_timeout = None
    else:
        # Signal and RPC timeout use the same timeout
        signal_timeout = rpc_timeout = timeout

    # Phase 1: Followers send it's object to the leader
    if is_leader:
        _gather_to_leader(sequence_id, self_name, obj, worker_names)
    else:
        rpc_sync(
            leader_name,
            _gather_to_leader,
            args=(sequence_id, self_name, obj, worker_names),
            timeout=rpc_timeout,
        )

    with _all_gather_dict_lock:
        states = _all_gather_sequence_id_to_states[sequence_id]

    # Timeout is either set by function parameter or None (which is indefinite)
    states.proceed_signal.wait(timeout=signal_timeout)

    # Phase 2: Leader broadcast gathered results to all followers
    # Leader's signal is the first to be unblocked, after receiving all
    # followers' data objects.
    if is_leader:
        worker_name_to_response_future_dict = dict()
        for follower_name in worker_names - {leader_name}:
            fut = rpc_async(
                follower_name,
                _broadcast_to_followers,
                args=(sequence_id, states.gathered_objects),
                timeout=rpc_timeout
            )
            worker_name_to_response_future_dict[follower_name] = fut

        errors = []
        for follower_name, fut in worker_name_to_response_future_dict.items():
            try:
                fut.wait()
            except RuntimeError as ex:
                errors.append((follower_name, ex))

        if errors:
            raise RuntimeError(
                f"Followers {[e[0] for e in errors]} timed out in _all_gather "
                f"after {rpc_timeout:.2f} seconds. The first exception is {errors[0][1]}"
            )

    # Clean up for the states using the sequence_id
    with _all_gather_dict_lock:
        states = _all_gather_sequence_id_to_states.pop(sequence_id)
    return states.gathered_objects