Пример #1
0
 def _get_test_objects(self,
                       task_type,
                       task_id,
                       num_gpus=0,
                       local_mode=False):
     collective_keys = cross_tower_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         MultiWorkerCollectiveAllReduceTest.collective_key_base,
         instance_key_start=num_gpus * 100 +
         MultiWorkerCollectiveAllReduceTest.collective_key_base,
         instance_key_with_id_start=num_gpus * 10000 +
         MultiWorkerCollectiveAllReduceTest.collective_key_base)
     if local_mode:
         collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce(
             1, num_gpus, collective_keys=collective_keys)
         if num_gpus:
             devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
         else:
             devices = ["/device:CPU:0"]
         return collective_all_reduce_ops, devices, "local"
     else:
         collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce(
             3, num_gpus, collective_keys=collective_keys)
         if num_gpus:
             devices = [
                 "/job:%s/task:%d/device:GPU:%d" % (task_type, task_id, i)
                 for i in range(num_gpus)
             ]
         else:
             devices = ["/job:%s/task:%d" % (task_type, task_id)]
         return collective_all_reduce_ops, devices, self._workers[
             task_id].target
Пример #2
0
    def _initialize_local_worker(self, container_strategy,
                                 num_gpus_per_worker):
        """Initializes the object for local training."""
        self._is_chief = True
        self._num_workers = 1

        if num_gpus_per_worker:
            local_devices = [
                "/device:GPU:%d" % i for i in range(num_gpus_per_worker)
            ]
        else:
            local_devices = ["/device:CPU:0"]

        self._collective_keys = cross_tower_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended, self).__init__(
            container_strategy,
            devices=local_devices,
            cross_device_ops=cross_tower_ops_lib.CollectiveAllReduce(
                num_workers=1,
                num_gpus_per_worker=num_gpus_per_worker,
                collective_keys=self._collective_keys))

        self._cluster_spec = None
        self._task_type = None
        self._task_id = None

        logging.info("CollectiveAllReduceStrategy with local_devices = %r",
                     local_devices)
Пример #3
0
    def _initialize_multi_worker(self, container_strategy, num_gpus_per_worker,
                                 cluster_spec, task_type, task_id):
        """Initializes the object for multi-worker training."""
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        if task_type not in ["chief", "worker"]:
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError("No `worker` or `chief` tasks can be found in "
                             "`cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        worker_device = "/job:%s/task:%d" % (task_type, task_id)
        if num_gpus_per_worker:
            local_devices = [
                "%s/device:GPU:%d" % (worker_device, i)
                for i in range(num_gpus_per_worker)
            ]
        else:
            local_devices = [worker_device]

        self._collective_keys = cross_tower_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended, self).__init__(
            container_strategy,
            devices=local_devices,
            cross_device_ops=cross_tower_ops_lib.CollectiveAllReduce(
                num_workers=self._num_workers,
                num_gpus_per_worker=num_gpus_per_worker,
                collective_keys=self._collective_keys))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        self._cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_spec)
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
            task_type, task_id, self._num_workers, local_devices)
    def _initialize(self, cluster_spec, task_type, task_id):
        if cluster_spec:
            if task_type is None or task_id is None:
                raise ValueError(
                    "When `cluster_spec` is given, you must also specify "
                    "`task_type` and `task_id`")
            if task_type not in ["chief", "worker"]:
                raise ValueError(
                    "Unrecognized task_type: %r, valid task types are: \"chief\", "
                    "\"worker\"." % task_type)
            self._cluster_spec = multi_worker_util.normalize_cluster_spec(
                cluster_spec)
            worker_device = "/job:%s/task:%d" % (task_type, task_id)
            num_workers = len(self._cluster_spec.as_dict().get(
                "worker", [])) + len(self._cluster_spec.as_dict().get(
                    "chief", []))
            if not num_workers:
                raise ValueError(
                    "No `worker` or `chief` tasks can be found in "
                    "`cluster_spec`.")

            self._is_chief = multi_worker_util.is_chief(
                cluster_spec, task_type, task_id)
        else:
            self._cluster_spec = None
            self._is_chief = True
            worker_device = ""
            num_workers = 1
        self._num_workers = num_workers

        if self._num_gpus_per_worker:
            local_devices = [
                "%s/device:GPU:%d" % (worker_device, i)
                for i in range(self._num_gpus_per_worker)
            ]
        else:
            local_devices = [worker_device]

        self._collective_keys = cross_tower_utils.CollectiveKeys()
        super(CollectiveAllReduceStrategy, self).__init__(
            devices=local_devices,
            cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
                num_workers=num_workers,
                num_gpus_per_worker=self._num_gpus_per_worker,
                collective_keys=self._collective_keys))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        if cluster_spec:
            self._default_device = "/job:%s/replica:0/task:%d" % (task_type,
                                                                  task_id)
Пример #5
0
    def _initialize(self, cluster_spec, task_type, task_id):
        if task_type not in ["chief", "worker"]:
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)
        if cluster_spec:
            self._cluster_spec = _normalize_cluster_spec(cluster_spec)
            worker_device = "/job:%s/task:%d" % (task_type, task_id)
            num_workers = len(self._cluster_spec.as_dict().get(task_type, []))
            if "chief" in self._cluster_spec.as_dict():
                num_workers += 1
            if not num_workers:
                raise ValueError("`task_type` shoud be in `cluster_spec`.")

            # TODO(yuefengz): create a utility to infer chief.
            if "chief" in self._cluster_spec.as_dict(
            ) and task_type == "chief":
                assert task_id == 0
                self._is_chief = True
            else:
                assert task_type == "worker"
                self._is_chief = task_id == 0
        else:
            self._cluster_spec = None
            self._is_chief = True
            worker_device = ""
            num_workers = 1
        self._num_workers = num_workers

        if self._num_gpus_per_worker:
            local_devices = [
                "%s/device:GPU:%d" % (worker_device, i)
                for i in range(self._num_gpus_per_worker)
            ]
        else:
            local_devices = [worker_device]

        self._collective_keys = cross_tower_utils.CollectiveKeys()
        super(CollectiveAllReduceStrategy, self).__init__(
            devices=local_devices,
            cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
                num_workers=num_workers,
                num_gpus_per_worker=self._num_gpus_per_worker,
                collective_keys=self._collective_keys))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        if cluster_spec:
            self._default_device = "/job:%s/replica:0/task:%d" % (task_type,
                                                                  task_id)
Пример #6
0
 def _get_test_object(self, task_type, task_id, num_gpus=0):
     distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
         num_gpus_per_worker=num_gpus,
         cluster_spec=self._cluster_spec,
         task_type=task_type,
         task_id=task_id)
     collective_keys = cross_tower_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         DistributedCollectiveAllReduceStrategyTest.collective_key_base,
         instance_key_start=num_gpus * 100 +
         DistributedCollectiveAllReduceStrategyTest.collective_key_base,
         instance_key_with_id_start=num_gpus * 10000 +
         DistributedCollectiveAllReduceStrategyTest.collective_key_base)
     distribution._collective_keys = collective_keys
     distribution._cross_tower_ops._collective_keys = collective_keys
     return distribution, self._workers[task_id].target
Пример #7
0
  def __init__(self,
               num_workers=1,
               num_gpus_per_worker=0,
               all_reduce_merge_scope=32,
               collective_keys=None):
    """Initializes the object.

    Args:
      num_workers: number of workers in the between-graph replicated training.
      num_gpus_per_worker: number of GPUs per worker.
      all_reduce_merge_scope: size of groups into which to partition consecutive
        gradients grouped under a common 'allreduce' name scope. This is useful
        for some optimization of collective ops.
      collective_keys: an optional CollectiveKey object.
    """
    self._num_workers = num_workers
    self._num_gpus_per_worker = num_gpus_per_worker
    self._all_reduce_merge_scope = all_reduce_merge_scope
    self._collective_keys = collective_keys or cross_tower_utils.CollectiveKeys(
    )
    super(CollectiveAllReduce, self).__init__()
 def _get_test_object(self, task_type, task_id, num_gpus=0):
     distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
         num_gpus_per_worker=num_gpus)
     if task_type and task_id is not None:
         distribution.configure(cluster_spec=self._cluster_spec,
                                task_type=task_type,
                                task_id=task_id)
     collective_keys = cross_tower_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_start=num_gpus * 100 +
         CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_with_id_start=num_gpus * 10000 +
         CollectiveAllReduceStrategyTestBase.collective_key_base)
     distribution._collective_keys = collective_keys
     distribution._cross_tower_ops._collective_keys = collective_keys
     if task_type and task_id is not None:
         return distribution, 'grpc://' + self._cluster_spec[task_type][
             task_id]
     else:
         return distribution, ''