Python CollectiveAllReduce示例，tensorflow.contrib.distribute.python.cross_tower_ops.CollectiveAllReduce Python示例

示例#1

0

显示文件

 def _get_test_objects(self,
                       task_type,
                       task_id,
                       num_gpus=0,
                       local_mode=False):
     collective_keys = cross_tower_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         MultiWorkerCollectiveAllReduceTest.collective_key_base,
         instance_key_start=num_gpus * 100 +
         MultiWorkerCollectiveAllReduceTest.collective_key_base,
         instance_key_with_id_start=num_gpus * 10000 +
         MultiWorkerCollectiveAllReduceTest.collective_key_base)
     if local_mode:
         collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce(
             1, num_gpus, collective_keys=collective_keys)
         if num_gpus:
             devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
         else:
             devices = ["/device:CPU:0"]
         return collective_all_reduce_ops, devices, "local"
     else:
         collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce(
             3, num_gpus, collective_keys=collective_keys)
         if num_gpus:
             devices = [
                 "/job:%s/task:%d/device:GPU:%d" % (task_type, task_id, i)
                 for i in range(num_gpus)
             ]
         else:
             devices = ["/job:%s/task:%d" % (task_type, task_id)]
         return collective_all_reduce_ops, devices, self._workers[
             task_id].target

示例#2

0

显示文件

    def _initialize_local_worker(self, container_strategy,
                                 num_gpus_per_worker):
        """Initializes the object for local training."""
        self._is_chief = True
        self._num_workers = 1

        if num_gpus_per_worker:
            local_devices = [
                "/device:GPU:%d" % i for i in range(num_gpus_per_worker)
            ]
        else:
            local_devices = ["/device:CPU:0"]

        self._collective_keys = cross_tower_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended, self).__init__(
            container_strategy,
            devices=local_devices,
            cross_device_ops=cross_tower_ops_lib.CollectiveAllReduce(
                num_workers=1,
                num_gpus_per_worker=num_gpus_per_worker,
                collective_keys=self._collective_keys))

        self._cluster_spec = None
        self._task_type = None
        self._task_id = None

        logging.info("CollectiveAllReduceStrategy with local_devices = %r",
                     local_devices)

示例#3

0

显示文件

    def _initialize_multi_worker(self, container_strategy, num_gpus_per_worker,
                                 cluster_spec, task_type, task_id):
        """Initializes the object for multi-worker training."""
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        if task_type not in ["chief", "worker"]:
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError("No `worker` or `chief` tasks can be found in "
                             "`cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        worker_device = "/job:%s/task:%d" % (task_type, task_id)
        if num_gpus_per_worker:
            local_devices = [
                "%s/device:GPU:%d" % (worker_device, i)
                for i in range(num_gpus_per_worker)
            ]
        else:
            local_devices = [worker_device]

        self._collective_keys = cross_tower_utils.CollectiveKeys()
        super(CollectiveAllReduceExtended, self).__init__(
            container_strategy,
            devices=local_devices,
            cross_device_ops=cross_tower_ops_lib.CollectiveAllReduce(
                num_workers=self._num_workers,
                num_gpus_per_worker=num_gpus_per_worker,
                collective_keys=self._collective_keys))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        self._cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_spec)
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
            task_type, task_id, self._num_workers, local_devices)

示例#4

0

显示文件

文件： collective_all_reduce_strategy.py 项目： loulansuiye/tensorflow-1

    def _initialize(self, cluster_spec, task_type, task_id):
        if cluster_spec:
            if task_type is None or task_id is None:
                raise ValueError(
                    "When `cluster_spec` is given, you must also specify "
                    "`task_type` and `task_id`")
            if task_type not in ["chief", "worker"]:
                raise ValueError(
                    "Unrecognized task_type: %r, valid task types are: \"chief\", "
                    "\"worker\"." % task_type)
            self._cluster_spec = multi_worker_util.normalize_cluster_spec(
                cluster_spec)
            worker_device = "/job:%s/task:%d" % (task_type, task_id)
            num_workers = len(self._cluster_spec.as_dict().get(
                "worker", [])) + len(self._cluster_spec.as_dict().get(
                    "chief", []))
            if not num_workers:
                raise ValueError(
                    "No `worker` or `chief` tasks can be found in "
                    "`cluster_spec`.")

            self._is_chief = multi_worker_util.is_chief(
                cluster_spec, task_type, task_id)
        else:
            self._cluster_spec = None
            self._is_chief = True
            worker_device = ""
            num_workers = 1
        self._num_workers = num_workers

        if self._num_gpus_per_worker:
            local_devices = [
                "%s/device:GPU:%d" % (worker_device, i)
                for i in range(self._num_gpus_per_worker)
            ]
        else:
            local_devices = [worker_device]

        self._collective_keys = cross_tower_utils.CollectiveKeys()
        super(CollectiveAllReduceStrategy, self).__init__(
            devices=local_devices,
            cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
                num_workers=num_workers,
                num_gpus_per_worker=self._num_gpus_per_worker,
                collective_keys=self._collective_keys))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        if cluster_spec:
            self._default_device = "/job:%s/replica:0/task:%d" % (task_type,
                                                                  task_id)

示例#5

0

显示文件

    def _initialize(self, cluster_spec, task_type, task_id):
        if task_type not in ["chief", "worker"]:
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)
        if cluster_spec:
            self._cluster_spec = _normalize_cluster_spec(cluster_spec)
            worker_device = "/job:%s/task:%d" % (task_type, task_id)
            num_workers = len(self._cluster_spec.as_dict().get(task_type, []))
            if "chief" in self._cluster_spec.as_dict():
                num_workers += 1
            if not num_workers:
                raise ValueError("`task_type` shoud be in `cluster_spec`.")

            # TODO(yuefengz): create a utility to infer chief.
            if "chief" in self._cluster_spec.as_dict(
            ) and task_type == "chief":
                assert task_id == 0
                self._is_chief = True
            else:
                assert task_type == "worker"
                self._is_chief = task_id == 0
        else:
            self._cluster_spec = None
            self._is_chief = True
            worker_device = ""
            num_workers = 1
        self._num_workers = num_workers

        if self._num_gpus_per_worker:
            local_devices = [
                "%s/device:GPU:%d" % (worker_device, i)
                for i in range(self._num_gpus_per_worker)
            ]
        else:
            local_devices = [worker_device]

        self._collective_keys = cross_tower_utils.CollectiveKeys()
        super(CollectiveAllReduceStrategy, self).__init__(
            devices=local_devices,
            cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
                num_workers=num_workers,
                num_gpus_per_worker=self._num_gpus_per_worker,
                collective_keys=self._collective_keys))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        if cluster_spec:
            self._default_device = "/job:%s/replica:0/task:%d" % (task_type,
                                                                  task_id)