Пример #1
0
def local_device_setter(num_devices=1,
                        ps_device_type='cpu',
                        worker_device='/cpu:0',
                        ps_ops=None,
                        ps_strategy=None):
  if ps_ops == None:
    ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']

  if ps_strategy is None:
    ps_strategy = device_setter._RoundRobinStrategy(num_devices)
  if not six.callable(ps_strategy):
    raise TypeError("ps_strategy must be callable")

  def _local_device_chooser(op):
    current_device = pydev.DeviceSpec.from_string(op.device or "")

    node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
    if node_def.op in ps_ops:
      ps_device_spec = pydev.DeviceSpec.from_string(
          '/{}:{}'.format(ps_device_type, ps_strategy(op)))

      ps_device_spec.merge_from(current_device)
      return ps_device_spec.to_string()
    else:
      worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "")
      worker_device_spec.merge_from(current_device)
      return worker_device_spec.to_string()
  return _local_device_chooser
Пример #2
0
    def device_setter(self, worker_device, ps_ops=None):
        if ps_ops is None:
            ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']

        if self.ps_type == 'CPU':
            ps_strategy = device_setter._RoundRobinStrategy(
                self.num_gpu)
        else:
            ps_strategy = tf.contrib.training.GreedyLoadBalancingStrategy(
                self.num_gpu, tf.contrib.training.byte_size_load_fn)

        def device_chooser(op):
            current_device = pydev.DeviceSpec.from_string(op.device or "")
            node_def = op if isinstance(
                op, node_def_pb2.NodeDef) else op.node_def
            if node_def.op in ps_ops:
                ps_device_spec = pydev.DeviceSpec.from_string(
                    '/{}:{}'.format(self.ps_type.lower(), ps_strategy(op)))

                ps_device_spec.merge_from(current_device)
                return ps_device_spec.to_string()
            else:
                worker_device_spec = pydev.DeviceSpec.from_string(
                    worker_device or "")
                worker_device_spec.merge_from(current_device)
                return worker_device_spec.to_string()
        return device_chooser
  def test_round_robin_placement(self):
    ps_devices = [
        '/device:GPU:0', '/device:GPU:1', '/device:GPU:3', '/device:GPU:4'
    ]
    round_robin = device_setter._RoundRobinStrategy(num_tasks=len(ps_devices))

    local_device_setter = replicate_model_fn._local_device_setter(
        ps_devices=ps_devices,
        ps_strategy=round_robin,
        worker_device='/device:GPU:2')

    with ops_lib.device(local_device_setter):
      a = variables.Variable(0.01)
      self.assertEqual('/device:GPU:0', a.device)

      b = variables.Variable(0.02)
      self.assertEqual('/device:GPU:1', b.device)

      c = variables.Variable(0.03)
      self.assertEqual('/device:GPU:3', c.device)

      a_op = array_ops.concat(a, axis=0)
      self.assertEqual('/device:GPU:2', a_op.device)

      b_op = array_ops.concat(b, axis=0)
      self.assertEqual('/device:GPU:2', b_op.device)

      c = variables.Variable(0.03)
      self.assertEqual('/device:GPU:4', c.device)

      d = variables.Variable(0.03)
      self.assertEqual('/device:GPU:0', d.device)

      c_op = array_ops.concat(c, axis=0)
      self.assertEqual('/device:GPU:2', c_op.device)
    def test_vars_are_on_ps_but_ops_are_on_workers(self):
        ps_devices = ['/device:GPU:3']
        round_robin = device_setter._RoundRobinStrategy(
            num_tasks=len(ps_devices))

        local_device_setter = replicate_model_fn._local_device_setter(
            ps_devices=ps_devices,
            ps_strategy=round_robin,
            worker_device='/device:GPU:2')

        with ops_lib.device(local_device_setter):
            a = variables.Variable(0.01)
            self.assertEqual('/device:GPU:3', a.device)

            b = variables.Variable(0.02)
            self.assertEqual('/device:GPU:3', b.device)

            c = variables.Variable(0.03)
            self.assertEqual('/device:GPU:3', c.device)

            a_op = array_ops.concat(a, axis=0)
            self.assertEqual('/device:GPU:2', a_op.device)

            b_op = array_ops.concat(b, axis=0)
            self.assertEqual('/device:GPU:2', b_op.device)
Пример #5
0
def local_device_setter(num_devices=1,
                        ps_device_type='cpu',
                        worker_device='/cpu:0',
                        ps_ops=None,
                        ps_strategy=None):
    if ps_ops == None:
        ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']

    if ps_strategy is None:
        ps_strategy = device_setter._RoundRobinStrategy(num_devices)
    if not six.callable(ps_strategy):
        raise TypeError("ps_strategy must be callable")

    def _local_device_chooser(op):
        current_device = pydev.DeviceSpec.from_string(op.device or "")

        node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
        if node_def.op in ps_ops:
            ps_device_spec = pydev.DeviceSpec.from_string('/{}:{}'.format(
                ps_device_type, ps_strategy(op)))

            ps_device_spec.merge_from(current_device)
            return ps_device_spec.to_string()
        else:
            worker_device_spec = pydev.DeviceSpec.from_string(worker_device
                                                              or "")
            worker_device_spec.merge_from(current_device)
            return worker_device_spec.to_string()

    return _local_device_chooser
def local_device_setter(num_devices=1,
                        ps_device_type='cpu',
                        worker_device='/cpu:0',
                        ps_ops=None,
                        ps_strategy=None):
    #  cluster = tf.train.ClusterSpec({"worker":["147.46.15.21:123", "147.46.15.21:124", "147.46.15.23:123", "147.46.15.23:124"], "ps":["147.46.15.21:456", "147.46.15.23:678"]})

    if ps_ops == None:
        ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']

    ps_strategy = device_setter._RoundRobinStrategy(num_devices)

    def _local_device_chooser(op):
        current_device = pydev.DeviceSpec.from_string(op.device or "")
        node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def

        if node_def.op in ps_ops and (
                isinstance(node_def.op, ops.IndexedSlices)
                or isinstance(node_def.op, sparse_tensor.SparseTensor)):
            #    if node_def.op in ps_ops:
            ps_device_spec = pydev.DeviceSpec.from_string('/{}:{}'.format(
                ps_device_type, ps_strategy(op)))

            ps_device_spec.merge_from(current_device)
            #      print("PS DEVICE STRING: ", ps_device_spec.to_string())
            return ps_device_spec.to_string()
        else:
            worker_device_spec = pydev.DeviceSpec.from_string(worker_device
                                                              or "")
            worker_device_spec.merge_from(current_device)
            #      print("WORKER DEVICE STRING: ", worker_device_spec.to_string())
            return worker_device_spec.to_string()

    return _local_device_chooser
Пример #7
0
def get_device_setter(device_category: DeviceCategory, device):
    if device_category == DeviceCategory.GPU:
        ps_strategy = tf.contrib.training.GreedyLoadBalancingStrategy(
            len(get_gpu_devices()[0]), tf.contrib.training.byte_size_load_fn)
    else:
        ps_strategy = _RoundRobinStrategy(len(get_cpu_devices()[0]))

    ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']

    def _local_device_chooser(op):
        current_device = pydev.DeviceSpec.from_string(op.device or "")

        node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
        if node_def.op in ps_ops:
            ps_device_spec = pydev.DeviceSpec.from_string('/{}:{}'.format(
                device_category.name, ps_strategy(op)))

            ps_device_spec.merge_from(current_device)
            return ps_device_spec.to_string()
        else:
            worker_device_spec = pydev.DeviceSpec.from_string(device or "")
            worker_device_spec.merge_from(current_device)
            return worker_device_spec.to_string()

    return _local_device_chooser
Пример #8
0
def local_device_setter(num_devices: int = 1,
                        ps_device_type: str = 'cpu',
                        worker_device: str = '/cpu:0',
                        ps_ops: List[str] = None,
                        ps_strategy: Optional[str] = None) -> Callable:
    """
    Setter for variable placement

    Parameters
    ----------
    num_devices
        number of devices
    ps_device_type
        device type for setting of the variables, e.g. cpu or gpu
    worker_device
        name of worker device
    ps_ops
        names of parameter server operations
    ps_strategy
        strategy of parameter server

    Returns
    -------
    local_device_chooser
        callable to pass to tf.device

    References
    ----------
    source
        https://github.com/tensorflow/models/blob/master/tutorials/image/
        cifar10_estimator/cifar10_utils.py
    """

    if ps_ops is None:
        ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']

    if ps_strategy is None:
        ps_strategy = device_setter._RoundRobinStrategy(num_devices)
    if not six.callable(ps_strategy):
        raise TypeError("ps_strategy must be callable")

    def _local_device_chooser(op):
        current_device = pydev.DeviceSpec.from_string(op.device or "")

        node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
        if node_def.op in ps_ops:
            ps_device_spec = pydev.DeviceSpec.from_string('/{}:{}'.format(
                ps_device_type, ps_strategy(op)))

            ps_device_spec.merge_from(current_device)
            return ps_device_spec.to_string()
        else:
            worker_device_spec = pydev.DeviceSpec.from_string(worker_device
                                                              or "")
            worker_device_spec.merge_from(current_device)
            return worker_device_spec.to_string()

    return _local_device_chooser
Пример #9
0
def _get_loss_towers(model_fn,
                     mode,
                     features,
                     labels,
                     params,
                     config,
                     devices,
                     local_ps_devices,
                     loss_reduction=losses.Reduction.SUM,
                     name_scope_pattern=_DEFAULT_NAME_SCOPE_PATTERN):
    """Replicate the loss computation across devices."""
    tower_specs = []

    model_fn_args = util.fn_args(model_fn)
    optional_params = {}
    if 'params' in model_fn_args:
        optional_params['params'] = copy.deepcopy(params)
    if 'config' in model_fn_args:
        optional_params['config'] = copy.deepcopy(config)

    # pylint: disable=protected-access
    round_robin_strategy = device_setter_lib._RoundRobinStrategy(
        num_tasks=len(local_ps_devices))
    # pylint: enable=protected-access

    for i, device in enumerate(devices):
        is_the_first_tower = (i == 0)

        device_setter = _local_device_setter(worker_device=device,
                                             ps_devices=local_ps_devices,
                                             ps_strategy=round_robin_strategy)

        # We would like to preserve the names of the variables and ops that the user
        # might be relying on. Names without a prefix are going to resolve to
        # variables and ops of the first tower.
        name_scope = name_scope_pattern
        if is_the_first_tower:
            name_scope = ''

        with variable_scope.variable_scope('', reuse=not is_the_first_tower):
            with ops_lib.name_scope(name_scope.format(i)):
                with ops_lib.device(device_setter):
                    labels_shard = None
                    if labels:
                        labels_shard = labels[i]

                    tower_spec = model_fn(mode=mode,
                                          features=features[i],
                                          labels=labels_shard,
                                          **optional_params)
                    if loss_reduction != losses.Reduction.SUM:
                        tower_spec = _scale_tower_loss(
                            tower_spec, number_of_towers=len(devices))
                    tower_specs.append(tower_spec)
    return tower_specs
def local_device_setter(cluster, worker_device='/cpu:0'):
  ps_ops = ['Variable', 'VariableV2', 'VarHandleOp', 'AutoReloadVariable']

  cluster_spec = cluster.as_dict()
  ps_device="/job:ps/task:0/cpu:0"
  ps_job_name = pydev.DeviceSpec.from_string(ps_device).job
  ps_tasks = len(cluster_spec[ps_job_name])
  ps_strategy = device_setter._RoundRobinStrategy(ps_tasks)
  merge_devices=True  
  print("LOCAL DEVICE SETTER: ")
  print("ps_job_name: ", ps_job_name)
  print("ps_strategy: ", ps_strategy)
  print("ps_device: ", ps_device, "ps_tasks num:", ps_tasks)
  print("worker_device: ", worker_device)
  chooser = _ReplicaDeviceChooser(ps_tasks, ps_device, worker_device, merge_devices, ps_ops, ps_strategy)
  return chooser.device_function
Пример #11
0
def _get_loss_towers(model_fn,
                     mode,
                     features,
                     labels,
                     params,
                     config,
                     devices,
                     local_ps_devices,
                     loss_reduction,
                     name_scope_pattern=_DEFAULT_NAME_SCOPE_PATTERN):
  """Replicate the loss computation across devices."""
  tower_specs = []

  model_fn_args = function_utils.fn_args(model_fn)
  optional_params = {}
  if 'params' in model_fn_args:
    optional_params['params'] = copy.deepcopy(params)
  if 'config' in model_fn_args:
    optional_params['config'] = copy.deepcopy(config)

  # pylint: disable=protected-access
  round_robin_strategy = device_setter_lib._RoundRobinStrategy(
      num_tasks=len(local_ps_devices))
  TowerOptimizer._graph_state().set_reduction_across_towers(
      loss_reduction, len(devices))

  for i, device in enumerate(devices):
    is_the_first_tower = (i == 0)

    device_setter = _local_device_setter(
        worker_device=device,
        ps_devices=local_ps_devices,
        ps_strategy=round_robin_strategy)

    # We would like to preserve the names of the variables and ops that the user
    # might be relying on. Names without a prefix are going to resolve to
    # variables and ops of the first tower.
    name_scope = name_scope_pattern
    if is_the_first_tower:
      name_scope = ''

    with variable_scope.variable_scope(
        '', reuse=not is_the_first_tower) as var_scope:
      with ops_lib.name_scope(name_scope.format(i)) as name_scope:
        with TowerOptimizer._graph_state().tower(
            tower_id=i, var_scope=var_scope, name_scope=name_scope):
          with ops_lib.device(device_setter):
            labels_shard = None
            if labels:
              labels_shard = labels[i]

            tower_spec = model_fn(
                mode=mode,
                features=features[i],
                labels=labels_shard,
                **optional_params)

            if (tower_spec.train_op is not None and len(devices) > 1 and
                not TowerOptimizer.has_been_used()):
              raise ValueError('Please wrap optimizers with TowerOptimizer'
                               ' in order to use replicate_model_fn with'
                               ' multiple `devices`.')

            # Scaling the loss here doesn't actually affect gradients.  Another
            # instance of scaling happens inside the TowerOptimizer.
            tower_spec = _scale_tower_loss(
                tower_spec, loss_reduction, number_of_towers=len(devices))
            tower_specs.append(tower_spec)

  if not TowerOptimizer._did_towers_have_same_optimizer_calls():
    raise ValueError('Each invocation of model_fn was supposed to make the same'
                     ' optimizer calls.')
  TowerOptimizer._clear_graph_state()
  # pylint: enable=protected-access
  return tower_specs
Пример #12
0
def _get_loss_towers(model_fn,
                     mode,
                     features,
                     labels,
                     params,
                     config,
                     devices,
                     local_ps_devices,
                     loss_reduction,
                     name_scope_pattern=_DEFAULT_NAME_SCOPE_PATTERN):
  """Replicate the loss computation across devices."""
  tower_specs = []

  model_fn_args = util.fn_args(model_fn)
  optional_params = {}
  if 'params' in model_fn_args:
    optional_params['params'] = copy.deepcopy(params)
  if 'config' in model_fn_args:
    optional_params['config'] = copy.deepcopy(config)

  # pylint: disable=protected-access
  round_robin_strategy = device_setter_lib._RoundRobinStrategy(
      num_tasks=len(local_ps_devices))
  TowerOptimizer._graph_state().set_reduction_across_towers(
      loss_reduction, len(devices))

  for i, device in enumerate(devices):
    is_the_first_tower = (i == 0)

    device_setter = _local_device_setter(
        worker_device=device,
        ps_devices=local_ps_devices,
        ps_strategy=round_robin_strategy)

    # We would like to preserve the names of the variables and ops that the user
    # might be relying on. Names without a prefix are going to resolve to
    # variables and ops of the first tower.
    name_scope = name_scope_pattern
    if is_the_first_tower:
      name_scope = ''

    with variable_scope.variable_scope(
        '', reuse=not is_the_first_tower) as var_scope:
      with ops_lib.name_scope(name_scope.format(i)) as name_scope:
        with TowerOptimizer._graph_state().tower(
            tower_id=i, var_scope=var_scope, name_scope=name_scope):
          with ops_lib.device(device_setter):
            labels_shard = None
            if labels:
              labels_shard = labels[i]

            tower_spec = model_fn(
                mode=mode,
                features=features[i],
                labels=labels_shard,
                **optional_params)

            if (tower_spec.train_op is not None and len(devices) > 1 and
                not TowerOptimizer.has_been_used()):
              raise ValueError('Please wrap optimizers with TowerOptimizer'
                               ' in order to use replicate_model_fn with'
                               ' multiple `devices`.')

            # Scaling the loss here doesn't actually affect gradients.  Another
            # instance of scaling happens inside the TowerOptimizer.
            tower_spec = _scale_tower_loss(
                tower_spec, loss_reduction, number_of_towers=len(devices))
            tower_specs.append(tower_spec)

  if not TowerOptimizer._did_towers_have_same_optimizer_calls():
    raise ValueError('Each invocation of model_fn was supposed to make the same'
                     ' optimizer calls.')
  TowerOptimizer._clear_graph_state()
  # pylint: enable=protected-access
  return tower_specs