def default_strategy(in_cross_check=False): from tensorflow.distribute import get_strategy, in_cross_replica_context if in_cross_check: if in_cross_replica_context(): return get_strategy() else: get_strategy()
def apply_gradients(self, grads_and_vars, global_step=None, name=None): summed_grads_and_vars = [] for (grad, var) in grads_and_vars: if grad is None: summed_grads_and_vars.append((grad, var)) else: with ops.colocate_with(grad): # gradient accumulation if self._gradients_to_accumulate > 1 and not self._pipelining: grad = gen_poputil_ops.ipu_stateful_gradient_accumulate( grad / self._gradients_to_accumulate, num_mini_batches=self._gradients_to_accumulate) # replication if self._replicas > 1: grad = gen_poputil_ops.ipu_replication_normalise( cross_replica_ops.cross_replica_sum(grad)) # distribution if distribute.has_strategy(): grad /= distribute.get_strategy().num_replicas_in_sync grad = math_ops.cast(grad, var.dtype) summed_grads_and_vars.append((grad, var)) if self._pipelining: # can do weight decay here as apply_gradients is only called on last accumulation step summed_grads_and_vars = self.add_WD(summed_grads_and_vars) ret = self._optimizer.apply_gradients(summed_grads_and_vars, global_step, name) if self._sharded: sharding.propagate_sharding(ops.get_default_graph()) return ret
def CreateInstances(cls, *args, **kwargs): if not has_strategy(): return EmbeddingVariable(local_replica_id=0, *args, **kwargs) strategy = get_strategy() strategy_extended = strategy.extended devices = strategy_extended._devices value_list = [] for i, d in enumerate(devices): with ops.device(d): if i > 0: name = value_list[0].name.split(":")[0] kwargs["name"] = "%s/replica_%d/" % (name, i) with context.device_policy(context.DEVICE_PLACEMENT_SILENT): with tape.stop_recording(): v = EmbeddingVariable(local_replica_id=i, *args, **kwargs) value_list.append(v) # TODO: check whether it will impact the performance due to the aggregation or synchronization setting. return DistributedVariable( strategy=strategy, values=value_list, aggregation=VariableAggregation.ONLY_FIRST_REPLICA, var_policy=VariableSynchronization.NONE)
def apply_gradients(self, grads_and_vars, global_step=None, name=None): summed_grads_and_vars = [] for (grad, var) in grads_and_vars: if grad is None: summed_grads_and_vars.append((grad, var)) else: with ops.colocate_with(grad): # gradient accumulation if self._gradient_accumulation_count > 1 and not self._pipelining: grad = gen_poputil_ops.ipu_stateful_gradient_accumulate( grad, num_mini_batches=self._gradient_accumulation_count) # replication if self._replicas > 1: grad = gen_poputil_ops.ipu_replication_normalise( cross_replica_ops.cross_replica_sum(grad)) # distribution with IPUMultiWorkerStrategy needs additional normalisation by the number of workers if isinstance( distribute.get_strategy(), ipu_multi_worker_strategy.IPUMultiWorkerStrategy): grad /= distribute.get_strategy().num_replicas_in_sync grad = math_ops.cast(grad, var.dtype) summed_grads_and_vars.append((grad, var)) if self._pipelining: # can do weight decay here as apply_gradients is only called on last accumulation step summed_grads_and_vars = self.add_WD(summed_grads_and_vars) if self._grad_scale != 1.0: # don't rescale batch norm moving average statistics as they are not affected by loss scaling summed_grads_and_vars = [ (grad, var) if 'batch_norm/moving_' in var.name else (grad / self._grad_scale, var) for grad, var in summed_grads_and_vars ] ret = self._optimizer.apply_gradients(summed_grads_and_vars, global_step, name) if self._sharded: sharding.propagate_sharding(ops.get_default_graph()) return ret
def Init(**kwargs): """ Abbreviated as ``sok.Init(**kwargs)``. This function is used to do the initialization of SparseOperationKit (SOK). SOK will leverage all available GPUs for current CPU process. Please set `CUDA_VISIBLE_DEVICES` or `tf.config.set_visible_devices` to specify which GPU(s) are used in this process before launching tensorflow runtime and calling this function. In **TensorFlow 2.x**, SOK can be used with **tf.distribute.Strategy** or **Horovod**. When it's used with tf.distribute.Strategy, it must be called under `strategy.scope()`. For example, .. code-block:: python with strategy.scope(): sok.Init(**kwargs) When it's used with Horovod, it must be called at each process. For example, .. code-block:: python import horovod.tensorflow as hvd hvd.init() sok.Init(**kwargs) In **TensorFlow 1.15**, SOK can only work with **Horovod**. The retured status must be evaluated with `sess.run`, and it must be the first step before evaluate any other SOK APIs. .. code-block:: python sok_init = sok.Init(global_batch_size=args.global_batch_size) with tf.Session() as sess: sess.run(sok_init) ... Parameters ---------- kwargs: dictionary keyword arguments for this function. Currently, it must contains `global_batch_size` used in all GPUs. Returns ------- status: string a string will be returned if this function executed successfully. And its contents will be 'OK'. """ def _get_visible_devices(): gpus = config.get_visible_devices('GPU') assert (len(gpus) > 0) visible_devices = [] for i in range(len(gpus)): visible_devices.append(int(gpus[i].name.split(':')[-1])) return array_ops.constant(visible_devices, dtype=int32) @function def _single_worker_init(**kwargs): replica_ctx = get_replica_context() replica_ctx.merge_call(lambda strategy: tf_print( "You are using the plugin with MirroredStrategy.")) nccl_unique_id = replica_ctx.merge_call( lambda strategy: kit_lib.get_nccl_unique_id()) global_random_seed = kwargs.get( "seed", None) or replica_ctx.merge_call( lambda strategy: kit_lib.gen_random_seed()) global_id = replica_ctx.replica_id_in_sync_group visible_devices = _get_visible_devices() status = kit_lib.plugin_init( global_id, replica_ctx.num_replicas_in_sync, nccl_unique_id, global_random_seed, visible_devices, global_batch_size=kwargs['global_batch_size']) return status def _multi_worker_init(**kwargs): replica_ctx = get_replica_context() global_id = replica_ctx.replica_id_in_sync_group if global_id == 0: unique_id = kit_lib.get_nccl_unique_id() re = collective_ops.broadcast_send( unique_id, TensorShape([ 32, ]), int32, group_size=replica_ctx.num_replicas_in_sync, group_key=1, instance_key=2) else: re = collective_ops.broadcast_recv( TensorShape([ 32, ]), int32, group_size=replica_ctx.num_replicas_in_sync, group_key=1, instance_key=2) if global_id == 0: global_seed = kwargs.get("seed", None) or kit_lib.gen_random_seed() re_seed = collective_ops.broadcast_send( global_seed, TensorShape([ 1, ]), int64, group_size=replica_ctx.num_replicas_in_sync, group_key=1, instance_key=3) else: global_seed = kwargs.get("seed", None) re_seed = collective_ops.broadcast_recv( TensorShape([ 1, ]), int64, group_size=replica_ctx.num_replicas_in_sync, group_key=1, instance_key=3) if (global_seed and global_seed != re_seed): logging.warning( "The seed: {} is not consistent with that from cheif-node: {}, " "and the seed from cheif-node will be used.".format( global_seed, re_seed)) visible_devices = _get_visible_devices() status = kit_lib.plugin_init( global_id, replica_ctx.num_replicas_in_sync, re, re_seed, visible_devices, global_batch_size=kwargs['global_batch_size']) return status # @function def _horovod_init(**kwargs): r""" This function uses horovod to broadcast nccl-id and random-seed which is used by sparse_operation_kit. Please note that the nccl-comm mentioned here is not the same one as the nccl-comm of horovod itself. After broadcasting, this function uses kit_lib.plugin_init and "nccl-id", "random-seed" to initialize sparse_operation_kit. """ local_rank = hvd.local_rank() unique_id = kit_lib.get_nccl_unique_id( ) if local_rank == 0 else array_ops.zeros([ 32, ], dtype=int32) unique_id = hvd.broadcast(unique_id, root_rank=0, name="nccl_unique_id") seed = kwargs.get("seed", None) if 0 == local_rank: global_seed = seed or kit_lib.gen_random_seed() else: global_seed = array_ops.zeros([ 1, ], dtype=int64) re_seed = hvd.broadcast(global_seed, root_rank=0, name="random_seed") if (seed and seed != re_seed): logging.warning( "The seed: {} is not consistent with that from cheif-node: {}, " "and the seed from cheif-node will be used.".format( global_seed, re_seed)) visible_devices = _get_visible_devices() status = kit_lib.plugin_init( local_rank, hvd.size(), unique_id, re_seed, visible_devices, global_batch_size=kwargs["global_batch_size"]) return status def _one_device_init(**kwargs): """ This function use to initialize only one GPU for SOK. """ local_rank = 0 unique_id = kit_lib.get_nccl_unique_id() global_seed = kwargs.get("seed", None) or kit_lib.gen_random_seed() visible_devices = _get_visible_devices() status = kit_lib.plugin_init( local_rank, 1, unique_id, global_seed, visible_devices, global_batch_size=kwargs["global_batch_size"]) return status if has_strategy(): strategy = get_strategy() @function def _init_wrapper(run_fn, init_fn, **kwargs): return run_fn(init_fn, kwargs=kwargs) if isinstance(strategy, MirroredStrategy): _init_fn = _single_worker_init elif isinstance(strategy, MultiWorkerMirroredStrategy): _init_fn = _multi_worker_init else: raise RuntimeError("This strategy type is not supported yet.") if not kit_lib.in_tensorflow2(): _init_results = _init_wrapper(strategy.experimental_run_v2, _init_fn, **kwargs) if hasattr(_init_results, "values"): _init_results = _init_results.values return _init_results else: return _init_wrapper(strategy.run, _init_fn, **kwargs) elif "horovod.tensorflow" in sys.modules: # imported horovod import horovod.tensorflow as hvd if not kit_lib.in_tensorflow2(): @function def _init_wrapper(**kwargs): return _horovod_init(**kwargs) return _init_wrapper(**kwargs) else: return _horovod_init(**kwargs) else: # horovod not imported return _one_device_init(**kwargs)
def __init__(self, shape, local_replica_id, initializer=None, trainable=True, use_hashtable=True, name="EmbeddingVariable", dtype=None, key_dtype=None, *args, **kwargs): if (not isinstance(shape, list)) or (len(shape) != 2): raise ValueError("shape_per_gpu must be a list which represents: "+\ "[vocabulary_size_per_gpu, embedding_vector_size].") self.m_shape_per_gpu = TensorShape(shape) self.m_local_replica_id = local_replica_id self.m_initializer = initializer or InPlaceInitializer( name="random_uniform") if not isinstance(self.m_initializer, InPlaceInitializer): self.m_initializer = tf_initializers.get(self.m_initializer) self.m_trainable = trainable self.m_use_hashtable = use_hashtable self.m_embedding_layer = None self.m_dtype = dtype or dtypes.float32 self.m_key_dtype = key_dtype or dtypes.int64 # produce intial_value if isinstance(self.m_initializer, InPlaceInitializer): # TODO: serialize it self.m_initial_value = self.m_initializer.name else: self.m_initial_value = self.m_initializer( shape=self.m_shape_per_gpu, dtype=self.m_dtype) collections = [ops.GraphKeys.GLOBAL_VARIABLES] if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections: collections = list(collections) + [ ops.GraphKeys.TRAINABLE_VARIABLES ] with ops.init_scope(): self._in_graph_mode = not context.executing_eagerly() with ops.name_scope(name) as var_name_scope: # TODO: use regulare expression while var_name_scope[-1] == r"/": var_name_scope = var_name_scope[:-1] var_name = var_name_scope self.m_var_name = var_name self.m_unique_id = "%s_%d" % (var_name, ops.uid()) # attr = resource_variable_ops.attr_value_pb2.AttrValue( # list=resource_variable_ops.attr_value_pb2.AttrValue.ListValue( # s=[resource_variable_ops.compat.as_bytes("loc:@%s" % self.m_var_name)])) # with ops.get_default_graph()._attr_scope({"_class": attr}): with ops.NullContextmanager(): # m_handle is the handle to EmbeddingVariable, tf_handle is the handle to TF Var. self.m_handle, self.tf_handle = kit_lib.create_var( var_name=var_name, dtype=self.m_dtype, shape=self.m_shape_per_gpu) if self._in_graph_mode: with ops.name_scope("IsInitialized"): self._is_initialized_op = ops.convert_to_tensor( True) # TODO: should not hard-writing??? if (isinstance(self.m_initial_value, ops.Tensor) and not self.m_initial_value.shape. is_compatible_with(self.m_shape_per_gpu)): raise ValueError( "The initial value's shape (%s) is not compatible with " "the explicitly supplied `shape` argument (%s)." % (initial_value.shape, self.m_shape_per_gpu)) _init_op = kit_lib.assign_embedding_variable( emb_var_handle=self.m_handle, tf_var_handle=self.tf_handle, var_name=var_name, initial_value=self.m_initial_value, local_replica_id=self.m_local_replica_id, trainable=self.m_trainable, shape=self.m_shape_per_gpu, use_hashtable=self.m_use_hashtable, dtype=self.m_dtype, key_dtype=self.m_key_dtype) self._initializer_op = control_flow_ops.group( (_init_op)) else: raise RuntimeError( "Currently, EmbeddingVariable does not support Eager mode." ) if not context.executing_eagerly(): ops.add_to_collections(collections, self) super(EmbeddingVariable, self).__init__( trainable=self.m_trainable, shape=self.m_shape_per_gpu, dtype=self.m_dtype, handle=self.m_handle, handle_name=var_name, distribute_strategy=get_strategy() if has_strategy() else None, synchronization=VariableSynchronization.NONE, aggregation=VariableAggregation.ONLY_FIRST_REPLICA, unique_id=self.m_unique_id, initializer_op=self._initializer_op, is_initialized_op=self._is_initialized_op, *args, **kwargs) handle_data = resource_variable_ops.cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData( ) handle_data.is_set = True handle_data.shape_and_type.append( resource_variable_ops.cpp_shape_inference_pb2. CppShapeInferenceResult.HandleShapeAndType( shape=self.shape.as_proto(), dtype=self.dtype.as_datatype_enum)) resource_variable_ops._set_handle_shapes_and_types( self.m_handle, handle_data, graph_mode=False if context.executing_eagerly() else True) resource_variable_ops._set_handle_shapes_and_types( self.tf_handle, handle_data, graph_mode=False if context.executing_eagerly() else True)
def __init__(self, shape, local_replica_id, initializer=None, trainable=True, use_hashtable=True, name="EmbeddingVariable", dtype=None, key_dtype=None, *args, **kwargs): if (not isinstance(shape, list)) or (len(shape) != 2): raise ValueError("shape_per_gpu must be a list which represents: "+\ "[vocabulary_size_per_gpu, embedding_vector_size].") self.m_shape_per_gpu = TensorShape(shape) self.m_local_replica_id = local_replica_id self.m_initializer = initializer or InPlaceInitializer(name="random_uniform") if not isinstance(self.m_initializer, InPlaceInitializer): self.m_initializer = tf_initializers.get(self.m_initializer) self.m_trainable = trainable self.m_use_hashtable = use_hashtable self.m_embedding_layer = None self.m_dtype = dtype or dtypes.float32 self.m_key_dtype = key_dtype or dtypes.int64 # produce intial_value if isinstance(self.m_initializer, InPlaceInitializer): # TODO: serialize it self.m_initial_value = self.m_initializer.name else: self.m_initial_value = self.m_initializer(shape=self.m_shape_per_gpu, dtype=self.m_dtype) with ops.init_scope(): with ops.name_scope(name): self.m_var_name = self._gen_unique_name(name) self.m_unique_id = "%s_%d" %(self.m_var_name, ops.uid()) # m_handle is the handle to EmbeddingVariable, tf_handle is the handle to TF Var. self.m_handle, self.tf_handle = kit_lib.create_var( var_name=self.m_var_name, dtype=self.m_dtype, shape=self.m_shape_per_gpu) with ops.name_scope("IsInitialized"): self._is_initialized_op = ops.convert_to_tensor(True) if (isinstance(self.m_initial_value, ops.Tensor) and not self.m_initial_value.shape.is_compatible_with(self.m_shape_per_gpu)): raise ValueError("The initial value's shape (%s) is not compatible with " "the explicitly supplied `shape` argument (%s)." % (self.m_initial_value.shape, self.m_shape_per_gpu)) _init_op = kit_lib.assign_embedding_variable(emb_var_handle=self.m_handle, tf_var_handle=self.tf_handle, var_name=self.m_var_name, initial_value=self.m_initial_value, local_replica_id=self.m_local_replica_id, trainable=self.m_trainable, shape=self.m_shape_per_gpu, use_hashtable=self.m_use_hashtable, dtype=self.m_dtype, key_dtype=self.m_key_dtype) self._initializer_op = control_flow_ops.group((_init_op)) super(EmbeddingVariable, self).__init__(trainable=self.m_trainable, shape=self.m_shape_per_gpu, dtype=self.m_dtype, handle=self.m_handle, handle_name=self.m_var_name, distribute_strategy=get_strategy() if has_strategy() else None, synchronization=VariableSynchronization.NONE, aggregation=VariableAggregation.ONLY_FIRST_REPLICA, unique_id=self.m_unique_id, *args, **kwargs) handle_data = resource_variable_ops.cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData() handle_data.is_set = True handle_data.shape_and_type.append( resource_variable_ops.cpp_shape_inference_pb2.CppShapeInferenceResult.HandleShapeAndType( shape=self.shape.as_proto(), dtype=self.dtype.as_datatype_enum)) resource_variable_ops._set_handle_shapes_and_types(self.m_handle, handle_data, graph_mode=False if context.executing_eagerly() else True) resource_variable_ops._set_handle_shapes_and_types(self.tf_handle, handle_data, graph_mode=False if context.executing_eagerly() else True)