def __init__( self, args, channel=None, ps_channels=None, max_minibatch_retry_num=DEFAULT_MAX_MINIBATCH_RETRY_NUM, max_allreduce_retry_num=DEFAULT_MAX_ALLREDUCE_RETRY_NUM, set_parallelism=False, ): """ Arguments: channel: The channel for the gRPC master service. ps_channels: The PS channels for PS service max_minibatch_retry_num: The maximum number of a minibatch retry as its results (e.g. gradients) are not accepted by master. max_allreduce_retry_num: The maximum number of retries for allreduce operation if allreduce-based distributed training strategy is used. """ self._args = args self.logger = get_logger("Worker", level=args.log_level.upper()) if set_parallelism: # Explicitly setting the parallelism will avoid multi-process hangs # Maybe due to an unknown bug in Tensorflow? # Must called before TensorFlow is initialized. # Not set_parallelism by default to make unittests happy. num_threads = os.cpu_count() tf.config.threading.set_inter_op_parallelism_threads(num_threads) tf.config.threading.set_intra_op_parallelism_threads(num_threads) if channel is None: self._stub = None else: self._stub = elasticdl_pb2_grpc.MasterStub(channel) self._use_multi_ps = False self._ps_vars = {} if isinstance(ps_channels, list): if len(ps_channels) > 0: self._use_multi_ps = True self._ps_stubs = [ elasticdl_pb2_grpc.PserverStub(c) for c in ps_channels ] self._var_to_ps = {} self._ps_num = len(self._ps_stubs) else: self._ps_num = 0 self._distribution_strategy = args.distribution_strategy if (self._distribution_strategy == DistributionStrategy.PARAMETER_SERVER and self._use_multi_ps is False): raise ValueError( "PS channels are not set up under parameter server strategy") self._max_minibatch_retry_num = max_minibatch_retry_num self._max_allreduce_retry_num = max_allreduce_retry_num self._init_from_args(args) self._timing = Timing(args.log_level.upper() == "DEBUG", self.logger) self._log_loss_count = 0
def __init__( self, parameters, grads_to_wait, optimizer, lr_staleness_modulation=False, sync_version_tolerance=0, use_async=False, evaluation_steps=0, master_channel=None, checkpoint_saver=None, ps_id=None, num_ps_pods=None, ): if master_channel is None: self._master_stub = None else: self._master_stub = elasticdl_pb2_grpc.MasterStub(master_channel) self._parameters = parameters self._grads_to_wait = grads_to_wait self._optimizer = optimizer self._lr_staleness_modulation = lr_staleness_modulation self._sync_version_tolerance = sync_version_tolerance self._use_async = use_async self._eval_steps = evaluation_steps self._checkpoint_saver = checkpoint_saver self._ps_id = ps_id self._num_ps_pods = num_ps_pods self._version_lock = threading.Lock() self._lock = threading.Lock() self._use_wrap_opt = False self._grads_n = 0 self._grads_buffer = {}
def __init__( self, args, channel=None, ps_channels=None, max_minibatch_retry_num=DEFAULT_MAX_MINIBATCH_RETRY_NUM, max_allreduce_retry_num=DEFAULT_MAX_ALLREDUCE_RETRY_NUM, ): """ Arguments: channel: The channel for the gRPC master service. ps_channels: TODO max_minibatch_retry_num: The maximum number of a minibatch retry as its results (e.g. gradients) are not accepted by master. max_allreduce_retry_num: The maximum number of retries for allreduce operation if allreduce-based distributed training strategy is used. """ self._args = args if channel is None: self._stub = None else: self._stub = elasticdl_pb2_grpc.MasterStub(channel) self._use_multi_ps = False if isinstance(ps_channels, list): if len(ps_channels) > 0: self._use_multi_ps = True self._ps_stubs = [ elasticdl_pb2_grpc.PserverStub(c) for c in ps_channels ] self._var_to_ps = {} self._max_minibatch_retry_num = max_minibatch_retry_num self._max_allreduce_retry_num = max_allreduce_retry_num self._init_from_args(args)
def __init__( self, worker_id, job_type, minibatch_size, model_zoo, dataset_fn="dataset_fn", loss="loss", optimizer="optimizer", eval_metrics_fn="eval_metrics_fn", channel=None, embedding_service_endpoint=None, model_def=None, model_params="", prediction_outputs_processor="PredictionOutputsProcessor", max_minibatch_retry_num=DEFAULT_MAX_MINIBATCH_RETRY_NUM, get_model_steps=1, ): """ Arguments: model_file: A module to define the model channel: grpc channel max_minibatch_retry_num: The maximum number of a minibatch retry as its results (e.g. gradients) are not accepted by master. """ self._worker_id = worker_id self._job_type = job_type self._minibatch_size = minibatch_size ( self._model, self._dataset_fn, self._loss, self._opt_fn, self._eval_metrics_fn, self._prediction_outputs_processor, ) = get_model_spec( model_zoo=model_zoo, model_def=model_def, dataset_fn=dataset_fn, loss=loss, optimizer=optimizer, eval_metrics_fn=eval_metrics_fn, model_params=model_params, prediction_outputs_processor=prediction_outputs_processor, ) self._init_embedding_layer() self._var_created = self._model.built if channel is None: self._stub = None else: self._stub = elasticdl_pb2_grpc.MasterStub(channel) self._embedding_service_endpoint = embedding_service_endpoint self._max_minibatch_retry_num = max_minibatch_retry_num self._model_version = -1 self._task_data_service = TaskDataService( self, self._job_type == JobType.TRAINING_WITH_EVALUATION ) self._get_model_steps = get_model_steps
def __init__(self, channel, worker_id): """Initialize a master client. Args: channel: grpc.Channel the gRPC channel object connects to master gRPC server. worker_id: int the unique and ordered worker ID assigned by elasticdl command-line. """ self._stub = elasticdl_pb2_grpc.MasterStub(channel) self._worker_id = worker_id
def __init__( self, worker_id, job_type, minibatch_size, model_zoo, dataset_fn="dataset_fn", loss="loss", optimizer="optimizer", eval_metrics_fn="eval_metrics_fn", channel=None, embedding_service_endpoint=None, model_def=None, model_params="", prediction_outputs_processor="PredictionOutputsProcessor", max_minibatch_retry_num=DEFAULT_MAX_MINIBATCH_RETRY_NUM, get_model_steps=1, ): """ Arguments: worker_id: The worker ID. job_type: The job type. minibatch_size: The size of the minibatch used for each iteration. model_zoo: The directory that contains user-defined model files or a specific model file. dataset_fn: The name of the dataset function defined in the model file. loss: The name of the loss function defined in the model file. optimizer: The name of the optimizer defined in the model file. eval_metrics_fn: The name of the evaluation metrics function defined in the model file. channel: The channel for the gRPC master service. embedding_service_endpoint: The endpoint to the embedding service. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". prediction_outputs_processor: The name of the prediction output processor class defined in the model file. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. max_minibatch_retry_num: The maximum number of a minibatch retry as its results (e.g. gradients) are not accepted by master. """ self._worker_id = worker_id self._job_type = job_type self._minibatch_size = minibatch_size ( self._model, self._dataset_fn, self._loss, self._opt_fn, self._eval_metrics_fn, self._prediction_outputs_processor, ) = get_model_spec( model_zoo=model_zoo, model_def=model_def, dataset_fn=dataset_fn, loss=loss, optimizer=optimizer, eval_metrics_fn=eval_metrics_fn, model_params=model_params, prediction_outputs_processor=prediction_outputs_processor, ) self._init_embedding_layer() self._var_created = self._model.built if channel is None: self._stub = None else: self._stub = elasticdl_pb2_grpc.MasterStub(channel) self._embedding_service_endpoint = embedding_service_endpoint self._max_minibatch_retry_num = max_minibatch_retry_num self._model_version = -1 self._task_data_service = TaskDataService( self, self._job_type == JobType.TRAINING_WITH_EVALUATION) self._get_model_steps = get_model_steps