示例#1
0
    def __init__(self, args):
        self.logger = get_logger("PS", level=args.log_level.upper())
        self.grads_to_wait = args.grads_to_wait
        self.lr_staleness_modulation = args.lr_staleness_modulation
        self.sync_version_tolerance = args.sync_version_tolerance
        self.use_async = args.use_async
        self.port = args.port
        model_module = load_module(
            get_module_file_path(args.model_zoo, args.model_def)).__dict__
        self.optimizer = model_module[args.optimizer]()
        self._set_lr_scheduler(model_module, args.learning_rate_scheduler)
        self.ps_id = args.ps_id
        self.num_ps_pods = args.num_ps_pods
        self.num_workers = args.num_workers
        # Create Parameters instance
        self.parameters = Parameters()
        if args.master_addr is None:
            raise ValueError("master_addr is missing for parameter servers")
        self.master_channel = build_channel(args.master_addr)
        self.evaluation_steps = args.evaluation_steps

        self.master_name = get_master_pod_name(args.job_name)
        self.namespace = args.namespace
        self._init_checkpoint_saver(args)
        self._restore_params_from_checkpoint(args.checkpoint_dir_for_init)
        self._debug_info_needed = args.log_level.upper() == "DEBUG"
示例#2
0
    def __init__(self, args, task_manager, rendezvous_server=None):
        self.logger = get_logger("master", level=args.log_level.upper())

        self.num_ps_pods = args.num_ps_pods
        self.checkpoint_output_path = args.checkpoint_dir

        # Master addr
        master_ip = os.getenv("MY_POD_IP", "localhost")
        self.master_addr = "%s:%d" % (master_ip, args.port)
        self.job_type = get_job_type(args)

        # Initialize the components from the model definition
        model_module = load_module(
            get_module_file_path(args.model_zoo, args.model_def)).__dict__

        self._optimizer = model_module[args.optimizer]()

        # TODO: Remove task manage and rendezvous server after
        # refactoring pod manager.
        self.task_manager = task_manager
        self.rendezvous_server = rendezvous_server

        self.evaluation_service = (
            None if args.eval_metrics_fn not in model_module
            else self._create_evaluation_service(
                model_module[args.eval_metrics_fn], args.evaluation_steps))
示例#3
0
    def test_odps_data_reader_integration_with_local_keras(self):
        num_records = 2
        model_spec = load_module(
            os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "../../../model_zoo",
                "odps_iris_dnn_model/odps_iris_dnn_model.py",
            )).__dict__
        model = model_spec["custom_model"]()
        optimizer = model_spec["optimizer"]()
        loss = model_spec["loss"]
        dataset_fn = model_spec["dataset_fn"]

        def _gen():
            for data in self.reader.read_records(
                    _MockedTask(0, num_records, self.test_table + ":shard_0")):
                if data is not None:
                    yield data

        dataset = tf.data.Dataset.from_generator(_gen, tf.float32)
        dataset = dataset_fn(dataset, None)

        loss_history = []
        grads = None
        for features, labels in dataset:
            with tf.GradientTape() as tape:
                logits = model(features, training=True)
                loss_value = loss(logits, labels)
            loss_history.append(loss_value.numpy())
            grads = tape.gradient(loss_value, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        self.assertEqual(len(loss_history), num_records)
        self.assertEqual(len(grads), num_records)
        self.assertEqual(len(model.trainable_variables), num_records)
示例#4
0
    def _test_correctness(self, optimizer_class, X, Y, seed, **kwargs):
        """Test the correctness of specific TensorFlow optimizer."""
        _model_file = get_module_file_path(
            os.path.dirname(os.path.realpath(__file__)),
            "embedding_test_module.KerasEmbeddingModel",
        )
        model_module = load_module(_model_file).__dict__

        # train model with TensorFlow optimizer
        weights = self._random_init_model_weight([(4, 4), (4, 4), (72, 1),
                                                  (1, )], seed)
        loss_fn = model_module["loss"]
        model1 = model_module["KerasEmbeddingModel"](4, 4, weights)
        opt1 = optimizer_class(**kwargs)
        _train(model1, opt1, X, Y, loss_fn, random_seed=seed)

        model2 = model_module["EdlEmbeddingModel"](4, weights[2:])
        opt2 = optimizer_class(**kwargs)

        layer_names = [layer.name for layer in find_layer(model2, Embedding)]
        embed_dims = dict([(layer_name, 4) for layer_name in layer_names])

        # intialize embedding vectors in kv store
        mock_kv_store = MockKvStore({})
        for layer, embed_table in zip(layer_names, weights[:2]):
            for i, embed_vector in enumerate(embed_table):
                mock_kv_store.update(["%s-%d" % (layer, i)], [embed_vector])

        # train model with optimizer wrapper
        with mock.patch.object(EmbeddingService, "lookup_embedding",
                               mock_kv_store.lookup), mock.patch.object(
                                   EmbeddingService, "update_embedding",
                                   mock_kv_store.update):
            _train_edl_embedding_with_optimizer_wrapper(model2,
                                                        opt2,
                                                        X,
                                                        Y,
                                                        loss_fn,
                                                        embed_dims,
                                                        random_seed=seed)

        # compare trained parameters
        wrong_msg = (
            "The updated parameters of Optimizer Wrapper and TensorFlow "
            "optimizer %s differ." % opt1.get_config()["name"])

        for layer1, layer2 in zip(model1.layers, model2.layers):
            if "embedding" in layer2.name:
                w1 = layer1.weights[0].numpy()
                keys = [Embedding.get_key([layer2.name, i]) for i in range(4)]
                w2 = np.concatenate(mock_kv_store.lookup(keys)[0]).reshape(
                    4, -1)
                self.assertTrue(np.isclose(w1, w2).all(), msg=wrong_msg)
            else:
                for w1, w2 in zip(layer1.weights, layer2.weights):
                    self.assertTrue(np.isclose(w1.numpy(), w2.numpy()).all(),
                                    msg=wrong_msg)
    def _test_correctness(self, optimizer_class, X, Y, seed, **opt_kwargs):
        """Test the correctness of specific TensorFlow optimizer."""
        _model_file = get_module_file_path(
            os.path.dirname(os.path.realpath(__file__)),
            "embedding_test_module.KerasEmbeddingModel",
        )
        model_module = load_module(_model_file).__dict__

        # train model with TensorFlow optimizer
        dim = 4
        weights = self._random_init_model_weight([(4, dim), (4, dim), (72, 1),
                                                  (1, )], seed)
        loss_fn = model_module["loss"]
        model1 = model_module["KerasEmbeddingModel"](4, dim, weights)
        opt1 = optimizer_class(**opt_kwargs)
        _train(model1, opt1, X, Y, loss_fn, random_seed=seed)

        model2 = model_module["EdlEmbeddingModel"](dim, weights[2:])
        opt2 = optimizer_class(**opt_kwargs)

        embedding_weight_names = [
            layer.embedding_weight_name
            for layer in find_layer(model2, Embedding)
        ]

        # create Parameters object and initialize embedding vectors
        params = Parameters()
        for weight_name, embed_value in zip(embedding_weight_names,
                                            weights[:2]):
            embed_table = EmbeddingTable(weight_name, dim)
            embed_table.set(range(len(embed_value)), embed_value)
            params.embedding_params[weight_name] = embed_table

        _train_edl_embedding_with_optimizer_wrapper(model2,
                                                    opt2,
                                                    X,
                                                    Y,
                                                    loss_fn,
                                                    params,
                                                    random_seed=seed)

        # compare trained parameters
        wrong_msg = (
            "The updated parameters of Optimizer Wrapper and TensorFlow "
            "optimizer %s differ." % opt1.get_config()["name"])

        for layer1, layer2 in zip(model1.layers, model2.layers):
            if "embedding" in layer2.name:
                w1 = layer1.weights[0].numpy()
                w2 = params.get_embedding_param(layer2.embedding_weight_name,
                                                range(4))
                self.assertTrue(np.isclose(w1, w2).all(), msg=wrong_msg)
            else:
                for w1, w2 in zip(layer1.weights, layer2.weights):
                    self.assertTrue(np.isclose(w1.numpy(), w2.numpy()).all(),
                                    msg=wrong_msg)
示例#6
0
    def _load_data_reader_fn(self, args):
        self._create_data_reader_fn = create_data_reader

        if args.model_zoo:
            # Initialize the components from the model definition
            model_module = load_module(
                get_module_file_path(args.model_zoo, args.model_def)).__dict__
            if args.custom_data_reader in model_module:
                self._create_data_reader_fn = model_module[
                    args.custom_data_reader]
示例#7
0
def main():
    parser = argparse.ArgumentParser(
        description="Spark job to convert training data to RecordIO format")
    parser.add_argument(
        "--training_data_tar_file",
        help="Tar file that contains all training data",
        required=True,
    )
    parser.add_argument("--output_dir",
                        help="Directory of output RecordIO data",
                        required=True)
    parser.add_argument(
        "--model_file",
        required=True,
        help="User-defined model file which data processing logic is in",
    )
    parser.add_argument("--records_per_file",
                        default=1024,
                        type=int,
                        help="Record per file")
    parser.add_argument(
        "--num_workers",
        default=2,
        type=int,
        help="Number of workers of Spark job",
    )

    args = parser.parse_args()

    # Get training data file names from training_data_tar_file
    tar = tarfile.open(args.training_data_tar_file)
    tar_info_list = tar.getmembers()
    filename_list = []
    for tar_info in tar_info_list:
        f = tar.extractfile(tar_info)
        if f is not None and not tar_info.name.split("/")[-1].startswith("."):
            filename_list.append(tar_info.name)

    # Load user-defined model
    model_module = load_module(args.model_file)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Start the Spark job
    sc = SparkContext()
    rdd = sc.parallelize(filename_list, args.num_workers)
    rdd.mapPartitions(
        process_data(
            model_module.prepare_data_for_a_single_file,
            args.training_data_tar_file,
            args.output_dir,
            args.records_per_file,
        )).collect()
示例#8
0
    def __init__(self,
                 *,
                 image_name,
                 namespace,
                 job_name,
                 event_callback=None,
                 cluster_spec="",
                 force_use_kube_config_file=False):
        """
        ElasticDL k8s client.

        Args:
            image_name: Docker image path for ElasticDL pod.
            namespace: The name of the Kubernetes namespace where ElasticDL
                pods will be created.
            job_name: ElasticDL job name, should be unique in the namespace.
                Used as pod name prefix and value for "elasticdl" label.
            event_callback: If not None, an event watcher will be created and
                events passed to the callback.
            force_use_kube_config_file: If true, force to load the cluster
                config from ~/.kube/config. Otherwise, if it's in a process
                running in a K8S environment, it loads the incluster config,
                if not, it loads the kube config file.
        """
        try:
            if (os.getenv("KUBERNETES_SERVICE_HOST")
                    and not force_use_kube_config_file):
                # We are running inside a k8s cluster
                config.load_incluster_config()
                logger.info("Load the incluster config.")
            else:
                # Use user's kube config
                config.load_kube_config()
                logger.info("Load the kube config file.")
        except Exception as ex:
            traceback.print_exc()
            raise Exception(
                "Failed to load configuration for Kubernetes:\n%s" % str(ex))

        self.client = client.CoreV1Api()
        self.namespace = namespace
        self.job_name = job_name
        self._image_name = image_name
        self._event_cb = event_callback
        if self._event_cb:
            threading.Thread(target=self._watch,
                             name="event_watcher",
                             daemon=True).start()
        self.cluster = None
        if cluster_spec:
            cluster_spec_module = load_module(cluster_spec)
            self.cluster = cluster_spec_module.cluster
示例#9
0
    def __init__(self, args):
        self.logger = get_logger("PS", level=args.log_level.upper())

        self.grads_to_wait = args.grads_to_wait
        self.lr_staleness_modulation = args.lr_staleness_modulation
        self.use_async = args.use_async
        self.port = args.port
        model_module = load_module(
            get_module_file_path(args.model_zoo, args.model_def)
        ).__dict__
        self.optimizer = model_module[args.optimizer]()
        # Create Parameters instance
        self.parameters = Parameters()
示例#10
0
    def test_odps_data_reader_integration_with_local_keras(self):
        num_records = 2
        model_spec = load_module(
            os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "../../../model_zoo",
                "odps_iris_dnn_model/odps_iris_dnn_model.py",
            )
        ).__dict__
        model = model_spec["custom_model"]()
        optimizer = model_spec["optimizer"]()
        loss = model_spec["loss"]
        reader = create_data_reader(
            data_origin=self.test_table,
            records_per_task=10,
            **{"columns": IRIS_TABLE_COLUMN_NAMES, "label_col": "class"}
        )
        dataset_fn = reader.default_dataset_fn()

        def _gen():
            for data in self.reader.read_records(
                _MockedTask(
                    0,
                    num_records,
                    self.test_table + ":shard_0",
                    elasticdl_pb2.TRAINING,
                )
            ):
                if data is not None:
                    yield data

        dataset = tf.data.Dataset.from_generator(_gen, tf.string)
        dataset = dataset_fn(
            dataset, None, Metadata(column_names=IRIS_TABLE_COLUMN_NAMES)
        )
        dataset = dataset.batch(1)

        loss_history = []
        grads = None
        for features, labels in dataset:
            with tf.GradientTape() as tape:
                logits = model(features, training=True)
                loss_value = loss(labels, logits)
            loss_history.append(loss_value.numpy())
            grads = tape.gradient(loss_value, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        self.assertEqual(len(loss_history), num_records)
        self.assertEqual(len(grads), num_records)
        self.assertEqual(len(model.trainable_variables), num_records)
示例#11
0
    def __init__(self,
                 *,
                 image_name,
                 namespace,
                 job_name,
                 event_callback=None,
                 cluster_spec=""):
        """
        ElasticDL k8s client.

        Args:
            image_name: Docker image path for ElasticDL pod.
            namespace: The name of the Kubernetes namespace where ElasticDL
                pods will be created.
            job_name: ElasticDL job name, should be unique in the namespace.
                Used as pod name prefix and value for "elasticdl" label.
            event_callback: If not None, an event watcher will be created and
                events passed to the callback.
        """
        if os.getenv("KUBERNETES_SERVICE_HOST"):
            # We are running inside k8s
            config.load_incluster_config()
        else:
            # Use user's kube config
            config.load_kube_config()

        self.client = client.CoreV1Api()
        self.namespace = namespace
        self.job_name = job_name
        self._image_name = image_name
        self._event_cb = event_callback
        if self._event_cb:
            threading.Thread(target=self._watch,
                             name="event_watcher",
                             daemon=True).start()
        self.cluster = None
        if cluster_spec:
            cluster_spec_module = load_module(cluster_spec)
            self.cluster = cluster_spec_module.cluster
示例#12
0
    def __init__(self, args):
        self.logger = get_logger("PS", level=args.log_level.upper())

        self.grads_to_wait = args.grads_to_wait
        self.lr_staleness_modulation = args.lr_staleness_modulation
        self.use_async = args.use_async
        self.port = args.port
        model_module = load_module(
            get_module_file_path(args.model_zoo, args.model_def)).__dict__
        self.optimizer = model_module[args.optimizer]()
        self.ps_id = args.ps_id
        self.num_ps_pods = args.num_ps_pods
        # Create Parameters instance
        self.parameters = Parameters()
        if args.master_addr is None:
            raise ValueError("master_addr is missing for parameter servers")
        self.master_channel = build_channel(args.master_addr)
        self.evaluation_steps = args.evaluation_steps

        self.master_name = get_master_pod_name(args.job_name)
        self.namespace = args.namespace
        self._init_checkpoint_service(args)
示例#13
0
def distributed_train_and_evaluate(
    feature_shape,
    model_zoo_path,
    model_def,
    model_params="",
    eval_metrics_fn="eval_metrics_fn",
    training=True,
    dataset_name=DatasetName.IMAGE_DEFAULT,
    callback_classes=[],
    use_async=False,
    get_model_steps=1,
):
    """Runs distributed training and evaluation with a local master. Grpc
    calls are mocked by local master call.

    Args:
        feature_shape: The shape of model input.
        model_zoo_path: The directory that contains user-defined model files
            or a specific model file.
        model_def: The import path to the model definition function/class in
            the model zoo, e.g.  "cifar10_subclass.CustomModel".
        model_params: The dictionary of model parameters in a string that will
            be used to instantiate the model, e.g. "param1=1,param2=2".
        training: True for job type `TRAIN_WITH_EVALUATION`, False for
            job type `EVALUATION`.
        dataset_name: A dataset name from `DatasetName`.
        callback_classes: A List of callbacks that will be called at given
            stages of the training procedure.
        use_async: A python bool. True if using asynchronous updates.
        get_model_steps: Worker will perform `get_model` from the parameter
            server every this many steps.

    Returns:
        An integer indicating the model version after the distributed training
        and evaluation.
    """
    job_type = (JobType.TRAINING_WITH_EVALUATION
                if training else JobType.EVALUATION_ONLY)
    batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16
    arguments = [
        "--worker_id",
        "1",
        "--job_type",
        job_type,
        "--minibatch_size",
        batch_size,
        "--model_zoo",
        model_zoo_path,
        "--model_def",
        model_def,
        "--model_params",
        model_params,
        "--get_model_steps",
        get_model_steps,
    ]
    args = parse_worker_args(arguments)
    worker = Worker(args)

    if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]:
        record_num = batch_size
    else:
        record_num = 128
    shards = {
        create_recordio_file(record_num, dataset_name, feature_shape): (
            0,
            record_num,
        )
    }
    if training:
        training_shards = shards
        evaluation_shards = shards
    else:
        training_shards = {}
        evaluation_shards = shards
    task_d = _TaskDispatcher(
        training_shards,
        evaluation_shards,
        {},
        records_per_task=64,
        num_epochs=1,
    )

    model_module = load_module(get_module_file_path(model_zoo_path,
                                                    model_def)).__dict__
    checkpoint_service = CheckpointService("", 0, 0, True)
    if training:
        evaluation_service = EvaluationService(
            checkpoint_service,
            None,
            task_d,
            0,
            0,
            1,
            False,
            model_module[eval_metrics_fn],
        )
    else:
        evaluation_service = EvaluationService(
            checkpoint_service,
            None,
            task_d,
            0,
            0,
            0,
            True,
            model_module[eval_metrics_fn],
        )
    task_d.set_evaluation_service(evaluation_service)
    grads_to_wait = 1 if use_async else 2
    master = MasterServicer(
        grads_to_wait,
        batch_size,
        worker._opt_fn(),
        task_d,
        init_var=[],
        checkpoint_filename_for_init="",
        checkpoint_service=checkpoint_service,
        evaluation_service=evaluation_service,
        use_async=use_async,
    )
    callbacks = [
        callback_class(master, worker) for callback_class in callback_classes
    ]
    worker._stub = InProcessMaster(master, callbacks)

    for var in worker._model.trainable_variables:
        master.set_model_var(var.name, var.numpy())

    worker.run()

    req = elasticdl_pb2.GetTaskRequest()
    req.worker_id = 1
    task = master.GetTask(req, None)
    # No more task.
    if task.shard_name:
        raise RuntimeError(
            "There are some tasks unfinished after worker exits.")
    return master._version
示例#14
0
    def test_push_model(self):
        opt_func_name = "ftrl_optimizer"
        opt = load_module(_module_file).__dict__[opt_func_name]()
        opt_config = opt.get_config()
        slot_names = ["accumulator", "linear"]
        slot_init_value = {
            "accumulator": opt_config["initial_accumulator_value"],
            "linear": 0.0,
        }

        self.create_default_server_and_stub(optimizer=opt_func_name)
        param0 = {
            "v0": np.random.rand(3, 2).astype(np.float32),
            "v1": np.random.rand(10, 32).astype(np.float32),
        }
        param1 = {
            "v0": np.ones([3, 2], dtype=np.float32),
            "v1": np.ones([10, 32], dtype=np.float32),
        }

        models = [param0, param1]

        for idx, model in enumerate(models):
            req = elasticdl_pb2.Model()
            req.version = idx + 1
            for name in model:
                serialize_ndarray(model[name], req.dense_parameters[name])
            req.embedding_table_infos.append(self._embedding_info)
            res = self._stub.push_model(req)
            self.assertEqual(res, empty_pb2.Empty())
            # self._parameters is initialized with the first push_model call
            # and the second push_model has no effect
            self.assertEqual(self._parameters.version, 1)
            for name in param0:
                self.assertTrue(
                    np.allclose(
                        param0[name],
                        self._parameters.non_embedding_params[name].numpy(),
                    )
                )
            self.assertEqual(
                self._embedding_info.name,
                self._parameters.embedding_params[
                    self._embedding_info.name
                ].name,
            )
            self.assertEqual(
                self._embedding_info.dim,
                self._parameters.embedding_params[
                    self._embedding_info.name
                ].dim,
            )
            self.assertEqual(
                tf.keras.initializers.get(
                    self._embedding_info.initializer
                ).__class__,
                self._parameters.embedding_params[
                    self._embedding_info.name
                ].initializer.__class__,
            )

            for slot_name in slot_names:
                name = get_slot_table_name(
                    self._embedding_info.name, slot_name
                )
                table = self._parameters.embedding_params[name]
                self.assertTrue(name, table.name)
                self.assertTrue(self._embedding_info.dim, table.dim)
                embedding = table.get([2])
                self.assertTrue(
                    (embedding - slot_init_value[slot_name] < 0.0001).all()
                )
示例#15
0
def main():
    args = parse_master_args()
    logger = get_logger("master", level=args.log_level.upper())

    # Master addr
    master_ip = os.getenv("MY_POD_IP", "localhost")
    master_addr = "%s:%d" % (master_ip, args.port)

    # Start TensorBoard service if requested
    if args.tensorboard_log_dir:
        logger.info(
            "Starting TensorBoard service with log directory %s",
            args.tensorboard_log_dir,
        )
        # Start TensorBoard CLI
        tb_service = TensorboardService(args.tensorboard_log_dir, master_ip)
        tb_service.start()
    else:
        tb_service = None

    # Start task queue
    logger.debug(
        "Starting task queue with training data directory %s, "
        "evaluation data directory %s, "
        "and prediction data directory %s",
        args.training_data_dir,
        args.evaluation_data_dir,
        args.prediction_data_dir,
    )

    records_per_task = args.minibatch_size * args.num_minibatches_per_task
    task_d = _make_task_dispatcher(
        args.training_data_dir,
        args.evaluation_data_dir,
        args.prediction_data_dir,
        records_per_task,
        args.num_epochs,
    )
    model_module = load_module(
        get_module_file_path(args.model_zoo, args.model_def)
    ).__dict__
    model_inst = load_model_from_module(
        args.model_def, model_module, args.model_params
    )
    optimizer = model_module[args.optimizer]()

    if all(
        (
            args.training_data_dir,
            args.evaluation_data_dir,
            args.evaluation_throttle_secs or args.evaluation_steps,
        )
    ):
        job_type = JobType.TRAINING_WITH_EVALUATION
    elif all(
        (
            args.evaluation_data_dir,
            not args.training_data_dir,
            not args.prediction_data_dir,
        )
    ):
        job_type = JobType.EVALUATION_ONLY
    elif all(
        (
            args.prediction_data_dir,
            not args.evaluation_data_dir,
            not args.training_data_dir,
        )
    ):
        job_type = JobType.PREDICTION_ONLY
    else:
        job_type = JobType.TRAINING_ONLY

    # Initialize checkpoint service
    if args.checkpoint_steps or job_type == JobType.TRAINING_WITH_EVALUATION:
        logger.info("Starting checkpoint service")
        checkpoint_service = CheckpointService(
            args.checkpoint_dir,
            args.checkpoint_steps,
            args.keep_checkpoint_max,
            job_type == JobType.TRAINING_WITH_EVALUATION,
        )
    else:
        checkpoint_service = None

    # Initialize evaluation service
    evaluation_service = None
    if (
        job_type == JobType.TRAINING_WITH_EVALUATION
        or job_type == JobType.EVALUATION_ONLY
    ):
        logger.info(
            "Starting evaluation service with throttle seconds %d "
            " and evaluation steps %d",
            args.evaluation_throttle_secs,
            args.evaluation_steps,
        )
        evaluation_service = EvaluationService(
            checkpoint_service,
            tb_service,
            task_d,
            args.evaluation_start_delay_secs,
            args.evaluation_throttle_secs,
            args.evaluation_steps,
            job_type == JobType.EVALUATION_ONLY,
        )
        evaluation_service.start()
        task_d.set_evaluation_service(evaluation_service)

    embedding_service_endpoint = None
    embedding_dims = {}
    # Search for embedding layers in the model,
    # if found, initialize embedding service
    layers = find_layer(model_inst, Embedding)
    if layers:
        embedding_service = EmbeddingService()
        embedding_service_endpoint = embedding_service.start_embedding_service(
            job_name=args.job_name,
            image_name=args.worker_image,
            namespace=args.namespace,
            resource_request=args.master_resource_request,
            resource_limit=args.master_resource_limit,
            pod_priority=args.worker_pod_priority,
            volume=args.volume,
            image_pull_policy=args.image_pull_policy,
            restart_policy=args.restart_policy,
            cluster_spec=args.cluster_spec,
        )
        logger.info(
            "Embedding service start succeeded. The endpoint is %s."
            % str(embedding_service_endpoint)
        )
        embedding_dims = dict(
            [(layer.name, layer.output_dim) for layer in layers]
        )

    # The master service
    logger.info("Starting master service")
    server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=64),
        options=[
            ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH),
            (
                "grpc.max_receive_message_length",
                GRPC.MAX_RECEIVE_MESSAGE_LENGTH,
            ),
        ],
    )
    master_servicer = MasterServicer(
        args.grads_to_wait,
        args.minibatch_size,
        optimizer,
        task_d,
        init_var=model_inst.trainable_variables if model_inst.built else [],
        embedding_dims=embedding_dims,
        checkpoint_filename_for_init=args.checkpoint_filename_for_init,
        checkpoint_service=checkpoint_service,
        evaluation_service=evaluation_service,
        embedding_service_endpoint=embedding_service_endpoint,
        lr_staleness_modulation=args.lr_staleness_modulation,
        use_async=args.use_async,
    )
    elasticdl_pb2_grpc.add_MasterServicer_to_server(master_servicer, server)
    server.add_insecure_port("[::]:{}".format(args.port))
    server.start()
    logger.info("Server started at port: %d", args.port)

    worker_manager = None
    if args.num_workers:
        assert args.worker_image, "Worker image cannot be empty"

        worker_command = ["python"]
        worker_args = [
            "-m",
            "elasticdl.python.worker.main",
            "--master_addr",
            master_addr,
            "--job_type",
            job_type,
            "--embedding_service_endpoint",
            str(embedding_service_endpoint),
        ]
        worker_args.extend(build_arguments_from_parsed_result(args))

        env_dict = parse_envs(args.envs)
        env = []
        for key in env_dict:
            env.append(V1EnvVar(name=key, value=env_dict[key]))

        worker_manager = WorkerManager(
            task_d,
            job_name=args.job_name,
            image_name=args.worker_image,
            command=worker_command,
            args=worker_args,
            namespace=args.namespace,
            num_workers=args.num_workers,
            worker_resource_request=args.worker_resource_request,
            worker_resource_limit=args.worker_resource_limit,
            pod_priority=args.worker_pod_priority,
            volume=args.volume,
            image_pull_policy=args.image_pull_policy,
            restart_policy=args.restart_policy,
            cluster_spec=args.cluster_spec,
            envs=env,
        )
        worker_manager.update_status(WorkerManagerStatus.PENDING)
        logger.info("Launching %d workers", args.num_workers)
        worker_manager.start_workers()
        worker_manager.update_status(WorkerManagerStatus.RUNNING)

    # Start TensorBoard k8s Service if requested
    if tb_service:
        TensorBoardClient(
            job_name=args.job_name,
            image_name=args.worker_image,
            namespace=args.namespace,
        ).start_tensorboard_service()

    try:
        while True:
            if task_d.finished():
                if worker_manager:
                    worker_manager.update_status(WorkerManagerStatus.FINISHED)
                if args.output:
                    master_servicer.save_latest_checkpoint(args.output)
                break
            time.sleep(30)
    except KeyboardInterrupt:
        logger.warning("Server stopping")

    if evaluation_service:
        logger.info("Stopping evaluation service")
        evaluation_service.stop()

    logger.info("Stopping RPC server")
    server.stop(0)

    # Keep TensorBoard running when all the tasks are finished
    if tb_service:
        logger.info(
            "All tasks finished. Keeping TensorBoard service running..."
        )
        while True:
            if tb_service.is_active():
                time.sleep(10)
            else:
                logger.warning(
                    "Unable to keep TensorBoard running. "
                    "It has already terminated"
                )
                break
    logger.info("Master stopped")
示例#16
0
    def __init__(self, args):
        self.logger = get_logger("master", level=args.log_level.upper())

        self.num_ps_pods = args.num_ps_pods
        self.checkpoint_output_path = args.checkpoint_dir

        # Master addr
        master_ip = os.getenv("MY_POD_IP", "localhost")
        self.master_addr = "%s:%d" % (master_ip, args.port)
        self.job_type = Master._get_job_type(args)

        # Initialize TensorBoard service if requested
        self.tb_service = self._create_tensorboard_service(
            args.tensorboard_log_dir, master_ip)
        if self.tb_service:
            self.tb_client = TensorBoardClient(
                job_name=args.job_name,
                image_name=args.worker_image,
                namespace=args.namespace,
            )

        # Initialize the components from the model definition
        self.model_module = load_module(
            get_module_file_path(args.model_zoo, args.model_def)).__dict__
        self.model_inst = load_model_from_module(args.model_def,
                                                 self.model_module,
                                                 args.model_params)
        model_handler = ModelHandler.get_model_handler(
            args.distribution_strategy, checkpoint_dir=args.checkpoint_dir)
        self.model_inst = model_handler.get_model_to_train(self.model_inst)
        self.optimizer = self.model_module[args.optimizer]()
        self._create_data_reader_fn = create_data_reader
        if args.custom_data_reader in self.model_module:
            self._create_data_reader_fn = self.model_module[
                args.custom_data_reader]

        # Start task queue
        records_per_task = args.minibatch_size * args.num_minibatches_per_task
        self.task_d = _make_task_dispatcher(
            args.training_data,
            args.validation_data,
            args.prediction_data,
            records_per_task,
            args.num_epochs,
            args.data_reader_params,
            self._create_data_reader_fn,
        )

        saved_model_path = args.output
        if saved_model_path is not None and self.job_type in [
                JobType.TRAINING_ONLY,
                JobType.TRAINING_WITH_EVALUATION,
        ]:
            self.task_d.add_deferred_callback_create_save_model_task(
                saved_model_path)

        self.evaluation_service = self._create_evaluation_service(args)

        # Initialize master service
        self.master_servicer, self.server = self._create_master_service(args)

        # Initialize instance manager
        self.instance_manager = self._create_instance_manager(args)

        self._should_stop = False
        self._exit_code = 0
示例#17
0
def distributed_train_and_evaluate(
    feature_shape,
    model_zoo_path,
    model_def,
    model_params="",
    eval_metrics_fn="eval_metrics_fn",
    loss="loss",
    training=True,
    dataset_name=DatasetName.IMAGE_DEFAULT,
    use_async=False,
    get_model_steps=1,
    ps_channels=None,
    pservers=None,
    distribution_strategy=DistributionStrategy.PARAMETER_SERVER,
):
    """Runs distributed training and evaluation with a local master. Grpc
    calls are mocked by local master call.

    Args:
        feature_shape: The shape of model input.
        model_zoo_path: The directory that contains user-defined model files
            or a specific model file.
        model_def: The import path to the model definition function/class in
            the model zoo, e.g.  "cifar10_subclass.CustomModel".
        model_params: The dictionary of model parameters in a string that will
            be used to instantiate the model, e.g. "param1=1,param2=2".
        eval_metrics_fn: The name of the evaluation metrics function defined
            in the model file.
        loss: The name of the loss function defined in the model file.
        training: True for job type `TRAIN_WITH_EVALUATION`, False for
            job type `EVALUATION`.
        dataset_name: A dataset name from `DatasetName`.
        use_async: A bool. True if using asynchronous updates.
        get_model_steps: Worker will perform `get_model` from the parameter
            server every this many steps.
        ps_channels: A channel list to all parameter server pods.
        pservers: A list of parameter server pods.
        distribution_strategy: The distribution startegy used by workers, e.g.
            DistributionStrategy.PARAMETER_SERVER or
            DistributionStrategy.AllreduceStrategy.

    Returns:
        An integer indicating the model version after the distributed training
        and evaluation.
    """
    job_type = (JobType.TRAINING_WITH_EVALUATION
                if training else JobType.EVALUATION_ONLY)
    evaluation_steps = 1 if job_type == JobType.TRAINING_WITH_EVALUATION else 0
    batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16
    pservers = pservers or []
    ps_channels = ps_channels or []

    model_module = load_module(get_module_file_path(model_zoo_path,
                                                    model_def)).__dict__

    worker_arguments = [
        "--worker_id",
        "1",
        "--job_type",
        job_type,
        "--minibatch_size",
        batch_size,
        "--model_zoo",
        model_zoo_path,
        "--model_def",
        model_def,
        "--model_params",
        model_params,
        "--loss",
        loss,
        "--get_model_steps",
        get_model_steps,
        "--distribution_strategy",
        distribution_strategy,
    ]
    args = parse_worker_args(worker_arguments)

    if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]:
        record_num = batch_size
    else:
        record_num = 128
    shards = {
        create_recordio_file(record_num, dataset_name, feature_shape): (
            0,
            record_num,
        )
    }
    if training:
        training_shards = shards
        evaluation_shards = shards
    else:
        training_shards = {}
        evaluation_shards = shards
    task_d = _TaskDispatcher(
        training_shards,
        evaluation_shards,
        {},
        records_per_task=64,
        num_epochs=1,
    )

    if training:
        evaluation_service = EvaluationService(
            None,
            task_d,
            0,
            0,
            evaluation_steps,
            False,
            model_module[eval_metrics_fn],
        )
    else:
        evaluation_service = EvaluationService(
            None,
            task_d,
            0,
            0,
            evaluation_steps,
            True,
            model_module[eval_metrics_fn],
        )
    task_d.set_evaluation_service(evaluation_service)

    master = Mock(
        task_d=task_d,
        instance_manager=None,
        distribution_strategy=None,
    )

    def master_creator():
        return MasterServicer(
            batch_size,
            evaluation_service=evaluation_service,
            master=master,
        )

    svc, port = _server(master_creator)
    mc = MasterClient(build_channel("localhost:%d" % port), 1)
    worker = Worker(args, master_client=mc, ps_client=PSClient(ps_channels))

    for pservicer in pservers:
        # FIXME(yancey1989): decouple pserver and master client
        pservicer._master_stub = mc

    worker.run()

    task = mc.get_task()
    # stop the master servicer
    svc.stop(0)
    # No more task.
    if task.shard_name:
        raise RuntimeError(
            "There are some tasks unfinished after worker exits.")
    return task.model_version
示例#18
0
def _create_model_instance(model_def):
    module_file = get_module_file_path(_get_model_zoo_path(), model_def)
    model_module = load_module(module_file).__dict__
    return load_model_from_module(model_def, model_module, None)
示例#19
0
def distributed_train_and_evaluate(
    feature_shape,
    model_zoo_path,
    model_def,
    model_params="",
    eval_metrics_fn="eval_metrics_fn",
    loss="loss",
    training=True,
    dataset_name=DatasetName.IMAGE_DEFAULT,
    callback_classes=[],
    use_async=False,
    get_model_steps=1,
    ps_channels=None,
    pservers=None,
    distribution_strategy=DistributionStrategy.PARAMETER_SERVER,
):
    """Runs distributed training and evaluation with a local master. Grpc
    calls are mocked by local master call.

    Args:
        feature_shape: The shape of model input.
        model_zoo_path: The directory that contains user-defined model files
            or a specific model file.
        model_def: The import path to the model definition function/class in
            the model zoo, e.g.  "cifar10_subclass.CustomModel".
        model_params: The dictionary of model parameters in a string that will
            be used to instantiate the model, e.g. "param1=1,param2=2".
        eval_metrics_fn: The name of the evaluation metrics function defined
            in the model file.
        loss: The name of the loss function defined in the model file.
        training: True for job type `TRAIN_WITH_EVALUATION`, False for
            job type `EVALUATION`.
        dataset_name: A dataset name from `DatasetName`.
        callback_classes: A List of callbacks that will be called at given
            stages of the training procedure.
        use_async: A bool. True if using asynchronous updates.
        get_model_steps: Worker will perform `get_model` from the parameter
            server every this many steps.
        ps_channels: A channel list to all parameter server pods.
        pservers: A list of parameter server pods.
        distribution_strategy: The distribution startegy used by workers, e.g.
            DistributionStrategy.PARAMETER_SERVER or
            DistributionStrategy.AllreduceStrategy.

    Returns:
        An integer indicating the model version after the distributed training
        and evaluation.
    """
    job_type = (JobType.TRAINING_WITH_EVALUATION
                if training else JobType.EVALUATION_ONLY)
    evaluation_steps = 1 if job_type == JobType.TRAINING_WITH_EVALUATION else 0
    batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16
    pservers = pservers or []
    ps_channels = ps_channels or []

    model_module = load_module(get_module_file_path(model_zoo_path,
                                                    model_def)).__dict__

    for channel in ps_channels:
        grpc.channel_ready_future(channel).result()
    worker_arguments = [
        "--worker_id",
        "1",
        "--job_type",
        job_type,
        "--minibatch_size",
        batch_size,
        "--model_zoo",
        model_zoo_path,
        "--model_def",
        model_def,
        "--model_params",
        model_params,
        "--loss",
        loss,
        "--get_model_steps",
        get_model_steps,
        "--distribution_strategy",
        distribution_strategy,
    ]
    args = parse_worker_args(worker_arguments)
    worker = Worker(args, ps_channels=ps_channels)

    if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]:
        record_num = batch_size
    else:
        record_num = 128
    shards = {
        create_recordio_file(record_num, dataset_name, feature_shape): (
            0,
            record_num,
        )
    }
    if training:
        training_shards = shards
        evaluation_shards = shards
    else:
        training_shards = {}
        evaluation_shards = shards
    task_d = _TaskDispatcher(
        training_shards,
        evaluation_shards,
        {},
        records_per_task=64,
        num_epochs=1,
    )

    if training:
        evaluation_service = EvaluationService(
            None,
            task_d,
            0,
            0,
            evaluation_steps,
            False,
            model_module[eval_metrics_fn],
        )
    else:
        evaluation_service = EvaluationService(
            None,
            task_d,
            0,
            0,
            evaluation_steps,
            True,
            model_module[eval_metrics_fn],
        )
    task_d.set_evaluation_service(evaluation_service)

    master = MasterServicer(
        batch_size,
        task_d,
        evaluation_service=evaluation_service,
    )
    callbacks = [
        callback_class(master, worker) for callback_class in callback_classes
    ]

    in_process_master = InProcessMaster(master, callbacks)
    worker._stub = in_process_master
    for pservicer in pservers:
        pservicer._master_stub = in_process_master

    worker.run()

    req = elasticdl_pb2.GetTaskRequest()
    req.worker_id = 1
    task = master.get_task(req, None)
    # No more task.
    if task.shard_name:
        raise RuntimeError(
            "There are some tasks unfinished after worker exits.")
    return master._version
示例#20
0
    def __init__(self, args):
        self.logger = get_logger("master", level=args.log_level.upper())

        self.num_ps_pods = args.num_ps_pods
        self.checkpoint_output_path = args.checkpoint_dir
        self.distribution_strategy = args.distribution_strategy

        # Master addr
        master_ip = os.getenv("MY_POD_IP", "localhost")
        self.master_addr = "%s:%d" % (master_ip, args.port)
        self.job_type = Master._get_job_type(args)
        self.rendezvous_server = None
        if self.distribution_strategy == DistributionStrategy.ALLREDUCE:
            self.rendezvous_server = HorovodRendezvousServer(master_ip)

        # Initialize TensorBoard service if requested
        self.tb_service = self._create_tensorboard_service(
            args.tensorboard_log_dir, master_ip
        )
        if self.tb_service:
            self.tb_client = TensorBoardClient(
                job_name=args.job_name,
                image_name=args.worker_image,
                namespace=args.namespace,
            )

        # Initialize the components from the model definition
        self.model_module = load_module(
            get_module_file_path(args.model_zoo, args.model_def)
        ).__dict__
        self.model_inst = load_model_from_module(
            args.model_def, self.model_module, args.model_params
        )
        self.optimizer = self.model_module[args.optimizer]()
        self._create_data_reader_fn = create_data_reader
        if args.custom_data_reader in self.model_module:
            self._create_data_reader_fn = self.model_module[
                args.custom_data_reader
            ]

        # Initialize the callbacks
        self.callbacks_list = load_callbacks_from_module(
            args.callbacks, self.model_module
        )
        self.callbacks_list.set_model(self.model_inst)
        set_callback_parameters(
            self.callbacks_list,
            batch_size=args.minibatch_size,
            saved_model_path=args.output,
            checkpoint_path=args.checkpoint_dir,
        )
        self._set_completed_steps_by_checkpoint(args.checkpoint_dir_for_init)

        # Start task queue
        records_per_task = args.minibatch_size * args.num_minibatches_per_task
        self.task_d = _make_task_dispatcher(
            args.training_data,
            args.validation_data,
            args.prediction_data,
            records_per_task,
            args.num_epochs,
            args.data_reader_params,
            self._create_data_reader_fn,
            self.callbacks_list,
        )

        self.task_d.add_deferred_callback_create_train_end_task()
        self.evaluation_service = self._create_evaluation_service(args)

        # Initialize instance manager
        self.instance_manager = self._create_instance_manager(args)

        # Initialize master service
        self.master_servicer, self.server = self._create_master_service(args)

        self._should_stop = False
        self._exit_code = 0
        threading.Thread(
            target=self._check_timeout_tasks,
            name="check_timeout_tasks",
            daemon=True,
        ).start()
示例#21
0
import os
import tempfile
import unittest

from elasticdl.proto import elasticdl_pb2
from elasticdl.python.common.model_utils import (
    get_module_file_path,
    load_module,
)
from elasticdl.python.master.checkpoint_service import CheckpointService
from elasticdl.python.master.servicer import MasterServicer

_model_zoo_path = os.path.dirname(os.path.realpath(__file__))
_model_file = get_module_file_path(_model_zoo_path, "test_module.custom_model")
m = load_module(_model_file).__dict__


class CheckpointTest(unittest.TestCase):
    def testNeedToCheckpoint(self):
        checkpointer = CheckpointService("", 0, 5, False)
        self.assertFalse(checkpointer.is_enabled())
        checkpointer._steps = 3
        self.assertTrue(checkpointer.is_enabled())

        self.assertFalse(checkpointer.need_to_checkpoint(1))
        self.assertFalse(checkpointer.need_to_checkpoint(2))
        self.assertTrue(checkpointer.need_to_checkpoint(3))
        self.assertFalse(checkpointer.need_to_checkpoint(4))
        self.assertFalse(checkpointer.need_to_checkpoint(5))
        self.assertTrue(checkpointer.need_to_checkpoint(6))