예제 #1
0
    def test_check_grad(self):
        self.params.reset()
        self.params.init_from_model_pb(self.model_pb)

        grad0 = Tensor("z", None, None)
        with self.assertRaisesRegex(ValueError, "Name error"):
            self.params.check_grad(grad0)

        grad1 = Tensor("x", np.random.uniform(size=(3, 5)), None)
        with self.assertRaisesRegex(ValueError, "Non embedding param error"):
            self.params.check_grad(grad1)

        grad2 = Tensor(
            name="embedding_1",
            values=np.random.uniform(size=(3, 11)),
            indices=np.array([1, 2, 3]),
        )
        with self.assertRaisesRegex(
            ValueError, "ElasticDL embedding param error"
        ):
            self.params.check_grad(grad2)

        grad3 = Tensor(
            name="x",
            values=np.random.uniform(size=(4, 4)),
            indices=np.array([1, 2, 3, 4]),
        )
        with self.assertRaisesRegex(ValueError, "Keras embedding param error"):
            self.params.check_grad(grad3)
예제 #2
0
    def setUp(self):
        self.params = Parameters()

        self.model_pb = Model()
        self.infos_pb = self.model_pb.embedding_table_infos
        self.tensors_pb = self.model_pb.dense_parameters
        self.embedding_tables_pb = self.model_pb.embedding_tables

        self.embedding_table_name = "embedding_1"
        self.embedding_dim = 10
        embedding_pb = self.infos_pb.add()
        embedding_pb.name = self.embedding_table_name
        embedding_pb.dim = self.embedding_dim
        embedding_pb.initializer = "uniform"

        arr1 = np.random.uniform(size=(3, 4))
        serialize_ndarray(arr1, self.tensors_pb["x"])
        arr2 = np.random.uniform(size=(4, 5))
        serialize_ndarray(arr2, self.tensors_pb["y"])

        embedding_vectors = np.random.uniform(size=(2, 10))
        embedding_indices = np.array([0, 8])
        serialize_indexed_slices(
            Tensor(None, embedding_vectors, embedding_indices),
            self.embedding_tables_pb[self.embedding_table_name],
        )
예제 #3
0
    def _get_model(self):
        self._timing.start_record_time("get_model")
        # 1. Worker tries to pull dense parameters from the PS, maybe one
        # or more PS instances are uninitialized.
        dense_params, uninit_ps = self._ps_client.pull_dense_parameters(
            [i for i in range(self._ps_client.ps_num)],
            self._model_versions_from_ps,
        )

        # 2. Worker pushes local dense parameters to these PS instances
        # to initialize their partition of parameters.
        if len(uninit_ps) > 0:
            for ps_id in uninit_ps:
                # push variable to ps for initialization
                parameters = [
                    Tensor(name, self._non_embed_vars[name].numpy(), None)
                    for name in self._ps_client.ps_to_parameter[ps_id]
                ]
                self._ps_client.push_dense_parameters(
                    parameters, ps_id, self._model_versions_from_ps[ps_id])

            ps_params, uninit = self._ps_client.pull_dense_parameters(
                uninit_ps, self._model_versions_from_ps)
            if len(uninit) > 0:
                # TODO: support PS fault-tolerance
                raise RuntimeError("PS initialization failed")
            dense_params.update(ps_params)

        # 3. Assign parameters to local model
        for k, v in dense_params.items():
            self._non_embed_vars[k].assign(v)

        self._model_version = max(self._model_versions_from_ps)
        self._timing.end_record_time("get_model")
예제 #4
0
    def report_gradient_to_ps(self, gradients):

        self._timing.start_record_time("report_gradient")

        grads = []
        for i, v in enumerate(self._non_embed_vars.values()):
            if isinstance(gradients[i], tf.IndexedSlices):
                grad = Tensor(
                    v.name,
                    gradients[i].values.numpy(),
                    gradients[i].indices.numpy(),
                )
            else:
                grad = Tensor(v.name, gradients[i].numpy(), None)
            grads.append(grad)

        edl_grads = []
        edl_embedding_name_values = self._collect_edl_embedding_name_values()
        if edl_embedding_name_values:
            non_embed_vars_n = len(self._non_embed_vars)
            edl_embedding_grads = gradients[non_embed_vars_n:]
            bet_number = 0
            for name, embedding_and_ids in edl_embedding_name_values:

                for i in range(bet_number):
                    grad = Tensor(
                        name,
                        edl_embedding_grads[i + bet_number].values.numpy(),
                        edl_embedding_grads[i + bet_number].indices.numpy(),
                    )
                    edl_grads.append(grad)
                bet_number += len(embedding_and_ids)
            if len(edl_embedding_grads) != bet_number:
                raise ValueError(
                    "elasticdl.layers.embedding related gradient number %d "
                    "does not match the number of its output tensor %d." %
                    (len(edl_embedding_grads), bet_number))
        learning_rate = K.get_value(self._model.optimizer.lr)
        accepted, max_version = self._ps_client.push_gradients(
            grads,
            edl_grads,
            learning_rate,
            self._model_versions_from_ps,
        )
        self._timing.end_record_time("report_gradient")
        return accepted, max_version
예제 #5
0
    def push_gradient_test_setup(self):
        self.var_names = ["test_1", "test_2"]
        self.var_values = [
            np.array([10.0, 20.0, 30.0], np.float32),
            np.array([20.0, 40.0, 60.0], np.float32),
        ]
        self.grad_values0 = [
            np.array([1.0, 2.0, 3.0], np.float32),
            np.array([2.0, 4.0, 6.0], np.float32),
        ]
        self.grad_values1 = [
            np.array([0.0, 0.0, 7.0], np.float32),
            np.array([9.0, 9.0, 6.0], np.float32),
        ]

        dim = self._embedding_info.dim
        self.embedding_table = (
            np.random.rand(4 * dim).reshape((4, dim)).astype(np.float32)
        )
        self.embedding_grads0 = Tensor(
            None,
            np.random.rand(3 * dim).reshape((3, dim)).astype(np.float32),
            np.asarray([3, 1, 3]),
        )
        self.embedding_grads1 = Tensor(
            None,
            np.random.rand(3 * dim).reshape((3, dim)).astype(np.float32),
            np.asarray([2, 2, 3]),
        )
        push_model_req = elasticdl_pb2.Model()
        push_model_req.version = self._parameters.version
        for name, value in zip(self.var_names, self.var_values):
            serialize_ndarray(value, push_model_req.dense_parameters[name])
        push_model_req.embedding_table_infos.append(self._embedding_info)
        self._stub.push_model(push_model_req)

        for name, var in zip(self.var_names, self.var_values):
            self._parameters.non_embedding_params[name] = tf.Variable(var)

        self._parameters.embedding_params[self._embedding_info.name].set(
            range(len(self.embedding_table)), self.embedding_table
        )
예제 #6
0
    def report_gradient_to_ps(self, grads):
        self._timing.start_record_time("report_gradient")
        reqs = [
            elasticdl_pb2.PushGradientsRequest() for i in range(self._ps_num)
        ]
        ps_grads = {}
        non_embed_vars_n = len(self._non_embed_vars)
        for g, v in zip(
            grads[:non_embed_vars_n], self._non_embed_vars.values()
        ):
            ps_id = self._var_to_ps[v.name]
            if ps_id not in ps_grads:
                ps_grads[ps_id] = {v.name: g}
            else:
                if v.name not in ps_grads[ps_id]:
                    ps_grads[ps_id][v.name] = g
                else:
                    if isinstance(g, tf.IndexedSlices):
                        ps_grads[ps_id][v.name] = merge_indexed_slices(
                            ps_grads[ps_id][v.name], g
                        )
                    else:
                        ps_grads[ps_id][v.name] += g

        for ps_id, pair in ps_grads.items():
            for name, g in pair.items():
                if isinstance(g, tf.IndexedSlices):
                    v, i = deduplicate_indexed_slices(g.values, g.indices)
                    ps_grads[ps_id][name] = tf.IndexedSlices(v, i)

        for ps_id in ps_grads:
            req = reqs[ps_id]
            for name, g in ps_grads[ps_id].items():
                # Keras embedding layer has a dense parameter,
                # but an indexed slices type gradient
                if isinstance(g, tf.IndexedSlices):
                    serialize_indexed_slices(
                        Tensor(None, g.values.numpy(), g.indices.numpy()),
                        req.gradients.embedding_tables[name],
                    )
                else:
                    serialize_ndarray(
                        g.numpy(), req.gradients.dense_parameters[name]
                    )

        edl_embedding_name_values = self._collect_edl_embedding_name_values()

        if edl_embedding_name_values:
            edl_embedding_grads = grads[non_embed_vars_n:]
            bet_number = 0
            for name, embedding_and_ids in edl_embedding_name_values:
                bet_number += len(embedding_and_ids)
            if len(edl_embedding_grads) != bet_number:
                raise ValueError(
                    "elasticdl.layers.embedding related gradient number %d "
                    "does not match the number of its output tensor %d."
                    % (len(edl_embedding_grads), bet_number)
                )

            grad_accum_iter = 0
            for name, embedding_and_ids in edl_embedding_name_values:
                g_values = None
                g_indices = None
                for _, ids in embedding_and_ids:
                    grad = edl_embedding_grads[grad_accum_iter]
                    grad_accum_iter += 1
                    # ElasticDL embedding layer with Sparse Gradients
                    if isinstance(grad, tf.IndexedSlices):
                        grad = grad.values
                    if g_values is not None:
                        g_values = tf.concat([g_values, grad], axis=0)
                        g_indices = tf.concat([g_indices, ids], axis=0)
                    else:
                        g_values = grad
                        g_indices = ids

                # Sum up the values of the duplicated indices in the
                # gradients. It can reduce the gradient payload of the
                # dense embedding.
                g_values, g_indices = deduplicate_indexed_slices(
                    values=g_values, indices=g_indices
                )

                results = scatter_embedding_vector(
                    g_values.numpy(), g_indices.numpy(), self._ps_num
                )

                for ps_id in results:
                    req = reqs[ps_id]
                    gv, gi = results[ps_id]
                    serialize_indexed_slices(
                        Tensor(None, gv, gi),
                        req.gradients.embedding_tables[name],
                    )

        report_futures = []
        for ps_id in range(self._ps_num):
            req = reqs[ps_id]
            req.gradients.version = self._model_versions_from_ps[ps_id]
            req.learning_rate = K.get_value(self._model.optimizer.lr)
            report_future = self._ps_stubs[ps_id].push_gradients.future(req)
            report_futures.append(report_future)

        accepted = False
        max_version = -1
        for report_future in report_futures:
            res = report_future.result()
            if res.accepted:
                accepted = True
            if res.version > max_version:
                max_version = res.version
        self._timing.end_record_time("report_gradient")
        return accepted, max_version
예제 #7
0
    def push_gradients(self, request, _):
        res = elasticdl_pb2.PushGradientsResponse()
        if self._use_async:
            grad_vars = []

            for name, pb in request.gradients.dense_parameters.items():
                grad = pb_to_ndarray(pb)
                self._parameters.check_grad(Tensor(name, grad, None))
                grad = tf.constant(grad)
                var = self._parameters.get_non_embedding_param(name)
                grad_vars.append((grad, var))

            for name, pb in request.gradients.embedding_tables.items():
                grad = pb_to_indexed_slices(pb)
                self._parameters.check_grad(
                    Tensor(name, grad.values, grad.indices))
                if name in self._parameters.non_embedding_params:
                    var = self._parameters.get_non_embedding_param(name)
                    grad_vars.append((grad, var))
                else:
                    grad_vars.append((grad, name))

            learning_rate = request.learning_rate
            # TODO: if request.learning_rate == 0.0, modulate learning_rate
            #       in self._optimizer with staleness
            if self._lr_staleness_modulation and learning_rate > 0.0:
                staleness = max(
                    1, self._parameters.version - request.gradients.version)
                # Modulate learning rate by staleness
                learning_rate /= staleness

            self._set_optimizer_learning_rate(learning_rate)
            self._optimizer.apply_gradients(grad_vars)
            with self._version_lock:
                self._parameters.version += 1
                self._save_params_to_checkpoint_if_needed()
                version = self._parameters.version
            self._report_version_if_needed(version)

            res.accepted = True
            res.version = self._parameters.version
            return res
        else:
            if (request.gradients.version <
                    self._parameters.version - self._sync_version_tolerance):
                res.accepted = False
                res.version = self._parameters.version
                return res

            with self._lock:
                for name, pb in request.gradients.dense_parameters.items():
                    grad = pb_to_ndarray(pb)
                    self._parameters.check_grad(Tensor(name, grad, None))
                    if name in self._grads_buffer:
                        self._grads_buffer[name] = (self._grads_buffer[name] +
                                                    grad)
                    else:
                        self._grads_buffer[name] = grad

                for name, pb in request.gradients.embedding_tables.items():
                    grad = pb_to_indexed_slices(pb)
                    self._parameters.check_grad(
                        Tensor(name, grad.values, grad.indices))
                    if name in self._grads_buffer:
                        self._grads_buffer[name] = merge_indexed_slices(
                            self._grads_buffer[name], grad)
                    else:
                        self._grads_buffer[name] = grad

                self._grads_n += 1
                res.accepted = True

                updated_version = False
                version = self._parameters.version
                if self._grads_n == self._grads_to_wait:
                    grad_vars = []
                    for name, grad in self._grads_buffer.items():
                        # Dense gradients are averaged,
                        # while sparse gradients are summed
                        if not isinstance(grad, tf.IndexedSlices):
                            grad = grad / self._grads_to_wait
                            grad = tf.constant(grad)
                        var = self._parameters.get_non_embedding_param(name)
                        if var is None:
                            grad_vars.append((grad, name))
                        else:
                            grad_vars.append((grad, var))

                    self._set_optimizer_learning_rate(request.learning_rate)
                    self._optimizer.apply_gradients(grad_vars)
                    self._grads_n = 0
                    self._grads_buffer.clear()
                    self._parameters.version += 1
                    self._save_params_to_checkpoint_if_needed()
                    version = self._parameters.version
                    updated_version = True

            if updated_version:
                self._report_version_if_needed(version)
            res.version = version
            return res
예제 #8
0
    def push_gradients(
        self, grads, edl_grads, learning_rate, model_versions,
    ):
        """
        Push gradients to PS. There two kinds of gradients:
         - gradients of normal layers
         - sparse gradients of ElasticDL embedding layers
        """
        reqs = [
            elasticdl_pb2.PushGradientsRequest() for i in range(self.ps_num)
        ]
        ps_grads = {}

        # 1. handle grads
        for grad in grads:
            ps_id = self.parameter_to_ps[grad.name]
            if ps_id not in ps_grads:
                ps_grads[ps_id] = {grad.name: grad}
            else:
                if grad.name not in ps_grads[ps_id]:
                    ps_grads[ps_id][grad.name] = grad
                else:
                    if grad.indices is not None:
                        ps_grads[ps_id][grad.name] = merge_indexed_slices(
                            ps_grads[ps_id][grad.name], grad
                        )
                    else:
                        ps_grads[ps_id][grad.name].values += grad.values

        for ps_id, pair in ps_grads.items():
            for name, grad in pair.items():
                if grad.indices is not None:
                    v, i = deduplicate_indexed_slices(
                        grad.values, grad.indices
                    )
                    ps_grads[ps_id][name] = Tensor(None, v, i)

        for ps_id in ps_grads:
            req = reqs[ps_id]
            for name, grad in ps_grads[ps_id].items():
                # Keras embedding layer has a dense parameter,
                # but an indexed slices type gradient
                if grad.indices is not None:
                    serialize_indexed_slices(
                        Tensor(None, grad.values, grad.indices),
                        req.gradients.embedding_tables[name],
                    )
                else:
                    serialize_ndarray(
                        grad.values, req.gradients.dense_parameters[name]
                    )

        # 2. handle sparse grads of elasticdl embedding layers
        groups = {}
        for grad in edl_grads:
            if grad.name not in groups:
                groups[grad.name] = grad
            else:
                groups[grad.name] = merge_indexed_slices(
                    groups[grad.name], grad
                )

        # Sum up the values of the duplicated indices in the
        # gradients. It can reduce the gradient payload of the
        # dense embedding.
        for name, grad in groups.items():
            v, i = deduplicate_indexed_slices(grad.values, grad.indices)
            groups[name] = Tensor(None, v, i)

            results = scatter_embedding_vector(
                groups[name].values, groups[name].indices, self.ps_num
            )

            for ps_id in results:
                req = reqs[ps_id]
                gv, gi = results[ps_id]
                serialize_indexed_slices(
                    Tensor(None, gv, gi), req.gradients.embedding_tables[name],
                )

        # 3. push gradients to PS
        report_futures = []
        for ps_id in range(self.ps_num):
            req = reqs[ps_id]
            req.gradients.version = model_versions[ps_id]
            req.learning_rate = learning_rate
            report_future = self.ps_stubs[ps_id].push_gradients.future(req)
            report_futures.append(report_future)

        accepted = False
        max_version = -1
        for report_future in report_futures:
            res = report_future.result()
            if res.accepted:
                accepted = True
            if res.version > max_version:
                max_version = res.version
        return accepted, max_version