Exemplo n.º 1
0
    def _get_aggregated_dense_grad(self, graph_item, grad_name,
                                   reduce_to_device, BFTaggregator):
        grad_op_name = strip_replica_prefix(get_op_name(grad_name))
        output_idx = get_index_from_tensor_name(grad_name)
        grad_ops = [
            graph_item.graph.get_operation_by_name(
                ops.prepend_name_scope(grad_op_name, replica_prefix(i)))
            for i in range(self.num_replicas)
        ]

        # Aggregate gradients on `reduce_to_device` (usually CPU)
        with ops.device(reduce_to_device):
            #print("@@@@@@@@@@@@@@",[grad_op.outputs[output_idx] for grad_op in grad_ops])
            '''
            grad_sum_op_name = ops.prepend_name_scope(grad_op_name, u"%sAdd" % AUTODIST_PREFIX)
            grad_sum = math_ops.add_n([grad_op.outputs[output_idx] for grad_op in grad_ops], name=grad_sum_op_name)
            grad_avg_op_name = ops.prepend_name_scope(grad_op_name, u"%sDiv" % AUTODIST_PREFIX)
            grad_avg = math_ops.realdiv(grad_sum, self.num_replicas, name=grad_avg_op_name)
            '''

            # BFT Aggregator
            gradients = [grad_op.outputs[output_idx] for grad_op in grad_ops]
            grad_avg = BFTaggregator.aggregate(gradients)

            #print("$$$$$$$$$$$$$$",grad_avg)

        return grad_avg
Exemplo n.º 2
0
        def build_and_run_model():
            def my_net(x):
                return gen_sendrecv_ops.ipu_send_to_host(
                    x,
                    tensor_name="test_tensor",
                    send_device="/device:IPU:0",
                    send_device_incarnation=0,
                    recv_device="/device:CPU:0")

            v = array_ops.placeholder(np.float32, shape=())
            with ipu.scopes.ipu_scope("/device:IPU:0"):
                send_op = ipu.ipu_compiler.compile(my_net, inputs=[v])
            with ops.device("/device:CPU:0"):
                recv_op = gen_sendrecv_ops.ipu_recv_at_host(
                    T=np.float32,
                    tensor_name="test_tensor",
                    send_device="/device:IPU:0",
                    send_device_incarnation=0,
                    recv_device="/device:CPU:0")

            with session.Session() as sess:
                report = ReportJSON(self, sess)
                _, received = sess.run([send_op, recv_op], feed_dict={v: 1.0})
                events = report.get_event_trace(sess)
                return received, events
Exemplo n.º 3
0
    def get_all_update_ops(self, grad_apply_finished, worker_device=None):
        """
        Create and return new update ops for proxy vars.

        Args:
            grad_apply_finished (List[Operation]): ops with which to colocate the new ops.
            worker_device (DeviceSpecV2): the device on which to create the ops.

        Returns:
            List[Operation]: the list of update ops for each proxy variable.
        """
        with ops.device(worker_device):
            with ops.control_dependencies(grad_apply_finished):
                updated_value = gen_read_var_op(
                    self._this_op, self._dtype)  # create new read var op
        update_ops = []
        for proxy_var in self._proxy_vars:
            with ops.device(proxy_var.device):
                update_ops.append(proxy_var.assign(updated_value))
        return update_ops
Exemplo n.º 4
0
def _MyNet():
    with variable_scope.variable_scope("vs", use_resource=True):
        with ops.device("cpu"):
            inp = array_ops.placeholder(np.float32,
                                        [1] + [24] * ndims + [M * K],
                                        name="input")
            bias = array_ops.placeholder(np.float32, [N * K], name="bias")
        with ops.device("/device:IPU:0"):
            weights = variable_scope.get_variable("weights",
                                                  [8] * ndims + [M, N * K])
            output = nn.convolution(inp,
                                    weights,
                                    strides=[1] + [4] * ndims + [1],
                                    padding="VALID",
                                    name='cnv')
            output = nn.bias_add(output, bias, name='bias_add')
            loss = math_ops.reduce_sum(math_ops.square(output))
            optimizer = gradient_descent.GradientDescentOptimizer(0.0005)
            train = optimizer.minimize(loss)
            return train, loss, inp, bias
Exemplo n.º 5
0
    def get_training_loss_and_op(self, compiled_training_loop):
        with ops.device(_HOST_DEVICE):
            with ops.control_dependencies([compiled_training_loop]):
                loss = self._outfeed_queue.dequeue()

            # Reduce loss over all dimensions (i.e. batch_size, gradient_accumulation_count)
            loss = math_ops.reduce_mean(math_ops.cast(loss, dtypes.float32))

        train_op = compiled_training_loop

        return loss, train_op
Exemplo n.º 6
0
    def begin(self):
        if not self._outfeed.enqueued:
            raise RuntimeError("This logging hook's outfeed was not enqueued. "
                               "Did you forget to call the log function?")

        assert self._dequeue_op is None
        assert self._deleter_op is None

        with ops.device("cpu"):
            self._dequeue_op = self._outfeed.dequeue()
            self._deleter_op = self._outfeed.deleter

        self._iter_count = 0
Exemplo n.º 7
0
    def get_predictions(self, compiled_prediction_loop):
        with ops.device(_HOST_DEVICE):
            with ops.control_dependencies([compiled_prediction_loop]):
                predictions = self._outfeed_queue.dequeue()

        if isinstance(predictions, dict):
            return predictions

        assert isinstance(predictions, list)
        if len(predictions) != 1:
            raise ValueError((
                "The last computational stage must return exactly one prediction "
                "tensor, but got {}").format(len(predictions)))

        return predictions[0]
Exemplo n.º 8
0
    def _aggregate_sparse_gradients(self, var_op, reduce_to_device,
                                    indexed_slices_grads, values_op_name):
        with ops.device(reduce_to_device):
            grad_accum_op_name = ops.prepend_name_scope(
                values_op_name, u"%sAccum" % AUTODIST_PREFIX)
            grad_accum = data_flow_ops.SparseConditionalAccumulator(
                dtype=indexed_slices_grads[0].values.dtype,
                shape=var_op.outputs[0].shape,
                shared_name=grad_accum_op_name,
                name=grad_accum_op_name)
            accum_apply_ops = [
                grad_accum.apply_indexed_slices_grad(
                    indexed_slices_grads[i],
                    MAX_INT64,
                    name=ops.prepend_name_scope(
                        values_op_name, u"%s-Accum-Apply" % replica_prefix(i)))
                for i in range(self.num_replicas)
            ]
            take_grad_op_name = ops.prepend_name_scope(
                values_op_name, u"%sTake-Grad" % AUTODIST_PREFIX)
            with ops.control_dependencies(accum_apply_ops):
                take_grad = grad_accum.take_indexed_slices_grad(
                    self.num_replicas, name=take_grad_op_name)

            new_indices = take_grad.indices
            new_values = take_grad.values
            new_dense_shape = take_grad.dense_shape
            if indexed_slices_grads[0].indices.dtype != new_indices.dtype:
                new_indices = math_ops.cast(
                    new_indices,
                    indexed_slices_grads[0].indices.dtype,
                    name=ops.prepend_name_scope(
                        values_op_name,
                        u"%sTake-Grad-Cast-Indices" % AUTODIST_PREFIX))
            if indexed_slices_grads[
                    0].dense_shape.dtype != new_dense_shape.dtype:
                new_dense_shape = math_ops.cast(
                    new_dense_shape,
                    indexed_slices_grads[0].dense_shape.dtype,
                    name=ops.prepend_name_scope(
                        values_op_name,
                        u"%sTake-Grad-Cast-Shape" % AUTODIST_PREFIX))
        return ops.IndexedSlices(new_values, new_indices, new_dense_shape)
Exemplo n.º 9
0
    def get_evaluation_loss_and_metrics(self, compiled_evaluation_loop):
        with ops.device(_HOST_DEVICE):
            with ops.control_dependencies([compiled_evaluation_loop]):
                inputs = self._outfeed_queue.dequeue()

            args, kwargs = loops._body_arguments(inputs)  # pylint: disable=protected-access
            metrics = self._captured_eval_metrics_fn(*args, **kwargs)

        if not isinstance(metrics, dict):
            raise TypeError(("The `eval_metrics_fn` must return a dict, "
                             "but got {}.").format(type(metrics)))

        if model_fn_lib.LOSS_METRIC_KEY not in metrics:
            raise KeyError(
                ("The dict returned from `eval_metrics_fn` "
                 "must contain '{}'.").format(model_fn_lib.LOSS_METRIC_KEY))

        loss = metrics.pop(model_fn_lib.LOSS_METRIC_KEY)

        return loss, metrics
Exemplo n.º 10
0
    def _build_proxy_on(self, destination_device):
        """
        Build a proxy of the original variable on `destination_device`.

        Args:
            destination_device (DeviceSpecV2): the destination device where the proxy is on.
        """
        is_gpu = destination_device.device_type.upper(
        ) == 'GPU' if destination_device.device_type else False
        prefix = replica_prefix(destination_device.device_index
                                ) if is_gpu else replica_prefix('CPU')
        with ops.device(destination_device):
            proxy_var = variable_scope.get_variable(
                ops.prepend_name_scope(self._this_op.name, prefix),
                dtype=self._dtype,
                initializer=self._initial_value,
                trainable=False)
        self._graph_item.info.update_variables(
            [proxy_var], replace=False)  # Should we update graph_item.info?
        self._proxy_vars.append(proxy_var)
        self._proxy_var_init_ops.append(
            proxy_var.assign(get_read_var_tensor(self._this_op)))
        self._mirror_all_read_var_ops()
        self._update_all_consumers()
Exemplo n.º 11
0
    def replicate(self, graph_item):
        """
        Replicate the entire graph as many times as num_replica.

        Args:
            graph_item: the original graph item

        Returns: The new graph item
        """
        item = GraphItem(graph=ops.Graph())
        fwd_ctx, bwd_ctx = self._collect_while_context(graph_item.graph)
        with item.graph.as_default():
            gdef = graph_item.graph.as_graph_def()
            for i in range(self._num_local_replicas):
                # Replicate ops
                with ops.device(self._replica_device_placer(replica_id=i)):
                    import_graph_def(gdef, name=replica_prefix(i))

                # Replicate while_loop context (control_flow) if needed.
                # The order matters -- We must replicate bwd context first, then forward context.
                # TODO(Zeya): To handle cases when there are nested while loops, in which we must replicate
                #  parent context first and then child context. See:
                #  https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/control_flow_ops.py#L938
                if bwd_ctx:
                    for ctx in bwd_ctx:
                        _ = WhileContext(context_def=ctx.to_proto(), grad_state=ctx._grad_state,
                                         import_scope=replica_prefix(i))
                if fwd_ctx:
                    for ctx in fwd_ctx:
                        _ = WhileContext(context_def=ctx.to_proto(), grad_state=ctx._grad_state,
                                         import_scope=replica_prefix(i))

            # update saver
            master_replica = 0
            if graph_item.info.savers:
                item.info.update_savers(
                    [Saver.from_proto(proto, import_scope=replica_prefix(master_replica)).to_proto()
                        for proto in graph_item.info.savers],
                    replace=False
                )

            # update gradient info
            for i in range(self._num_local_replicas):
                for g_name, t_name in graph_item.grad_target_name_pairs.items():
                    if isinstance(g_name, tuple):
                        new_g_name = (
                            ops.prepend_name_scope(g_name[0], replica_prefix(i)),
                            ops.prepend_name_scope(g_name[1], replica_prefix(i)),
                            ops.prepend_name_scope(g_name[2], replica_prefix(i)))
                    else:
                        new_g_name = ops.prepend_name_scope(g_name, replica_prefix(i))
                    new_t_name = ops.prepend_name_scope(t_name, replica_prefix(i))
                    item.extend_gradient_info_by_names(
                        grads=[new_g_name],
                        targets=[new_t_name]
                    )
                item.info.update_variables(
                    [_from_proto_fn(proto, import_scope=replica_prefix(i)).to_proto()
                        for proto in graph_item.info.variables],
                    replace=False
                )
                item.info.update_table_initializers(
                    [ops.prepend_name_scope(tb_init, replica_prefix(i))
                        for tb_init in graph_item.info.table_initializers],
                    replace=False
                )
        return item
Exemplo n.º 12
0
    def _get_accumulation_ops(graph_item, gradient, target,
                              num_accum_required):
        def _get_accum_apply_and_agg_grad(var_op, grad, indices, dense_shape):
            if indices is None:
                tensor = variable_utils.get_read_var_tensor(var_op)
                grad_accum = data_flow_ops.ConditionalAccumulator(
                    grad.dtype,
                    shape=tensor.get_shape(),
                    shared_name=var_op.name + "/grad_accum")
                # Get a copy of consumers list before creating accum_apply_op
                grad_consumers = list(grad.consumers())
                accum_apply_op = grad_accum.apply_grad(grad,
                                                       local_step=MAX_INT64,
                                                       name=grad.op.name +
                                                       '_accum_apply_grad')
                agg_grad = grad_accum.take_grad(num_accum_required,
                                                name=var_op.name +
                                                '_take_grad')
                update_consumers(grad_consumers, grad, agg_grad)
                update_control_consumers(get_control_consumers(grad.op),
                                         grad.op, agg_grad.op)
            else:
                grad_indexed_slices = ops.IndexedSlices(
                    values=grad, indices=indices, dense_shape=dense_shape)
                grad_accum = data_flow_ops.SparseConditionalAccumulator(
                    grad.dtype,
                    shape=grad.shape,
                    shared_name=var_op.name + "/grad_accum")
                # Get a copy of consumers list before creating accum_apply_op
                indices_consumers = list(indices.consumers())
                grad_consumers = list(grad.consumers())
                accum_apply_op = grad_accum.apply_indexed_slices_grad(
                    grad_indexed_slices,
                    local_step=MAX_INT64,
                    name=grad.op.name + '_accum_apply_grad')
                agg_grad = grad_accum.take_indexed_slices_grad(
                    num_accum_required, name=var_op.name + '_take_grad')
                agg_indices = agg_grad.indices
                if indices.dtype != agg_grad.indices.dtype:
                    agg_indices = math_ops.cast(agg_grad.indices,
                                                indices.dtype)
                agg_grad = ops.IndexedSlices(values=agg_grad.values,
                                             indices=agg_indices,
                                             dense_shape=agg_grad.dense_shape)
                assert isinstance(agg_grad, ops.IndexedSlices)
                update_consumers(indices_consumers, indices, agg_grad.indices)
                update_consumers(grad_consumers, grad, agg_grad.values)
                update_control_consumers(get_control_consumers(indices.op),
                                         indices.op, agg_grad.indices.op)
                update_control_consumers(get_control_consumers(grad.op),
                                         grad.op, agg_grad.values.op)
            return accum_apply_op, agg_grad

        # Aggregate gradients from different workers using ConditionalAccumulator.
        # var_op_to_agg_grad and var_op_to_accum_apply_op are updated.
        var_op_to_agg_grad = {}
        var_op_to_accum_apply_op = {}

        if target.op not in graph_item.trainable_var_op_to_var:
            logging.debug(
                "Gradient for non-trainable variable %s is created, "
                "do not insert accumulator for aggregating this gradient" %
                target.op.name)
            return {}, {}

        var_op = target.op
        if isinstance(gradient, ops.Tensor):
            grad = gradient
            indices = None
            dense_shape = None
        else:
            grad = gradient.values
            indices = gradient.indices
            dense_shape = gradient.dense_shape
        with ops.device(var_op.device), ops.name_scope(""):
            accum_apply_op, agg_grad = _get_accum_apply_and_agg_grad(
                var_op, grad, indices, dense_shape)
        if indices is None:
            var_op_to_agg_grad[var_op] = (None, agg_grad)
        else:
            var_op_to_agg_grad[var_op] = (agg_grad.indices, agg_grad.values)
        var_op_to_accum_apply_op[var_op] = accum_apply_op
        return var_op_to_agg_grad, var_op_to_accum_apply_op
Exemplo n.º 13
0
    def add_sync_op(self, graph_item, var_update_op, variable_replicator=None):
        """
        Adds additional ops needed for synchronous distributed training into current graph.

        Main purpose of additional ops are:
        1. Initialization
        2. Synchronization
        3. Gradient aggregation

        Args:
            graph_item (graph_item.GraphItem): the graph
            var_update_op: The op
            variable_replicator: The dictionary of master variable op name
                -> list of replicated variables, could be None

        Returns:
            None
        """
        this_worker_cpu = device_spec.DeviceSpecV2.from_string(
            self.worker_device)
        this_worker_cpu = this_worker_cpu.replace(device_type='CPU',
                                                  device_index=0)

        var_op = var_update_op.inputs[UPDATE_OP_VAR_POS].op
        is_trainable = var_op in graph_item.trainable_var_op_to_var
        source_op = self._get_optimizer_source_op(var_update_op)
        cc = get_control_consumers(source_op)

        with ops.device(var_op.device):
            if self._staleness == 0:
                queue_ops = self._get_queue_ops(var_update_op, source_op,
                                                self.is_chief, is_trainable)
            elif self._staleness > 0:
                queue_ops = self._get_queue_ops_stale(var_update_op, source_op,
                                                      self.is_chief,
                                                      is_trainable)
            else:
                raise ValueError(
                    "staleness should be greater than or equal to 0.")

            # Only dense trainable variables are replicated locally
            if variable_replicator:
                mirror_variable_update_ops = variable_replicator.get_all_update_ops(
                    queue_ops, worker_device=this_worker_cpu)
                with ops.device(this_worker_cpu):
                    finish_op = control_flow_ops.group(
                        *mirror_variable_update_ops)
            else:
                finish_op = control_flow_ops.group(*queue_ops)

        # Place computation ops of aggregated gradients on PS
        # Note that even though this is doing a graph traversal, it is called in such a way that it
        # only traverses from a gradient aggregator op to a gradient application op (or vice versa) --
        # these corresponding ops should always be adjacent in the graph.
        self._place_post_grad_agg_ops(
            device_spec.DeviceSpecV2.from_string(self.target_device),
            self._var_op_to_agg_grad,
            {var_op: var_update_op} if is_trainable else {})

        # Replace the control input of train_op to be finish_op
        # Note(Hao): this cc is stale, i.e. cc \subset get_control_consumers(source_op)
        update_control_consumers(cc, source_op, finish_op)
Exemplo n.º 14
0
    def test_pipelining(self):
        gradient_accumulation_count = 4
        local_batch_size = 2

        features = np.ones((1, 20), dtype=np.float32) * hvd.rank()
        labels = np.ones(1, dtype=np.int32) * hvd.rank()
        dataset = dataset_ops.Dataset.from_tensor_slices((features, labels))
        dataset = dataset.repeat().batch(local_batch_size, drop_remainder=True)

        loss_vals = []

        strategy = IPUHorovodStrategy()

        with strategy.scope():

            infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "infeed")
            outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("outfeed")

            def stage1(lr, images, labels):
                partial = keras.layers.Dense(32, activation="relu")(images)
                partial = keras.layers.Dense(16, activation="relu")(partial)
                return lr, partial, labels

            def stage2(lr, partial, labels):
                logits = keras.layers.Dense(10)(partial)
                per_example_loss = keras.losses.sparse_categorical_crossentropy(
                    y_true=labels, y_pred=logits, from_logits=True)
                # In a custom training loop, the optimiser does an allreduce *sum*, not
                # average, of the gradients across the distributed workers. Therefore
                # we want to divide the loss here by the *global* batch size, which is
                # done by the `tf.nn.compute_average_loss()` function.
                loss = nn.compute_average_loss(per_example_loss)
                return lr, loss

            def optimizer_function(lr, loss):
                optimizer = GradientDescentOptimizer(lr)
                return pipelining_ops.OptimizerFunctionOutput(optimizer, loss)

            def model(lr):
                pipeline_op = pipelining_ops.pipeline(
                    computational_stages=[stage1, stage2],
                    device_mapping=[0, 0],
                    gradient_accumulation_count=gradient_accumulation_count,
                    inputs=[lr],
                    infeed_queue=infeed_queue,
                    repeat_count=2,
                    outfeed_queue=outfeed_queue,
                    optimizer_function=optimizer_function,
                    name="Pipeline")
                return pipeline_op

            def compiled_model(lr):
                with ipu_scope("/device:IPU:0"):
                    return ipu_compiler.compile(model, inputs=[lr])

            with ops.device("cpu"):
                lr = array_ops.placeholder(np.float32, [])

            train_op = strategy.experimental_run_v2(compiled_model, args=[lr])

            _, per_worker_losses = outfeed_queue.dequeue()

            # Mean across the local `gradient_accumulation_count` batches:
            per_worker_loss = math_ops.reduce_mean(per_worker_losses)

            # Global mean across the distributed workers (since it is already
            # divided by the global batch size above, we do a sum here):
            global_loss = strategy.reduce(ReduceOp.SUM, per_worker_loss)

            config = ipu_utils.create_ipu_config()
            config = ipu_utils.auto_select_ipus(config, num_ipus=1)
            ipu_utils.configure_ipu_system(config)
            ipu_utils.move_variable_initialization_to_cpu()

            with session.Session() as sess:
                sess.run(infeed_queue.initializer)
                sess.run(variables.global_variables_initializer())

                for _ in range(10):
                    sess.run(train_op, {lr: 0.01})
                    global_loss_val = sess.run(global_loss)

                    if loss_vals:
                        # Check that the loss decreases monotonically.
                        self.assertLess(global_loss_val, loss_vals[-1])
                    loss_vals.append(global_loss_val)

                sess.run(infeed_queue.deleter)
                sess.run(outfeed_queue.deleter)

                # Check all variables are equal across workers.
                for variable in variables.global_variables():
                    self.assertAllRanksEqual(variable.eval(), variable.name)