Python ipu_session示例，tensorflow.compiler.plugin.poplar.tests.test_utils.ipu_session Python示例

示例#1

0

显示文件

    def testSingleFunctionElided(self):
        with tu.ipu_session() as sess:

            @ipu.function
            def func(a):
                return nn.relu(a)

            def body(a):
                return func(a)

            with ops.device('cpu'):
                a = array_ops.placeholder(np.float16, [64, 64])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                res = ipu.ipu_compiler.compile(body, inputs=[a])

            tu.move_variable_initialization_to_cpu()
            sess.run(variables.global_variables_initializer())

            report = tu.ReportJSON(self, sess)
            result = sess.run(res, {a: np.ones(a.shape)})
            self.assertAllClose(result[0], np.broadcast_to(1.0, [64, 64]))

            report.parse_log()

            ok = [
                'Relu/relu*/Nonlinearity',
                '__seed',
            ]
            report.assert_all_compute_sets_and_list(ok)

            # Function inlined into the entry computation.
            self.assertEqual(len(report.tensor_map.computation_names()), 1)

示例#2

0

显示文件

    def testNoGradient(self):
        with tu.ipu_session() as sess:

            @ipu.function
            def func(lhs, rhs):
                @custom_gradient.custom_gradient
                def f(a, b):
                    def grad(dy):
                        return [None, dy - b]

                    return a, grad

                return f(lhs, rhs)

            def body(a):
                with variable_scope.variable_scope("vs", use_resource=True):
                    w0 = variable_scope.get_variable(
                        "w0",
                        shape=[64, 64],
                        dtype=np.float32,
                        initializer=init_ops.ones_initializer())
                a = func(a, w0)
                return gradients_impl.gradients(a, [w0])

            with ops.device('cpu'):
                a = array_ops.placeholder(np.float32, [64, 64])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                res = ipu.ipu_compiler.compile(body, inputs=[a])

            tu.move_variable_initialization_to_cpu()
            sess.run(variables.global_variables_initializer())

            result = sess.run(res, {x: np.ones(x.shape) for x in [a]})
            self.assertAllClose(result[0], np.broadcast_to(0., [64, 64]))

示例#3

0

显示文件

    def testUserOpLoadLibraryWithWrongApiLevel(self):
        with tu.ipu_session() as sess:
            cwd = os.getcwd()
            outputs = {
                "output_types": [dtypes.float32],
                "output_shapes": [
                    tensor_shape.TensorShape([20]),
                ],
            }
            lib_path = cwd + "/tensorflow/python/ipu/libwrong_api_level_custom.so"

            def my_net(x):
                return ipu.custom_ops.precompiled_user_op([x],
                                                          lib_path,
                                                          outs=outputs)

            with self.assertRaises(errors_impl.InternalError):
                with ipu.scopes.ipu_scope('/device:IPU:0'):
                    x = array_ops.placeholder(np.float32, shape=[20])
                    model = ipu.ipu_compiler.compile(my_net, inputs=[x])

                sess.run(variables.global_variables_initializer())
                sess.run(model, {
                    x: np.ones([20]),
                })

示例#4

0

显示文件

  def testReplicationNormaliseNotInplace(self):
    with ops.device("/device:IPU:0"):
      x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
      a = gen_poputil_ops.ipu_replication_normalise(x)
      b = a + x

    with tu.ipu_session() as sess:
      report = tu.ReportJSON(self, sess, replicated=True)
      sess.run(variables.global_variables_initializer())

      report.reset()

      res = sess.run(b, {x: np.ones([1, 4, 4, 2])})
      self.assertAllClose(res, np.full([1, 4, 4, 2], 1.5))
      report.parse_log()

      ok = [
          '__seed*',
          'IpuReplicationNormalise/replication-normalise*/replication_normalise/Op/Divide',
          'switchControlBroadcast*/GlobalPre/Copy/OnTileCopy',
          '/OnTileCopy',
          'Copy_XLA_Args*OnTileCopy',
          'add/add*/AddTo',
      ]
      report.assert_all_compute_sets_and_list(ok)

示例#5

0

显示文件

    def testUserOpLoadNonExistentSharedLibrary(self):
        with tu.ipu_session() as sess:
            cwd = os.getcwd()
            outputs = {
                "output_types": [dtypes.float32],
                "output_shapes": [
                    tensor_shape.TensorShape([20]),
                ],
            }
            lib_path = cwd + "/and-now-for-something-completely-different.so"

            def my_net(x):
                return ipu.custom_ops.precompiled_user_op([x],
                                                          lib_path,
                                                          outs=outputs)

            with self.assertRaises(errors_impl.NotFoundError):
                with ipu.scopes.ipu_scope('/device:IPU:0'):
                    x = array_ops.placeholder(np.float32, shape=[20])
                    model = ipu.ipu_compiler.compile(my_net, inputs=[x])

                sess.run(variables.global_variables_initializer())
                sess.run(model, {
                    x: np.ones([20]),
                })

示例#6

0

显示文件

    def testPipelineIterationsNotMultiple(self):
        dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2])
        dataset = dataset.batch(batch_size=2, drop_remainder=True)

        def dataset_parser(value):
            a = value
            b = (value + 10.) / 2.0
            return {"a": a, "b": b}

        dataset = dataset.map(dataset_parser)
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed1")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed1")

        def stage1(c, **kwargs):
            with variable_scope.variable_scope("vs", use_resource=True):
                y = layers.Conv2D(
                    2,
                    1,
                    use_bias=True,
                    kernel_initializer=init_ops.ones_initializer(),
                    name='conv1')(kwargs["a"])
                return y + kwargs["b"], c

        def stage2(x, c):
            return math_ops.reduce_sum(x) + c

        def stage3(x):
            return x

        def my_net(c):
            return pipelining_ops.pipeline(
                [stage1, stage2, stage3],
                10,
                inputs=[c],
                infeed_queue=infeed_queue,
                outfeed_queue=outfeed_queue,
                pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped)

        with ops.device('cpu'):
            c = array_ops.placeholder(np.float32, shape=[])

        with tu.ipu_session() as sess:

            with ops.device("/device:IPU:0"):
                r = ipu_compiler.compile(my_net, inputs=[c])

            cfg = utils.create_ipu_config(profiling=True,
                                          profile_execution=True)
            cfg = utils.auto_select_ipus(cfg, 4)
            utils.configure_ipu_system(cfg)
            utils.move_variable_initialization_to_cpu()

            sess.run(variables.global_variables_initializer())
            sess.run(infeed_queue.initializer)
            with self.assertRaisesRegex(
                    errors.FailedPreconditionError,
                    'The pipeline depth of the pipeline must be a multiple of 3'
            ):
                sess.run(r, {c: 10.01})

示例#7

0

显示文件

  def testResnetLike(self):
    # Check that we get all classifications for a small resnet correct

    def stage1(img, label):
      with variable_scope.variable_scope("stage1", use_resource=True):
        x = conv(img, 7, 2, 16)
        x = nn.relu(x)
        x = max_pool(x, ksize=3, stride=2)
        return x, label

    def stage2(x, label):
      with variable_scope.variable_scope("stage2", use_resource=True):
        x = block("b", 2, 64, 1, x)
        return x, label

    def stage3(x, label):
      with variable_scope.variable_scope("stage3", use_resource=True):
        x = math_ops.reduce_mean(x, axis=[1, 2])
        x = fc(x, 100)
        loss = math_ops.reduce_mean(
            nn.sparse_softmax_cross_entropy_with_logits(logits=x,
                                                        labels=label))
        return loss

    def optimizer_function(loss):
      opt = gradient_descent.GradientDescentOptimizer(0.01)
      return pipelining_ops.OptimizerFunctionOutput(opt, loss)

    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

    # Run the pipeline twice.
    def model_pipeline(x, lr):
      return pipelining_ops.pipeline([stage1, stage2, stage3],
                                     12,
                                     inputs=[x, lr],
                                     outfeed_queue=outfeed_queue,
                                     optimizer_function=optimizer_function)

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
      l = array_ops.placeholder(np.int32, shape=[1])

    with tu.ipu_session() as sess:

      with ops.device("/device:IPU:0"):
        compiled_model_pipeline = ipu_compiler.compile(model_pipeline,
                                                      inputs=[x, l])

      tu.move_variable_initialization_to_cpu()
      outfeed_queue.dequeue()

      report = tu.ReportJSON(self, sess, pipelining=True)
      sess.run(variables.global_variables_initializer())
      report.reset()
      sess.run(compiled_model_pipeline, {x: np.ones(x.shape), l: [1]})
      report.parse_log()

      # 1 conv in stage1, 2 conv in stage2, 1 matmul in stage3 = 4
      self.assertAllEqual(report.get_ml_type_counts(), [0, 4, 3, 4])

示例#8

0

显示文件

    def testUserOpMetadata(self):
        with tu.ipu_session() as sess:
            cwd = os.getcwd()
            outputs = {
                "output_types":
                [dtypes.float32, dtypes.float32, dtypes.float32],
                "output_shapes": [
                    tensor_shape.TensorShape([20]),
                    tensor_shape.TensorShape([5, 2]),
                    tensor_shape.TensorShape([10])
                ],
            }

            lib_path = os.path.join(
                cwd,
                "tensorflow/python/ipu/libadd_incrementing_custom_with_metadata.so"
            )

            def my_net(x, y, z):
                output = ipu.custom_ops.precompiled_user_op([x, y, z],
                                                            lib_path,
                                                            outs=outputs)

                opt = gradient_descent.GradientDescentOptimizer(
                    learning_rate=0.1)

                gradients = opt.compute_gradients(output[2], [x, y, z])

                return [output, gradients]

            with ipu.scopes.ipu_scope('/device:IPU:0'):
                x = array_ops.placeholder(np.float32, shape=[20])
                y = array_ops.placeholder(np.float32, shape=[5, 2])
                z = array_ops.placeholder(np.float32, shape=[10])

                model = ipu.ipu_compiler.compile(my_net, inputs=[x, y, z])

            sess.run(variables.global_variables_initializer())
            res = sess.run(model, {
                x: np.ones([20]),
                y: np.ones([5, 2]),
                z: np.ones([10])
            })

            inputs = res[0]

            self.assertAllEqual(np.full([20], 2.0), inputs[0])
            self.assertAllEqual(np.full([5, 2], 3.0), inputs[1])
            self.assertAllEqual(np.full([10], 4.0), inputs[2])

            gradients = res[1]

            # Our gradient function is the same as the above but a multiply
            # instead. Since the "loss" is just output[3], input[3] is the only
            # one which will actually have a gradient. (Which will be 3).
            self.assertAllEqual(np.zeros([20]), gradients[0][0])
            self.assertAllEqual(np.zeros([5, 2]), gradients[1][0])
            self.assertAllEqual(np.full([10], 3.0), gradients[2][0])

示例#9

0

显示文件

    def testUserReadWriteOpBackwardsUnusedGradients(self):
        SIZE = 5

        def scaled_add_op(x, scale, y):
            cwd = os.getcwd()
            outputs = {
                "output_types": [dtypes.float32],
                "output_shapes": [tensor_shape.TensorShape([SIZE])],
            }
            base_dir = os.path.join(cwd, "tensorflow/python/ipu")
            gp_path = os.path.join(base_dir,
                                   "tests/add_scaled_vector_add_codelet.cc")
            lib_path = os.path.join(base_dir,
                                    "libadd_partial_gradients_custom.so")

            return ipu.custom_ops.precompiled_user_op(
                [x, scale, y, math_ops.cos(x),
                 math_ops.cosh(y)],
                lib_path,
                gp_path,
                outs=outputs,
                inputs_with_gradients=[0, 2])

        def model(scale, y, label):
            with variable_scope.variable_scope("vs", use_resource=True):
                x = variable_scope.get_variable(
                    "x",
                    shape=[SIZE],
                    initializer=init_ops.ones_initializer(),
                    dtype=np.float32)
            z = math_ops.reduce_mean(scaled_add_op(x, scale, y), axis=1)
            loss = losses.mean_squared_error(label, z)
            return loss, gradient_descent.GradientDescentOptimizer(
                0.01).minimize(loss)

        with ipu.scopes.ipu_scope('/device:IPU:0'):
            scale_data = array_ops.placeholder(np.float32, [])
            y_data = array_ops.placeholder(np.float32, [SIZE])
            label_data = array_ops.placeholder(np.int32, [1])

            xla_result = ipu.ipu_compiler.compile(
                model, [scale_data, y_data, label_data])

        with tu.ipu_session() as sess:
            scale = 2
            b = np.full([SIZE], 3)
            label = np.ones([1])
            sess.run(variables.global_variables_initializer())

            result = sess.run(xla_result,
                              feed_dict={
                                  y_data: b,
                                  scale_data: scale,
                                  label_data: label
                              })

            self.assertEqual(result[0], 36)

示例#10

0

显示文件

    def testFunctionInferenceWithVariableScope(self):
        with tu.ipu_session() as sess:

            def func(a, b, name):
                @ipu.function
                def outlined_func(a, b):
                    with variable_scope.variable_scope(name,
                                                       use_resource=True):
                        w = variable_scope.get_variable(
                            "w",
                            shape=[64, 64],
                            dtype=np.float32,
                            initializer=init_ops.ones_initializer())
                    x = math_ops.matmul(a, w)
                    x = x + b
                    return math_ops.sigmoid(x)

                return outlined_func(a, b)

            def body(a, b, c):
                a = func(a, b, name="one")
                a = a - func(a, c, name="two")
                return a

            with ops.device('cpu'):
                a = array_ops.placeholder(np.float32, [64, 64])
                b = array_ops.placeholder(np.float32, [64, 64])
                c = array_ops.placeholder(np.float32, [64, 64])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                res = ipu.ipu_compiler.compile(body, inputs=[a, b, c])

            tu.move_variable_initialization_to_cpu()
            sess.run(variables.global_variables_initializer())

            report = tu.ReportJSON(self, sess)
            result = sess.run(res, {x: np.ones(x.shape) for x in [a, b, c]})
            self.assertAllClose(result[0], np.broadcast_to(0., [64, 64]))

            report.parse_log()
            # There would be multiple non-linearities if the function was not
            # cached.
            ok = [
                'MatMul/dot*/Conv_1',
                'add/add*/Op/Add',
                'Sigmoid/sigmoid/Nonlinearity',
                'sub/subtract*/Op/Subtract',
                '__seed',
                'Copy_',
            ]
            report.assert_all_compute_sets_and_list(ok)
            report.assert_total_tile_memory(954492)
            report.assert_max_tile_memory(1690)

            # Entry computation and outlined one.
            self.assertEqual(len(report.tensor_map.computation_names()), 2)

示例#11

0

显示文件

    def testFifo(self):
        def my_net(x):
            body = lambda z: ipu.internal_ops.fifo(z, 5)
            return ipu.loops.repeat(3, body, [x])

        with ipu.scopes.ipu_scope('/device:IPU:0'):
            x = array_ops.placeholder(np.float32, shape=[2])
            run_loop = ipu.ipu_compiler.compile(my_net, inputs=[x])

        with tu.ipu_session() as sess:
            sess.run(variables.global_variables_initializer())
            res = sess.run(run_loop, {x: np.ones([2])})
            self.assertAllClose(res, np.zeros([1, 2]))

示例#12

0

显示文件

    def testUserOpWithAllocate(self):
        with tu.ipu_session() as sess:
            cwd = os.getcwd()
            outputs = {
                "output_types": [dtypes.float32],
                "output_shapes": [tensor_shape.TensorShape([128])],
            }

            lib_path = os.path.join(
                cwd,
                "tensorflow/python/ipu/libadd_incrementing_custom_with_metadata.so"
            )

            def my_net(x, y):
                x = ipu.custom_ops.precompiled_user_op([x, y],
                                                       lib_path,
                                                       op_name="AllocTest",
                                                       outs=outputs)
                return x

            with ipu.scopes.ipu_scope('/device:IPU:0'):
                x = array_ops.placeholder(np.float32, shape=[128])
                y = array_ops.placeholder(np.float32, shape=[128])

                model = ipu.ipu_compiler.compile(my_net, inputs=[x, y])

            report = tu.ReportJSON(self, sess)
            report.reset()

            sess.run(variables.global_variables_initializer())
            res = sess.run(model, {
                x: np.ones([128]),
                y: np.ones([128]),
            })

            report.parse_log()

            found = 0
            for t in report.get_tensor_map().all_tensors():
                if t.inst == "arg0.1":
                    # Allocator maps all of input 0 to tile 0
                    self.assertAllEqual(t.tile_ids(), [0])
                    found = found + 1
                if t.inst == "arg1.2":
                    # Allocator leaves input 1 to be linearly mapped
                    self.assertAllEqual(t.tile_ids(), [0, 1, 2, 3])
                    found = found + 1

            self.assertAllEqual(found, 2)
            self.assertAllEqual(np.full([128], 2.0), res[0])

示例#13

0

显示文件

    def testRandomConstant(self):
        def my_net(x, w):
            b = random_ops.random_uniform([2, 2])
            return math_ops.matmul(x, w) + b

        with ipu.scopes.ipu_scope('/device:IPU:0'):
            x = array_ops.placeholder(np.float32, shape=[2, 3])
            w = array_ops.placeholder(np.float32, shape=[3, 2])
            run_loop = ipu.ipu_compiler.compile(my_net, inputs=[x, w])

        with tu.ipu_session() as sess:
            sess.run(variables.global_variables_initializer())
            # We don't care about the value, just that it doesn't throw an exception
            sess.run(run_loop, {x: np.ones([2, 3]), w: np.ones([3, 2])})

示例#14

0

显示文件

    def testRecomputeSuggestion(self):
        def my_model(a):
            b = array_ops.constant(np.random.rand(5, 5),
                                   dtype=np.float32,
                                   name="W_ih")
            c = array_ops.constant(np.random.rand(5, 5),
                                   dtype=np.float32,
                                   name="W_ho")
            d = a + b
            ipu.internal_ops.print_tensor(d)  # block some optimisation
            e = d + c
            ipu.internal_ops.print_tensor(e)  # block some optimisation
            f = ipu.internal_ops.recompute(e)
            g = f + f
            ipu.internal_ops.print_tensor(g)  # block some optimisation
            output = g + f

            return [output]

        with ops.device("cpu"):
            inp = array_ops.placeholder(np.float32, [5, 5], name="a")

        with ipu.scopes.ipu_scope("/device:IPU:0"):
            out = ipu.ipu_compiler.compile(my_model, inputs=[inp])

        with tu.ipu_session() as sess:
            report = tu.ReportJSON(self,
                                   sess,
                                   replicated=False,
                                   allow_recompute=True)
            sess.run(variables.global_variables_initializer())

            report.reset()
            sess.run(out, {inp: np.ones([5, 5])})
            report.parse_log()

            # 5 adds in a graph that only defined 4
            ok = [
                '__seed*',
                'add_1/add.1/Op/Add',
                'add_2/add.10/Op/Add',
                'add_1/add.1.clone.1/Op/Add',
                'add/add.4/Op/Add',
                'add_1/add.1.clone/Op/Add',
                'add_3/add.12/Op/Add',
            ]
            report.assert_all_compute_sets_and_list(ok)

示例#15

0

显示文件

  def testDuplicateInputsOutputs(self):
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed9")

    def stage1(x, y):
      return x, y, y, x

    # The above should be optimised to a single copy for each duplicate output.
    def stage2(x1, y1, y2, x2):
      return x1, y1, y2, x2

    # Same for this stage
    def stage3(_x1, _y1, y2, x2):
      return x2, y2

    def model_pipeline(x, y):
      return pipelining_ops.pipeline(
          [stage1, stage2, stage3],
          12,
          inputs=[x, y],
          outfeed_queue=outfeed_queue,
          pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
      y = array_ops.placeholder(np.float32, shape=[1, 2])

    with ops.device("/device:IPU:0"):
      compiled_model_pipeline = ipu_compiler.compile(model_pipeline,
                                                     inputs=[x, y])

    cfg = utils.create_ipu_config(profiling=True, profile_execution=True)
    cfg = utils.auto_select_ipus(cfg, 4)
    utils.configure_ipu_system(cfg)
    utils.move_variable_initialization_to_cpu()

    #TODO(T10784) test how many IPU copies are here once we insert IPU copies.
    outfeed_op = outfeed_queue.dequeue()
    with tu.ipu_session() as sess:
      sess.run(compiled_model_pipeline, {
          x: np.ones(x.shape),
          y: np.ones(y.shape)
      })
      output = sess.run(outfeed_op)
      for i in range(12):
        self.assertAllClose(output[0][i], np.ones(x.shape))
        self.assertAllClose(output[1][i], np.ones(y.shape))

示例#16

0

显示文件

    def testUserOp(self):
        with tu.ipu_session() as sess:
            cwd = os.getcwd()
            outputs = {
                "output_types":
                [dtypes.float32, dtypes.float32, dtypes.float32],
                "output_shapes": [
                    tensor_shape.TensorShape([20]),
                    tensor_shape.TensorShape([5, 2]),
                    tensor_shape.TensorShape([10])
                ],
            }
            lib_path = cwd + "/tensorflow/python/ipu/libadd_incrementing_custom.so"

            def my_net(x, y, z):
                o1 = ipu.custom_ops.precompiled_user_op([x, y, z],
                                                        lib_path,
                                                        outs=outputs)

                o2 = ipu.custom_ops.precompiled_user_op(
                    [x + 1., y + 1., z + 1.], lib_path, outs=outputs)
                return o1, o2

            with ipu.scopes.ipu_scope('/device:IPU:0'):
                x = array_ops.placeholder(np.float32, shape=[20])
                y = array_ops.placeholder(np.float32, shape=[5, 2])
                z = array_ops.placeholder(np.float32, shape=[10])

                model = ipu.ipu_compiler.compile(my_net, inputs=[x, y, z])

            sess.run(variables.global_variables_initializer())
            res = sess.run(model, {
                x: np.ones([20]),
                y: np.ones([5, 2]),
                z: np.ones([10])
            })

            self.assertAllEqual(np.full([20], 2.0), res[0][0])
            self.assertAllEqual(np.full([5, 2], 3.0), res[0][1])
            self.assertAllEqual(np.full([10], 4.0), res[0][2])
            self.assertAllEqual(np.full([20], 3.0), res[1][0])
            self.assertAllEqual(np.full([5, 2], 4.0), res[1][1])
            self.assertAllEqual(np.full([10], 5.0), res[1][2])

示例#17

0

显示文件

  def testTwoParallelMatMuls(self):
    # Check that we get all classifications for a simple conv

    def graph(x, label):
      a = fc(x, 48)
      a = nn.relu(a)
      b = fc(x, 48)
      b = nn.relu(b)
      x = a + b

      a = fc(x, 100)
      a = nn.relu(a)
      b = fc(x, 100)
      b = nn.relu(b)
      x = a + b

      loss = math_ops.reduce_mean(
          nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label))

      opt = gradient_descent.GradientDescentOptimizer(0.01).minimize(loss)
      return loss, opt

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 224])
      l = array_ops.placeholder(np.int32, shape=[1])

    with ops.device("/device:IPU:0"):
      output = ipu_compiler.compile(graph, inputs=[x, l])

    tu.move_variable_initialization_to_cpu()

    with tu.ipu_session() as sess:

      report = tu.ReportJSON(self, sess)
      sess.run(variables.global_variables_initializer())
      report.reset()
      sess.run(output, {x: np.ones(x.shape), l: [1]})
      report.parse_log()

      # 4x updates, 2x grads
      self.assertAllEqual(report.get_ml_type_counts(), [0, 4, 2, 4])

示例#18

0

显示文件

    def testUserOpCPU(self):
        with tu.ipu_session() as sess:
            cwd = os.getcwd()
            outputs = {
                "output_types": [dtypes.float32, dtypes.int32, dtypes.float32],
                "output_shapes": [
                    tensor_shape.TensorShape([20]),
                    tensor_shape.TensorShape([10, 10, 10]),
                    tensor_shape.TensorShape([1]),
                ],
            }
            lib_path = cwd + "/tensorflow/python/ipu/libadd_incrementing_custom.so"

            def my_net(x, y):
                output = ipu.custom_ops.cpu_user_operation([x, y],
                                                           lib_path,
                                                           outs=outputs)
                return output

            with ipu.scopes.ipu_scope('/device:IPU:0'):
                x = array_ops.placeholder(np.float32, shape=[20])
                y = array_ops.placeholder(np.int32, shape=[10, 10, 10])

                model = ipu.ipu_compiler.compile(my_net, inputs=[x, y])

            sess.run(variables.global_variables_initializer())
            res = sess.run(
                model, {
                    x: np.ones([20]),
                    y: np.full([10, 10, 10], fill_value=6, dtype=np.int32),
                })

            # The first operation is in[0] + 6
            self.assertAllEqual(np.full([20], 7.0), res[0])

            # The second part is in[1] / 2
            self.assertAllEqual(np.full([10, 10, 10], 3, dtype=np.int32),
                                res[1])

            # The third part is the sum of the last two so 20*7 + 1000*3.
            self.assertAllEqual(np.full([1], 3140.0), res[2])

示例#19

0

显示文件

    def testFunctionsNoMatch(self):
        with tu.ipu_session() as sess:

            @ipu.function
            def func(a):
                return nn.relu(a)

            def body(a, b, c):
                return func(a), func(b), func(c)

            with ops.device('cpu'):
                a = array_ops.placeholder(np.float16, [64, 64])
                b = array_ops.placeholder(np.float16, [64, 64])
                c = array_ops.placeholder(np.float32, [64, 64])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                res = ipu.ipu_compiler.compile(body, inputs=[a, b, c])

            tu.move_variable_initialization_to_cpu()
            sess.run(variables.global_variables_initializer())

            report = tu.ReportJSON(self, sess)
            result = sess.run(res, {x: np.ones(x.shape) for x in [a, b, c]})
            self.assertAllClose(result[0], np.broadcast_to(1.0, [64, 64]))
            self.assertAllClose(result[1], np.broadcast_to(1.0, [64, 64]))
            self.assertAllClose(result[2], np.broadcast_to(1.0, [64, 64]))

            report.parse_log()
            # Two non-linearties, as one of them has a different type.
            ok = [
                'Relu/relu/Nonlinearity',
                'Relu/relu.*/Nonlinearity',
                '__seed',
                'Copy_',
            ]
            report.assert_all_compute_sets_and_list(ok)

            # Main computation (including inlined fp32 one, and the fp16 outlined).
            self.assertEqual(len(report.tensor_map.computation_names()), 2)

示例#20

0

显示文件

  def testResnetLike(self):
    # Check that we get all classifications for a small resnet correct

    def graph(img, label):
      x = conv(img, 7, 2, 16)
      x = nn.relu(x)
      x = max_pool(x, ksize=3, stride=2)

      x = block("b", 2, 64, 1, x)

      x = math_ops.reduce_mean(x, axis=[1, 2])
      x = fc(x, 100)
      loss = math_ops.reduce_mean(
          nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label))

      opt = gradient_descent.GradientDescentOptimizer(0.01).minimize(loss)
      return loss, opt

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
      l = array_ops.placeholder(np.int32, shape=[1])

    with ops.device("/device:IPU:0"):
      output = ipu_compiler.compile(graph, inputs=[x, l])

    tu.move_variable_initialization_to_cpu()

    with tu.ipu_session() as sess:

      report = tu.ReportJSON(self, sess)
      sess.run(variables.global_variables_initializer())
      report.reset()
      sess.run(output, {x: np.ones(x.shape), l: [1]})
      report.parse_log()

      # 3 convs, 1 matmul = 4
      self.assertAllEqual(report.get_ml_type_counts(), [0, 4, 3, 4])

示例#21

0

显示文件

    def runCustomUserOpWithUnusedOutput(self, op_name, ok):
        with tu.ipu_session() as sess:
            cwd = os.getcwd()
            outputs = {
                "output_types": [dtypes.float32],
                "output_shapes": [tensor_shape.TensorShape([128])],
            }

            lib_path = os.path.join(
                cwd,
                "tensorflow/python/ipu/libadd_incrementing_custom_with_metadata.so"
            )

            def my_net(x, y):
                ipu.custom_ops.precompiled_user_op([x, y],
                                                   lib_path,
                                                   op_name=op_name,
                                                   outs=outputs)
                return [x + y]

            with ipu.scopes.ipu_scope('/device:IPU:0'):
                x = array_ops.placeholder(np.float32, shape=[128])
                y = array_ops.placeholder(np.float32, shape=[128])

                model = ipu.ipu_compiler.compile(my_net, inputs=[x, y])

            report = tu.ReportJSON(self, sess)
            report.reset()

            sess.run(variables.global_variables_initializer())
            sess.run(model, {
                x: np.ones([128]),
                y: np.ones([128]),
            })

            report.parse_log()
            report.assert_all_compute_sets_and_list(ok)

示例#22

0

显示文件

    def testUserReadWriteOpBackwards(self):
        with tu.ipu_session() as sess:
            cwd = os.getcwd()
            outputs = {
                "output_types": [dtypes.float32],
                "output_shapes": [tensor_shape.TensorShape([10])],
            }
            lib_path = cwd + "/tensorflow/python/ipu/libadd_tensors_custom.so"

            def my_net(x, y):
                output = ipu.custom_ops.cpu_user_operation([x, y],
                                                           lib_path,
                                                           outs=outputs)

                opt = gradient_descent.GradientDescentOptimizer(
                    learning_rate=0.1)
                gradients = opt.compute_gradients(output[0], [x, y])

                return [output, gradients]

            with ipu.scopes.ipu_scope('/device:IPU:0'):
                x = array_ops.placeholder(np.float32, shape=[10])
                y = array_ops.placeholder(np.float32, shape=[10])

                model = ipu.ipu_compiler.compile(my_net, inputs=[x, y])

            sess.run(variables.global_variables_initializer())
            res = sess.run(model, {
                x: np.ones([10]),
                y: np.full([10], 6.0),
            })

            self.assertAllEqual(np.full([1, 10], 7.0), res[0])

            gradients = res[1]
            self.assertAllEqual(np.ones([10]), gradients[0][0])

示例#23

0

显示文件

    def testFunctionTraining(self):
        with tu.ipu_session() as sess:

            @ipu.function
            def func(lhs, rhs, a):
                x = math_ops.matmul(lhs, rhs)
                x = x + a
                x = math_ops.sigmoid(x)
                return x

            def body(a, b, c, labels):
                with variable_scope.variable_scope("vs", use_resource=True):
                    w0 = variable_scope.get_variable(
                        "w0",
                        shape=[64, 64],
                        dtype=np.float32,
                        initializer=init_ops.ones_initializer())
                    w1 = variable_scope.get_variable(
                        "w1",
                        shape=[64, 64],
                        dtype=np.float32,
                        initializer=init_ops.ones_initializer())
                a = func(a, w0, b)
                a = a - func(a, w1, c)
                loss = math_ops.reduce_mean(
                    nn.sparse_softmax_cross_entropy_with_logits(logits=a,
                                                                labels=labels))
                train_op = gradient_descent.GradientDescentOptimizer(
                    0.001).minimize(loss)
                return a, train_op

            with ops.device('cpu'):
                a = array_ops.placeholder(np.float32, [64, 64])
                b = array_ops.placeholder(np.float32, [64, 64])
                c = array_ops.placeholder(np.float32, [64, 64])
                labels = array_ops.placeholder(np.int32, [64])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                res = ipu.ipu_compiler.compile(body, inputs=[a, b, c, labels])

            tu.move_variable_initialization_to_cpu()
            sess.run(variables.global_variables_initializer())

            report = tu.ReportJSON(self, sess)
            result = sess.run(res,
                              {x: np.ones(x.shape)
                               for x in [a, b, c, labels]})
            self.assertAllClose(result[0], np.broadcast_to(0., [64, 64]))

            report.parse_log()
            # There would be multiple non-linearities(grads) if the function was not
            # cached.
            ok = [
                'MatMul/dot*/Conv_1',
                'add/add*/Op/Add',
                'Sigmoid/sigmoid/Nonlinearity',
                'sub/subtract*/Op/Subtract',
                '__seed',
                'Copy_',
                'SparseSoftmaxCrossEntropyWithLogits',
                'gradients/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits_grad/mul',
                'gradients/sub_grad/Neg/negate*/Op/Negate',
                'gradients/Sigmoid_grad/SigmoidGrad/sigmoid-grad*/NonLinearityGrad',
                'gradients/AddN/fusion/scaledAdd/Op/Multiply',
                'gradients/AddN/fusion/AddTo',
                'GradientDescent/update_vs/w*/ResourceApplyGradientDescent/fusion*/AddTo',
                'gradients/AddN/fusion/scaledAdd/Op/Multiply/OnTileCopyPre',
            ]
            report.assert_all_compute_sets_and_list(ok)
            report.assert_total_tile_memory(1167740)
            report.assert_max_tile_memory(3534)

            # Entry computastion and 2 outlined ones.
            self.assertEqual(len(report.tensor_map.computation_names()), 3)

示例#24

0

显示文件

    def testTwoMatMuls(self):
        # Check that we get all classifications for a simple conv

        def stage1(x, label):
            with variable_scope.variable_scope("stage1", use_resource=True):
                x = fc(x, 16)
                x = nn.relu(x)
                x = fc(x, 48)
                x = nn.relu(x)
                return x, label

        def stage2(x, label):
            with variable_scope.variable_scope("stage2", use_resource=True):
                x = fc(x, 48)
                x = nn.relu(x)
                x = fc(x, 100)
                x = nn.relu(x)
                return x, label

        def stage3(x, label):
            with variable_scope.variable_scope("stage3", use_resource=True):
                loss = math_ops.reduce_mean(
                    nn.sparse_softmax_cross_entropy_with_logits(logits=x,
                                                                labels=label))
                return loss

        def optimizer_function(loss):
            opt = gradient_descent.GradientDescentOptimizer(0.01)
            return pipelining_ops.OptimizerFunctionOutput(opt, loss)

        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

        # Run the pipeline twice.
        def model_pipeline(x, lr):
            return pipelining_ops.pipeline(
                [stage1, stage2, stage3],
                12,
                inputs=[x, lr],
                outfeed_queue=outfeed_queue,
                optimizer_function=optimizer_function)

        with ops.device('cpu'):
            x = array_ops.placeholder(np.float32, shape=[1, 224])
            l = array_ops.placeholder(np.int32, shape=[1])

        with tu.ipu_session() as sess:

            with ops.device("/device:IPU:0"):
                compiled_model_pipeline = ipu_compiler.compile(model_pipeline,
                                                               inputs=[x, l])
            outfeed_queue.dequeue()
            tu.move_variable_initialization_to_cpu()

            report = tu.ReportJSON(self,
                                   sess,
                                   pipelining=True,
                                   allow_recompute=True)
            sess.run(variables.global_variables_initializer())
            report.reset()
            sess.run(compiled_model_pipeline, {x: np.ones(x.shape), l: [1]})
            report.parse_log()

            # 2x matmul in 2 stages = 4x fwd x recomputation, 3x grads, 4x updates
            self.assertAllEqual(report.get_ml_type_counts(), [0, 8, 3, 4])

示例#25

0

显示文件

    def testFunctionSerializedLookup(self):
        with tu.ipu_session() as sess:

            @ipu.function
            def func(table, indices, min_idx, max_idx):
                # Do a serialized embedding lookup by adjusting the indices.
                adjusted_indices = indices - min_idx
                x = ipu.embedding_ops.embedding_lookup(table, adjusted_indices)
                # Mask out any outputs which are not in range [min_idx, max_idx).
                mask_max = math_ops.less(indices, max_idx)
                mask_min = math_ops.greater_equal(indices, min_idx)
                mask = math_ops.cast(math_ops.logical_and(mask_max, mask_min),
                                     np.float16)
                mask = array_ops.expand_dims(mask, 1)
                return x * mask

            DICT_SIZE = 20000
            EMB_SIZE = 128
            NUM_SPLITS = 10
            SPLIT_SIZE = DICT_SIZE // NUM_SPLITS

            def body(table, indices):
                table_sliced = array_ops.slice(table, [0, 0],
                                               [SPLIT_SIZE, EMB_SIZE])
                output = func(table_sliced, indices, 0, SPLIT_SIZE)

                for i in range(1, NUM_SPLITS):
                    min_idx = SPLIT_SIZE * i
                    max_idx = SPLIT_SIZE * (i + 1)
                    table_sliced = array_ops.slice(table, [min_idx, 0],
                                                   [SPLIT_SIZE, EMB_SIZE])
                    output = math_ops.add(output,
                                          func(table_sliced, indices, min_idx,
                                               max_idx),
                                          name=f"slice_{i}")
                return output

            with ops.device('cpu'):
                table = array_ops.placeholder(np.float16,
                                              [DICT_SIZE, EMB_SIZE])
                indices = array_ops.placeholder(np.int32, [NUM_SPLITS * 2])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                res = ipu.ipu_compiler.compile(body, inputs=[table, indices])

            report = tu.ReportJSON(self, sess)
            i_h = np.arange(0, DICT_SIZE, step=SPLIT_SIZE // 2)
            w_h = np.arange(EMB_SIZE, dtype=np.float16) * np.ones(
                [DICT_SIZE, EMB_SIZE], dtype=np.float16)
            result = sess.run(res, {table: w_h, indices: i_h})
            self.assertAllClose(result[0], np.take(w_h, i_h, axis=0))

            report.parse_log()
            # There would be multiple multi slices if the function was not cached.
            ok = [
                'Less/fusion*/Op/LessThan',
                'GreaterEqual/fusion*/Op/GreaterThanEqual',
                'sub/fusion/Op/Subtract',
                'embedding_lookup/multi-slice/output/multiSlice',
                'LogicalAnd/and*/Op/LogicalAnd',
                'Cast/convert*/Cast',
                'mul_0/fusion*/Op/Multiply',
                'slice_1*/add.*/Op/Add',
                'slice_2*/add.*/Op/Add',
                'slice_3*/add.*/Op/Add',
                'slice_4*/add.*/Op/Add',
                'slice_5*/add.*/Op/Add',
                'slice_6*/add.*/Op/Add',
                'slice_7*/add.*/Op/Add',
                'slice_8*/add.*/Op/Add',
                'slice_9*/add.*/Op/Add',
                '__seed',
                'Copy_',
            ]
            report.assert_all_compute_sets_and_list(ok)
            report.assert_total_tile_memory(10980622)
            report.assert_max_tile_memory(9888)

            # Main computation and outlined serialized one.
            self.assertEqual(len(report.tensor_map.computation_names()), 2)

示例#26

0

显示文件

    def testNestedFunctionTraining(self):
        with tu.ipu_session() as sess:

            def matmul_with_bias(x, scope_name):
                @ipu.function
                def func(x):
                    with variable_scope.variable_scope(scope_name,
                                                       use_resource=True):
                        w = variable_scope.get_variable(
                            "w",
                            shape=[64, 64],
                            dtype=np.float32,
                            initializer=init_ops.ones_initializer())
                    x = x @ w
                    with variable_scope.variable_scope(scope_name,
                                                       use_resource=True):
                        bias = variable_scope.get_variable(
                            "bias",
                            shape=[x.shape.as_list()[-1]],
                            dtype=np.float32,
                            initializer=init_ops.ones_initializer())
                    return x + bias

                return func(x)

            def cached_func(x, scope_name):
                @ipu.function
                def func(x):
                    x = matmul_with_bias(x, scope_name)
                    x = math_ops.sigmoid(x)
                    return x

                return func(x)

            def body(x, labels):
                x = cached_func(x, "1")
                x = cached_func(x, "2")
                loss = math_ops.reduce_mean(
                    nn.sparse_softmax_cross_entropy_with_logits(logits=x,
                                                                labels=labels))
                train_op = gradient_descent.GradientDescentOptimizer(
                    0.001).minimize(loss)
                return x, train_op

            with ops.device('cpu'):
                a = array_ops.placeholder(np.float32, [64, 64])
                labels = array_ops.placeholder(np.int32, [64])

            with ipu.scopes.ipu_scope("/device:IPU:0"):
                res = ipu.ipu_compiler.compile(body, inputs=[a, labels])

            tu.move_variable_initialization_to_cpu()
            sess.run(variables.global_variables_initializer())

            report = tu.ReportJSON(self, sess)
            result = sess.run(res, {x: np.ones(x.shape) for x in [a, labels]})
            self.assertAllClose(result[0], np.broadcast_to(1., [64, 64]))

            report.parse_log()
            # There would be multiple non-linearities(grads) if the function was not
            # cached.
            ok = [
                '__seed/set/setMasterSeed',
                'matmul/dot*/Conv_1',
                'add_0/fusion/Op/Add',
                'Sigmoid/sigmoid/Nonlinearity',
                'SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits',
                'gradients/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits_grad/',
                'gradients/Sigmoid_grad/SigmoidGrad/sigmoid-grad/NonLinearityGrad',
                'gradients/add_grad/Sum/reduce*/Reduce',
                'GradientDescent/update_1/bias/ResourceApplyGradientDescent/fusion.5/AddTo',
                'GradientDescent/update_1/w/ResourceApplyGradientDescent/fusion.4/AddTo',
                'GradientDescent/update_2/bias/ResourceApplyGradientDescent/fusion.3/AddTo',
                'GradientDescent/update_2/w/ResourceApplyGradientDescent/fusion.2/AddTo',
                'Copy_',
            ]
            report.assert_all_compute_sets_and_list(ok)
            report.assert_total_tile_memory(1129384)
            report.assert_max_tile_memory(3634)

            # Entry computastion and 4 outlined ones.
            self.assertEqual(len(report.tensor_map.computation_names()), 5)

示例#27

0

显示文件

    def testUserOpBackwardsSeparateOps(self):
        with tu.ipu_session() as sess:
            cwd = os.getcwd()
            outputs = {
                "output_types":
                [dtypes.float32, dtypes.float32, dtypes.float32],
                "output_shapes": [
                    tensor_shape.TensorShape([20]),
                    tensor_shape.TensorShape([5, 2]),
                    tensor_shape.TensorShape([10])
                ],
            }

            lib_path = os.path.join(
                cwd,
                "tensorflow/python/ipu/libadd_incrementing_custom_with_metadata.so"
            )

            def my_net(x, y, z):
                output = ipu.custom_ops.precompiled_user_op(
                    [x, y, z],
                    lib_path,
                    op_name="SepGrad",
                    separate_gradients=True,
                    outs=outputs)
                opt = gradient_descent.GradientDescentOptimizer(
                    learning_rate=0.1)

                gradients = opt.compute_gradients(output[2], [x, y, z])

                return [output, gradients]

            with ipu.scopes.ipu_scope('/device:IPU:0'):
                x = array_ops.placeholder(np.float32, shape=[20])
                y = array_ops.placeholder(np.float32, shape=[5, 2])
                z = array_ops.placeholder(np.float32, shape=[10])

                model = ipu.ipu_compiler.compile(my_net, inputs=[x, y, z])

            self.assertAllEqual(count_grad_ops(ops.get_default_graph()), 3)

            sess.run(variables.global_variables_initializer())
            res = sess.run(model, {
                x: np.ones([20]),
                y: np.ones([5, 2]),
                z: np.ones([10])
            })

            inputs = res[0]

            self.assertAllEqual(np.full([20], 2.0), inputs[0])
            self.assertAllEqual(np.full([5, 2], 3.0), inputs[1])
            self.assertAllEqual(np.full([10], 4.0), inputs[2])

            gradients = res[1]

            # The grad function adds index+1 to the value of the partial derivative
            # index. Since the "loss" is just output[2], input[2] is the only one
            # which will actually have a gradient. (Which will be 1*3 = 3).
            self.assertAllEqual(np.zeros([20]), gradients[0][0])
            self.assertAllEqual(np.zeros([5, 2]), gradients[1][0])
            self.assertAllEqual(np.full([10], 3.0), gradients[2][0])

示例#28

0

显示文件

  def testOutlinedFunction(self):
    # Check that we get all classifications for a simple conv

    def stage1(x, label):
      with variable_scope.variable_scope("stage1", use_resource=True):
        weight = variable_scope.get_variable(
            "w0",
            shape=[224, 48],
            dtype=np.float32,
            initializer=init_ops.ones_initializer())
        a = ipu_math_ops.serialized_matmul(
            x, weight, 2, serialization_dimension="a_rows_b_columns")
        a = nn.relu(a)
        b = fc(x, 48)
        b = nn.relu(b)
        return a + b, label

    def stage2(x, label):
      with variable_scope.variable_scope("stage2", use_resource=True):
        a = fc(x, 100)
        a = nn.relu(a)
        b = fc(x, 100)
        b = nn.relu(b)
        return a + b, label

    def stage3(x, label):
      with variable_scope.variable_scope("stage3", use_resource=True):
        loss = math_ops.reduce_mean(
            nn.sparse_softmax_cross_entropy_with_logits(logits=x,
                                                        labels=label))
        return loss

    def optimizer_function(loss):
      opt = gradient_descent.GradientDescentOptimizer(0.01)
      return pipelining_ops.OptimizerFunctionOutput(opt, loss)

    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

    # Run the pipeline twice.
    def model_pipeline(x, lr):
      return pipelining_ops.pipeline([stage1, stage2, stage3],
                                     12,
                                     inputs=[x, lr],
                                     outfeed_queue=outfeed_queue,
                                     optimizer_function=optimizer_function)

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 224])
      l = array_ops.placeholder(np.int32, shape=[1])

    with tu.ipu_session() as sess:

      with ops.device("/device:IPU:0"):
        compiled_model_pipeline = ipu_compiler.compile(model_pipeline,
                                                      inputs=[x, l])

      tu.move_variable_initialization_to_cpu()
      outfeed_queue.dequeue()

      report = tu.ReportJSON(self, sess, pipelining=True)
      sess.run(variables.global_variables_initializer())
      report.reset()
      sess.run(compiled_model_pipeline, {x: np.ones(x.shape), l: [1]})
      report.parse_log()

      # 3 matmul in stage 1, 2 matmuls in stage 2 = 5 (5x updates, 5x grads)
      self.assertAllEqual(report.get_ml_type_counts(), [0, 5, 2, 5])

示例#29

0

显示文件

  def testPipelineWithInfeedsKwargs(self):
    with tu.ipu_session() as sess:
      dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2])
      dataset = dataset.batch(batch_size=2, drop_remainder=True)

      def dataset_parser(value):
        a = value
        b = (value + 10.) / 2.0
        return {"a": a, "b": b}

      dataset = dataset.map(dataset_parser)
      infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed6")
      outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed6")

      def stage1(c, **kwargs):
        with variable_scope.variable_scope("vs", use_resource=True):
          y = layers.Conv2D(2,
                            1,
                            use_bias=True,
                            kernel_initializer=init_ops.ones_initializer(),
                            name='conv1')(kwargs["a"])
          return y + kwargs["b"], c

      def stage2(x, c):
        return math_ops.reduce_sum(x) + c

      def stage3(x):
        return x

      def my_net(c):
        return pipelining_ops.pipeline(
            [stage1, stage2, stage3],
            12,
            inputs=[c],
            infeed_queue=infeed_queue,
            outfeed_queue=outfeed_queue,
            pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)

      with ops.device('cpu'):
        c = array_ops.placeholder(np.float32, shape=[])

      with ops.device("/device:IPU:0"):
        r = ipu_compiler.compile(my_net, inputs=[c])

      cfg = utils.create_ipu_config(profiling=True, profile_execution=True)
      cfg = utils.auto_select_ipus(cfg, 4)
      utils.configure_ipu_system(cfg)
      utils.move_variable_initialization_to_cpu()

      outfeed_op = outfeed_queue.dequeue()

      report = tu.ReportJSON(self, sess, configure_device=False)
      report.reset()
      sess.run(variables.global_variables_initializer())
      sess.run(infeed_queue.initializer)
      sess.run(r, {c: 10.01})
      losses_pipeline = sess.run(outfeed_op)
      self.assertAllClose(losses_pipeline, [[
          410.01, 730.01, 650.01, 570.01, 890.01, 410.01, 730.01, 650.01,
          570.01, 890.01, 410.01, 730.01
      ]])
      report.parse_log()
      report.assert_pipeline_stages_on_expected_ipu((0, 1, 3))