示例#1
0
    def test_ipu_horovod_strategy(self):
        hvd_size = hvd.size()
        hvd_rank = hvd.rank()

        strategy = IPUHorovodStrategy()
        self.assertEqual(strategy.num_replicas_in_sync, hvd_size)

        cfg = ipu_utils.create_ipu_config()
        cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=1)
        ipu_utils.configure_ipu_system(cfg)

        with strategy.scope():

            def per_replica_fn():
                w = variable_scope.get_variable(name="w",
                                                initializer=hvd_rank + 1.0)
                self.assertEqual("/replica:0/task:0/device:IPU:0", w.device)
                return w * w

            per_replica_val = strategy.experimental_run_v2(per_replica_fn)
            strategy_sum = strategy.reduce(ReduceOp.SUM, per_replica_val)
            strategy_mean = strategy.reduce(ReduceOp.MEAN, per_replica_val)

            with session.Session() as sess:
                sess.run(variables.global_variables_initializer())

                # All workers should have the initial value from the first worker.
                self.assertEqual([1.0], sess.run(variables.global_variables()))
                self.assertEqual(1.0 * hvd_size, strategy_sum.eval())
                self.assertEqual(1.0, strategy_mean.eval())
示例#2
0
    def testPrefixPathWithTranspose(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
                z = array_ops.placeholder(np.float32, shape=[4, 4, 2, 1])

                with variable_scope.variable_scope("vs", use_resource=True):
                    y = layers.Conv2D(
                        2,
                        1,
                        use_bias=True,
                        kernel_initializer=init_ops.ones_initializer())(x)
                res = array_ops.transpose(y, [1, 2, 3, 0]) + z

            opts = utils.create_ipu_config()
            utils.configure_ipu_system(opts)

            sess.run(variables.global_variables_initializer())

            result = sess.run(
                res, {
                    x: np.reshape(np.arange(32), [1, 4, 4, 2]),
                    z: np.ones([4, 4, 2, 1])
                })
            self.assertAllClose(result, [[[[2.], [2.]], [[6.], [6.]],
                                          [[10.], [10.]], [[14.], [14.]]],
                                         [[[18.], [18.]], [[22.], [22.]],
                                          [[26.], [26.]], [[30.], [30.]]],
                                         [[[34.], [34.]], [[38.], [38.]],
                                          [[42.], [42.]], [[46.], [46.]]],
                                         [[[50.], [50.]], [[54.], [54.]],
                                          [[58.], [58.]], [[62.], [62.]]]])
示例#3
0
  def testIoTilesAreExcludedFromShard(self):
    def my_net(a, b):
      with ipu_shard(0):
        aa = math_ops.matmul(a, a, transpose_b=True, name="aa")
      with ipu_shard(1):
        bb = math_ops.matmul(b, b, transpose_b=True, name="bb")
      return aa, bb

    input_a = array_ops.placeholder(np.float32, [1216, 1])
    input_b = array_ops.placeholder(np.float32, [1216, 1])

    with ops.device("/device:IPU:0"):
      compiled_net = ipu_compiler.compile(my_net, inputs=[input_a, input_b])

    num_io_tiles = 128
    cfg = ipu_utils.create_ipu_config(profiling=True)
    cfg = ipu_utils.set_gcl_options(cfg, num_io_tiles=num_io_tiles)
    cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=2)
    ipu_utils.configure_ipu_system(cfg)

    with session.Session() as sess:
      report = ReportJSON(self, sess, configure_device=False)
      report.reset()

      sess.run(compiled_net, {
          input_a: np.ones(input_a.shape),
          input_b: np.ones(input_b.shape)
      })

      report.parse_log()
      num_compute_tiles = report.get_num_tiles_per_ipu() - num_io_tiles
      for t in report.get_tensor_map().all_tensors():
        self.assertLessEqual(len(t.tiles), num_compute_tiles)
示例#4
0
    def testPrefixPathWithElementwiseInPath(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
                z = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
                s = array_ops.placeholder(np.float32, shape=[])

                with variable_scope.variable_scope("vs", use_resource=True):
                    y = layers.Conv2D(
                        2,
                        1,
                        use_bias=True,
                        kernel_initializer=init_ops.ones_initializer())(x)
                res = y + z * s

            opts = utils.create_ipu_config()
            utils.configure_ipu_system(opts)

            sess.run(variables.global_variables_initializer())

            result = sess.run(
                res, {
                    x: np.reshape(np.arange(32), [1, 4, 4, 2]),
                    z: np.reshape(np.arange(32), [1, 4, 4, 2]),
                    s: 2.0
                })
            # Confirmed with values on the CPU.
            self.assertAllClose(
                result,
                [[[[1., 3.], [9., 11.], [17., 19.], [25., 27.]],
                  [[33., 35.], [41., 43.], [49., 51.], [57., 59.]],
                  [[65., 67.], [73., 75.], [81., 83.], [89., 91.]],
                  [[97., 99.], [105., 107.], [113., 115.], [121., 123.]]]])
示例#5
0
    def testStatefulGradientAccumulate(self):
        with self.session() as sess:
            dtype = np.float32

            def my_net(y):
                def cond(i, x, y):
                    del x
                    del y
                    return i < 10

                def body(i, x, y):
                    x = x + gen_poputil_ops.ipu_stateful_gradient_accumulate(
                        array_ops.ones_like(x),
                        num_mini_batches=5,
                        verify_usage=False)
                    y = y + array_ops.ones_like(x)
                    i = i + 1
                    return (i, x, y)

                i = 0
                return control_flow_ops.while_loop(cond, body, (i, y, y))

            with ops.device('cpu'):
                y = array_ops.placeholder(dtype, [1])

            opts = utils.create_ipu_config()
            utils.configure_ipu_system(opts)

            with ops.device("/device:IPU:0"):
                r = xla.compile(my_net, inputs=[y])

            y = sess.run(r, {y: [10]})
            self.assertEqual(y[0], 10)
            self.assertAllEqual(y[1], [20])
            self.assertAllEqual(y[2], [20])
示例#6
0
    def testStatefulGradientAccumulateInvalidUse(self):
        with self.session() as sess:
            dtype = np.float32

            def my_net(y):
                def cond(i, x, y):
                    del x
                    del y
                    return i < 10

                def body(i, x, y):
                    x = x + gen_poputil_ops.ipu_stateful_gradient_accumulate(
                        array_ops.ones_like(x), num_mini_batches=5)
                    y = y + array_ops.ones_like(x)
                    i = i + 1
                    return (i, x, y)

                i = 0
                return control_flow_ops.while_loop(cond, body, (i, y, y))

            with ops.device('cpu'):
                y = array_ops.placeholder(dtype, [1])

            opts = utils.create_ipu_config()
            utils.configure_ipu_system(opts)

            with ops.device("/device:IPU:0"):
                r = xla.compile(my_net, inputs=[y])

            with self.assertRaisesRegex(
                    errors.FailedPreconditionError,
                    "The .*IpuStatefulGradientAccumulate op"):
                sess.run(r, {y: [10]})
示例#7
0
    def testReportEveryNthExecution_Every1(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                pa = array_ops.placeholder(np.float32, [2, 2], name="a")
                pb = array_ops.placeholder(np.float32, [2, 2], name="b")
                out = math_ops.add(pa, pb)

            with ops.device('cpu'):
                report = gen_ipu_ops.ipu_event_trace()

            opts = utils.create_ipu_config(profiling=True,
                                           profile_execution=True,
                                           report_every_nth_execution=1,
                                           use_poplar_text_report=False)
            utils.configure_ipu_system(opts)

            fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]}
            sess.run(report, fd)

            sess.run(out, fd)
            sess.run(out, fd)
            sess.run(out, fd)
            sess.run(out, fd)
            sess.run(out, fd)

            rep = sess.run(report, fd)
            r = tu.ReportJSON(self)
            types = r.parse_events(rep)
            self.assertEqual(types[IpuTraceEvent.EXECUTE], 5)
            self.assertEqual(len(r.get_execution_reports()), 5,
                             "Every execution should have generated a report")
示例#8
0
    def testCrossReplicaAndStatefulGradientAccumulate(self):
        with self.session() as sess:
            dtype = np.float32

            def my_net(y):
                def cond(i, y):
                    del y
                    return i < 10

                def body(i, y):
                    cr = gen_popops_ops.ipu_cross_replica_sum(
                        array_ops.ones_like(y))
                    ga = gen_poputil_ops.ipu_stateful_gradient_accumulate(
                        cr, num_mini_batches=5)
                    y = y + ga
                    i = i + 1
                    return (i, y)

                i = 0
                return control_flow_ops.while_loop(cond, body, (i, y))

            with ops.device('cpu'):
                y = array_ops.placeholder(dtype, [1])

            opts = utils.create_ipu_config()
            opts = utils.auto_select_ipus(opts, num_ipus=2)
            utils.configure_ipu_system(opts)

            with ops.device("/device:IPU:0"):
                r = xla.compile(my_net, inputs=[y])

            y = sess.run(r, {y: [10]})
            self.assertEqual(y[0], 10)
            self.assertAllEqual(y[1], [30])
示例#9
0
    def testPrefixPathWithReshape(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
                z = array_ops.placeholder(np.float32, shape=[32])

                with variable_scope.variable_scope("vs", use_resource=True):
                    y = layers.Conv2D(
                        2,
                        1,
                        use_bias=True,
                        kernel_initializer=init_ops.ones_initializer())(x)
                res = gen_array_ops.reshape(y, [32]) + z

            opts = utils.create_ipu_config()
            utils.configure_ipu_system(opts)

            sess.run(variables.global_variables_initializer())

            result = sess.run(res, {
                x: np.reshape(np.arange(32), [1, 4, 4, 2]),
                z: np.ones([32])
            })
            # Confirmed with values on the CPU.
            self.assertAllClose(result, [
                2., 2., 6., 6., 10., 10., 14., 14., 18., 18., 22., 22., 26.,
                26., 30., 30., 34., 34., 38., 38., 42., 42., 46., 46., 50.,
                50., 54., 54., 58., 58., 62., 62.
            ])
示例#10
0
    def testCborReport(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                pa = array_ops.placeholder(np.float32, [2, 2], name="a")
                pb = array_ops.placeholder(np.float32, [2, 2], name="b")
                out = math_ops.add(pa, pb)

            with ops.device('cpu'):
                report = gen_ipu_ops.ipu_event_trace()

            opts = utils.create_ipu_config(profiling=True,
                                           profile_execution=True,
                                           use_poplar_text_report=False,
                                           use_poplar_cbor_report=True)
            utils.configure_ipu_system(opts)

            fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]}
            sess.run(report, fd)

            sess.run(out, fd)

            rep = sess.run(report, fd)
            evts = utils.extract_all_events(rep)
            self.assertEqual(len(evts), 4)  # engine, begin, end, execute

            self.assertEqual(evts[1].compile_end.compilation_report[0],
                             bytes(bytearray([217]))[0])
            self.assertEqual(evts[3].execute.execution_report[0],
                             bytes(bytearray([217]))[0])
示例#11
0
    def testIpuModelDeviceWithMultipleReport(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                pa = array_ops.placeholder(np.float32, [2, 2], name="a")
                pb = array_ops.placeholder(np.float32, [2, 2], name="b")
                out1 = pa + pb
                out2 = pa - pb

            with ops.device('cpu'):
                with ops.control_dependencies([out1, out2]):
                    report = gen_ipu_ops.ipu_event_trace()

            opts = utils.create_ipu_config(profiling=True,
                                           profile_execution=True)
            utils.configure_ipu_system(opts)

            fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]}
            sess.run(report, fd)

            result = sess.run(out1, fd)
            self.assertAllClose(result, [[1., 2.], [6., 8.]])

            result, rep = sess.run([out2, report], fd)
            self.assertAllClose(result, [[1., 0.], [-2., -2.]])

            # 2x engine, 2x compile_begin, 2x compile_end, 2x load engine
            self.assertEqual(len(rep), 8)
示例#12
0
    def testIpuEventsWithoutPoplarReporting(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                pa = array_ops.placeholder(np.float32, [2, 2], name="a")
                pb = array_ops.placeholder(np.float32, [2, 2], name="b")
                out = math_ops.add(pa, pb)

            with ops.device('cpu'):
                report = gen_ipu_ops.ipu_event_trace()

            opts = utils.create_ipu_config(profiling=False,
                                           enable_ipu_events=True)
            utils.configure_ipu_system(opts)

            fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]}
            sess.run(report, fd)

            sess.run(out, fd)

            rep = sess.run(report, fd)
            evts = utils.extract_all_events(rep)
            self.assertEqual(len(evts),
                             3)  # compile begin, compile end, execute

            for e in evts:
                if e.type == IpuTraceEvent.COMPILE_END:
                    self.assertFalse(e.compile_end.compilation_report)
                if e.type == IpuTraceEvent.EXECUTE:
                    self.assertFalse(e.execute.execution_report)

            sess.close()
示例#13
0
    def testSendScalar(self, dtype):
        with self.session() as sess:

            def device_fn(x):
                return gen_sendrecv_ops.ipu_send_to_host(
                    x,
                    tensor_name="test_tensor",
                    send_device="/device:IPU:0",
                    send_device_incarnation=0,
                    recv_device="/device:CPU:0")

            inputs = array_ops.placeholder(dtype=dtype, shape=())

            with ipu_scope("/device:IPU:0"):
                send_op = ipu_compiler.compile(device_fn, inputs=[inputs])

            with ops.device("/device:CPU:0"):
                recv_op = gen_sendrecv_ops.ipu_recv_at_host(
                    T=dtype,
                    tensor_name="test_tensor",
                    send_device="/device:IPU:0",
                    send_device_incarnation=0,
                    recv_device="/device:CPU:0")

            opts = utils.create_ipu_config()
            utils.configure_ipu_system(opts)

            sent, received = sess.run([send_op, recv_op],
                                      feed_dict={inputs: 1})

            self.assertIsNone(sent)  # Send op has no output
            self.assertEqual(dtype, received.dtype)
            self.assertEqual(0, len(received.shape))
            self.assertEqual(1, received)
示例#14
0
    def testVectorInputOutput(self):
        with self.session() as sess:

            def device_fn(x):
                with ipu_scope("/device:IPU:0"):
                    x = x + x
                    with outside_compilation_scope():
                        # Use float64 which is not supported on IPU
                        x = math_ops.cast(x, dtype=dtypes.float64)
                        c = constant_op.constant(2.0,
                                                 dtype=dtypes.float64,
                                                 shape=(2, ))
                        x += c
                        x = math_ops.cast(x, dtype=dtypes.float32)
                    x = x + 2.0
                return x

            inputs = array_ops.placeholder(dtype=dtypes.float32, shape=(2, ))
            [device_out] = ipu_compiler.compile(device_fn, inputs=[inputs])

            opts = utils.create_ipu_config()
            utils.configure_ipu_system(opts)
            result = sess.run(device_out, feed_dict={inputs: [1.0, 2.0]})
            self.assertEqual((2, ), result.shape)
            self.assertAllEqual([6.0, 8.0], result)
示例#15
0
    def testSentTensorIsUsedAfterReceive(self):
        with self.session() as sess:

            def device_fn(x):
                with ipu_scope("/device:IPU:0"):
                    x *= x  # 4

                    with outside_compilation_scope():
                        y = x + 1.0  # 5

                    # Use `x` after receiving `y` and make sure that we still have the correct
                    # value of `x` (i.e. it is not overwritten by the receive, in which case
                    # we would get 25).
                    z = x * y  # 20

                    return z

            inputs = array_ops.placeholder(dtype=dtypes.float32, shape=())
            [out] = ipu_compiler.compile(device_fn, inputs=[inputs])

            opts = utils.create_ipu_config()
            utils.configure_ipu_system(opts)

            res = sess.run(out, feed_dict={inputs: 2.0})
            self.assertEqual(20.0, res)
示例#16
0
    def testTwoInputsTwoOutputs(self):
        with self.session() as sess:

            def device_fn(x1, x2):
                with ipu_scope("/device:IPU:0"):
                    x1 *= x1
                    x2 *= x2
                    with outside_compilation_scope():
                        x1 += 1.0
                        x2 += 2.0
                    x1 *= 1.0
                    x2 *= 2.0
                    return x1, x2

            input1 = array_ops.placeholder(dtype=dtypes.float32, shape=())
            input2 = array_ops.placeholder(dtype=dtypes.float32, shape=())
            out1, out2 = ipu_compiler.compile(device_fn,
                                              inputs=[input1, input2])

            opts = utils.create_ipu_config()
            opts = utils.set_optimization_options(opts,
                                                  max_send_recv_cluster_size=8)
            utils.configure_ipu_system(opts)

            res1, res2 = sess.run([out1, out2],
                                  feed_dict={
                                      input1: 1.0,
                                      input2: 2.0
                                  })
            self.assertEqual(2.0, res1)
            self.assertEqual(12.0, res2)
示例#17
0
    def testPipelineIterationsNotMultiple(self):
        dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2])
        dataset = dataset.batch(batch_size=2, drop_remainder=True)

        def dataset_parser(value):
            a = value
            b = (value + 10.) / 2.0
            return {"a": a, "b": b}

        dataset = dataset.map(dataset_parser)
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed1")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed1")

        def stage1(c, **kwargs):
            with variable_scope.variable_scope("vs", use_resource=True):
                y = layers.Conv2D(
                    2,
                    1,
                    use_bias=True,
                    kernel_initializer=init_ops.ones_initializer(),
                    name='conv1')(kwargs["a"])
                return y + kwargs["b"], c

        def stage2(x, c):
            return math_ops.reduce_sum(x) + c

        def stage3(x):
            return x

        def my_net(c):
            return pipelining_ops.pipeline(
                [stage1, stage2, stage3],
                10,
                inputs=[c],
                infeed_queue=infeed_queue,
                outfeed_queue=outfeed_queue,
                pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped)

        with ops.device('cpu'):
            c = array_ops.placeholder(np.float32, shape=[])

        with tu.ipu_session() as sess:

            with ops.device("/device:IPU:0"):
                r = ipu_compiler.compile(my_net, inputs=[c])

            cfg = utils.create_ipu_config(profiling=True,
                                          profile_execution=True)
            cfg = utils.auto_select_ipus(cfg, 4)
            utils.configure_ipu_system(cfg)
            utils.move_variable_initialization_to_cpu()

            sess.run(variables.global_variables_initializer())
            sess.run(infeed_queue.initializer)
            with self.assertRaisesRegex(
                    errors.FailedPreconditionError,
                    'The pipeline depth of the pipeline must be a multiple of 3'
            ):
                sess.run(r, {c: 10.01})
示例#18
0
def training_graph(opts, training_data):
    train_graph = tf.Graph()

    with train_graph.as_default():

        dataset, train_iterator, placeholders = training_data.get_dataset(
            opts, is_training=True)
        infeed = ipu_infeed_queue.IPUInfeedQueue(dataset,
                                                 "training_dataset_infeed", 0)

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_loss_, sum_rmse_metric, *args, **kwargs):
                    data_tensors = args
                    observed_ratings = data_tensors[0]
                    loss, rmse_metric, apply_grads_ = graph_builder(
                        opts,
                        observed_ratings=observed_ratings,
                        learning_rate=placeholders["learning_rate"],
                        type='TRAIN')
                    with tf.control_dependencies([apply_grads_]):
                        return total_loss_ + loss, sum_rmse_metric + rmse_metric

                return loops.repeat(
                    opts.batches_per_step, body,
                    [tf.constant(0, tf.float32),
                     tf.constant(0, tf.float32)], infeed)

            total_loss, sum_rmse_metric = ipu_compiler.compile(comp_fn, [])

        rmse = sum_rmse_metric / opts.batches_per_step
        loss = total_loss / opts.batches_per_step

        tf.summary.scalar("loss", loss)
        tf.summary.scalar("learning_rate", placeholders["learning_rate"])
        tf.summary.scalar("RMSE/train", rmse)

        if opts.compiler_report:
            ipu_ops.ipu_compile_summary('compile_summary', loss)

        train_summary = tf.summary.merge_all()
        train_saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()

    train_writer = tf.summary.FileWriter(opts.logs_path + '/train',
                                         graph=train_graph,
                                         flush_secs=30)

    ipu_options = util.get_config(opts, profiling=opts.compiler_report)
    ipu_utils.configure_ipu_system(ipu_options)
    train_sess = tf.Session(graph=train_graph)

    return GraphOps(train_graph, train_sess, train_init,
                    [loss, train_summary, rmse], placeholders, infeed,
                    train_saver, train_writer)
示例#19
0
def _gradient_accumulation_loop(test_wrapper,
                                fwd_fn,
                                inputs_fn,
                                input_values,
                                repeat_count,
                                num_batches_to_accumulate,
                                dataset_fn,
                                optimizer,
                                num_iterations=None):
  g = ops.Graph()

  if num_iterations is None:
    num_iterations = repeat_count * num_batches_to_accumulate

  with g.as_default(), test_wrapper.test_session(graph=g) as session:
    dataset = dataset_fn()
    inputs = inputs_fn()
    infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, next_feed_id())
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

    with variable_scope.variable_scope("ipu", use_resource=True, reuse=False):

      def model(*args):
        loss = fwd_fn(*functional_ops._convert_to_list(args))  # pylint: disable=W0212
        enqueue_op = outfeed_queue.enqueue(loss)
        opt = gradient_accumulation_optimizer.GradientAccumulationOptimizerV2(
            optimizer, num_batches_to_accumulate)
        outs = list(args[:len(args) - infeed_queue.number_of_tuple_elements])
        outs.append(enqueue_op)
        outs.append(opt.minimize(loss))
        return outs

      def my_net(*args):
        return loops.repeat(num_iterations,
                            model,
                            inputs=args,
                            infeed_queue=infeed_queue)

    with ops.device("/device:IPU:0"):
      loop_ret = ipu_compiler.compile(my_net, inputs=inputs)

    outfeed_op = outfeed_queue.dequeue()

    profiling = utils.running_on_ipu_model()

    cfg = utils.create_ipu_config(profiling=profiling,
                                  profile_execution=profiling)
    cfg = utils.set_ipu_model_options(cfg,
                                      compile_ipu_code=True,
                                      tiles_per_ipu=128)
    cfg = utils.auto_select_ipus(cfg, 1)
    utils.configure_ipu_system(cfg)
    utils.move_variable_initialization_to_cpu()

    session.run(variables.global_variables_initializer())
    session.run(infeed_queue.initializer)
    session.run(loop_ret, feed_dict=dict(zip(inputs, input_values)))
    return session.run(outfeed_op)
示例#20
0
def validation_graph(opts, valid_data):
    # Do not apply dropout during validation
    opts.apply_dropout = False

    valid_graph = tf.Graph()
    tf_device_ordinal = 0 if opts.multiprocessing else 1
    with valid_graph.as_default():
        dataset, _, _ = valid_data.get_dataset(opts, is_training=False)
        infeed = ipu_infeed_queue.IPUInfeedQueue(
            dataset, "validation_dataset_infeed", tf_device_ordinal)

        with ipu_scope('/device:IPU:{}'.format(tf_device_ordinal)):
            def comp_fn():
                def body(sum_rmse_metric, *args, **kwargs):
                    data_tensors = args
                    observed_ratings, ground_truth = tf.split(
                        data_tensors[0], num_or_size_splits=2, axis=1)
                    rmse_metric = graph_builder(opts,
                                                observed_ratings=observed_ratings,
                                                ground_truth=ground_truth,
                                                type='VALID')
                    return sum_rmse_metric + rmse_metric

                return loops.repeat(opts.validation_batches_per_step,
                                    body,
                                    [tf.constant(0, tf.float32)],
                                    infeed)

            (sum_rmse_metric,) = ipu_compiler.compile(comp_fn, [])

        # Accuracy Ops
        rmse = sum_rmse_metric / opts.validation_batches_per_step

        valid_summary = tf.summary.scalar("RMSE/validation", rmse)
        valid_saver = tf.train.Saver()

        ipu_utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

    valid_writer = tf.summary.FileWriter(
        opts.logs_path + '/valid',
        graph=valid_graph,
        flush_secs=30)

    ipu_options = util.get_config(opts, False)
    if opts.multiprocessing:
        ipu_utils.configure_ipu_system(ipu_options)
    valid_sess = tf.Session(graph=valid_graph)

    return GraphOps(valid_graph,
                    valid_sess,
                    valid_init,
                    [rmse, valid_summary],
                    None,
                    infeed,
                    valid_saver,
                    valid_writer)
示例#21
0
 def _configureIPU(self, serialization_folder, verification_options=None):
     opts = utils.create_ipu_config()
     opts = utils.set_ipu_connection_type(opts,
                                          utils.DeviceConnectionType.NEVER,
                                          1)
     opts = utils.set_serialization_options(opts, serialization_folder)
     if verification_options:
         opts = utils.set_transfer_options(opts, True)
         opts = utils.set_verification_options(opts, verification_options)
     utils.configure_ipu_system(opts)
示例#22
0
 def configureIPU(self,
                  serialization_folder=None,
                  offline_compilation=True):
     opts = utils.create_ipu_config()
     if offline_compilation:
         opts = utils.set_ipu_connection_type(
             opts, utils.DeviceConnectionType.NEVER, 1)
     if serialization_folder:
         opts = utils.set_serialization_options(opts, serialization_folder)
     utils.configure_ipu_system(opts)
示例#23
0
    def testResetSeed(self):
        # The dataset for feeding the graphs
        ds = dataset_ops.Dataset.from_tensors(
            array_ops.constant(1.0, shape=[SIZE]))
        ds = ds.map(lambda x: [x, x])
        ds = ds.repeat()

        # The host side queues
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
            ds, feed_name="infeed", replication_factor=REPLICAS)
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=REPLICAS)

        # The device side
        def body(x1, x2):
            d1 = rand_ops.dropout(x1)
            d2 = rand_ops.dropout(x2)
            outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2})
            return outfeed

        def my_net():
            r = loops.repeat(REPEATS, body, [], infeed_queue)
            return r

        with scopes.ipu_scope('/device:IPU:0'):
            res = ipu_compiler.compile(my_net, inputs=[])

        # The outfeed dequeue has to happen after the outfeed enqueue
        dequeue_outfeed = outfeed_queue.dequeue()

        # Configure the hardware
        config = utils.create_ipu_config(profiling=True)
        config = utils.auto_select_ipus(config, REPLICAS)
        config = utils.set_floating_point_behaviour_options(config)
        utils.configure_ipu_system(config)

        with session.Session() as sess:
            res_all = set()
            total = 0

            sess.run(infeed_queue.initializer)

            for _ in range(EXECS):
                sess.run(res)
                outfed_result = sess.run(dequeue_outfeed)
                for r in np.array(list(outfed_result.values())).reshape(
                    [-1, SIZE]):
                    total += 1
                    res_all.add(r.tostring())

            # 2 dropouts per replica * REPLICAS * REPEATS * EXECS
            expected = 2 * REPLICAS * REPEATS * EXECS
            self.assertEqual(total, expected)
            self.assertEqual(len(res_all), expected)
示例#24
0
def generic_train_graph(opts, is_training):
    data_type = 'float32'
    train_graph = tf.Graph()
    with train_graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(opts, is_training, seed)

        if opts['use_synthetic_data']:
            dataset_train = get_synthetic_dataset(opts)
        else:
            dataset_train = get_dataset_embed(opts, is_training=True)

        infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train, feed_name = 'DIN_dataset_infeed_train', replication_factor = (opts['replicas']))

        with ipu_scope('/device:IPU:0'):
            def comp_fn():
                def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen):
                    prob, loss, aux_loss, accuracy, grad_op = graph_builder(opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False)

                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy

                return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train)

            outputs_train = ipu_compiler.compile(comp_fn, [])
            avg_loss, avg_aux_loss, avg_accuracy = [x / opts['batches_per_step'] for x in outputs_train]
            outfeed = None

        saver = tf.compat.v1.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.compat.v1.global_variables_initializer()

    if opts['use_ipu_model']:
        os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"
    ipu_options = utils.create_ipu_config()
    ipu_options = utils.set_optimization_options(ipu_options,
                                                 combine_embedding_lookups=True)
    ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True)
    ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']])
    utils.configure_ipu_system(ipu_options)
    if seed is not None:
        utils.reset_ipu_seed(seed)

    ops_train = [avg_loss, avg_aux_loss, avg_accuracy]
    sess = tf.compat.v1.Session(graph=train_graph)

    return GraphOps(sess,
                    init,
                    ops_train,
                    placeholders,
                    infeed_train,
                    outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
示例#25
0
    def setUpClass(cls):
        # Set up input to the network
        img_width = img_height = 224
        img_channels = 3
        densenet_121_blocks = (6, 12, 24, 16)
        cls.batch_size = 1
        cls.num_classes = 1000
        # Set up image input placeholder
        cls.placeholder_input = tf.placeholder(dtype=tf.float16,
                                               shape=(cls.batch_size, img_height, img_width, img_channels),
                                               name="image_input")

        # Set compile and device options
        opts = utils.create_ipu_config(profiling=False, use_poplar_text_report=False)
        utils.auto_select_ipus(opts, [1])
        utils.configure_ipu_system(opts)

        # Construct Densenet model
        cls.densenet_model = DenseNet(blocks=densenet_121_blocks, num_classes=cls.num_classes,
                                      image_width=img_width, image_height=img_height, image_channels=img_channels)

        cls.densenet_model(cls.placeholder_input)

        # Restore weights
        checkpoint_file = CHECKPOINT_PATH

        if not Path(checkpoint_file + ".index").exists():
            print('Checkpoint file does not exist, attempting to download pre-trained weights')
            checkpoint_file = get_densenet_weights(Path(checkpoint_file))

        # Create test session
        saver = tf.train.Saver()

        with tf.Session() as sess:
            saver.restore(sess, checkpoint_file)
            logging.info('Restored imagenet weights.')

            # Optimize inference graph
            logging.info('Starting graph optimization.')
            densenet_graph_def = tf.get_default_graph().as_graph_def()
            frozen_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(sess, densenet_graph_def,
                                                                                      output_node_names=["output-prob"])
            # Remove identity ops in initializers to allow fusing batch norm with conv in the next line
            frozen_graph_def = tf.compat.v1.graph_util.remove_training_nodes(frozen_graph_def)
            optimized_graph_def = optimize_for_infer.fold_batch_norms(frozen_graph_def)

            logging.info('Completed graph optimization.')

        tf.reset_default_graph()
        with tf.device('/device:IPU:0'):
            with tf.variable_scope('', use_resource=True):
                cls.output = tf.import_graph_def(optimized_graph_def, input_map={}, name="optimized",
                                                 return_elements=["output-prob:0"])[0]
示例#26
0
def get_report(loop_op: tf.Operation,
               infeed_queue_initializer: tf.Operation,
               outfeed_op: tf.Operation,
               report_dest: str,
               available_memory_proportion: Optional[float] = 0.6) -> None:
    """Generate report from running model on IPU and save to disk.

    Args:
        loop_op: Inference op to generate report on.
        infeed_queue_initializer: Initializer for the infeed queue
        outfeed_op: Outfeed operator.
        report_dest: Location to store report.
        available_memory_proportion: Proportion of tile memory available as temporary memory
        for matmul and convolution execution

    """
    # Set compile and device options
    os.environ["TF_POPLAR_FLAGS"] += " --use_ipu_model"
    use_poplar_text_report = report_mode == 'text'
    opts = ipu_utils.create_ipu_config(
        profiling=True,
        use_poplar_text_report=use_poplar_text_report,
        profile_execution=True)
    opts = ipu_utils.set_matmul_options(opts,
                                        matmul_options={
                                            "availableMemoryProportion":
                                            str(available_memory_proportion)
                                        })
    opts = ipu_utils.set_convolution_options(
        opts,
        convolution_options={
            "availableMemoryProportion": str(available_memory_proportion)
        })
    ipu_utils.auto_select_ipus(opts, [1])
    ipu_utils.configure_ipu_system(opts)

    with tf.device('cpu'):
        report = gen_ipu_ops.ipu_event_trace()

    run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True)
    session = tf.Session()
    session.run(infeed_queue_initializer)
    session.run(loop_op, options=run_options)
    session.run(outfeed_op, options=run_options)
    out = session.run(report)
    if report_mode == 'text':
        # extract the report
        rep = ipu_utils.extract_all_strings_from_event_trace(out)
        logging.info("Writing profiling report to %s" % report_dest)
        with open(report_dest, "w") as f:
            f.write(rep)
    else:
        save_tf_report(out)
示例#27
0
  def testGatherLookupRandomize(self, y_0):
    # Configure argument for targeting the IPU.
    # gather_simplifier is on.
    cfg = utils.create_ipu_config(profiling=True, profile_execution=True)
    self.assertFalse(cfg.enable_gather_simplifier)
    cfg = utils.set_optimization_options(cfg, gather_simplifier=True)
    self.assertTrue(cfg.enable_gather_simplifier)
    utils.configure_ipu_system(cfg)

    # Set test range shape.
    w_0 = 5
    w_1 = 10

    def network(w, y):
      g = nn.embedding_lookup(w, y)
      return g

    # Compare cpu gather vs ipu gather_simplifier.
    with self.session() as sess:
      with ops.device('cpu'):
        y = array_ops.placeholder(np.int32, shape=[y_0])
        w = array_ops.placeholder(np.int32, shape=[w_0, w_1])
        y_i = np.random.randint(low=0, high=w_0 - 1, size=y_0)
        w_i = np.reshape(np.random.randint(low=100, high=200, size=w_0 * w_1),
                         (w_0, w_1))
        cpu_take = array_ops.gather(w_i, y_i)

        report = tu.ReportJSON(self, sess=sess, configure_device=False)

      with ops.device("/device:IPU:0"):
        r = xla.compile(network, inputs=[w, y])

      sess.run(variables.global_variables_initializer())
      report.reset()
      ipu_gather_simplifier = sess.run(r, {y: y_i, w: w_i})
      self.assertAllClose(ipu_gather_simplifier[0], cpu_take)

      report.parse_log()
      # pylint: disable=line-too-long

      # This tests gather simplifier hlo pass for embedding_lookup case.
      # It checks if "embedding_lookup/gather*/multiSlice" string was
      # replaced by embedding_lookup/multi-slice/*/multiSlice".
      ok = [
          'embedding_lookup/multi-slice/output/multiSlice/*',
          '__seed/set/setMasterSeed',
          'host-exchange-local-copy-',
      ]
      if y_0 == 1:
        ok = ok[:-1]
      # pylint: enable=line-too-long
      report.assert_all_compute_sets_and_list(ok)
示例#28
0
    def testIpuModelDevice(self):
        with self.session() as sess:
            with ops.device("/device:IPU:0"):
                pa = array_ops.placeholder(np.float32, [2, 2], name="a")
                pb = array_ops.placeholder(np.float32, [2, 2], name="b")
                output = pa + pb

            opts = utils.create_ipu_config(profiling=True)
            utils.configure_ipu_system(opts)

            fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]}
            result = sess.run(output, fd)
            self.assertAllClose(result, [[1., 2.], [6., 8.]])
示例#29
0
def train():
    graph = tf.Graph()
    with graph.as_default():
        dataset = tf.data.Dataset.from_tensors(tf.constant(1, shape=[]))
        #         dataset = tf.data.Dataset.from_tensors(np.array([1,2,3,4,5,6,7,8,9,0]))
        dataset = dataset.map(lambda x: [x, x])
        dataset = dataset.batch(BS, drop_remainder=True)
        dataset = dataset.repeat()
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(get_data_set(),
                                                       feed_name="infeed")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name='outfeed')
        time_steps_ph = tf.placeholder(tf.int32, shape=[])
        with ipu_scope('/device:IPU:0'):

            def compile_fn():
                def body(x, y):
                    #                     z1, z2 = model1(x, y, time_steps_ph)
                    #                     outfeed = outfeed_queue.enqueue({'z1':z1, 'z2':z2})
                    z3 = model2(time_steps_ph)
                    outfeed = outfeed_queue.enqueue({'z3': z3})
                    return outfeed

                return loops.repeat(1, body, [], infeed_queue)

        utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()
        outputs = ipu_compiler.compile(compile_fn, [])

        dequeue_outfeed = outfeed_queue.dequeue()
    ipu_options = utils.create_ipu_config(
        profiling=False,
        profile_execution=False,
        max_cross_replica_sum_buffer_size=10000000,
        max_inter_ipu_copies_buffer_size=10000000)
    ipu_options = utils.auto_select_ipus(ipu_options, 1)
    utils.configure_ipu_system(ipu_options)
    utils.reset_ipu_seed(SEED)

    sess = tf.Session(graph=graph)
    sess.run(init)
    sess.run(infeed_queue.initializer)

    steps = 6
    i = 0
    while i < steps:
        sess.run(outputs, feed_dict={time_steps_ph: 3})
        result = sess.run(dequeue_outfeed)
        print(result)
        i = i + 1
        break
示例#30
0
def run_language_model(opts):
    if opts.random_seed is not None:
        utils.reset_ipu_seed(opts.random_seed)

    # Setup and acquire an IPU device:
    logging.info("Acquiring devices")
    if not opts.pipeline:
        opts.num_shards = 1  # FIX-ME enable sparse models using multiple shards

    # Make sure that no matter the number of shards/stages required, we always
    # acquire a power of 2 ipus (else attachment will fail)
    k = 0
    while 2**k < opts.num_shards:
        k += 1
    num_ipus = 2**k
    logger.info(f"Need {opts.num_shards} IPUs, requesting {num_ipus}")
    config = utils.create_ipu_config()

    if opts.compile_only:
        if opts.compile_only_ipu_version is None:
            raise AttributeError(
                "Must provide --compile-only-ipu-version if --compile-only is set."
            )

        config = utils.set_ipu_connection_type(
            config,
            utils.DeviceConnectionType.NEVER,
            ipu_version=opts.compile_only_ipu_version,
            enable_remote_buffers=True)

    config = utils.auto_select_ipus(config, num_ipus)
    config = utils.set_recomputation_options(config,
                                             allow_recompute=opts.recompute)
    # Enable stochastic rounding
    config = utils.set_floating_point_behaviour_options(config,
                                                        inv=False,
                                                        div0=False,
                                                        oflo=False,
                                                        esr=True,
                                                        nanoo=False)
    config = sparse.set_system_config(
        config, custom_op_debug_printing=opts.debug_dense_grad)
    utils.configure_ipu_system(config)

    transformer = DynsparseTransformer(opts)
    if opts.mode in ["all", "train"]:
        run_training(opts, transformer)

    if opts.mode in ["all", "test"]:
        run_testing(opts, transformer)