Exemplo n.º 1
0
    def setUpClass(cls):
        # Set up input to the network
        img_width = img_height = 224
        img_channels = 3
        densenet_121_blocks = (6, 12, 24, 16)
        cls.batch_size = 1
        cls.num_classes = 1000
        # Set up image input placeholder
        cls.placeholder_input = tf.placeholder(dtype=tf.float16,
                                               shape=(cls.batch_size, img_height, img_width, img_channels),
                                               name="image_input")

        # Set compile and device options
        opts = utils.create_ipu_config(profiling=False, use_poplar_text_report=False)
        utils.auto_select_ipus(opts, [1])
        utils.configure_ipu_system(opts)

        # Construct Densenet model
        cls.densenet_model = DenseNet(blocks=densenet_121_blocks, num_classes=cls.num_classes,
                                      image_width=img_width, image_height=img_height, image_channels=img_channels)

        cls.densenet_model(cls.placeholder_input)

        # Restore weights
        checkpoint_file = CHECKPOINT_PATH

        if not Path(checkpoint_file + ".index").exists():
            print('Checkpoint file does not exist, attempting to download pre-trained weights')
            checkpoint_file = get_densenet_weights(Path(checkpoint_file))

        # Create test session
        saver = tf.train.Saver()

        with tf.Session() as sess:
            saver.restore(sess, checkpoint_file)
            logging.info('Restored imagenet weights.')

            # Optimize inference graph
            logging.info('Starting graph optimization.')
            densenet_graph_def = tf.get_default_graph().as_graph_def()
            frozen_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(sess, densenet_graph_def,
                                                                                      output_node_names=["output-prob"])
            # Remove identity ops in initializers to allow fusing batch norm with conv in the next line
            frozen_graph_def = tf.compat.v1.graph_util.remove_training_nodes(frozen_graph_def)
            optimized_graph_def = optimize_for_infer.fold_batch_norms(frozen_graph_def)

            logging.info('Completed graph optimization.')

        tf.reset_default_graph()
        with tf.device('/device:IPU:0'):
            with tf.variable_scope('', use_resource=True):
                cls.output = tf.import_graph_def(optimized_graph_def, input_map={}, name="optimized",
                                                 return_elements=["output-prob:0"])[0]
Exemplo n.º 2
0
def get_report(loop_op: tf.Operation,
               infeed_queue_initializer: tf.Operation,
               outfeed_op: tf.Operation,
               report_dest: str,
               available_memory_proportion: Optional[float] = 0.6) -> None:
    """Generate report from running model on IPU and save to disk.

    Args:
        loop_op: Inference op to generate report on.
        infeed_queue_initializer: Initializer for the infeed queue
        outfeed_op: Outfeed operator.
        report_dest: Location to store report.
        available_memory_proportion: Proportion of tile memory available as temporary memory
        for matmul and convolution execution

    """
    # Set compile and device options
    os.environ["TF_POPLAR_FLAGS"] += " --use_ipu_model"
    use_poplar_text_report = report_mode == 'text'
    opts = ipu_utils.create_ipu_config(
        profiling=True,
        use_poplar_text_report=use_poplar_text_report,
        profile_execution=True)
    opts = ipu_utils.set_matmul_options(opts,
                                        matmul_options={
                                            "availableMemoryProportion":
                                            str(available_memory_proportion)
                                        })
    opts = ipu_utils.set_convolution_options(
        opts,
        convolution_options={
            "availableMemoryProportion": str(available_memory_proportion)
        })
    ipu_utils.auto_select_ipus(opts, [1])
    ipu_utils.configure_ipu_system(opts)

    with tf.device('cpu'):
        report = gen_ipu_ops.ipu_event_trace()

    run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True)
    session = tf.Session()
    session.run(infeed_queue_initializer)
    session.run(loop_op, options=run_options)
    session.run(outfeed_op, options=run_options)
    out = session.run(report)
    if report_mode == 'text':
        # extract the report
        rep = ipu_utils.extract_all_strings_from_event_trace(out)
        logging.info("Writing profiling report to %s" % report_dest)
        with open(report_dest, "w") as f:
            f.write(rep)
    else:
        save_tf_report(out)
Exemplo n.º 3
0
    def testTrainReplicated(self):
        if ipu_utils.running_on_ipu_model():
            self.skipTest(
                "Replicated top level graphs are not supported on the "
                "IPU_MODEL target")

        def my_model_fn(features, labels, mode):  # pylint: disable=unused-argument
            self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode)

            loss = ipu.ops.cross_replica_ops.cross_replica_sum(features,
                                                               name="loss")

            train_op = array_ops.identity(loss)

            return model_fn_lib.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)

        def my_input_fn():
            dataset = tu.create_dual_increasing_dataset(10,
                                                        data_shape=[1],
                                                        label_shape=[1])
            dataset = dataset.batch(batch_size=1, drop_remainder=True)
            return dataset

        ipu_options = ipu_utils.create_ipu_config()
        ipu_options = ipu_utils.auto_select_ipus(ipu_options, 4)
        config = ipu_run_config.RunConfig(
            ipu_run_config=ipu_run_config.IPURunConfig(
                iterations_per_loop=2, num_replicas=4,
                ipu_options=ipu_options),
            log_step_count_steps=1,
            save_summary_steps=1)

        estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn,
                                               config=config)

        session_run_counter = _SessionRunCounter()

        num_steps = 6
        estimator.train(input_fn=my_input_fn,
                        steps=num_steps,
                        hooks=[session_run_counter])

        self.assertEqual(
            session_run_counter.num_session_runs,
            num_steps // config.ipu_run_config.iterations_per_loop)

        model_dir = estimator.model_dir
        events_file = glob.glob(model_dir + "/*tfevents*")
        assert len(events_file) == 1
        events_file = events_file[0]
        loss_output = list()
        for e in summary_iterator.summary_iterator(events_file):
            for v in e.summary.value:
                if "loss" in v.tag:
                    loss_output.append(v.simple_value)

        # loss is averaged across iterations per loop
        self.assertEqual(loss_output, [14.0, 16.0, 18.0])
Exemplo n.º 4
0
    def testCrossReplicaAndStatefulGradientAccumulate(self):
        with self.session() as sess:
            dtype = np.float32

            def my_net(y):
                def cond(i, y):
                    del y
                    return i < 10

                def body(i, y):
                    cr = gen_popops_ops.ipu_cross_replica_sum(
                        array_ops.ones_like(y))
                    ga = gen_poputil_ops.ipu_stateful_gradient_accumulate(
                        cr, num_mini_batches=5)
                    y = y + ga
                    i = i + 1
                    return (i, y)

                i = 0
                return control_flow_ops.while_loop(cond, body, (i, y))

            with ops.device('cpu'):
                y = array_ops.placeholder(dtype, [1])

            opts = utils.create_ipu_config()
            opts = utils.auto_select_ipus(opts, num_ipus=2)
            utils.configure_ipu_system(opts)

            with ops.device("/device:IPU:0"):
                r = xla.compile(my_net, inputs=[y])

            y = sess.run(r, {y: [10]})
            self.assertEqual(y[0], 10)
            self.assertAllEqual(y[1], [30])
Exemplo n.º 5
0
  def testIoTilesAreExcludedFromShard(self):
    def my_net(a, b):
      with ipu_shard(0):
        aa = math_ops.matmul(a, a, transpose_b=True, name="aa")
      with ipu_shard(1):
        bb = math_ops.matmul(b, b, transpose_b=True, name="bb")
      return aa, bb

    input_a = array_ops.placeholder(np.float32, [1216, 1])
    input_b = array_ops.placeholder(np.float32, [1216, 1])

    with ops.device("/device:IPU:0"):
      compiled_net = ipu_compiler.compile(my_net, inputs=[input_a, input_b])

    num_io_tiles = 128
    cfg = ipu_utils.create_ipu_config(profiling=True)
    cfg = ipu_utils.set_gcl_options(cfg, num_io_tiles=num_io_tiles)
    cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=2)
    ipu_utils.configure_ipu_system(cfg)

    with session.Session() as sess:
      report = ReportJSON(self, sess, configure_device=False)
      report.reset()

      sess.run(compiled_net, {
          input_a: np.ones(input_a.shape),
          input_b: np.ones(input_b.shape)
      })

      report.parse_log()
      num_compute_tiles = report.get_num_tiles_per_ipu() - num_io_tiles
      for t in report.get_tensor_map().all_tensors():
        self.assertLessEqual(len(t.tiles), num_compute_tiles)
Exemplo n.º 6
0
    def test_ipu_horovod_strategy(self):
        hvd_size = hvd.size()
        hvd_rank = hvd.rank()

        strategy = IPUHorovodStrategy()
        self.assertEqual(strategy.num_replicas_in_sync, hvd_size)

        cfg = ipu_utils.create_ipu_config()
        cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=1)
        ipu_utils.configure_ipu_system(cfg)

        with strategy.scope():

            def per_replica_fn():
                w = variable_scope.get_variable(name="w",
                                                initializer=hvd_rank + 1.0)
                self.assertEqual("/replica:0/task:0/device:IPU:0", w.device)
                return w * w

            per_replica_val = strategy.experimental_run_v2(per_replica_fn)
            strategy_sum = strategy.reduce(ReduceOp.SUM, per_replica_val)
            strategy_mean = strategy.reduce(ReduceOp.MEAN, per_replica_val)

            with session.Session() as sess:
                sess.run(variables.global_variables_initializer())

                # All workers should have the initial value from the first worker.
                self.assertEqual([1.0], sess.run(variables.global_variables()))
                self.assertEqual(1.0 * hvd_size, strategy_sum.eval())
                self.assertEqual(1.0, strategy_mean.eval())
Exemplo n.º 7
0
  def testNumUniqueDevicesBelowNumShardsRange(self):
    def model_fn_with_zero_stages(mode):
      def optimizer_function():
        pass

      return IPUPipelineEstimatorSpec(mode,
                                      computational_stages=[],
                                      gradient_accumulation_count=1,
                                      device_mapping=[0, 1, 0],
                                      optimizer_function=optimizer_function)

    def my_input_fn():
      return dataset_ops.Dataset.from_tensor_slices(([0], [0]))

    ipu_options = ipu_utils.create_ipu_config()
    ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4)
    config = ipu_run_config.RunConfig(
        ipu_run_config=ipu_run_config.IPURunConfig(
            num_shards=4, iterations_per_loop=1, ipu_options=ipu_options))

    estimator = IPUPipelineEstimator(model_fn=model_fn_with_zero_stages,
                                     config=config)

    with self.assertRaisesRegex(
        ValueError, r"This pipeline requires 2 devices, but "
        "`IPURunConfig.num_shards` was set to 4"):
      estimator.train(input_fn=my_input_fn, steps=1)
Exemplo n.º 8
0
def get_config(opts):
    """Builds ipu_options"""
    profile = opts.report

    config = utils.create_ipu_config(profiling=profile,
                                     profile_execution=profile,
                                     report_every_nth_execution=1)
    if opts.device_id == -1:
        config = utils.auto_select_ipus(config, opts.shards * opts.replicas)
    else:
        config = utils.select_ipus(config, [opts.device_id])

    if opts.convolution_options:
        config = utils.set_convolution_options(
            config, json.loads(opts.convolution_options))

    if opts.matmul_options:
        config = utils.set_matmul_options(config,
                                          json.loads(opts.matmul_options))

    if opts.enable_half_partials:
        config = utils.set_matmul_options(config, {"partialsType": 'half'})
        config = utils.set_convolution_options(config,
                                               {"partialsType": 'half'})
    return config
Exemplo n.º 9
0
def get_config(report_n=1):
    """Builds ipu_options"""

    config = utils.create_ipu_config(profiling=False, use_poplar_text_report=False, report_every_nth_execution=report_n)
    config = utils.auto_select_ipus(config, [1])

    return config
Exemplo n.º 10
0
    def testPipelineIterationsNotMultiple(self):
        dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2])
        dataset = dataset.batch(batch_size=2, drop_remainder=True)

        def dataset_parser(value):
            a = value
            b = (value + 10.) / 2.0
            return {"a": a, "b": b}

        dataset = dataset.map(dataset_parser)
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed1")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed1")

        def stage1(c, **kwargs):
            with variable_scope.variable_scope("vs", use_resource=True):
                y = layers.Conv2D(
                    2,
                    1,
                    use_bias=True,
                    kernel_initializer=init_ops.ones_initializer(),
                    name='conv1')(kwargs["a"])
                return y + kwargs["b"], c

        def stage2(x, c):
            return math_ops.reduce_sum(x) + c

        def stage3(x):
            return x

        def my_net(c):
            return pipelining_ops.pipeline(
                [stage1, stage2, stage3],
                10,
                inputs=[c],
                infeed_queue=infeed_queue,
                outfeed_queue=outfeed_queue,
                pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped)

        with ops.device('cpu'):
            c = array_ops.placeholder(np.float32, shape=[])

        with tu.ipu_session() as sess:

            with ops.device("/device:IPU:0"):
                r = ipu_compiler.compile(my_net, inputs=[c])

            cfg = utils.create_ipu_config(profiling=True,
                                          profile_execution=True)
            cfg = utils.auto_select_ipus(cfg, 4)
            utils.configure_ipu_system(cfg)
            utils.move_variable_initialization_to_cpu()

            sess.run(variables.global_variables_initializer())
            sess.run(infeed_queue.initializer)
            with self.assertRaisesRegex(
                    errors.FailedPreconditionError,
                    'The pipeline depth of the pipeline must be a multiple of 3'
            ):
                sess.run(r, {c: 10.01})
Exemplo n.º 11
0
def _gradient_accumulation_loop(test_wrapper,
                                fwd_fn,
                                inputs_fn,
                                input_values,
                                repeat_count,
                                num_batches_to_accumulate,
                                dataset_fn,
                                optimizer,
                                num_iterations=None):
  g = ops.Graph()

  if num_iterations is None:
    num_iterations = repeat_count * num_batches_to_accumulate

  with g.as_default(), test_wrapper.test_session(graph=g) as session:
    dataset = dataset_fn()
    inputs = inputs_fn()
    infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, next_feed_id())
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

    with variable_scope.variable_scope("ipu", use_resource=True, reuse=False):

      def model(*args):
        loss = fwd_fn(*functional_ops._convert_to_list(args))  # pylint: disable=W0212
        enqueue_op = outfeed_queue.enqueue(loss)
        opt = gradient_accumulation_optimizer.GradientAccumulationOptimizerV2(
            optimizer, num_batches_to_accumulate)
        outs = list(args[:len(args) - infeed_queue.number_of_tuple_elements])
        outs.append(enqueue_op)
        outs.append(opt.minimize(loss))
        return outs

      def my_net(*args):
        return loops.repeat(num_iterations,
                            model,
                            inputs=args,
                            infeed_queue=infeed_queue)

    with ops.device("/device:IPU:0"):
      loop_ret = ipu_compiler.compile(my_net, inputs=inputs)

    outfeed_op = outfeed_queue.dequeue()

    profiling = utils.running_on_ipu_model()

    cfg = utils.create_ipu_config(profiling=profiling,
                                  profile_execution=profiling)
    cfg = utils.set_ipu_model_options(cfg,
                                      compile_ipu_code=True,
                                      tiles_per_ipu=128)
    cfg = utils.auto_select_ipus(cfg, 1)
    utils.configure_ipu_system(cfg)
    utils.move_variable_initialization_to_cpu()

    session.run(variables.global_variables_initializer())
    session.run(infeed_queue.initializer)
    session.run(loop_ret, feed_dict=dict(zip(inputs, input_values)))
    return session.run(outfeed_op)
Exemplo n.º 12
0
    def testTrainWithAutomaticSharding(self):
        if ipu_utils.running_on_ipu_model():
            self.skipTest(
                "Replicated top level graphs are not supported on the "
                "IPU_MODEL target")

        def my_model_fn(features, labels, mode):
            self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode)

            with variable_scope.variable_scope("vs", use_resource=True):
                predictions = layers.Dense(units=1)(features)

            loss = losses.mean_squared_error(labels=labels,
                                             predictions=predictions)
            sharded_optimizer_obj = sharded_optimizer.ShardedOptimizer(
                gradient_descent.GradientDescentOptimizer(0.1))
            train_op = sharded_optimizer_obj.minimize(loss)

            return model_fn_lib.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)

        def my_input_fn():
            dataset = dataset_ops.Dataset.from_tensor_slices(
                _create_regression_dataset(num_samples=1000, num_features=5))
            dataset = dataset.batch(batch_size=2, drop_remainder=True).repeat()
            return dataset

        ipu_options = ipu_utils.create_ipu_config()
        ipu_options = ipu_utils.auto_select_ipus(ipu_options, 4)

        config = ipu_run_config.RunConfig(
            ipu_run_config=ipu_run_config.IPURunConfig(
                iterations_per_loop=2,
                num_shards=4,
                autosharding=True,
                ipu_options=ipu_options),
            log_step_count_steps=1,
            save_summary_steps=1)

        estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn,
                                               config=config)

        estimator.train(input_fn=my_input_fn, steps=10)

        model_dir = estimator.model_dir
        events_file = glob.glob(model_dir + "/*tfevents*")
        assert len(events_file) == 1
        events_file = events_file[0]
        loss_output = list()
        for e in summary_iterator.summary_iterator(events_file):
            for v in e.summary.value:
                if "loss" in v.tag:
                    loss_output.append(v.simple_value)

        self.assertTrue(loss_output[0] > loss_output[-1])
Exemplo n.º 13
0
def get_ipu_config(fp_exceptions=True,
                   stochastic_rounding=True,
                   xla_recompute=False,
                   available_memory_proportion=None,
                   disable_graph_outlining=False,
                   num_ipus_required=0,
                   max_cross_replica_sum_buffer_size=0,
                   scheduler_selection='',
                   compile_only=False,
                   partials_type="half"):
    """Builds ipu_options"""
    config = utils.create_ipu_config(
        max_report_size=3001819596000,
        merge_infeed_io_copies=True,
        always_rearrange_copies_on_the_host=False,
        selection_order=utils.SelectionOrder.AUTO,
        disable_graph_outlining=disable_graph_outlining,
        max_cross_replica_sum_buffer_size=max_cross_replica_sum_buffer_size,
        scheduler_selection=scheduler_selection)

    config = utils.auto_select_ipus(config, num_ipus_required)

    config = utils.set_matmul_options(config, clear_pass_type=True)

    if available_memory_proportion is not None:
        config = utils.set_convolution_options(
            config, {
                "availableMemoryProportion": str(available_memory_proportion),
                "partialsType": partials_type
            })
        config = utils.set_matmul_options(
            config, {
                "availableMemoryProportion": str(available_memory_proportion),
                "partialsType": partials_type
            })

    config = utils.set_norm_options(config, use_stable_statistics=True)

    config = utils.set_recomputation_options(config,
                                             allow_recompute=xla_recompute)

    if compile_only:
        config = utils.set_ipu_connection_type(
            config,
            utils.DeviceConnectionType.NEVER,
            ipu_version=2,
            enable_remote_buffers=True)

    config = utils.set_floating_point_behaviour_options(
        config,
        inv=fp_exceptions,
        div0=fp_exceptions,
        oflo=fp_exceptions,
        esr=stochastic_rounding,
        nanoo=fp_exceptions)
    return config
Exemplo n.º 14
0
    def testResetSeed(self):
        # The dataset for feeding the graphs
        ds = dataset_ops.Dataset.from_tensors(
            array_ops.constant(1.0, shape=[SIZE]))
        ds = ds.map(lambda x: [x, x])
        ds = ds.repeat()

        # The host side queues
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
            ds, feed_name="infeed", replication_factor=REPLICAS)
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=REPLICAS)

        # The device side
        def body(x1, x2):
            d1 = rand_ops.dropout(x1)
            d2 = rand_ops.dropout(x2)
            outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2})
            return outfeed

        def my_net():
            r = loops.repeat(REPEATS, body, [], infeed_queue)
            return r

        with scopes.ipu_scope('/device:IPU:0'):
            res = ipu_compiler.compile(my_net, inputs=[])

        # The outfeed dequeue has to happen after the outfeed enqueue
        dequeue_outfeed = outfeed_queue.dequeue()

        # Configure the hardware
        config = utils.create_ipu_config(profiling=True)
        config = utils.auto_select_ipus(config, REPLICAS)
        config = utils.set_floating_point_behaviour_options(config)
        utils.configure_ipu_system(config)

        with session.Session() as sess:
            res_all = set()
            total = 0

            sess.run(infeed_queue.initializer)

            for _ in range(EXECS):
                sess.run(res)
                outfed_result = sess.run(dequeue_outfeed)
                for r in np.array(list(outfed_result.values())).reshape(
                    [-1, SIZE]):
                    total += 1
                    res_all.add(r.tostring())

            # 2 dropouts per replica * REPLICAS * REPEATS * EXECS
            expected = 2 * REPLICAS * REPEATS * EXECS
            self.assertEqual(total, expected)
            self.assertEqual(len(res_all), expected)
Exemplo n.º 15
0
    def testReplicatedEvaluationOnHost(self):
        if ipu_utils.running_on_ipu_model():
            self.skipTest(
                "Replicated top level graphs are not supported on the "
                "IPU_MODEL target")

        def my_input_fn():
            features = [0, 0, 0, 1, 0, 0, 0, 1]
            labels = [0, 1, 0, 1, 0, 1, 0, 1]
            return dataset_ops.Dataset.from_tensor_slices(
                (features, labels)).batch(2, drop_remainder=True)

        def my_metrics_fn(features, labels):
            labels64 = math_ops.cast(labels, np.int64)
            return {
                "accuracy": metrics_impl.accuracy(labels, features),
                "precision": metrics_impl.precision(labels, features),
                "recall": metrics_impl.recall(labels, features),
                "recall_at_1": metrics_impl.recall_at_k(labels64,
                                                        features,
                                                        k=1),
                "recall_at_2": metrics_impl.recall_at_k(labels64,
                                                        features,
                                                        k=2),
                "mse": metrics_impl.mean_squared_error(labels, features),
                "rmse": metrics_impl.root_mean_squared_error(labels, features),
            }

        def my_model_fn(features, labels, mode):
            loss = math_ops.cast(replication_ops.replication_index(),
                                 np.float32)
            eval_metrics = (my_metrics_fn, [features, labels])
            return ipu_estimator.IPUEstimatorSpec(mode,
                                                  loss=loss,
                                                  eval_metrics=eval_metrics)

        ipu_options = ipu_utils.create_ipu_config()
        ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4)
        config = ipu_run_config.RunConfig(
            ipu_run_config=ipu_run_config.IPURunConfig(
                iterations_per_loop=1, num_replicas=4,
                ipu_options=ipu_options))

        estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn,
                                               config=config)
        scores = estimator.evaluate(my_input_fn, steps=1)
        self.assertEqual(0.75, scores["accuracy"])
        self.assertEqual(1.0, scores["precision"])
        self.assertEqual(0.5, scores["recall"])
        self.assertEqual(0.5, scores["recall_at_1"])
        self.assertEqual(1.0, scores["recall_at_2"])
        self.assertEqual(0.25, scores["mse"])
        self.assertEqual(0.5, scores["rmse"])
        self.assertEqual(1.5, scores[model_fn_lib.LOSS_METRIC_KEY])
Exemplo n.º 16
0
def generic_train_graph(opts, is_training):
    data_type = 'float32'
    train_graph = tf.Graph()
    with train_graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(opts, is_training, seed)

        if opts['use_synthetic_data']:
            dataset_train = get_synthetic_dataset(opts)
        else:
            dataset_train = get_dataset_embed(opts, is_training=True)

        infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train, feed_name = 'DIN_dataset_infeed_train', replication_factor = (opts['replicas']))

        with ipu_scope('/device:IPU:0'):
            def comp_fn():
                def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen):
                    prob, loss, aux_loss, accuracy, grad_op = graph_builder(opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False)

                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy

                return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train)

            outputs_train = ipu_compiler.compile(comp_fn, [])
            avg_loss, avg_aux_loss, avg_accuracy = [x / opts['batches_per_step'] for x in outputs_train]
            outfeed = None

        saver = tf.compat.v1.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.compat.v1.global_variables_initializer()

    if opts['use_ipu_model']:
        os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"
    ipu_options = utils.create_ipu_config()
    ipu_options = utils.set_optimization_options(ipu_options,
                                                 combine_embedding_lookups=True)
    ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True)
    ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']])
    utils.configure_ipu_system(ipu_options)
    if seed is not None:
        utils.reset_ipu_seed(seed)

    ops_train = [avg_loss, avg_aux_loss, avg_accuracy]
    sess = tf.compat.v1.Session(graph=train_graph)

    return GraphOps(sess,
                    init,
                    ops_train,
                    placeholders,
                    infeed_train,
                    outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Exemplo n.º 17
0
def get_config(prng=False,
               ipu_id=-1,
               shards=1,
               number_of_replicas=1,
               max_cross_replica_buffer_size=10*1024*1024,
               merge_infeed_io_copies=True,
               fp_exceptions=True,
               xla_recompute=False,
               seed=None,
               profile=None,
               availableMemoryProportion=None,
               stable_norm=False):
    """Builds ipu_options"""

    profile_exec_modes = {"NO_PROFILE": ExecutionProfileType.NO_PROFILE,
                          "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE,
                          "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE,
                          "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE}

    config = utils.create_ipu_config(max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size,
                                     merge_infeed_io_copies=merge_infeed_io_copies,
                                     always_rearrange_copies_on_the_host=False,
                                     profiling=profile is not None,
                                     profile_execution=profile_exec_modes[profile] if profile else None)

    if "GCL_REAL_COLLECTIVES" in os.environ:
        config = utils.set_gcl_options(config, num_io_tiles=128, gcl_options={"useGclCollectives": "true", })

    if ipu_id == -1:
        config = utils.auto_select_ipus(config, number_of_replicas*shards)
    else:
        config = utils.select_ipus(config, [ipu_id])
    config = utils.set_compilation_options(config, {
        "device.clearAtomicFlagAfterExchange": "false",
        "prng.enable": "true" if prng else "false",
        "target.deterministicWorkers": "false" if seed is None else "true",
    })

    if availableMemoryProportion is not None:
        config = utils.set_convolution_options(config, {
            "availableMemoryProportion": str(availableMemoryProportion)
        })

    if stable_norm:
        config = utils.set_norm_options(config, use_stable_statistics=True)

    if xla_recompute:
        utils.set_recomputation_options(config, allow_recompute=True)

    config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions,
                                                        oflo=fp_exceptions, esr=prng, nanoo=True)

    return config
Exemplo n.º 18
0
def run_inference(batch_size: int,
                  image_dir: Path = Path(IMAGE_DIR),
                  loop: bool = False) -> None:
    """Run inference on pre-trained Densenet model.

    Args:
        batch_size: Batch size for inference
        image_dir: Path to dir of images
        loop: Flag to iterate through the images endlessly

    Raises:
        ValueError if `image_dir` does not contain test images.

    """

    image_filenames = glob.glob(image_dir.as_posix() + "/*.jpg")
    if len(image_filenames) == 0:
        raise ValueError(
            ('Image directory: %s does not have images,'
             'please run `./get_images.sh` '
             'to download sample imagenet images' % image_dir.as_posix()))

    opts = utils.create_ipu_config(profiling=False,
                                   use_poplar_text_report=False)
    utils.auto_select_ipus(opts, [1])
    utils.configure_ipu_system(opts)

    output_probs = construct_graph(batch_size)

    timings = collections.deque(maxlen=250)  # keep the most recent timings
    with tf.Session() as session:
        if loop:
            image_filenames = itertools.cycle(image_filenames)

        for img_file in image_filenames:
            classify_image(session, img_file, output_probs)
            timings.append(time.time())
            if len(timings) > 2:
                fps = (len(timings) - 1) / (timings[-1] - timings[1])
                print("Average images per second: {0:.1f}".format(fps))
Exemplo n.º 19
0
def _make_config(iterations_per_loop=1):
  num_ipus_in_pipeline = 2

  ipu_options = ipu_utils.create_ipu_config()
  ipu_options = ipu_utils.set_ipu_model_options(ipu_options,
                                                compile_ipu_code=True,
                                                tiles_per_ipu=128)
  ipu_options = ipu_utils.auto_select_ipus(ipu_options,
                                           num_ipus=num_ipus_in_pipeline)
  return ipu_run_config.RunConfig(ipu_run_config=ipu_run_config.IPURunConfig(
      num_shards=num_ipus_in_pipeline,
      iterations_per_loop=iterations_per_loop,
      ipu_options=ipu_options))
Exemplo n.º 20
0
    def testReplicatedPrediction(self):
        if ipu_utils.running_on_ipu_model():
            self.skipTest(
                "Replicated top level graphs are not supported on the "
                "IPU_MODEL target")

        def my_input_fn():
            features = [
                [1.0],  # IPU0
                [3.0],  # IPU0
                [5.0],  # IPU1
                [3.0],  # IPU1
                [7.0],  # IPU2
                [3.0],  # IPU2
                [9.0],  # IPU3
                [3.0],  # IPU3
            ]
            return dataset_ops.Dataset.from_tensor_slices(features).batch(
                batch_size=2, drop_remainder=True)

        hook = ipu_session_run_hooks.IPULoggingTensorHook(every_n_iter=1,
                                                          replication_factor=4)

        def my_model_fn(features, mode):
            logging_op = hook.log({"features": features})
            with ops.control_dependencies([logging_op]):
                predictions = math_ops.reduce_max(features)

            return model_fn_lib.EstimatorSpec(
                mode,
                predictions=predictions,
            )

        ipu_options = ipu_utils.create_ipu_config()
        ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4)
        config = ipu_run_config.RunConfig(
            ipu_run_config=ipu_run_config.IPURunConfig(
                iterations_per_loop=1, num_replicas=4,
                ipu_options=ipu_options))
        estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn,
                                               config=config)

        outputs = estimator.predict(input_fn=my_input_fn,
                                    yield_single_examples=True)
        self.assertEqual(3.0, next(outputs))
        self.assertEqual(5.0, next(outputs))

        outputs = estimator.predict(input_fn=my_input_fn,
                                    yield_single_examples=False,
                                    hooks=[hook])
        np.testing.assert_array_equal([3.0, 5.0, 7.0, 9.0], next(outputs))
Exemplo n.º 21
0
def run_language_model(opts):
    if opts.random_seed is not None:
        utils.reset_ipu_seed(opts.random_seed)

    # Setup and acquire an IPU device:
    logging.info("Acquiring devices")
    if not opts.pipeline:
        opts.num_shards = 1  # FIX-ME enable sparse models using multiple shards

    # Make sure that no matter the number of shards/stages required, we always
    # acquire a power of 2 ipus (else attachment will fail)
    k = 0
    while 2**k < opts.num_shards:
        k += 1
    num_ipus = 2**k
    logger.info(f"Need {opts.num_shards} IPUs, requesting {num_ipus}")
    config = utils.create_ipu_config()

    if opts.compile_only:
        if opts.compile_only_ipu_version is None:
            raise AttributeError(
                "Must provide --compile-only-ipu-version if --compile-only is set."
            )

        config = utils.set_ipu_connection_type(
            config,
            utils.DeviceConnectionType.NEVER,
            ipu_version=opts.compile_only_ipu_version,
            enable_remote_buffers=True)

    config = utils.auto_select_ipus(config, num_ipus)
    config = utils.set_recomputation_options(config,
                                             allow_recompute=opts.recompute)
    # Enable stochastic rounding
    config = utils.set_floating_point_behaviour_options(config,
                                                        inv=False,
                                                        div0=False,
                                                        oflo=False,
                                                        esr=True,
                                                        nanoo=False)
    config = sparse.set_system_config(
        config, custom_op_debug_printing=opts.debug_dense_grad)
    utils.configure_ipu_system(config)

    transformer = DynsparseTransformer(opts)
    if opts.mode in ["all", "train"]:
        run_training(opts, transformer)

    if opts.mode in ["all", "test"]:
        run_testing(opts, transformer)
Exemplo n.º 22
0
def train():
    graph = tf.Graph()
    with graph.as_default():
        dataset = tf.data.Dataset.from_tensors(tf.constant(1, shape=[]))
        #         dataset = tf.data.Dataset.from_tensors(np.array([1,2,3,4,5,6,7,8,9,0]))
        dataset = dataset.map(lambda x: [x, x])
        dataset = dataset.batch(BS, drop_remainder=True)
        dataset = dataset.repeat()
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(get_data_set(),
                                                       feed_name="infeed")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name='outfeed')
        time_steps_ph = tf.placeholder(tf.int32, shape=[])
        with ipu_scope('/device:IPU:0'):

            def compile_fn():
                def body(x, y):
                    #                     z1, z2 = model1(x, y, time_steps_ph)
                    #                     outfeed = outfeed_queue.enqueue({'z1':z1, 'z2':z2})
                    z3 = model2(time_steps_ph)
                    outfeed = outfeed_queue.enqueue({'z3': z3})
                    return outfeed

                return loops.repeat(1, body, [], infeed_queue)

        utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()
        outputs = ipu_compiler.compile(compile_fn, [])

        dequeue_outfeed = outfeed_queue.dequeue()
    ipu_options = utils.create_ipu_config(
        profiling=False,
        profile_execution=False,
        max_cross_replica_sum_buffer_size=10000000,
        max_inter_ipu_copies_buffer_size=10000000)
    ipu_options = utils.auto_select_ipus(ipu_options, 1)
    utils.configure_ipu_system(ipu_options)
    utils.reset_ipu_seed(SEED)

    sess = tf.Session(graph=graph)
    sess.run(init)
    sess.run(infeed_queue.initializer)

    steps = 6
    i = 0
    while i < steps:
        sess.run(outputs, feed_dict={time_steps_ph: 3})
        result = sess.run(dequeue_outfeed)
        print(result)
        i = i + 1
        break
Exemplo n.º 23
0
def get_config(opts, training=True, profiling=False):
    """Builds ipu_options
    """
    config = utils.create_ipu_config(profiling=profiling)

    ipus = opts.select_ipus
    if ipus[0] == -1:
        train_ipus = 1  # opts.shards
        valid_ipus = 1  # This might want an option to control
        if not opts.multiprocessing:
            config = utils.auto_select_ipus(config, [train_ipus, valid_ipus])
        else:
            ipus = train_ipus if training else valid_ipus
            config = utils.auto_select_ipus(config, [ipus])
    else:
        if opts.multiprocessing:
            ipus = [ipus[0] if training else ipus[1]]
        config = utils.select_ipus(config, ipus)

    config = utils.set_compilation_options(
        config, {"prng.enable": "true" if opts.prng else "false"})

    return config
Exemplo n.º 24
0
def generate_report(batch_size: int,
                    report_dest: str = "./densenet_report.txt") -> None:
    """Generate report from running model on IPU

    Args:
        batch_size: Batch size for inference
        report_dest: Location to save generated text report

    """
    # Set compile and device options
    os.environ['TF_POPLAR_FORCE_IPU_MODEL'] = "1"
    opts = utils.create_ipu_config(profiling=True, use_poplar_text_report=True)
    utils.auto_select_ipus(opts, [1])
    utils.configure_ipu_system(opts)
    output_probs = construct_graph(batch_size)

    with tf.device('cpu'):
        report = gen_ipu_ops.ipu_event_trace()

    run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True)
    with tf.Session() as session:
        session.run(output_probs,
                    feed_dict={
                        "optimized/image_input:0":
                        np.zeros(
                            (batch_size, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS),
                            dtype=np.float16)
                    },
                    options=run_options)
        out = session.run(report)

    # extract the report
    rep = utils.extract_all_strings_from_event_trace(out)
    logging.info("Writing densenet profiling report to %s" % report_dest)
    with open(report_dest, "w") as f:
        f.write(rep)
Exemplo n.º 25
0
def get_config(opts):
    """Builds ipu_options"""
    profile = opts.cycle_report

    config = utils.create_ipu_config(profiling=profile,
                                     profile_execution=profile,
                                     report_every_nth_execution=1)
    if opts.device_id == -1:
        config = utils.auto_select_ipus(config, [opts.shards or 1])
    else:
        config = utils.select_ipus(config, [opts.device_id])

    if opts.convolution_options:
        config = utils.set_convolution_options(
            config, json.loads(opts.convolution_options))
    return config
Exemplo n.º 26
0
def get_config(fp_exceptions,
               xla_recompute,
               disable_graph_outlining,
               num_required_ipus,
               enable_stochastic_rounding,
               max_cross_replica_sum_buffer_size,
               scheduler_selection,
               compile_only,
               ipu_id):

    # Builds ipu_options
    config = utils.create_ipu_config(
        merge_infeed_io_copies=True,
        always_rearrange_copies_on_the_host=False,
        disable_graph_outlining=disable_graph_outlining,
        selection_order=utils.SelectionOrder.AUTO,
        scheduler_selection=scheduler_selection
    )

    if ipu_id:
        config = utils.select_ipus(config, [ipu_id])
    else:
        config = utils.auto_select_ipus(config, num_required_ipus)

    config = utils.set_recomputation_options(
        config, allow_recompute=xla_recompute)
    # simple way to skip the big `Transpose` operation due to bad allocation
    # config = utils.set_matmul_options(config, clear_pass_type=True)
    config = utils.set_norm_options(config, use_stable_statistics=True)
    config = utils.set_floating_point_behaviour_options(
        config,
        inv=fp_exceptions,
        div0=fp_exceptions,
        oflo=fp_exceptions,
        esr=enable_stochastic_rounding,
        nanoo=fp_exceptions)
    config = utils.set_optimization_options(
        config,
        merge_remote_buffers=True,
        max_cross_replica_sum_buffer_size=max_cross_replica_sum_buffer_size)

    # Do not acquire a device, compile only.
    if compile_only:
        config = utils.set_ipu_connection_type(
            config, utils.DeviceConnectionType.NEVER, ipu_version=2, enable_remote_buffers=True)

    return config
Exemplo n.º 27
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--connection_type",
                        choices=['ALWAYS', 'ON_DEMAND', 'NEVER'],
                        help="Specify connection type")
    parser.set_defaults(connection_type='ALWAYS')
    opts = parser.parse_args()

    with tf.device("cpu"):
        pa = tf.compat.v1.placeholder(np.float32, [2], name="a")
        pb = tf.compat.v1.placeholder(np.float32, [2], name="b")
        pc = tf.compat.v1.placeholder(np.float32, [2], name="c")

    # Create the IPU section of the graph.
    with scopes.ipu_scope("/device:IPU:0"):
        out = ipu_compiler.compile(my_graph, [pa, pb, pc])

    # Define the feed_dict input data.
    fd = {pa: [1., 1.], pb: [0., 1.], pc: [1., 5.]}

    # Connection type from options.
    connection_type = device_connection_type(opts.connection_type)

    cfg = utils.create_ipu_config()
    cfg = utils.auto_select_ipus(cfg, 1)
    cfg = utils.set_ipu_connection_type(cfg,
                                        connection_type,
                                        1)
    utils.configure_ipu_system(cfg)

    # Run the session.
    # If running with DeviceConnectionType.NEVER then anticipate the
    # specific exception with message "configured for compilation only".
    with tf.compat.v1.Session() as sess:
        try:
            result = sess.run(out, fd)
            print(result)
        except tf.errors.InvalidArgumentError as invalid_arg_exception:
            if (connection_type == utils.DeviceConnectionType.NEVER) and \
               ("configured for compilation only" in invalid_arg_exception.message):
                print("Compiled")
                pass
            else:
                print("ERROR: {}".format(invalid_arg_exception.message))
        except:
            general_exception = sys.exc_info()[0]
            print("ERROR: {}".format(general_exception))
Exemplo n.º 28
0
  def testDuplicateInputsOutputs(self):
    outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed9")

    def stage1(x, y):
      return x, y, y, x

    # The above should be optimised to a single copy for each duplicate output.
    def stage2(x1, y1, y2, x2):
      return x1, y1, y2, x2

    # Same for this stage
    def stage3(_x1, _y1, y2, x2):
      return x2, y2

    def model_pipeline(x, y):
      return pipelining_ops.pipeline(
          [stage1, stage2, stage3],
          12,
          inputs=[x, y],
          outfeed_queue=outfeed_queue,
          pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential)

    with ops.device('cpu'):
      x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2])
      y = array_ops.placeholder(np.float32, shape=[1, 2])

    with ops.device("/device:IPU:0"):
      compiled_model_pipeline = ipu_compiler.compile(model_pipeline,
                                                     inputs=[x, y])

    cfg = utils.create_ipu_config(profiling=True, profile_execution=True)
    cfg = utils.auto_select_ipus(cfg, 4)
    utils.configure_ipu_system(cfg)
    utils.move_variable_initialization_to_cpu()

    #TODO(T10784) test how many IPU copies are here once we insert IPU copies.
    outfeed_op = outfeed_queue.dequeue()
    with tu.ipu_session() as sess:
      sess.run(compiled_model_pipeline, {
          x: np.ones(x.shape),
          y: np.ones(y.shape)
      })
      output = sess.run(outfeed_op)
      for i in range(12):
        self.assertAllClose(output[0][i], np.ones(x.shape))
        self.assertAllClose(output[1][i], np.ones(y.shape))
Exemplo n.º 29
0
def get_config(prng=False,
               ipu_id=-1,
               shards=1,
               number_of_replicas=1,
               max_cross_replica_buffer_size=10 * 1024 * 1024,
               merge_infeed_io_copies=True,
               fp_exceptions=True,
               xla_recompute=False,
               seed=None,
               profile=False,
               availableMemoryProportion=None):
    """Builds ipu_options"""
    config = utils.create_ipu_config(
        max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size,
        merge_infeed_io_copies=merge_infeed_io_copies,
        always_rearrange_copies_on_the_host=False,
        profiling=profile,
        profile_execution=profile)
    if ipu_id == -1:
        config = utils.auto_select_ipus(config, number_of_replicas * shards)
    else:
        config = utils.select_ipus(config, [ipu_id])
    config = utils.set_compilation_options(
        config, {
            "device.clearAtomicFlagAfterExchange": "false",
            "prng.enable": "true" if prng else "false",
            "target.deterministicWorkers": "false" if seed is None else "true",
        })

    if availableMemoryProportion is not None:
        config = utils.set_convolution_options(
            config,
            {"availableMemoryProportion": str(availableMemoryProportion)})

    if xla_recompute:
        utils.set_recomputation_options(config, allow_recompute=True)

    config = utils.set_floating_point_behaviour_options(config,
                                                        inv=fp_exceptions,
                                                        div0=fp_exceptions,
                                                        oflo=fp_exceptions,
                                                        esr=prng,
                                                        nanoo=True)

    return config
Exemplo n.º 30
0
  def testSyntheticDataWithOutfeeds(self):
    poplar_flags = os.environ.get("TF_POPLAR_FLAGS", "")
    poplar_flags += " --use_ipu_model"
    poplar_flags += " --use_synthetic_data"
    poplar_flags += " --synthetic_data_initializer=random"

    with test.mock.patch.dict("os.environ", {"TF_POPLAR_FLAGS": poplar_flags}):

      # The device side main
      def body(x1, x2):
        d1 = x1 + x2
        d2 = x1 - x2
        outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2})
        return outfeed

      def my_net():
        r = loops.repeat(5, body, [], infeed_queue)
        return r

      with ops.device('cpu'):
        # The dataset for feeding the graphs
        ds = tf.data.Dataset.from_tensors(tf.constant(1.0, shape=[10]))
        ds = ds.map(lambda x: [x, x])
        ds = ds.repeat()

        # The host side queues
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(ds, feed_name="infeed2")
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="outfeed2")

      with scopes.ipu_scope('/device:IPU:0'):
        run_loop = ipu_compiler.compile(my_net, inputs=[])

      # The outfeed dequeue has to happen after the outfeed enqueue
      dequeue_outfeed = outfeed_queue.dequeue()

      # Configure the hardware
      config = utils.create_ipu_config()
      config = utils.auto_select_ipus(config, 1)
      utils.configure_ipu_system(config)

      with tf.Session() as sess:
        sess.run(infeed_queue.initializer)
        sess.run(run_loop)
        result = sess.run(dequeue_outfeed)
        self.assertAllEqual(len(result['d1']), 0)