def testBatchNormalizeFused(self): x = array_ops.placeholder(np.float32, [4, 64, 64, 4], name="a") with ops.device("/device:IPU:0"): with variable_scope.variable_scope("", use_resource=True): beta = variable_scope.get_variable( "x", dtype=np.float32, shape=[4], initializer=init_ops.constant_initializer(0.0)) gamma = variable_scope.get_variable( "y", dtype=np.float32, shape=[4], initializer=init_ops.constant_initializer(1.0)) b_mean, b_var = nn.moments(x, [0, 1, 2], name='moments') normed = nn.fused_batch_norm(x, gamma, beta, b_mean, b_var, is_training=False) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) sess.run(variables.global_variables_initializer()) result, _, _ = sess.run(normed, {x: np.zeros([4, 64, 64, 4])}) self.assertAllClose(result, np.zeros([4, 64, 64, 4])) rep = sess.run(report) s = tu.extract_all_strings_from_event_trace(rep) cs = tu.get_compute_sets_from_report(s) bl = ['*convert*/Cast*'] self.assertTrue(tu.check_compute_sets_not_in_blacklist(cs, bl))
def testBatchNormalizeInference(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): y = convolutional.conv2d( x, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer()) y = layers_norm.batch_normalization(y, fused=True) y = convolutional.conv2d( y, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer()) y = layers_norm.batch_normalization(y, fused=True) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) sess.run(y, {x: np.zeros([1, 4, 4, 2])}) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) # Would fail if there were two batch norms in the graph ok = [ '__seed*', 'host-exchange-local-copy', 'Copy_', 'vs/conv2d/Conv2D/convolution.*/Conv_1x1/Convolve', 'vs/batch_normalization/FusedBatchNorm/batch-norm-inference.*/' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testEngineCompilationOptions(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [480], name="a") pb = array_ops.placeholder(np.float32, [480], name="b") output = pa + pb tu.configure_ipu_system(True, True, True, engine_opts={"some_option": "some_value"}) try: with session_lib.Session() as sess: fd = {pa: np.zeros([480]), pb: np.zeros([480])} sess.run(output, fd) self.assertTrue(False) except errors.InvalidArgumentError: pass
def testIpuModelDeviceWithNoReport(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") output = pa + pb with ops.device('cpu'): with ops.control_dependencies([output]): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(False, False, False) with session_lib.Session() as sess: fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) result, rep = sess.run([output, report], fd) self.assertAllClose(result, [[1., 2.], [6., 8.]]) self.assertTrue(len(rep) == 0)
def testConv8x8_WithBias(self): for fmt in self.data_formats: with ops.device("/device:IPU:0"): inp = array_ops.placeholder( np.float32, self._ip_shp([1, 84, 84, 4], fmt), name="inp") wei = array_ops.placeholder(np.float32, [8, 8, 4, 16], name="wei") bia = array_ops.placeholder(np.float32, [16], name="bia") output = nn_ops.conv2d( inp, wei, strides=self._ip_shp([1, 4, 4, 1], fmt), padding="VALID", data_format=fmt, name='cnv4') output = nn_ops.bias_add(output, bia, data_format=fmt, name='ba4') with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = { inp: np.zeros(self._ip_shp([1, 84, 84, 4], fmt)), wei: np.zeros([8, 8, 4, 16]), bia: np.zeros([16]), } result = sess.run(output, fd) self.assertAllClose(result, np.zeros(self._ip_shp([1, 20, 20, 16], fmt))) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'host-exchange-local-copy-', 'Copy_XLA_Args/arg2.*_weights_to_cnv4*/convolution.*/Conv_8x8_stride4x4/weightsRearranged', 'cnv4*/convolution.*/Conv_8x8_stride4x4', 'ba4*/fusion/addToChannel' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testConvolutionsDontMatchDifferentTypes(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer())(x) y = math_ops.cast(y, np.float16) y = layers.Conv2D( 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer())(y) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) sess.run(y, {x: np.zeros([1, 4, 4, 2])}) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) # Matches two convolutions ok = [ '__seed*', 'Copy_*weightsRearranged', 'host-exchange-local-copy-', 'Copy_vs/conv2d_1/Conv2D/convolution.7/Conv_1x1/cast_', 'vs/conv2d/Conv2D/convolution.*/Conv_1x1', 'vs/Cast/convert.*/Cast', 'vs/conv2d_1/Conv2D/convolution.*/Conv_1x1' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testBatchNormalizeLayerFusedTrainingFp16(self): # This test checks for the correct behaviour in batch norm grad when # perofrming training, but the batch norm attribute `training` is False with ops.device("/device:IPU:0"): with variable_scope.variable_scope("", use_resource=True): x = array_ops.placeholder(np.float16, [4, 64, 64, 4], name="a") normed = layers_norm.batch_normalization(x, fused=True, training=False) loss = math_ops.reduce_sum(normed) optimizer = gradient_descent.GradientDescentOptimizer(0.1) train = optimizer.minimize(loss) tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) result = sess.run([normed, train], {x: np.zeros([4, 64, 64, 4])}) self.assertAllClose(result[0], np.zeros([4, 64, 64, 4]))
def testBiasApplyVariableLR(self): input = np.ones((1, 4, 4, 2)) with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float16, shape=[1, 4, 4, 2]) lr = array_ops.placeholder(np.float16, shape=[]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), bias_initializer=init_ops.ones_initializer(), name="a")(x) y = nn.relu(y) loss = math_ops.reduce_sum(y) optimizer = gradient_descent.GradientDescentOptimizer(lr) train = optimizer.minimize(loss) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) fe = { x: input, lr: 0.1, } l, _ = sess.run((loss, train), fe) tvars = variables.global_variables() tvars_vals = sess.run(tvars) found = False for var, val in zip(tvars, tvars_vals): if var.name == "vs/a/bias:0": # Value computed using the CPU backend self.assertAllClose(val, [-0.6, -0.6], atol=0.001) found = True self.assertTrue(found)
def tesInplaceAddCopyWithInplacePeer2(self): data_a = np.array([[10, -10], [-5, 5]]) data_b = np.array([[-15, 15], [25, -25]]) data_c = 2 with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2]) pb = array_ops.placeholder(np.float32, [2, 2]) pc = array_ops.placeholder(np.float32, []) a = array_ops.transpose(pa) b = pa + pb * pc c = a * pb + pc d = b / c with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = { pa: data_a, pb: data_b, pc: data_c, } np_result = (data_a + data_b * data_c) / ( np.transpose(data_a) * data_b + data_c) result = sess.run(d, fd) self.assertAllClose(result, np_result) result = sess.run(report) self.assertTrue(len(result) == 3) #compile_begin, compile_end, execute s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'Copy_XLA_Args/arg0.*_to_transpose/transpose' 'mul/multiply.*/Op/Multiply', 'add/add.*/AddTo', 'mul_1/multiply.*/Op/Multiply', 'add_1/add.*/AddTo', 'truediv/divide.*/Op/Divide' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testConvolutionsDontMatchDifferentDevices(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): with tu.ipu_shard(0): y = layers.Conv2D( 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer())(x) with tu.ipu_shard(1): y = layers.Conv2D( 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer())(y) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True, sharded=True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) sess.run(y, {x: np.zeros([1, 4, 4, 2])}) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) # Note how there are two convolutions ok = [ '__seed*', '*OnTileCopy*', 'vs/conv2d/Conv2D/convolution.*', 'Copy_vs/conv2d/Conv2D/convolution.*', 'vs/conv2d_1/Conv2D/convolution.*' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def executeModel(inputs, expected): # Decide what the output type should be. data_type = inputs["on"].dtype # The actual model function which perfoms the one-hot operation based on the inputs given to executeModel. def model(a): return array_ops.one_hot(a, inputs["n_classes"], dtype=data_type, on_value=inputs["on"], off_value=inputs["off"], axis=inputs["axis"]) # We run once on the CPU to get the expected result, then on the IPU to compare the two. cpuRun = expected is None with ops.device('cpu'): pa = array_ops.placeholder(np.int32, inputs["shape"], name="a") report = gen_ipu_ops.ipu_event_trace() # Check if we should be running on IPU or cpu. device = "cpu:0" if cpuRun else "/device:IPU:0" with ops.device(device): out = model(pa) tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) in_data = np.array(inputs["in_values"]) fd = {pa: in_data} result = sess.run(out, fd) if cpuRun: return result else: self.assertAllClose(result, expected)
def testAvgPoolValidWithBroadcast(self): np.random.seed(0) shape = [1, 10, 10, 1] data = np.random.uniform(0, 1, shape) # The expected answer was generated using TF on the cpu expected = [[[[0.52647954], [0.44196457], [0.49284577]], [[0.44039682], [0.44067329], [0.44934618]], [[0.46444583], [0.45419583], [0.38236427]]]] with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, shape, name="a") output = nn.avg_pool(pa, ksize=[1, 5, 5, 1], strides=[1, 2, 2, 1], data_format='NHWC', padding='VALID', name="avg") with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) fd = {pa: data} result = sess.run(output, fd) self.assertAllClose(result, expected) result = sess.run(report) self.assertEqual(len(result), 4) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = ['__seed*', 'avg/custom-call*/avgPool5x5'] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testInplaceTuple(self): def my_net(x): def cond(i, x, y): return i < 1 def body(i, x, y): i = i + 1 x = nn.tanh(x) y = nn.tanh(y) return (i, x, y) i = 0 return control_flow_ops.while_loop(cond, body, (i, x, x))[1:] with ops.device('cpu'): x = array_ops.placeholder(np.float32, [4]) report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with ops.device("/device:IPU:0"): r = xla.compile(my_net, inputs=[x]) with tu.ipu_session() as sess: sess.run(report) x, y = sess.run(r, {x: np.full([4], 2)}) self.assertAllClose(x, np.full([4], np.tanh(2))) self.assertAllClose(y, np.full([4], np.tanh(2))) result = sess.run(report) self.assertTrue(len(result) == 3) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'Copy_*_to_*', 'while/Tanh/tanh*/Op/Tanh', 'while/Tanh_1/tanh*/Op/Tanh' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def test3DConv8x8x8_WithBias(self): with ops.device("/device:IPU:0"): inp = array_ops.placeholder(np.float32, [1, 84, 84, 84, 2], name="inp") wei = array_ops.placeholder(np.float32, [8, 8, 8, 2, 4], name="wei") bia = array_ops.placeholder(np.float32, [4], name="bia") output = nn_ops.conv3d(inp, wei, strides=[1, 4, 4, 4, 1], padding="VALID") output = nn_ops.bias_add(output, bia) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = { inp: np.zeros([1, 84, 84, 84, 2]), wei: np.zeros([8, 8, 8, 2, 4]), bia: np.zeros([4]), } result = sess.run(output, fd) self.assertAllClose(result, np.zeros([1, 20, 20, 20, 4])) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'host-exchange-local-copy-', 'Copy_', 'Conv3D/convolution.*/Conv_8x8x8_stride4x4x4', 'BiasAdd/fusion/addToChannel' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testConvWithBnAndRelu(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer())(x) y = layers_norm.batch_normalization(y, fused=True) y = nn_ops.relu(y) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) sess.run(y, {x: np.zeros([1, 4, 4, 2])}) result = sess.run(report) self.assertEqual( len(result), 6) # 2xcompile, 1xupload 1xload, 1xdownload, 1xexecute s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'host-exchange-local-copy', 'Copy_', 'vs/conv2d/Conv2D/convolution.*/Conv_1x1', 'vs/conv2d/BiasAdd', 'vs/batch_normalization/FusedBatchNorm/batch-norm-inference.*/', 'vs/Relu/custom-call/Nonlinearity' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def test3DConvBackpropInput(self): with ops.device("/device:IPU:0"): ins = constant_op.constant([2, 8, 8, 8, 3], np.int32) fil = array_ops.placeholder(np.float32, [2, 2, 2, 3, 5], name="inp") bck = array_ops.placeholder(np.float32, [2, 8, 8, 8, 5], name="wei") output = nn_ops.conv3d_backprop_input_v2(ins, fil, bck, strides=[1, 1, 1, 1, 1], padding="SAME") with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = { fil: np.zeros([2, 2, 2, 3, 5]), bck: np.zeros([2, 8, 8, 8, 5]), } result = sess.run(output, fd) self.assertAllClose(result, np.zeros([2, 8, 8, 8, 3])) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'Copy_', 'Conv3DBackpropInputV2/fusion*/Conv_2x2x2' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testConvolutionsMatch(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer())(x) y = layers.Conv2D( 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer())(y) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) sess.run(y, {x: np.zeros([1, 4, 4, 2])}) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) # Would fail if there were two convolutions in the graph as they would be # called conv2d and conv2d_1 ok = [ '__seed*', 'host-exchange-local-copy-', 'vs/conv2d/Conv2D/convolution.*/Conv_1x1', 'Copy_' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testConv3x3_WithBias(self): for fmt in self.data_formats: with ops.device("/device:IPU:0"): pa = array_ops.placeholder( np.float32, self._ip_shp([1, 14, 14, 64], fmt), name="a") pb = array_ops.placeholder(np.float32, [3, 3, 64, 128], name="b") bi = array_ops.placeholder(np.float32, [128], name="b") output = nn_ops.convolution( pa, pb, padding="SAME", data_format=fmt, name='cnv3') output = nn_ops.bias_add(output, bi, data_format=fmt, name='ba3') with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = { pa: np.zeros(self._ip_shp([1, 14, 14, 64], fmt)), pb: np.zeros([3, 3, 64, 128]), bi: np.zeros([128]), } result = sess.run(output, fd) self.assertAllClose(result, np.zeros( self._ip_shp([1, 14, 14, 128], fmt))) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'Copy_*actsRearranged', 'host-exchange-local-copy-', 'cnv3*/convolution.*/Conv_3x3', 'ba3*/fusion/addToChannel' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testFullyConnectedWithBias(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[2, 2]) weights = array_ops.placeholder(np.float32, shape=[2, 2]) bias = array_ops.placeholder(np.float32, shape=[2]) x_new = nn.xw_plus_b(x, weights, bias) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(report) out = sess.run( x_new, { x: np.full([2, 2], 3), weights: np.full([2, 2], 4), bias: np.ones([2]), }) self.assertAllClose(np.full([2, 2], 25), out) result = sess.run(report) self.assertEqual(len(result), 4) # 1xcompile, 1xload, 1xdownload, 1xexecute s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'host-exchange-local-copy', 'xw_plus_b/MatMul/dot.*/Conv_1/Convolve', 'xw_plus_b/fusion/addToChannel' ] self.assertTrue( tu.check_compute_sets_in_whitelist_entries(cs_list, ok))
def testInplaceOpAddCopyWithInplaceParent(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [3]) pb = array_ops.placeholder(np.float32, [3]) pc = array_ops.placeholder(np.float32, []) c = array_ops.slice(pa, [0], [2]) d = array_ops.slice(pb, [0], [2]) e = c + d f = e / pc g = array_ops.slice(pa, [1], [2]) h = f + g with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = { pa: [1, 2, 3], pb: [5, 6, 7], pc: 2, } result = sess.run(h, fd) self.assertAllClose(result, [5, 7]) result = sess.run(report) self.assertTrue(len(result) == 3) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'Copy_XLA_Args/arg*_to_Slice*/slice*.clone', 'add/add.*/AddTo', 'truediv/divide.*/Op/Divide', 'add_1/add.*/AddTo' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testIpuModelDeviceWithReport(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") output = pa + pb with ops.device('cpu'): with ops.control_dependencies([output]): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with session_lib.Session() as sess: fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) result, rep = sess.run([output, report], fd) self.assertAllClose(result, [[1., 2.], [6., 8.]]) self.assertEqual(len(rep), 3) evts = tu.extract_all_events(rep) self.assertEqual(evts[0].type, IpuTraceEvent.COMPILE_BEGIN) self.assertEqual(evts[1].type, IpuTraceEvent.COMPILE_END) self.assertEqual(evts[2].type, IpuTraceEvent.EXECUTE)
def testRelu(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [3], name="a") c = nn_ops.relu(pa) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: fd = {pa: [-6.0, 0.0, 6.0]} result = sess.run(c, fd) self.assertAllClose(result, [0.0, 0.0, 6.0]) result = sess.run(report) self.assertTrue(len(result) == 3) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = ['__seed*', 'Relu/custom-call/Nonlinearity'] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def tesInplaceAddCopyWithInplacePeer(self): data_a = np.array([[10, -20], [5, 1]]) data_b = np.array([[-12, 11], [12, -13]]) with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2]) pb = array_ops.placeholder(np.float32, [2, 2]) c = array_ops.transpose(pa) d = pa + pb e = c / d with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = { pa: data_a, pb: data_b, } result = sess.run(e, fd) np_result = np.transpose(data_a) / (data_a + data_b) self.assertAllClose(result, np_result) result = sess.run(report) self.assertTrue(len(result) == 3) #compile_begin, compile_end, execute s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'host-exchange-local-copy-', 'Copy_XLA_Args/arg0.*_to_transpose/transpose', 'add/add.*/AddTo', 'truediv/divide.*/Op/Divide' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testDepthwiseConv3x1(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [1, 2, 2, 3], name="a") pb = array_ops.placeholder(np.float32, [1, 1, 3, 1], name="b") pc = array_ops.placeholder(np.float32, [3], name="c") c = nn.depthwise_conv2d(pa, pb, strides=[1, 1, 1, 1], padding="SAME") output = c + pc with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = { pa: [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], pb: [[[[6], [4], [2]]]], pc: [1, 1, 1] } result = sess.run(output, fd) self.assertAllClose( result, [[[[7, 9, 7], [25, 21, 13]], [[43, 33, 19], [61, 45, 25]]]]) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'host-exchange-local-copy-', 'Copy_', 'depthwise/convolution.*/Conv_1x1', 'Copy_depthwise/convolution.*/Conv_1x1/partials_to_depthwise/convolution.*/Conv_1x1/partials[[]cloned[]]', 'add/fusion*/addToChannel' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testTruncatedNormalInitalizer(self): with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() with ops.device("/device:IPU:0"): with variable_scope.variable_scope("", use_resource=True): i = init_ops.truncated_normal_initializer(mean=1.0, stddev=0.01) z = variable_scope.get_variable("z1", shape=[2, 4], dtype=np.float32, initializer=i) tu.configure_ipu_system() with tu.ipu_session() as sess: # Clean existing reports sess.run(report) sess.run(variables.global_variables_initializer()) o = sess.run(z) self.assertAllClose(o, np.ones((2, 4)), 0.2, 0.2) # Find of the names of compute sets r = sess.run(report) s = tu.extract_all_strings_from_event_trace(r) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'z1/Initializer/truncated_normal/TruncatedNormal/custom-call*/truncatedNormal', 'z1/Initializer/truncated_normal/mul/multiply.*/Op/Multiply', 'z1/Initializer/truncated_normal/add.*/AddTo' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def test3DConv3x3x3_WithBias(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [1, 14, 14, 14, 16], name="a") pb = array_ops.placeholder(np.float32, [3, 3, 3, 16, 32], name="b") bi = array_ops.placeholder(np.float32, [32], name="b") output = nn_ops.convolution(pa, pb, padding="SAME") output = nn_ops.bias_add(output, bi) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) fd = { pa: np.zeros([1, 14, 14, 14, 16]), pb: np.zeros([3, 3, 3, 16, 32]), bi: np.zeros([32]), } result = sess.run(output, fd) self.assertAllClose(result, np.zeros([1, 14, 14, 14, 32])) result = sess.run(report) s = tu.extract_all_strings_from_event_trace(result) cs_list = tu.get_compute_sets_from_report(s) ok = [ '__seed*', 'host-exchange-local-copy-', 'Copy_', 'convolution/convolution.*/Conv_3x3x3', 'BiasAdd/fusion/addToChannel' ] self.assertTrue(tu.check_all_compute_sets_and_list(cs_list, ok))
def testInTopK(self): batchsize = 4 n_categories = 1200 topn = 8 def model(a, b): return nn.in_top_k(a, b, topn) with ops.device('cpu'): pa = array_ops.placeholder(np.float32, [batchsize, n_categories]) pb = array_ops.placeholder(np.int32, [batchsize]) report = gen_ipu_ops.ipu_event_trace() with ops.device("/device:IPU:0"): out = model(pa, pb) tu.configure_ipu_system() with tu.ipu_session() as sess: sess.run(report) input = np.random.rand(batchsize, n_categories) input = input / np.sqrt(np.sum(input**2)) ref = (-input).argsort(axis=1)[:, :1] ref = ref.reshape([batchsize]) expected = [True] * batchsize fd = {pa: input, pb: ref} result = sess.run(out, fd) self.assertAllClose(result, [True, True, True, True]) result = sess.run(report) self.assertTrue(len(result) == 3)
def testReportEveryNthExecution(self): with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") out = math_ops.add(pa, pb) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(compilation_trace=False) with tu.ipu_session() as sess: fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) rep = sess.run(report, fd) evts = tu.extract_all_execute_events(rep) self.assertEqual(len(evts), 5) # execute x 5 for i, e in enumerate(evts): if i > 0: self.assertTrue(len(e.execute.execution_report) == 0) sess.close() tu.configure_ipu_system(compilation_trace=False, report_every_nth_execution=2) with tu.ipu_session() as sess: fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) rep = sess.run(report, fd) evts = tu.extract_all_execute_events(rep) self.assertEqual(len(evts), 5) # execute x 5 for i, e in enumerate(evts): if i % 2 != 0: self.assertTrue(len(e.execute.execution_report) == 0) sess.close() tu.configure_ipu_system(compilation_trace=False, report_every_nth_execution=1) with tu.ipu_session() as sess: fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) rep = sess.run(report, fd) evts = tu.extract_all_execute_events(rep) self.assertEqual(len(evts), 5) # execute x 5 for e in evts: self.assertTrue(len(e.execute.execution_report) > 0) sess.close()
def testResourceCountsAreCorrect(self): with ops.device("/device:IPU:0"): with variable_scope.variable_scope("vs", use_resource=True): w1 = variable_scope.get_variable( "w1", shape=[4, 2], dtype=np.float32, initializer=init_ops.constant_initializer( np.array([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=np.float32))) b1 = variable_scope.get_variable( "b1", shape=[2], dtype=np.float32, trainable=False, initializer=init_ops.constant_initializer( np.array([2, 3], dtype=np.float32))) w2 = variable_scope.get_variable( "w2", shape=[2, 2], dtype=np.float32, initializer=init_ops.constant_initializer( np.array([[1, 2], [3, 4]], dtype=np.float32))) b2 = variable_scope.get_variable( "b2", shape=[2], dtype=np.float32, trainable=False, initializer=init_ops.constant_initializer( np.array([2, 3], dtype=np.float32))) x = array_ops.placeholder(np.float32, shape=[1, 4]) y = math_ops.matmul(x, w1) + b1 y = math_ops.matmul(y, w2) + b2 loss = math_ops.reduce_sum(y) optimizer = gradient_descent.GradientDescentOptimizer(0.1) train = optimizer.minimize(loss) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) sess.run([train, loss], {x: np.array([[7, 3, 5, 9]], dtype=np.float32)}) sess.run([train, loss], {x: np.array([[1, 2, 3, 4]], dtype=np.float32)}) sess.run([train, loss], {x: np.array([[7, 3, 5, 9]], dtype=np.float32)}) sess.run([train, loss], {x: np.array([[1, 2, 3, 4]], dtype=np.float32)}) sess.run([train, loss], {x: np.array([[7, 3, 5, 9]], dtype=np.float32)}) d_dl = "0.0" w1_dl = "1.0" b1_dl = "2.0" w2_dl = "3.0" b2_dl = "4.0" # biases are not outputs of the graph d_ul = "out_0.0" w1_ul = "out_1.0" w2_ul = "out_2.0" rep = sess.run(report) io_evts = tu.extract_all_io_events(rep) # The initialization is constant, so there are no events generated on the # IPU. host_to_device = list( filter(lambda x: x[0] == IpuTraceEvent.HOST_TO_DEVICE_TRANSFER, io_evts)) device_to_host = list( filter(lambda x: x[0] == IpuTraceEvent.DEVICE_TO_HOST_TRANSFER, io_evts)) self.assertEqual(len(list(io_evts)), 4) # Weights/biases should be downloaded once, and the input no times # because it is streamed self.assertEqual( len(list(filter(lambda x: x[1] == d_dl, host_to_device))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == w1_dl, host_to_device))), 1) self.assertEqual( len(list(filter(lambda x: x[1] == b1_dl, host_to_device))), 1) self.assertEqual( len(list(filter(lambda x: x[1] == w2_dl, host_to_device))), 1) self.assertEqual( len(list(filter(lambda x: x[1] == b2_dl, host_to_device))), 1) # Weights should not be uploaded, and the loss is streamed self.assertEqual( len(list(filter(lambda x: x[1] == d_ul, device_to_host))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == w1_ul, device_to_host))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == w2_ul, device_to_host))), 0) # Explicitly fetch the first set of weights and biases vw, vb = sess.run([w1, b1]) self.assertAllClose(np.array( [[100.00576782, 86.60944366], [57.62784195, 51.23856354], [93.45920563, 82.40240479], [155.36032104, 135.74447632]], dtype=np.float32), vw, rtol=1e-4) self.assertAllClose(np.array([2, 3], dtype=np.float32), vb, rtol=1e-4) rep = sess.run(report) io_evts = tu.extract_all_io_events(rep) host_to_device = list( filter(lambda x: x[0] == IpuTraceEvent.HOST_TO_DEVICE_TRANSFER, io_evts)) device_to_host = list( filter(lambda x: x[0] == IpuTraceEvent.DEVICE_TO_HOST_TRANSFER, io_evts)) self.assertEqual(len(list(io_evts)), 2) # Weights/biases/inputs should not be downloaded at all self.assertEqual( len(list(filter(lambda x: x[1] == d_dl, host_to_device))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == w1_dl, host_to_device))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == b1_dl, host_to_device))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == w2_dl, host_to_device))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == b2_dl, host_to_device))), 0) # Weights should be uploaded once (explicitly fetched) # Note all weights are fetched as a group self.assertEqual( len(list(filter(lambda x: x[1] == d_ul, device_to_host))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == w1_ul, device_to_host))), 1) self.assertEqual( len(list(filter(lambda x: x[1] == w2_ul, device_to_host))), 1)
def testVariablesRemainResident(self): with ops.device("/device:IPU:0"): with variable_scope.variable_scope("vs", use_resource=True): w = variable_scope.get_variable( "w", shape=[4, 2], dtype=np.float32, initializer=init_ops.constant_initializer( np.array([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=np.float32))) b = variable_scope.get_variable( "b", shape=[2], dtype=np.float32, initializer=init_ops.constant_initializer( np.array([2, 3], dtype=np.float32))) x = array_ops.placeholder(np.float32, shape=[1, 4]) y = math_ops.matmul(x, w) + b loss = math_ops.reduce_sum(y) optimizer = gradient_descent.GradientDescentOptimizer(0.1) train = optimizer.minimize(loss) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() tu.configure_ipu_system(True, True, True) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(report) sess.run([train, loss], {x: np.array([[7, 3, 5, 9]], dtype=np.float32)}) sess.run([train, loss], {x: np.array([[1, 2, 3, 4]], dtype=np.float32)}) sess.run([train, loss], {x: np.array([[7, 3, 5, 9]], dtype=np.float32)}) sess.run([train, loss], {x: np.array([[1, 2, 3, 4]], dtype=np.float32)}) sess.run([train, loss], {x: np.array([[7, 3, 5, 9]], dtype=np.float32)}) d_dl = "0.0" d_ul = "out_0.0" w_dl = "1.0" w_ul = "out_1.0" b_dl = "2.0" b_ul = "out_2.0" rep = sess.run(report) io_evts = tu.extract_all_io_events(rep) self.assertEqual(len(list(io_evts)), 2) # The initialization is constant, so there are no events generated on the # IPU. host_to_device = list( filter(lambda x: x[0] == IpuTraceEvent.HOST_TO_DEVICE_TRANSFER, io_evts)) device_to_host = list( filter(lambda x: x[0] == IpuTraceEvent.DEVICE_TO_HOST_TRANSFER, io_evts)) # Weights/biases should be downloaded once, and the input no times # because it is streamed self.assertEqual( len(list(filter(lambda x: x[1] == d_dl, host_to_device))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == w_dl, host_to_device))), 1) self.assertEqual( len(list(filter(lambda x: x[1] == b_dl, host_to_device))), 1) # Weights/biases should not be uploaded, and the loss is streamed self.assertEqual( len(list(filter(lambda x: x[1] == d_ul, device_to_host))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == w_ul, device_to_host))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == b_ul, device_to_host))), 0) # Explicitly fetch the weights vw, vb = sess.run([w, b]) self.assertAllClose(np.array( [[-1.3, -0.3], [1.7, 2.7], [2.9, 3.9], [3.5, 4.5]], dtype=np.float32), vw, rtol=1e-4) self.assertAllClose(np.array([1.5, 2.5], dtype=np.float32), vb, rtol=1e-4) rep = sess.run(report) io_evts = tu.extract_all_io_events(rep) host_to_device = list( filter(lambda x: x[0] == IpuTraceEvent.HOST_TO_DEVICE_TRANSFER, io_evts)) device_to_host = list( filter(lambda x: x[0] == IpuTraceEvent.DEVICE_TO_HOST_TRANSFER, io_evts)) self.assertEqual(len(list(io_evts)), 2) # Weights/biases/inputs should not be downloaded at all self.assertEqual( len(list(filter(lambda x: x[1] == d_dl, host_to_device))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == w_dl, host_to_device))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == b_dl, host_to_device))), 0) # Weights/biases should be uploaded once (explicitly fetched) self.assertEqual( len(list(filter(lambda x: x[1] == d_ul, device_to_host))), 0) self.assertEqual( len(list(filter(lambda x: x[1] == w_ul, device_to_host))), 1) self.assertEqual( len(list(filter(lambda x: x[1] == b_ul, device_to_host))), 1)