def transformer_factory(request): factory = ngt.make_transformer_factory(request.param) ngt.set_transformer_factory(factory) yield factory # Reset transformer factory to default ngt.set_transformer_factory(ngt.make_transformer_factory("numpy"))
def make_and_set_transformer_factory(args): flex_args = ('fixed_point', 'flex_verbose', 'collect_flex_data') # default value for all flex args if not given, confusing with store_true in add_argument default = False if args.backend == flex_gpu_transformer_name: flex_args_dict = dict( (a, getattr(args, a, default)) for a in flex_args) factory = ngt.make_transformer_factory(args.backend, **flex_args_dict) else: factory = ngt.make_transformer_factory(args.backend) ngt.set_transformer_factory(factory)
def test_hetr_graph_passes(): # Build the graph with ng.metadata(device_id='1'): x = ng.placeholder(()) y = ng.placeholder(()) x_plus_y = x + y # Build the graph metadata graph_ops = OrderedSet([x_plus_y, x, y]) graph_op_metadata = {op: list() for op in graph_ops} graph_op_metadata[x] = ["numpy", '1'] graph_op_metadata[y] = ["numpy", '0'] graph_op_metadata[x_plus_y] = ["numpy", '0'] transformer_list = ["numpy1", "numpy0"] # Run the hetr passes one by one, and verify they did the expected things to the graph check_device_assign_pass("numpy", "0", graph_op_metadata, graph_ops) check_communication_pass(ops_to_transform=graph_ops, expected_recv_nodes=[x_plus_y]) # Check if the hetr pass (childTransfromer pass) generates the expected transformer list obj = ChildTransformerPass([]) transformer = ngt.make_transformer_factory('hetr')() obj.do_pass(graph_ops, transformer) transformer.close() assert set(transformer_list) == set(obj.transformer_list)
def run(self): with closing(ngt.make_transformer_factory('cpu')()) as t: comp = t.computation(self.y) self.results_qs.put(comp()) while not self.exit.is_set(): time.sleep(0.1)
def time(self, n_iterations, n_skip, computation_name, visualize, subgraph_attr=None, preprocess=False): """ This runs _any_ computation repeatedly with data from feed_dict, and times it (Nothing model-specific inside, can be reused) """ times = DefaultOrderedDict() feed_dict = self.fill_feed_dict(self.train_set, self.inputs, preprocess) start = Benchmark.marker.init_mark() end = Benchmark.marker.init_mark() with closing(ngt.make_transformer_factory(self.transformer, device=self.device)()) as transformer: if visualize: nviz = ngraph.transformers.passes.nviz.VizPass(show_axes=True, show_all_metadata=True, subgraph_attr=subgraph_attr) transformer.register_graph_pass(nviz) model_out_computation = transformer.add_computation(self.computation) for i in range(n_skip): model_out_computation(feed_dict=feed_dict) for i in range(n_skip, n_iterations): Benchmark.marker.record_mark(start) model_out_computation(feed_dict=feed_dict) Benchmark.marker.record_mark(end) times[computation_name][i] = Benchmark.marker.get_time(start, end) return times
def test_distributed_dot_parallel_second_axis(hetr_device): if hetr_device == 'gpu': pytest.xfail( "Axes Layout needs to be fixed for GPUs after changes to make\ parallel_axis the least contiguous axis for scatter/gather communication ops" ) H = ng.make_axis(length=6, name='height') N = ng.make_axis(length=8, name='batch') W1 = ng.make_axis(length=2, name='W1') W2 = ng.make_axis(length=4, name='W2') x = ng.placeholder(axes=[H, N]) w2 = ng.placeholder(axes=[W2, W1]) with ng.metadata(device=hetr_device, device_id=('0', '1'), parallel=N): w1 = ng.placeholder(axes=[W1, H]) dot1 = ng.dot(w1, x).named("dot1") dot2 = ng.dot(w2, dot1).named("dot2") np_x = np.random.randint(100, size=[H.length, N.length]) np_w1 = np.random.randint(100, size=[W1.length, H.length]) np_w2 = np.random.randint(100, size=[W2.length, W1.length]) with closing(ngt.make_transformer_factory( 'hetr', device=hetr_device)()) as transformer: computation = transformer.computation([dot2, dot1], x, w1, w2) res2, res1 = computation(np_x, np_w1, np_w2) np.testing.assert_array_equal(res1, np.dot(np_w1, np_x)) np.testing.assert_array_equal(res2, np.dot(np_w2, np.dot(np_w1, np_x))) computation2 = transformer.computation([dot1, dot2], x, w1, w2) res1, res2 = computation2(np_x, np_w1, np_w2) np.testing.assert_array_equal(res1, np.dot(np_w1, np_x)) np.testing.assert_array_equal(res2, np.dot(np_w2, np.dot(np_w1, np_x)))
def test_one_dot_bprop_allreduce(config): c = config pytest.xfail( "GPU child transformers generate errors during AssignLayouts graph pass #1651" ) H_axis = ng.make_axis(length=4, name='height') W_axis = ng.make_axis(length=6, name='width') with ng.metadata(step='input'): X = ng.placeholder(axes=[H_axis, W_axis]) target = ng.constant(1, axes=[W_axis]) with ng.metadata(device_id=c['device_id'], parallel=W_axis): W = ng.variable(axes=[H_axis], initial_value=UniformInit(1, 1)) dot = ng.dot(W, X) L = ng.squared_L2(target - dot, out_axes=()) grad = ng.deriv(L, W) grad.metadata['reduce_func'] = c['func'] update = (W - grad) with closing(ngt.make_transformer_factory('hetr')()) as hetr: out_comp = hetr.computation([update], X) result = out_comp(c['input']) np.testing.assert_array_equal(result, c['expected_result'])
def test_allreduce_hint(hetr_device, config): if hetr_device == 'gpu': if 'gpu' not in ngt.transformer_choices(): pytest.skip("GPUTransformer not available") input = config['input'] device_id = config['device_id'] axis_A = ng.make_axis(length=4, name='axis_A') parallel_axis = ng.make_axis(name='axis_parallel', length=16) with ng.metadata(device=hetr_device, device_id=device_id, parallel=parallel_axis): var_A = ng.variable(axes=[axis_A], initial_value=UniformInit(1, 1)) var_B = ng.variable(axes=[axis_A], initial_value=UniformInit(input, input)) var_B.metadata['reduce_func'] = 'sum' var_B_mean = var_B / len(device_id) var_minus = (var_A - var_B_mean) with closing(ngt.make_transformer_factory('hetr', device=hetr_device)()) as hetr: out_comp = hetr.computation(var_minus) result = out_comp() np_result = np.full((axis_A.length), config['expected_result'], np.float32) np.testing.assert_array_equal(result, np_result)
def transformer_factory(request): def set_and_get_factory(transformer_name): factory = ngt.make_transformer_factory(transformer_name) ngt.set_transformer_factory(factory) return factory transformer_name = request.param if pytest.config.getoption("--enable_flex"): if transformer_name == flex_gpu_transformer_name: if flex_gpu_transformer_name in ngt.transformer_choices(): yield set_and_get_factory(transformer_name) else: raise ValueError("GPU not found, should not set --enable_flex" "flag for py.test.") else: pytest.skip( 'Skip all other transformers since --enable_flex is set.') else: if transformer_name == flex_gpu_transformer_name: pytest.skip('Skip flex test since --enable_flex is not set.') else: yield set_and_get_factory(transformer_name) # Reset transformer factory to default ngt.set_transformer_factory(ngt.make_transformer_factory("numpy"))
def test_gpu_graph(config): pytest.xfail("Multi-GPU testing not enabled yet") if 'gpu' not in ngt.transformer_choices(): pytest.skip('GPUTransformer not available!') t = config with ng.metadata(device='gpu'): x = ng.placeholder(axes=t['axes']) with ng.metadata(device='gpu', device_id=t['device_id'], parallel=t['parallel_axis']): x_plus_one = x + 1 with ng.metadata(device='gpu'): x_plus_two = x_plus_one + 1 os.environ["HETR_SERVER_GPU_NUM"] = str(len(t['device_id'])) np_x = np.random.randint(100, size=t['axes'].full_lengths) with closing(ngt.make_transformer_factory('hetr')()) as transformer: computation = transformer.computation(x_plus_two, x) res = computation(np_x) np.testing.assert_array_equal(res, np_x + 2)
def test_reduce_vector(hetr_device): """ A whole vector is produced on each worker and should be reduced before being returned, but not along its axes since it does not have the parallel axis in its axes """ if hetr_device == 'gpu': pytest.xfail("broadcast communication ops not yet supported on gpus") H = ng.make_axis(length=4, name='height') N = ng.make_axis(length=8, name='batch') weight = ng.make_axis(length=2, name='weight') x = ng.placeholder(axes=[N, H]) w = ng.placeholder(axes=[H, weight]) with ng.metadata(device=hetr_device, device_id=('0', '1'), parallel=N): dot = ng.dot(x, w) out = ng.sum(dot, N) np_x = np.random.randint(100, size=[N.length, H.length]) np_weight = np.random.randint(100, size=[H.length, weight.length]) with closing(ngt.make_transformer_factory( 'hetr', device=hetr_device)()) as transformer: computation = transformer.computation(out, x, w) res = computation(np_x, np_weight) # TODO should the reduce infer a sum or mean? expected = np.sum(np.dot(np_x, np_weight), 0) / 2. np.testing.assert_array_equal(res, expected)
def test_allreduce_hint_gpu(config): pytest.xfail("Multi-GPU testing not enabled yet") if 'gpu' not in ngt.transformer_choices(): pytest.skip("GPUTransformer not available") c = config os.environ["HETR_SERVER_GPU_NUM"] = str(len(c['device_id'])) ax_A_length = 32 ax_B_length = 16 np_result = [np.full((ax_A_length, ax_B_length), c['expected_result'], np.float32)] parallel_axis = ng.make_axis(name='axis_parallel', length=16) with ng.metadata(device_id=c['device_id'], parallel=parallel_axis): axis_A = ng.make_axis(length=ax_A_length, name='axis_A') axis_B = ng.make_axis(length=ax_B_length, name='axis_B') var_A = ng.variable(axes=[axis_A], initial_value=UniformInit(1, 1)).named('var_A') var_B = ng.variable(initial_value=UniformInit(c['input'], c['input']), axes=[axis_B]).named('var_B') var_B.metadata['reduce_func'] = c['func'] var_minus = (var_A - var_B).named('var_minus') with closing(ngt.make_transformer_factory('hetr', device='gpu')()) as hetr: out_comp = hetr.computation([var_minus]).named('out_comp') result = out_comp() np.testing.assert_array_equal(result, np_result)
def test_get_layouts(config): test_transformer = ngt.make_transformer_factory('gpu')() t = config with ng.metadata(parallel=t['parallel_axis']): test_ops = [ GPUCudaScatterSendOp( TensorValueOp(ng.placeholder(t['axes']), metadata=dict(device='gpu', device_id='0', parallel=t['parallel_axis'], transformer='gpu0', host_transformer=None)), ng.Op(metadata=dict(device='gpu', device_id=('0', '1'), parallel=t['parallel_axis'], transformer=['gpu0', 'gpu1'], host_transformer=None)) ), GPUCudaScatterRecvOp( ng.Op(metadata=dict(device='gpu', device_id=('0', '1'), parallel=t['parallel_axis'], transformer=['gpu0', 'gpu1'], host_transformer=None)), GPUCudaScatterSendOp( TensorValueOp(ng.placeholder(t['axes']), metadata=dict(device='gpu', device_id='0', parallel=t['parallel_axis'], transformer='gpu0', host_transformer=None)), ng.Op(metadata=dict(device='gpu', device_id=('0', '1'), parallel=t['parallel_axis'], transformer=['gpu0', 'gpu1'], host_transformer=None)) ) ), GPUCudaGatherRecvOp( ng.Op(metadata=dict(device='gpu', device_id=('0', '1'), parallel=t['parallel_axis'], transformer=['gpu0', 'gpu1'], host_transformer=None)), ng.Op(metadata=dict(device='gpu', device_id='0', parallel=t['parallel_axis'], transformer='gpu0', host_transformer=None)), GPUCudaScatterSendOp( TensorValueOp(ng.placeholder(t['axes']), metadata=dict(device='gpu', device_id='0', parallel=t['parallel_axis'], transformer='gpu0', host_transformer=None)), ng.Op(metadata=dict(device='gpu', device_id=('0', '1'), parallel=t['parallel_axis'], transformer=['gpu0', 'gpu1'], host_transformer=None)) ) ), GPUCudaGatherSendOp( TensorValueOp(ng.placeholder(t['axes']), metadata=dict(device='gpu', device_id='0', transformer='gpu0', host_transformer=None, parallel=t['parallel_axis'])) ), GPUCudaAllReduceOp( input_node=TensorValueOp(ng.placeholder(t['axes']), metadata=dict(device='gpu', device_id='0', transformer='gpu0', host_transformer=None, parallel=t['parallel_axis'])), func='sum' ) ] test_layouts = [] for op in test_ops: test_layouts.append(test_transformer.get_layouts(op)[0].axes) np.testing.assert_array_equal(test_layouts, t['expected_layouts'])
def run(self): with closing( ngt.make_transformer_factory( 'gpu', device_id=self.device_id)()) as t: comp = t.computation(self.y) self.queue.put(comp()) while not self.exit.is_set(): time.sleep(0.1)
def parse_args(self, gen_be=True): args = super(NgraphArgparser, self).parse_args() factory = ngt.make_transformer_factory(args.backend) ngt.set_transformer_factory(factory) # invert no_progress_bar meaning and store in args.progress_bar args.progress_bar = not args.no_progress_bar return args
def transformer_factory(request): def set_and_get_factory(transformer_name): factory = ngt.make_transformer_factory(transformer_name) ngt.set_transformer_factory(factory) return factory name = request.config.getoption("--transformer") yield set_and_get_factory(name) # Reset transformer factory to default ngt.set_transformer_factory(ngt.make_transformer_factory("cpu"))
def test_broadcast_scalar(hetr_device, config): if hetr_device == 'gpu': pytest.skip('gpu communication broadcast op is not supported.') device_id = config['device_id'] x = ng.placeholder(()) y = ng.placeholder(()) with ng.metadata(device_id=device_id, parallel=ax_A): x_plus_y = x + y with closing(ngt.make_transformer_factory( 'hetr', device=hetr_device)()) as transformer: computation = transformer.computation(x_plus_y, x, y) res = computation(1, 2) np.testing.assert_array_equal(res, 3)
def test_multiple_gather_ops(config): c = config H = ng.make_axis(length=2, name='height') W = ng.make_axis(length=4, name='width') x = ng.placeholder(axes=[H, W]) with ng.metadata(device_id=c['device_id'], parallel=W): x_plus_one = x + 1 x_plus_two = x_plus_one + 2 with closing(ngt.make_transformer_factory('hetr')()) as hetr: plus = hetr.computation([x_plus_two, x_plus_one], x) result_two, result_one = plus(c['input']) np.testing.assert_array_equal(result_two, c['result_two']) np.testing.assert_array_equal(result_one, c['result_one'])
def test_distributed_plus_one(hetr_device, config): device_id = config['device_id'] axes = config['axes'] parallel_axis = config['parallel_axis'] with ng.metadata(device=hetr_device): x = ng.placeholder(axes=axes) with ng.metadata(device_id=device_id, parallel=parallel_axis): x_plus_one = x + 1 np_x = np.random.randint(100, size=axes.lengths) with closing(ngt.make_transformer_factory( 'hetr', device=hetr_device)()) as transformer: computation = transformer.computation(x_plus_one, x) res = computation(np_x) np.testing.assert_array_equal(res, np_x + 1)
def test_allreduce_hint_cpu(config): c = config parallel_axis = ng.make_axis(name='axis_parallel', length=16) with ng.metadata(device_id=c['device_id'], parallel=parallel_axis): axis_A = ng.make_axis(length=4, name='axis_A') axis_B = ng.make_axis(length=2, name='axis_B') var_A = ng.variable(axes=[axis_A], initial_value=UniformInit(1, 1)).named('var_A') var_B = ng.variable(initial_value=UniformInit(c['input'], c['input']), axes=[axis_B]).named('var_B') var_B.metadata['reduce_func'] = c['func'] var_minus = (var_A - var_B).named('var_minus') with closing(ngt.make_transformer_factory('hetr')()) as hetr: out_comp = hetr.computation([var_minus]).named('out_comp') result = out_comp() np.testing.assert_array_equal(result, c['expected_result'])
def test_to_and_from_device(hetr_device, config): axes = config['axes'] with ng.metadata(device=hetr_device): x = ng.placeholder(axes=axes) if axes else ng.placeholder(()) with ng.metadata(device_id='1'): x_plus_one = x + 1 x_plus_two = x_plus_one * 2 np_x = np.random.randint(100, size=axes.lengths) if axes else random.random() with closing(ngt.make_transformer_factory( 'hetr', device=hetr_device)()) as transformer: computation = transformer.computation([x_plus_one, x_plus_two], x) res = computation(np_x) np.testing.assert_allclose(res[0], np_x + 1.0) np.testing.assert_allclose(res[1], (np_x + 1.0) * 2.0)
def check_result_values(input_vector, result_expected, placeholder, ops=OrderedSet(), *args): """ This function checks the result values return by the hetr computation object against the expected result values it also checks if the value returned by the hetr object matches the order in the expected result list :param: input_vector: list specifying the differnt values to be passed to the placeholder :param: result_expected: list of tuples specifying the expected result values from the hetr computation object :param: placeholder: list of placeholder to be passed for hetrcomputation :param: ops: list of result handlers to be paased for hetrcomputation """ # Select the transformer transformer = ngt.make_transformer_factory('hetr')() # Build the hetr computation object if isinstance(placeholder, tuple): computation = transformer.computation(ops, *placeholder) else: computation = transformer.computation(ops, placeholder) result_obtained = [] # Check for the return result list for i in input_vector: if isinstance(i, tuple): result_obtained.append(computation(*i)) else: result_obtained.append(computation(i)) # if return result is tuple if len(result_expected) > 1: np.testing.assert_array_equal(result_expected, result_obtained) # if return result is scalar else: assert (np.array(tuple(result_obtained)) == np.array( result_expected[0])).all() transformer.close()
def run_benchmark(model_out_comp, transformer_type, feed_dict, n_skip, n_iter): """ This runs _any_ computation repeatedly with data from feed_dict, and times it (Nothing model-specific inside, can be reused) """ times = DefaultOrderedDict() with closing(ngt.make_transformer_factory(transformer_type)()) as transformer: nviz = ngraph.transformers.passes.nviz.VizPass(show_axes=True, show_all_metadata=False) transformer.register_graph_pass(nviz) model_out_computation = transformer.add_computation(model_out_comp) for i in range(n_skip): model_out_computation(feed_dict=feed_dict) for i in range(n_iter): times[i]['start'] = time.time() * 1000.0 model_out_computation(feed_dict=feed_dict) times[i]['stop'] = time.time() * 1000.0 return times
def test_comm_broadcast_op(hetr_device): if hetr_device == 'gpu': pytest.skip('gpu communication broadcast op is not supported.') H = ng.make_axis(length=4, name='height') N = ng.make_axis(length=8, name='batch') weight = ng.make_axis(length=2, name='weight') x = ng.placeholder(axes=[N, H]) # w will be broadcasted to devices w = ng.placeholder(axes=[H, weight]) with ng.metadata(device=hetr_device, device_id=('0', '1'), parallel=N): dot = ng.dot(x, w) np_x = np.random.randint(100, size=[N.length, H.length]) np_weight = np.random.randint(100, size=[H.length, weight.length]) with closing(ngt.make_transformer_factory( 'hetr', device=hetr_device)()) as transformer: computation = transformer.computation(dot, x, w) res = computation(np_x, np_weight) np.testing.assert_array_equal(res, np.dot(np_x, np_weight))
def check_communication_pass(ops_to_transform, expected_recv_nodes): """ The communication pass should insert send/recv nodes wherever the metadata[transformer] differs between nodes. This checks that the recv nodes are inserted in the right place, and counts that the expected number of send nodes are found. :param ops_to_transform: list of ops to do the garph traversal :param expected_recv_nodes: lits of ops where receive nodes are expected to be inserted after the communication pass """ transformer = ngt.make_transformer_factory('hetr')() send_nodes = OrderedSet() scatter_shared_queues = list() gather_shared_queues = list() obj = CommunicationPass(send_nodes, scatter_shared_queues, gather_shared_queues) obj.do_pass(ops_to_transform, transformer) op_list_instance_type = list() num_expected_sendnodes = len(expected_recv_nodes) # Count if the communication pass inserted the expected number of send nodes assert num_expected_sendnodes == len(send_nodes) # verify if Recv nodes are inserted in the right place for op in expected_recv_nodes: for each_arg in op.args: op_list_instance_type.append(type(each_arg)) if (ng.op_graph.communication.Recv in op_list_instance_type or ng.op_graph.communication.Gather_Recv in op_list_instance_type or ng.op_graph.communication.Scatter_Recv in op_list_instance_type) is False: assert False del op_list_instance_type[:] transformer.close()
def test_distributed_dot(hetr_device, config): if hetr_device == 'gpu': pytest.xfail("Intermittent failure on jenkins for mgpu") device_id = config['device_id'] axes_x = config['axes_x'] axes_w = config['axes_w'] parallel_axis = config['parallel_axis'] np_weight = np.ones(axes_w.lengths) with ng.metadata(device=hetr_device): x = ng.placeholder(axes=axes_x) with ng.metadata(device_id=device_id, parallel=parallel_axis): w = ng.variable(axes=axes_w, initial_value=np_weight) dot = ng.dot(x, w) np_x = np.random.randint(100, size=axes_x.lengths) with closing(ngt.make_transformer_factory( 'hetr', device=hetr_device)()) as transformer: computation = transformer.computation(dot, x) res = computation(np_x) np.testing.assert_array_equal(res, np.dot(np_x, np_weight))
def test_multi_computations(hetr_device): if hetr_device == 'gpu': pytest.xfail("enable after gpu exgraph") axes_x = ng.make_axes([ax_A, ax_B]) x = ng.placeholder(axes=axes_x) y = ng.placeholder(()) with ng.metadata(device_id=('0', '1'), parallel=ax_A): f = x**2 out = y - ng.mean(f, out_axes=()) np_x = np.random.randint(10, size=axes_x.lengths) np_y = np.random.randint(10) with closing(ngt.make_transformer_factory('hetr', device=hetr_device)()) as t: comp = t.computation(out, x, y) another_comp = t.computation(f, x) res_comp = comp(np_x, np_y) res_another_comp = another_comp(np_x) ref_comp = np_y - np.mean(np_x**2) np.testing.assert_array_equal(res_comp, ref_comp) np.testing.assert_array_equal(res_another_comp, np_x**2)
def time(self, n_iterations, n_skip, computation_name, feed_dict): """ This runs _any_ computation repeatedly with data from feed_dict, and times it (Nothing model-specific inside, can be reused) """ times = DefaultOrderedDict() start = Benchmark.marker.init_mark() end = Benchmark.marker.init_mark() t_args = {} if self.transformer == 'hetr': t_args['device'] = self.device with closing(ngt.make_transformer_factory(self.transformer, **t_args)()) as transformer: model_out_computation = transformer.add_computation(self.computation) for i in range(n_skip): model_out_computation(feed_dict=feed_dict) for i in range(n_skip, n_iterations): Benchmark.marker.record_mark(start) model_out_computation(feed_dict=feed_dict) Benchmark.marker.record_mark(end) times[computation_name][i] = Benchmark.marker.get_time(start, end) return times
def check_device_assign_pass(default_device, default_device_id, graph_op_metadata, graph_op=OrderedSet(), *args): """ The Device assign pass should inject the metadata{device_id, device} as specified by the user for each op, if not specified then the default {device_id:0, device:numpy} should be inserted for each op. :param: default_device: string, the default device for each op, if not specified by user ex: "numpy" :param: default_device_id: string, the default device number for each op, if not specified by user ex: "0" :param: graph_op_metadata: dict, dictionary of list specifying the expected metadata {device_id, device} for each op :param: graph_op: list of ops to do the graph traversal """ transformer = ngt.make_transformer_factory('hetr')() transformers = set() expected_transformers = set() obj = DeviceAssignPass(default_device, default_device_id, transformers) obj.do_pass(graph_op, transformer) for op in graph_op_metadata.keys(): assert op.metadata['device'] == graph_op_metadata[op][0] assert op.metadata['device_id'] == graph_op_metadata[op][1] assert op.metadata['transformer'] == graph_op_metadata[op][0] + \ str(graph_op_metadata[op][1]) expected_transformers.add(op.metadata['transformer']) assert transformers == expected_transformers transformer.close()
def test_multiple_gather_ops(hetr_device): if hetr_device == 'gpu': if 'gpu' not in ngt.transformer_choices(): pytest.skip("GPUTransformer not available") pytest.xfail( "Failure due to gather recv tensor being returned in wrong shape, " " possible mismatch between op layout and op.tensor layout") H = ng.make_axis(length=2, name='height') W = ng.make_axis(length=4, name='width') x = ng.placeholder(axes=[H, W]) with ng.metadata(device_id=('0', '1'), parallel=W): x_plus_one = x + 1 x_mul_two = x_plus_one * 2 input = np.random.randint(100, size=x.axes.lengths) with closing(ngt.make_transformer_factory('hetr', device=hetr_device)()) as hetr: plus = hetr.computation([x_mul_two, x_plus_one], x) result_mul_two, result_plus_one = plus(input) np.testing.assert_array_equal(result_plus_one, input + 1) np.testing.assert_array_equal(result_mul_two, (input + 1) * 2)