def test_maximum_minimum_scalar(): data1 = mx.symbol.Variable('data') shape = (3, 4) data_tmp1 = np.random.rand(3,4) data_tmp1[:] = 2 arr_data1 = mx.nd.array(data_tmp1) arr_grad1 = mx.nd.empty(shape) test = mx.sym.maximum(data1,3) + mx.sym.maximum(9,data1) + mx.sym.minimum(5,data1) + mx.sym.minimum(data1,4) exe_test = test.bind(mx.cpu(), args=[arr_data1], args_grad=[arr_grad1]) exe_test.forward() out = exe_test.outputs[0].asnumpy() npout = np.maximum(data_tmp1,3) + np.maximum(9,data_tmp1) + np.minimum(5,data_tmp1) + np.minimum(data_tmp1,4) assert reldiff(out, npout) < 1e-6 out_grad = mx.nd.empty(shape) out_grad[:] = 2 exe_test.backward(out_grad) npout_grad = np.ones(shape) npout_grad[:] = 2 mask1 = (data_tmp1 > 3).astype('float') mask2 = (9 > data_tmp1).astype('float') mask3 = (5 < data_tmp1).astype('float') mask4 = (data_tmp1 < 4).astype('float') npout_grad1 = npout_grad * mask1 + (npout_grad - npout_grad * mask2) + (npout_grad - npout_grad * mask3) + npout_grad * mask4 assert reldiff(arr_grad1.asnumpy(), npout_grad1) < 1e-6
def test_rsqrt_cos_sin(): data = mx.symbol.Variable("data") shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:] = 5 arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:] = 3 test = mx.sym.rsqrt(data) + mx.sym.cos(data) + mx.sym.sin(data) exe_test = test.bind(mx.cpu(), args=[arr_data], args_grad=[arr_grad]) exe_test.forward() out = exe_test.outputs[0].asnumpy() npout = 1 / np.sqrt(data_tmp) + np.cos(data_tmp) + np.sin(data_tmp) assert reldiff(out, npout) < 1e-6 out_grad = mx.nd.empty(shape) out_grad[:] = 2 npout_grad = out_grad.asnumpy() npout_grad = ( npout_grad * -(1.0 / (2.0 * data_tmp * np.sqrt(data_tmp))) + npout_grad * -1 * np.sin(data_tmp) + npout_grad * np.cos(data_tmp) ) exe_test.backward(out_grad) assert reldiff(arr_grad.asnumpy(), npout_grad) < 1e-6
def test_embedding(): in_dim = 10 out_dim = 4 batch = 24 data = mx.sym.Variable("data") embed = mx.sym.Embedding(data=data, input_dim=in_dim, output_dim=out_dim, name="embed") exe_test = embed.simple_bind(mx.cpu(), data=(batch, )) arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays)) grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays)) np_data = np.random.randint(low=0, high=in_dim, size=batch) np_weight = np.random.uniform(-0.01, 0.01, arg_map["embed_weight"].shape) np_onehot = np.zeros((batch, in_dim)) np_onehot[np.arange(batch), np_data] = 1.0 # forward arg_map["data"][:] = np_data arg_map["embed_weight"][:] = np_weight exe_test.forward() assert reldiff(exe_test.outputs[0].asnumpy(), np.dot(np_onehot, np_weight)) < 1e-6 # backward np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape) grad = mx.nd.zeros(np_grad.shape) grad[:] = np_grad exe_test.backward([grad]) assert reldiff(grad_map["embed_weight"].asnumpy(), np.dot(np_onehot.T, np_grad)) < 1e-6
def test_maximum_minimum(): data1 = mx.symbol.Variable("data") data2 = mx.symbol.Variable("data") shape = (3, 4) data_tmp1 = np.random.rand(3, 4) data_tmp2 = np.random.rand(3, 4) data_tmp1[:] = 2 data_tmp2[:] = 3 arr_data1 = mx.nd.array(data_tmp1) arr_data2 = mx.nd.array(data_tmp2) arr_grad1 = mx.nd.empty(shape) arr_grad2 = mx.nd.empty(shape) test = mx.sym.maximum(data1, data2) + mx.sym.minimum(data1, data2) exe_test = test.bind(mx.cpu(), args=[arr_data1, arr_data2], args_grad=[arr_grad1, arr_grad2]) exe_test.forward() out = exe_test.outputs[0].asnumpy() npout = np.maximum(data_tmp1, data_tmp2) + np.minimum(data_tmp1, data_tmp2) assert reldiff(out, npout) < 1e-6 out_grad = mx.nd.empty(shape) out_grad[:] = 2 exe_test.backward(out_grad) npout_grad = np.ones(shape) npout_grad[:] = 2 mask1 = (data_tmp1 > data_tmp2).astype("float") mask2 = (data_tmp1 < data_tmp2).astype("float") npout_grad1 = npout_grad * mask1 + npout_grad * mask2 npout_grad2 = (npout_grad - npout_grad * mask1) + (npout_grad - npout_grad * mask2) assert reldiff(arr_grad1.asnumpy(), npout_grad1) < 1e-6 assert reldiff(arr_grad2.asnumpy(), npout_grad2) < 1e-6
def test_embedding(): in_dim = 10 out_dim = 4 batch = 24 data = mx.sym.Variable("data") embed = mx.sym.Embedding(data=data, input_dim=in_dim, output_dim=out_dim, name="embed") exe_test = embed.simple_bind(mx.cpu(), data=(batch,)) arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays)) grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays)) np_data = np.random.randint(low=0, high=in_dim, size=batch) np_weight = np.random.uniform(-0.01, 0.01, arg_map["embed_weight"].shape) np_onehot = np.zeros((batch, in_dim)) np_onehot[np.arange(batch), np_data] = 1.0 # forward arg_map["data"][:] = np_data arg_map["embed_weight"][:] = np_weight exe_test.forward() assert reldiff(exe_test.outputs[0].asnumpy(), np.dot(np_onehot, np_weight)) < 1e-6 # backward np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape) grad = mx.nd.zeros(np_grad.shape) grad[:] = np_grad exe_test.backward([grad]) assert reldiff(grad_map["embed_weight"].asnumpy(), np.dot(np_onehot.T, np_grad)) < 1e-6
def check_slice_channel(dim, num): ins = [] if dim == 2: shape = (2, 2) else: shape = (2, 2, 2, 3) ins = [np.ones(shape) * i for i in range(num)] e = np.hstack(ins) e_nd = mx.nd.empty(e.shape) e_nd[:] = e data = mx.sym.Variable('data') op = mx.sym.SliceChannel(data=data, num_outputs=num) arg_shape, output_shape, aux_shape = op.infer_shape(data=e_nd.shape) grad_nd = [mx.nd.empty(shape) for shape in arg_shape] exe = op.bind(mx.cpu(), args=[e_nd], args_grad=grad_nd) assert len(exe.outputs) == num o_nd = [exe.outputs[i] for i in range(num)] # test forward exe.forward() for i in range(num): assert reldiff(o_nd[i].asnumpy(), ins[i]) < 1e-5 # test backward for i in range(num): o_nd[i] += i exe.backward(o_nd) assert reldiff(grad_nd[0].asnumpy(), np.hstack([ins[i] + i for i in range(num)])) < 1e-5
def test_maximum_minimum_scalar(): data1 = mx.symbol.Variable('data') shape = (3, 4) data_tmp1 = np.random.rand(3, 4) data_tmp1[:] = 2 arr_data1 = mx.nd.array(data_tmp1) arr_grad1 = mx.nd.empty(shape) test = mx.sym.maximum(data1, 3) + mx.sym.maximum( 9, data1) + mx.sym.minimum(5, data1) + mx.sym.minimum(data1, 4) exe_test = test.bind(mx.cpu(), args=[arr_data1], args_grad=[arr_grad1]) exe_test.forward() out = exe_test.outputs[0].asnumpy() npout = np.maximum(data_tmp1, 3) + np.maximum(9, data_tmp1) + np.minimum( 5, data_tmp1) + np.minimum(data_tmp1, 4) assert reldiff(out, npout) < 1e-6 out_grad = mx.nd.empty(shape) out_grad[:] = 2 exe_test.backward(out_grad) npout_grad = np.ones(shape) npout_grad[:] = 2 mask1 = (data_tmp1 > 3).astype('float') mask2 = (9 > data_tmp1).astype('float') mask3 = (5 < data_tmp1).astype('float') mask4 = (data_tmp1 < 4).astype('float') npout_grad1 = npout_grad * mask1 + (npout_grad - npout_grad * mask2) + ( npout_grad - npout_grad * mask3) + npout_grad * mask4 assert reldiff(arr_grad1.asnumpy(), npout_grad1) < 1e-6
def check_slice_channel(dim, num): ins = [] if dim == 2: shape = (2,2) else: shape = (2, 2, 2 ,3) ins = [np.ones(shape) * i for i in range(num)] e = np.hstack(ins) e_nd = mx.nd.empty(e.shape) e_nd[:] = e data = mx.sym.Variable('data') op = mx.sym.SliceChannel(data=data, num_outputs=num) arg_shape, output_shape, aux_shape = op.infer_shape(data=e_nd.shape) grad_nd = [mx.nd.empty(shape) for shape in arg_shape] exe = op.bind(mx.cpu(), args=[e_nd], args_grad=grad_nd) assert len(exe.outputs) == num o_nd = [exe.outputs[i] for i in range(num)] # test forward exe.forward() for i in range(num): assert reldiff(o_nd[i].asnumpy(), ins[i]) < 1e-5 # test backward for i in range(num): o_nd[i] += i exe.backward(o_nd) assert reldiff(grad_nd[0].asnumpy(), np.hstack([ins[i] + i for i in range(num)])) < 1e-5
def test_python_op(): X = mx.symbol.Variable('X') op = mx.operator.NumpyOp() s = op.get_symbol(X, name='numpy_op') x = mx.ndarray.ones((10)) * 10 dx = mx.ndarray.zeros((10)) dy = mx.ndarray.ones((10)) exec1 = s.bind(mx.cpu(), args=[x], args_grad={'X': dx}) exec1.forward() assert reldiff(x.asnumpy(), exec1.outputs[0].asnumpy()) < 1e-5 exec1.backward(dy) assert reldiff(dy.asnumpy(), dx.asnumpy()) < 1e-5
def test_python_op(): X = mx.symbol.Variable('X') op = mx.operator.NumpyOp() s = op.get_symbol(X, name='numpy_op') x = mx.ndarray.ones((10))*10 dx = mx.ndarray.zeros((10)) dy = mx.ndarray.ones((10)) exec1 = s.bind(mx.cpu(), args=[x], args_grad = {'X': dx}) exec1.forward() assert reldiff(x.asnumpy(), exec1.outputs[0].asnumpy()) < 1e-5 exec1.backward(dy) assert reldiff(dy.asnumpy(), dx.asnumpy()) < 1e-5
def check_softmax_with_ignore_label(xpu): X = mx.symbol.Variable('X') L = mx.symbol.Variable('L') Y = mx.symbol.SoftmaxOutput(data=X, label=L, ignore_label=0, use_ignore=True) shape = (20, 10) x = mx.nd.empty(shape, ctx=xpu) l = mx.nd.empty((shape[0], ), ctx=xpu) x_np = np.random.rand(*shape) l_np = np.random.randint(0, shape[1] - 1, (shape[0], )) x[:] = x_np l[:] = l_np grad = mx.nd.empty(shape, ctx=xpu) exec1 = Y.bind(xpu, args=[x, l], args_grad={'X': grad}) exec1.forward() exec1.backward() grad0 = grad.asnumpy() for i in range(int(shape[0] / 2)): l_np[i] = 0 l[:] = l_np exec1.forward() exec1.backward() grad1 = grad.asnumpy() assert (abs(np.sum(grad1[:int(shape[0] / 2)])) < 1e-5) assert (reldiff(grad0[int(shape[0] / 2):], grad1[int(shape[0] / 2):]) < 1e-5)
def test_broadcasting_ele(sym_bcast): dat_npy = np.random.rand(*shape) groundtruth = dat_npy grad_nd = mx.nd.empty(shape) outgrad_npy = np.random.rand(*target_shape) grad_groundtruth = _np_reduce(outgrad_npy, axis=axis, keepdims=True, numpy_reduce_func=np.sum) net = sym_bcast.bind(mx.cpu(), args={'a': mx.nd.array(dat_npy)}, args_grad={'a': grad_nd}) net.forward(is_train=True) assert (net.outputs[0].shape == target_shape).all() err_forward = reldiff(net.outputs[0].asnumpy(), groundtruth) assert err_forward < 1E-4 net.backward(out_grads=mx.nd.array(outgrad_npy)) err_backward = reldiff(grad_nd.asnumpy(), grad_groundtruth) assert err_backward < 1E-4
def check_deconvolution_forward_backward(input_shape, num_filter, kernel, stride, pad): """configure A: input --> conv --> deconv --> output. the convolution and deconvoluiton has similar parameter which ensure the input shape is the same as output, and the same weights between conv and deconv; If the input value of forward() and backwrad() is the same, then the output value of them should also the same; """ assert input_shape[1] == num_filter data = mx.sym.Variable(name="data") conv = mx.sym.Convolution( data=data, kernel=kernel, stride=stride, pad=pad, num_filter=num_filter, no_bias = "true", name = "conv") deconv = mx.sym.Deconvolution( data=conv, kernel=kernel, stride=stride, pad=pad, num_filter=num_filter, no_bias = "true", name = "deconv") arg_names = deconv.list_arguments() arg_shapes, out_shapes, _ = deconv.infer_shape(data=input_shape) input_data = mx.random.uniform(-5, 5, input_shape) out_grad = input_data args = {} args["data"] = input_data args['conv_weight'] = args['deconv_weight'] = mx.random.normal(0, 1, (num_filter, input_shape[1]) + kernel) args_grad = [mx.nd.empty(s) for s in arg_shapes] exe = deconv.bind(mx.cpu(), args=args, args_grad=args_grad) exe.forward() out = exe.outputs[0].asnumpy() exe.backward(out_grad) assert reldiff(out, args_grad[0].asnumpy()) < 1e-6
def check_softmax_with_ignore_label(xpu): X = mx.symbol.Variable('X') L = mx.symbol.Variable('L') Y = mx.symbol.SoftmaxOutput(data=X, label=L, ignore_label=0, use_ignore=True) shape = (20, 10) x = mx.nd.empty(shape, ctx = xpu) l = mx.nd.empty((shape[0],), ctx = xpu) x_np = np.random.rand(*shape) l_np = np.random.randint(0, shape[1]-1, (shape[0],)) x[:] = x_np l[:] = l_np grad = mx.nd.empty(shape, ctx = xpu) exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad}) exec1.forward() exec1.backward() grad0 = grad.asnumpy() for i in range(int(shape[0]/2)): l_np[i] = 0 l[:] = l_np exec1.forward() exec1.backward() grad1 = grad.asnumpy() assert(abs(np.sum(grad1[:int(shape[0]/2)])) < 1e-5) assert(reldiff(grad0[int(shape[0]/2):], grad1[int(shape[0]/2):]) < 1e-5)
def test_binary_op_duplicate_input(): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:] = 5 arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:] = 3 out_grad = mx.nd.empty(shape) out_grad[:] = 1 square = data * data exe_square = square.bind(mx.cpu(), args=[arr_data], args_grad=[arr_grad]) exe_square.forward() assert reldiff(exe_square.outputs[0].asnumpy(), data_tmp * data_tmp) < 1e-6 exe_square.backward(out_grad) assert reldiff(arr_grad.asnumpy(), 2.0 * data_tmp) < 1e-6
def check_regression(symbol, forward, backward): data = mx.symbol.Variable("data") label = mx.symbol.Variable("label") out = symbol(data, label) shape = (3, 1) arr_data = mx.random.uniform(-1, 1, shape) arr_label = mx.random.uniform(0, 1, shape[0]) arr_grad = mx.nd.empty(shape) exec1 = out.bind(mx.cpu(), args=[arr_data, arr_label], args_grad={"data": arr_grad}) exec1.forward() out1 = exec1.outputs[0].asnumpy() npout = forward(arr_data.asnumpy()) assert reldiff(npout, out1) < 1e-6 exec1.backward() npout = backward(npout, arr_label.asnumpy().reshape(npout.shape)) assert reldiff(npout, arr_grad.asnumpy()) < 1e-6
def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym): for i in range(sample_num): # Generate random data that has ndim between 1-7 and all the shape dims between 1-10 ndim = np.random.randint(1, 8) shape = np.random.randint(1, 11, size=(ndim, )) axis_num = np.random.randint(0, ndim, size=1) axis_flags = np.random.randint(0, 2, size=ndim) axes = [] for (axis, flag) in enumerate(axis_flags): if flag: axes.append(axis) if 0 == len(axes): axes = None elif 1 == len(axes): axes = axes[0] else: axes = tuple(axes) keepdims = np.random.randint(0, 2) a = mx.symbol.Variable('a') if axes is None: b = mx_reduce_sym(a, keepdims=keepdims) else: b = mx_reduce_sym(a, axis=axes, keepdims=keepdims) dat_npy = np.random.rand(*shape) sum_groundtruth = np.array( numpy_reduce_func(dat_npy, axis=axes, keepdims=keepdims)) if sum_groundtruth.shape == (): sum_groundtruth = np.array([sum_groundtruth]) grad_nd = mx.nd.empty(shape) outgrad_npy = np.array(np.random.rand(*sum_groundtruth.shape)) grad_groundtruth = numpy_reduce_grad_func(outgrad=outgrad_npy, data=dat_npy, axis=axes, keepdims=keepdims) net = b.bind(mx.cpu(), args={'a': mx.nd.array(dat_npy)}, args_grad={'a': grad_nd}) net.forward(is_train=True) err_forward = reldiff(net.outputs[0].asnumpy(), sum_groundtruth) assert err_forward < 1E-4 net.backward(out_grads=mx.nd.array(outgrad_npy)) err_backward = reldiff(grad_nd.asnumpy(), grad_groundtruth) assert err_backward < 1E-4
def check_regression(symbol, forward, backward): data = mx.symbol.Variable('data') label = mx.symbol.Variable('label') out = symbol(data, label) shape = (3, 1) arr_data = mx.random.uniform(-1, 1, shape) arr_label = mx.random.uniform(0, 1, shape[0]) arr_grad = mx.nd.empty(shape) exec1 = out.bind(mx.cpu(), args=[arr_data, arr_label], args_grad={"data": arr_grad}) exec1.forward() out1 = exec1.outputs[0].asnumpy() npout = forward(arr_data.asnumpy()) assert reldiff(npout, out1) < 1e-6 exec1.backward() npout = backward(npout, arr_label.asnumpy().reshape(npout.shape)) assert reldiff(npout, arr_grad.asnumpy()) < 1e-6
def check_deconvolution_gradient(input_shape, num_filter, pad): """configure A: input --> conv --> output. configure B: input --> deconv --> output the convolution and deconvoluiton has similar parameter which ensure the input shape is the same as output; During backward(), if the input of A equals output of B, and the output of A equals input of B, then the grad of weight should be the same; """ stride = (1, 1) kernel = (2 * pad[0] + 1, 2 * pad[1] + 1) data_conv = mx.sym.Variable(name="data_conv") conv = mx.sym.Convolution(data=data_conv, kernel=kernel, stride=stride, pad=pad, num_filter=num_filter, no_bias="true", name="conv") data_deconv = mx.sym.Variable(name="data_deconv") deconv = mx.sym.Deconvolution(data=data_deconv, kernel=kernel, stride=stride, pad=pad, num_filter=num_filter, no_bias="true", name="deconv") conv_data = mx.random.uniform(-5, 5, input_shape) conv_args = {} conv_args["data_conv"] = conv_data conv_args['conv_weight'] = \ mx.random.normal(0, 1,(num_filter, input_shape[1]) + kernel) conv_args_grad = [ mx.nd.zeros(conv_data.shape), mx.nd.zeros((num_filter, input_shape[1]) + kernel) ] exe_conv = conv.bind(mx.cpu(), args=conv_args, args_grad=conv_args_grad) conv_out_grad = mx.random.normal(0, 2, exe_conv.outputs[0].shape) exe_conv.backward(conv_out_grad) deconv_data = conv_out_grad deconv_args = {} deconv_args['data_deconv'] = deconv_data deconv_args['deconv_weight'] = conv_args['conv_weight'] deconv_args_grad = [ mx.nd.zeros(deconv_data.shape), mx.nd.zeros((num_filter, input_shape[1]) + kernel) ] exe_deconv = deconv.bind(mx.cpu(), args=deconv_args, args_grad=deconv_args_grad) deconv_out_grad = conv_data[:] exe_deconv.backward(deconv_out_grad) assert reldiff(conv_args_grad[1].asnumpy(), deconv_args_grad[1].asnumpy()) < 1e-6
def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym): for i in range(sample_num): # Generate random data that has ndim between 1-7 and all the shape dims between 1-10 ndim = np.random.randint(1, 8) shape = np.random.randint(1, 11, size=(ndim,)) axis_num = np.random.randint(0, ndim, size=1) axis_flags = np.random.randint(0, 2, size=ndim) axes = [] for (axis, flag) in enumerate(axis_flags): if flag: axes.append(axis) if 0 == len(axes): axes = None elif 1 == len(axes): axes = axes[0] else: axes = tuple(axes) keepdims = np.random.randint(0, 2) a = mx.symbol.Variable('a') if axes is None: b = mx_reduce_sym(a, keepdims=keepdims) else: b = mx_reduce_sym(a, axis=axes, keepdims=keepdims) dat_npy = np.random.rand(*shape) sum_groundtruth = np.array(numpy_reduce_func(dat_npy, axis=axes, keepdims=keepdims)) if sum_groundtruth.shape == (): sum_groundtruth = np.array([sum_groundtruth]) grad_nd = mx.nd.empty(shape) outgrad_npy = np.array(np.random.rand(*sum_groundtruth.shape)) grad_groundtruth = numpy_reduce_grad_func(outgrad=outgrad_npy, data=dat_npy, axis=axes, keepdims=keepdims) net = b.bind(mx.cpu(), args={'a': mx.nd.array(dat_npy)}, args_grad={'a': grad_nd}) net.forward(is_train=True) err_forward = reldiff(net.outputs[0].asnumpy(), sum_groundtruth) assert err_forward < 1E-4 net.backward(out_grads=mx.nd.array(outgrad_npy)) err_backward = reldiff(grad_nd.asnumpy(), grad_groundtruth) assert err_backward < 1E-4
def test_stn(): import pdb np.set_printoptions(threshold=np.nan) num_filter = 2 # conv of loc net kernel = (3, 3) # conv of loc net num_hidden = 6 # fc of loc net for n in [1, 2, 3, 4]: for c in [1, 2, 3, 4]: for h in [5, 9, 13, 17]: # for convenience test, this third and forth input dim should be 4x + 1 for w in [5, 9, 13, 17]: data_shape = (n, c, h, w) target_shape = (int((data_shape[2]+1)/2), int((data_shape[3]+1)/2)) data = mx.sym.Variable(name="data") loc = mx.sym.Convolution(data=data, kernel=kernel, pad=(1, 1), num_filter=num_filter, name="loc_conv") loc = mx.sym.Flatten(data=loc) loc = mx.sym.FullyConnected(data=loc, num_hidden=num_hidden, name="loc_fc") stn = mx.sym.SpatialTransformer(data=data, loc=loc, target_shape=target_shape, transform_type="affine", sampler_type="bilinear") arg_names = stn.list_arguments() arg_shapes, out_shapes, _ = stn.infer_shape(data=data_shape) # check shape assert out_shapes[0] == (data_shape[0], data_shape[1], target_shape[0], target_shape[1]) dev = mx.cpu() #dev = mx.gpu(0) args = {} args['data'] = mx.random.normal(0, 1, data_shape, dev) args['loc_conv_weight'] = mx.nd.zeros((num_filter, data_shape[1], kernel[0], kernel[1]), ctx=dev) args['loc_conv_bias'] = mx.nd.zeros((num_filter,), ctx=dev) args['loc_fc_weight'] = mx.nd.zeros((6, num_filter*data_shape[2]*data_shape[3]), ctx=dev) args['loc_fc_bias'] = mx.nd.array([0.5, 0, 0, 0, 0.5, 0], ctx=dev) grad_grad = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes] exe = stn.bind(dev, args=args, args_grad=grad_grad) exe.forward(is_train=True) out = exe.outputs[0].asnumpy() # check forward reldiff(out, args['data'].asnumpy()[:, :, h//4:h-h//4, w//4:w-w//4]) < 1e-6 out_grad = mx.nd.ones(out.shape, ctx=dev) exe.backward([out_grad]) # check backward reldiff(out_grad.asnumpy(), grad_grad[0].asnumpy()[:, :, h//4:h-h//4, w//4:w-w//4]) < 1e-6
def test_abs(): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:]=5 arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:]=3 test = mx.sym.abs(data) exe_test = test.bind(mx.cpu(), args=[arr_data], args_grad=[arr_grad]) exe_test.forward() out = exe_test.outputs[0].asnumpy() npout = abs(data_tmp) assert reldiff(out, npout) < 1e-6 out_grad = mx.nd.empty(shape) out_grad[:] = 2; npout_grad = out_grad.asnumpy() npout_grad = npout_grad * np.sign(data_tmp) exe_test.backward(out_grad) assert reldiff(arr_grad.asnumpy(), npout_grad) < 1e-6
def test_rsqrt_cos_sin(): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:]=5 arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:]=3 test = mx.sym.rsqrt(data) + mx.sym.cos(data) + mx.sym.sin(data) exe_test = test.bind(mx.cpu(), args=[arr_data], args_grad=[arr_grad]) exe_test.forward() out = exe_test.outputs[0].asnumpy() npout = 1/ np.sqrt(data_tmp) + np.cos(data_tmp) + np.sin(data_tmp) assert reldiff(out, npout) < 1e-6 out_grad = mx.nd.empty(shape) out_grad[:] = 2; npout_grad = out_grad.asnumpy() npout_grad = npout_grad * -(1.0 / (2.0 * data_tmp * np.sqrt(data_tmp))) + npout_grad * -1 * np.sin(data_tmp) + npout_grad * np.cos(data_tmp) exe_test.backward(out_grad) assert reldiff(arr_grad.asnumpy(), npout_grad) < 1e-6
def test_abs(): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:] = 5 arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:] = 3 test = mx.sym.abs(data) exe_test = test.bind(mx.cpu(), args=[arr_data], args_grad=[arr_grad]) exe_test.forward() out = exe_test.outputs[0].asnumpy() npout = abs(data_tmp) assert reldiff(out, npout) < 1e-6 out_grad = mx.nd.empty(shape) out_grad[:] = 2 npout_grad = out_grad.asnumpy() npout_grad = npout_grad * np.sign(data_tmp) exe_test.backward(out_grad) assert reldiff(arr_grad.asnumpy(), npout_grad) < 1e-6
def test_maximum_minimum(): data1 = mx.symbol.Variable('data') data2 = mx.symbol.Variable('data') shape = (3, 4) data_tmp1 = np.random.rand(3, 4) data_tmp2 = np.random.rand(3, 4) data_tmp1[:] = 2 data_tmp2[:] = 3 arr_data1 = mx.nd.array(data_tmp1) arr_data2 = mx.nd.array(data_tmp2) arr_grad1 = mx.nd.empty(shape) arr_grad2 = mx.nd.empty(shape) test = mx.sym.maximum(data1, data2) + mx.sym.minimum(data1, data2) exe_test = test.bind(mx.cpu(), args=[arr_data1, arr_data2], args_grad=[arr_grad1, arr_grad2]) exe_test.forward() out = exe_test.outputs[0].asnumpy() npout = np.maximum(data_tmp1, data_tmp2) + np.minimum(data_tmp1, data_tmp2) assert reldiff(out, npout) < 1e-6 out_grad = mx.nd.empty(shape) out_grad[:] = 2 exe_test.backward(out_grad) npout_grad = np.ones(shape) npout_grad[:] = 2 mask1 = (data_tmp1 > data_tmp2).astype('float') mask2 = (data_tmp1 < data_tmp2).astype('float') npout_grad1 = npout_grad * mask1 + npout_grad * mask2 npout_grad2 = (npout_grad - npout_grad * mask1) + (npout_grad - npout_grad * mask2) assert reldiff(arr_grad1.asnumpy(), npout_grad1) < 1e-6 assert reldiff(arr_grad2.asnumpy(), npout_grad2) < 1e-6
def test_round_ceil_floor(): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:]=5.543 arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:]= 2 test = mx.sym.round(data) + mx.sym.ceil(data) + mx.sym.floor(data) exe_test = test.bind(mx.cpu(), args=[arr_data]) exe_test.forward() out = exe_test.outputs[0].asnumpy() npout = np.round(data_tmp) + np.ceil(data_tmp) + np.floor(data_tmp) assert reldiff(out, npout) < 1e-6
def test_round_ceil_floor(): data = mx.symbol.Variable('data') shape = (3, 4) data_tmp = np.ones(shape) data_tmp[:] = 5.543 arr_data = mx.nd.array(data_tmp) arr_grad = mx.nd.empty(shape) arr_grad[:] = 2 test = mx.sym.round(data) + mx.sym.ceil(data) + mx.sym.floor(data) exe_test = test.bind(mx.cpu(), args=[arr_data]) exe_test.forward() out = exe_test.outputs[0].asnumpy() npout = np.round(data_tmp) + np.ceil(data_tmp) + np.floor(data_tmp) assert reldiff(out, npout) < 1e-6
def test_swapaxes(): data = mx.symbol.Variable('data') shape = (2, 3, 4) data_tmp = np.ones(shape) data_tmp[0] = 1 data_tmp[1] = 2 arr_data = mx.nd.array(data_tmp) swap0 = mx.symbol.SwapAxis(data=data, dim1=0, dim2=2) swap = mx.symbol.SwapAxis(data=swap0, dim1=1, dim2=2) exe_c = swap.bind(mx.cpu(), args=[arr_data]) exe_c.forward() out = exe_c.outputs[0].asnumpy() swap0_ = np.swapaxes(data_tmp, 0, 2) swap_ = np.swapaxes(swap0_, 1, 2) assert reldiff(out, swap_) < 1e-6
def check_elementwise_sum_with_shape(shape, n): # forward inputs = [mx.symbol.Variable("arg%d" % i) for i in range(n)] out = mx.symbol.ElementWiseSum(*inputs, name="esum") arr = [mx.nd.empty(shape) for i in range(n)] arr_grad = [mx.nd.empty(shape) for i in range(n)] for i in range(n): arr[i][:] = np.random.uniform(-10, 10, shape) exec1 = out.bind(mx.Context("cpu"), args=arr, args_grad=arr_grad) out1 = exec1.outputs[0].asnumpy() exec1.forward() out1 = exec1.outputs[0].asnumpy() out = sum(a.asnumpy() for a in arr) assert reldiff(out, out1) < 1e-6 out_grad = mx.nd.empty(shape) out_grad[:] = np.random.uniform(-10, 10, shape) # backward exec1.backward([out_grad]) for a in arr_grad: assert same(a.asnumpy(), out_grad.asnumpy())
def check_elementwise_sum_with_shape(shape, n): # forward inputs = [mx.symbol.Variable('arg%d' % i) for i in range(n)] out = mx.symbol.ElementWiseSum(*inputs, name='esum') arr = [mx.nd.empty(shape) for i in range(n)] arr_grad = [mx.nd.empty(shape) for i in range(n)] for i in range(n): arr[i][:] = np.random.uniform(-10, 10, shape) exec1 = out.bind(mx.Context('cpu'), args=arr, args_grad=arr_grad) out1 = exec1.outputs[0].asnumpy() exec1.forward() out1 = exec1.outputs[0].asnumpy() out = sum(a.asnumpy() for a in arr) assert reldiff(out, out1) < 1e-6 out_grad = mx.nd.empty(shape) out_grad[:] = np.random.uniform(-10, 10, shape) # backward exec1.backward([out_grad]) for a in arr_grad: assert same(a.asnumpy(), out_grad.asnumpy())
def check_deconvolution_gradient(input_shape, num_filter, pad): """configure A: input --> conv --> output. configure B: input --> deconv --> output the convolution and deconvoluiton has similar parameter which ensure the input shape is the same as output; During backward(), if the input of A equals output of B, and the output of A equals input of B, then the grad of weight should be the same; """ stride = (1, 1) kernel = (2*pad[0]+1, 2*pad[1]+1) data_conv = mx.sym.Variable(name="data_conv") conv = mx.sym.Convolution( data=data_conv, kernel=kernel, stride=stride, pad=pad, num_filter=num_filter, no_bias = "true", name = "conv") data_deconv = mx.sym.Variable(name="data_deconv") deconv = mx.sym.Deconvolution( data=data_deconv, kernel=kernel, stride=stride, pad=pad, num_filter=num_filter, no_bias = "true", name = "deconv") conv_data = mx.random.uniform(-5, 5, input_shape) conv_args = {} conv_args["data_conv"] = conv_data conv_args['conv_weight'] = \ mx.random.normal(0, 1,(num_filter, input_shape[1]) + kernel) conv_args_grad = [mx.nd.zeros(conv_data.shape), mx.nd.zeros((num_filter, input_shape[1]) + kernel)] exe_conv = conv.bind(mx.cpu(), args=conv_args, args_grad=conv_args_grad) conv_out_grad = mx.random.normal(0, 2, exe_conv.outputs[0].shape) exe_conv.backward(conv_out_grad) deconv_data = conv_out_grad deconv_args = {} deconv_args['data_deconv'] = deconv_data deconv_args['deconv_weight'] = conv_args['conv_weight'] deconv_args_grad = [mx.nd.zeros(deconv_data.shape), mx.nd.zeros((num_filter, input_shape[1]) + kernel)] exe_deconv = deconv.bind(mx.cpu(), args=deconv_args, args_grad=deconv_args_grad) deconv_out_grad = conv_data[:] exe_deconv.backward(deconv_out_grad) assert reldiff(conv_args_grad[1].asnumpy(), deconv_args_grad[1].asnumpy()) < 1e-6