def _graph_dict(): # This function creates a graph that has no real meaning other than # providing something to traverse. d = {} batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('ia') input_dynamic_axes = [batch_axis, input_seq_axis] d['i1'] = input_variable( shape=(2, 3), dynamic_axes=input_dynamic_axes, name='i1') d['c1'] = constant(shape=(2, 3), value=6, name='c1') d['p1'] = parameter(shape=(3, 2), init=7, name='p1') d['op1'] = plus(d['i1'], d['c1'], name='op1') d['op2'] = times(d['op1'], d['p1'], name='op2') #d['slice'] = slice(d['c1'], Axis.default_dynamic_axis(), 0, 3) #label_sentence_start = sequence.first(raw_labels) # no name d['p2'] = parameter(shape=(2, 2)) # duplicate names d['op3a'] = plus(d['op2'], d['p2'], name='op3') d['op3b'] = plus(d['op3a'], d['p2'], name='op3') d['first'] = sequence.first(d['op3b'], name='past') d['root'] = d['first'] return d
def _test_op_slice_sequence(input_data, slice_params, expected_result, device_id, precision): input_data = AA(input_data, dtype=PRECISION_TO_TYPE[precision]) t = Axis.new_unique_dynamic_axis('t') sample_shape = input_data.shape[1:] a = I(shape=sample_shape, dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, dynamic_axes=[Axis.default_batch_axis(), t], name='a') result = C.sequence.slice(a, begin_index=slice_params[0], end_index=slice_params[1]) def grad_slice(x, beg_index, end_index): res = np.zeros_like(x) res[beg_index:end_index] = 1 return res expected_gradient = grad_slice(np.asarray(input_data), *slice_params) expected_forward = AA([expected_result], dtype=PRECISION_TO_TYPE[precision]) expected_backward = { a: [grad_slice(np.asarray(input_data), *slice_params)] } # create batch input_data.shape = (1,) + input_data.shape forward_input = {a: input_data} unittest_helper(result, forward_input, expected_forward, expected_backward, device_id=device_id, precision=precision)
def test_op_broadcast_as(device_id, precision): from .. import sequence a_data = [ AA([1], dtype=PRECISION_TO_TYPE[precision]), AA([2], dtype=PRECISION_TO_TYPE[precision]), AA([3], dtype=PRECISION_TO_TYPE[precision]) ] b_data = [ AA([[2]], dtype=PRECISION_TO_TYPE[precision]), AA([[2], [3]], dtype=PRECISION_TO_TYPE[precision]), AA([[2], [3], [4]], dtype=PRECISION_TO_TYPE[precision]) ] a = I(shape=(1, ), dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), name='a', dynamic_axes=[Axis.default_batch_axis()]) b = I(shape=(1, ), dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), name='b') broadcast_a_as_b = sequence.broadcast_as(a, b) res = broadcast_a_as_b.eval({a: a_data, b: b_data}) assert np.array_equal(res[0], np.asarray([[1.]])) assert np.array_equal(res[1], np.asarray([[2.], [2.]])) assert np.array_equal(res[2], np.asarray([[3.], [3.], [3.]]))
def test_op_broadcast_as(device_id, precision): from .. import sequence a_data = [AA([1], dtype=PRECISION_TO_TYPE[precision]), AA([2], dtype=PRECISION_TO_TYPE[precision]), AA([3], dtype=PRECISION_TO_TYPE[precision])] b_data = [AA([[2]], dtype=PRECISION_TO_TYPE[precision]), AA([[2], [3]], dtype=PRECISION_TO_TYPE[precision]), AA([[2], [3], [4]], dtype=PRECISION_TO_TYPE[precision])] a = I(shape=(1,), dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), name='a', dynamic_axes=[Axis.default_batch_axis()]) b = I(shape=(1,), dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), name='b') broadcast_a_as_b = sequence.broadcast_as(a, b) res = broadcast_a_as_b.eval({a: a_data, b: b_data}) assert np.array_equal(res[0], np.asarray([[1.]])) assert np.array_equal(res[1], np.asarray([[2.], [2.]])) assert np.array_equal(res[2], np.asarray([[3.], [3.], [3.]]))
def Input(shape, dtype=default_override_or(np.float32), needs_gradient=True, is_sparse=False, dynamic_axes=Axis.default_input_variable_dynamic_axes(), name=''): ''' Constructs an Input variable. ''' dtype = get_default_override(Input, dtype=dtype) return input_variable(shape=shape, dtype=dtype, needs_gradient=needs_gradient, is_sparse=is_sparse, dynamic_axes=dynamic_axes, name=name)
def Input(shape, dtype=default_override_or(np.float32), needs_gradient=True, is_sparse=False, dynamic_axes=Axis.default_input_variable_dynamic_axes(), name=''): ''' Input(shape, dtype=np.float32, needs_gradient=True, is_sparse=False, dynamic_axes=Axis.default_input_variable_dynamic_axes(), name='') Constructs an Input variable. Input variables are used when explicitly constructing a graph. In the context of the Layers library, however, the preferred method is to use the @\ :func:`~cntk.utils.Signature` pattern. This is a wrapper around :func:`~cntk.ops.input_variable`. Example: >>> # an input receptacle for explicit graph building >>> x = Input((2,3), is_sparse=True) >>> x.is_sparse True >>> x.shape (2, 3) >>> y = sigmoid(x) >>> y.shape (2, 3) >>> # but the preferred pattern is to use the @Function/@Signature pattern instead: >>> from cntk.ops.functions import Function >>> from cntk.layers.typing import * >>> @Function ... @Signature(x = Tensor[2,3]) ... def y(x): ... return sigmoid(x) >>> y.shape (2, 3) >>> # type specifications can also be directly passed to Input: >>> x = Input(**SparseTensor[2,3]) >>> x.is_sparse True >>> x.shape (2, 3) >>> y = sigmoid(x) >>> y.shape (2, 3) Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer dtype (np.dtype, defaults to np.float32): data type needs_gradient (bool, defaults to `True`): is_sparse (bool, defaults to `False`): dynamic_axes (object, `Axis.default_input_variable_dynamic_axes`): name (str, defaults to ''): the name of the Function instance in the network Returns: an input Variable ''' dtype = get_default_override(Input, dtype=dtype) return input(shape=shape, dtype=dtype, needs_gradient=needs_gradient, is_sparse=is_sparse, dynamic_axes=dynamic_axes, name=name)
def test_op_broadcast_as_in_loop(device_id): from .. import sequence, placeholder_variable, past_value a_data = [AA([1]), AA([2]), AA([3])] b_data = [AA([[2]]), AA([[2], [3]]), AA([[2], [3], [4]])] a = I(shape=(1, ), name='a', dynamic_axes=[Axis.default_batch_axis()]) b = I(shape=(1, ), name='b') out_placeholder = placeholder_variable() out_delayed = past_value(out_placeholder, time_step=5) out_delayed_plus_b = out_delayed + b out = sequence.broadcast_as(a, out_delayed_plus_b) out.replace_placeholder(out) res = out.eval({a: a_data, b: b_data}) assert np.array_equal(res[0], np.asarray([[1.]])) assert np.array_equal(res[1], np.asarray([[2.], [2.]])) assert np.array_equal(res[2], np.asarray([[3.], [3.], [3.]]))
def Input(shape, dtype=default_override_or(np.float32), needs_gradient=True, is_sparse=False, dynamic_axes=Axis.default_input_variable_dynamic_axes(), name=''): ''' Input(shape, dtype=np.float32, needs_gradient=True, is_sparse=False, dynamic_axes=Axis.default_input_variable_dynamic_axes(), name='') Constructs an Input variable. Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer dtype (np.dtype, defaults to np.float32): data type needs_gradient (bool, defaults to `True`): is_sparse (bool, defaults to `False`): dynamic_axes (object, Axis.default_input_variable_dynamic_axes): name (str, defaults to ''): the name of the Function instance in the network ''' dtype = get_default_override(Input, dtype=dtype) return input_variable(shape=shape, dtype=dtype, needs_gradient=needs_gradient, is_sparse=is_sparse, dynamic_axes=dynamic_axes, name=name)
def test_op_broadcast_as_in_loop(device_id): from .. import sequence, placeholder_variable, past_value a_data = [AA([1]), AA([2]), AA([3])] b_data = [AA([[2]]), AA([[2], [3]]), AA([[2], [3], [4]])] a = I(shape=(1,), name='a', dynamic_axes=[Axis.default_batch_axis()]) b = I(shape=(1,), name='b') out_placeholder = placeholder_variable() out_delayed = past_value(out_placeholder, time_step=5) out_delayed_plus_b = out_delayed + b out = sequence.broadcast_as(a, out_delayed_plus_b) out.replace_placeholder(out) res = out.eval({a: a_data, b: b_data}) assert np.array_equal(res[0], np.asarray([[1.]])) assert np.array_equal(res[1], np.asarray([[2.], [2.]])) assert np.array_equal(res[2], np.asarray([[3.], [3.], [3.]]))
# create batch input_tensor.shape = (1, ) + input_tensor.shape forward_input = {a: input_tensor} unittest_helper(input_op, forward_input, expected_forward, expected_backward, device_id=device_id, precision=precision) RESHAPE_SUBSHAPE_TEST_CASES = [ #(input_shape, replacement_shape, begin_axis, end_axis, expected_output_shape) ((2, 3), (3, 2), 0, Axis.new_leading_axis(), (3, 2)), ((2, 3), (1), 0, 0, (1, 2, 3)), ((2, 3), (1, 1), Axis.new_leading_axis(), Axis.new_leading_axis(), (2, 3, 1, 1)), ((2, 3, 5), (C.InferredDimension), 0, Axis(2), (6, 5)), ((2, 3, 5), (C.InferredDimension), Axis(-3), -1, (6, 5)), ((6, 5), (2, C.InferredDimension), 0, 1, (2, 3, 5)), ] @pytest.mark.parametrize( "input_shape, replacement_shape, begin_axis, end_axis, expected_output_shape", RESHAPE_SUBSHAPE_TEST_CASES) def test_op_reshape_subshape(input_shape, replacement_shape, begin_axis, end_axis, expected_output_shape, device_id, precision):
def __init__(self, in_shape, output_shape, device_id=None, learning_rate=0.00025, momentum=0.9, minibatch_size=32, update_interval=10000, n_workers=1, visualizer=None): """ Q Neural Network following Mnih and al. implementation and default options. The network has the following topology: Convolution(32, (8, 8)) Convolution(64, (4, 4)) Convolution(64, (2, 2)) Dense(512) :param in_shape: Shape of the observations perceived by the learner (the neural net input) :param output_shape: Size of the action space (mapped to the number of output neurons) :param device_id: Use None to let CNTK select the best available device, -1 for CPU, >= 0 for GPU (default: None) :param learning_rate: Learning rate (default: 0.00025, as per Mnih et al.) :param momentum: Momentum, provided as momentum value for averaging gradients without unit gain filter Note that CNTK does not currently provide an implementation of Graves' RmsProp with momentum. It uses AdamSGD optimizer instead. (default: 0, no momentum with RProp optimizer) :param minibatch_size: Minibatch size (default: 32, as per Mnih et al.) :param n_workers: Number of concurrent worker for distributed training. (default: 1, not distributed) :param visualizer: Optional visualizer allowing the model to save summary data (default: None, no visualization) Ref: Mnih et al.: "Human-level control through deep reinforcement learning." Nature 518.7540 (2015): 529-533. """ assert learning_rate > 0, 'learning_rate should be > 0' assert 0. <= momentum < 1, 'momentum should be 0 <= momentum < 1' QModel.__init__(self, in_shape, output_shape) CntkModel.__init__(self, device_id, False, n_workers, visualizer) self._nb_actions = output_shape self._steps = 0 self._target_update_interval = update_interval self._target = None # Input vars self._environment = input(in_shape, name='env', dynamic_axes=(Axis.default_batch_axis())) self._q_targets = input(1, name='q_targets', dynamic_axes=(Axis.default_batch_axis())) self._actions = input(output_shape, name='actions', dynamic_axes=(Axis.default_batch_axis())) # Define the neural network graph self._model = self._build_model()(self._environment) self._target = self._model.clone( CloneMethod.freeze, {self._environment: self._environment}) # Define the learning rate lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) # AdamSGD optimizer m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._model.parameters, lr_schedule, momentum=m_schedule, unit_gain=True, variance_momentum=vm_schedule) if self.distributed_training: raise NotImplementedError('ASGD not implemented yet.') # _actions is a sparse 1-hot encoding of the actions done by the agent q_acted = reduce_sum(self._model * self._actions, axis=0) # Define the trainer with Huber Loss function criterion = huber_loss(q_acted, self._q_targets, 1.0) self._learner = l_sgd self._trainer = Trainer(self._model, (criterion, None), l_sgd)
expected_forward = [input_reshaped**2] expected_backward = {a: input_tensor} # create batch input_tensor.shape = (1,) + input_tensor.shape forward_input = {a: input_tensor} unittest_helper(input_op, forward_input, expected_forward, expected_backward, device_id=device_id, precision=precision) RESHAPE_SUBSHAPE_TEST_CASES = [ #(input_shape, replacement_shape, begin_axis, end_axis, expected_output_shape) ((2, 3), (3, 2), 0, Axis.new_leading_axis(), (3, 2)), ((2, 3), (1), 0, 0, (1, 2, 3)), ((2, 3), (1, 1), Axis.new_leading_axis(),Axis.new_leading_axis(), (2, 3, 1, 1)), ((2, 3, 5), (C.InferredDimension), 0, Axis(2), (6, 5)), ((2, 3, 5), (C.InferredDimension), Axis(-3), -1, (6, 5)), ((6, 5), (2, C.InferredDimension), 0, 1, (2, 3, 5)), ] @pytest.mark.parametrize("input_shape, replacement_shape, begin_axis, end_axis, expected_output_shape", RESHAPE_SUBSHAPE_TEST_CASES) def test_op_reshape_subshape(input_shape, replacement_shape, begin_axis, end_axis, expected_output_shape, device_id, precision): # Reshaping is just moving the input values to different indexes of the result tensor. # If we compute the gradients on the unmodified tensor, reshape would get 1 for all inputs # For testing the gradients we want to have different gradients for each input index otherwise we can't # test if they get wrongly permuted during test. To this end we multiply # the reshaping result with itself. dev = cntk_device(device_id)
expected_forward = [[input_reshaped**2]] expected_backward = {a: input_tensor} # create batch input_tensor.shape = (1, 1) + input_tensor.shape forward_input = {a: input_tensor} unittest_helper(input_op, forward_input, expected_forward, expected_backward, device_id=device_id, precision=precision) RESHAPE_SUBSHAPE_TEST_CASES = [ #(input_shape, replacement_shape, begin_axis, end_axis, expected_output_shape) ((2, 3), (3, 2), 0, Axis.end_static_axis(), (3, 2)), ((2, 3), (1), 0, 0, (1, 2, 3)), ((2, 3), (1, 1), Axis.end_static_axis(), Axis.end_static_axis(), (2, 3, 1, 1)), ((2, 3, 5), (C.InferredDimension), 0, Axis(2), (6, 5)), ((2, 3, 5), (C.InferredDimension), Axis(-3), -1, (6, 5)), ((6, 5), (2, C.InferredDimension), 0, 1, (2, 3, 5)), ] @pytest.mark.parametrize("input_shape, replacement_shape, begin_axis, end_axis, expected_output_shape", RESHAPE_SUBSHAPE_TEST_CASES) def test_op_reshape_subshape(input_shape, replacement_shape, begin_axis, end_axis, expected_output_shape, device_id, precision): # Reshaping is just moving the input values to different indexes of the result tensor. # If we compute the gradients on the unmodified tensor, reshape would get 1 for all inputs # For testing the gradients we want to have different gradients for each input index otherwise we can't # test if they get wrongly permuted during test. To this end we multiply # the reshaping result with itself. dev = cntk_device(device_id)
# create batch input_tensor.shape = (1, 1) + input_tensor.shape forward_input = {a: input_tensor} unittest_helper(input_op, forward_input, expected_forward, expected_backward, device_id=device_id, precision=precision) RESHAPE_SUBSHAPE_TEST_CASES = [ #(input_shape, replacement_shape, begin_axis, end_axis, expected_output_shape) ((2, 3), (3, 2), 0, Axis.end_static_axis(), (3, 2)), ((2, 3), (1), 0, 0, (1, 2, 3)), ((2, 3), (1, 1), Axis.end_static_axis(), Axis.end_static_axis(), (2, 3, 1, 1)), ((2, 3, 5), (C.InferredDimension), 0, Axis(2), (6, 5)), ((2, 3, 5), (C.InferredDimension), Axis(-3), -1, (6, 5)), ((6, 5), (2, C.InferredDimension), 0, 1, (2, 3, 5)), ] @pytest.mark.parametrize( "input_shape, replacement_shape, begin_axis, end_axis, expected_output_shape", RESHAPE_SUBSHAPE_TEST_CASES) def test_op_reshape_subshape(input_shape, replacement_shape, begin_axis, end_axis, expected_output_shape, device_id, precision):
def __init__(self, in_shape, output_shape, device_id=None, learning_rate=0.00025, momentum=0.9, minibatch_size=32, update_interval=10000, n_workers=1, visualizer=None): """ Q Neural Network following Mnih and al. implementation and default options. The network has the following topology: Convolution(32, (8, 8)) Convolution(64, (4, 4)) Convolution(64, (2, 2)) Dense(512) :param in_shape: Shape of the observations perceived by the learner (the neural net input) :param output_shape: Size of the action space (mapped to the number of output neurons) :param device_id: Use None to let CNTK select the best available device, -1 for CPU, >= 0 for GPU (default: None) :param learning_rate: Learning rate (default: 0.00025, as per Mnih et al.) :param momentum: Momentum, provided as momentum value for averaging gradients without unit gain filter Note that CNTK does not currently provide an implementation of Graves' RmsProp with momentum. It uses AdamSGD optimizer instead. (default: 0, no momentum with RProp optimizer) :param minibatch_size: Minibatch size (default: 32, as per Mnih et al.) :param n_workers: Number of concurrent worker for distributed training. (default: 1, not distributed) :param visualizer: Optional visualizer allowing the model to save summary data (default: None, no visualization) Ref: Mnih et al.: "Human-level control through deep reinforcement learning." Nature 518.7540 (2015): 529-533. """ assert learning_rate > 0, 'learning_rate should be > 0' assert 0. <= momentum < 1, 'momentum should be 0 <= momentum < 1' QModel.__init__(self, in_shape, output_shape) CntkModel.__init__(self, device_id, False, n_workers, visualizer) self._nb_actions = output_shape self._steps = 0 self._target_update_interval = update_interval self._target = None # Input vars self._environment = input(in_shape, name='env', dynamic_axes=(Axis.default_batch_axis())) self._q_targets = input(1, name='q_targets', dynamic_axes=(Axis.default_batch_axis())) self._actions = input(output_shape, name='actions', dynamic_axes=(Axis.default_batch_axis())) # Define the neural network graph self._model = self._build_model()(self._environment) self._target = self._model.clone( CloneMethod.freeze, {self._environment: self._environment} ) # Define the learning rate lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) # AdamSGD optimizer m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._model.parameters, lr_schedule, momentum=m_schedule, unit_gain=True, variance_momentum=vm_schedule) if self.distributed_training: raise NotImplementedError('ASGD not implemented yet.') # _actions is a sparse 1-hot encoding of the actions done by the agent q_acted = reduce_sum(self._model * self._actions, axis=0) # Define the trainer with Huber Loss function criterion = huber_loss(q_acted, self._q_targets, 1.0) self._learner = l_sgd self._trainer = Trainer(self._model, (criterion, None), l_sgd)