def test_image_scale_transformer(self): spec = alf.TensorSpec((3, 16, 16), dtype=torch.uint8) transformer = ImageScaleTransformer(spec, min=0.) new_spec = transformer.transformed_observation_spec self.assertEqual(new_spec.dtype, torch.float32) self.assertEqual(new_spec.minimum, 0.) self.assertEqual(new_spec.maximum, 1.) timestep = DataItem( observation=torch.randint(256, (3, 16, 16)).to(torch.uint8)) transformed = transformer.transform_timestep(timestep, ())[0] self.assertLess( (transformed.observation * 255 - timestep.observation).abs().max(), 1e-4) transformed = transformer.transform_experience(timestep) self.assertLess( (transformed.observation * 255 - timestep.observation).abs().max(), 1e-4) spec = dict(img=alf.TensorSpec((3, 16, 16), dtype=torch.uint8), other=alf.TensorSpec(())) self.assertRaises(AssertionError, ImageScaleTransformer, spec, min=0.) self.assertRaises(AssertionError, ImageScaleTransformer, spec, min=0., fields=['other']) transformer = ImageScaleTransformer(spec, min=0., fields=['img'])
def env_info_spec(self): return { "play0_win": alf.TensorSpec(()), "play1_win": alf.TensorSpec(()), "draw": alf.TensorSpec(()), "invalid_move": alf.TensorSpec(()), }
def __init__(self, batch_size, height=19, width=19, winning_thresh=7.5, allow_suicidal_move=False, reward_shaping=False, human_player=None): """ Args: batch_size (int): the number of parallel boards height (int): height of each board width (int): width of each board winning_thresh (float): player 0 wins if area0 - area1 > winning_thresh, lose if area0 - area1 < winning_thresh, otherwise draw. allow_suicidal_move (bool): whether suicidal move is allowed. reward_shaping (bool): if True, instead of using +1,-1 as reward, use ``alf.math.softsign(area0 - area1 - winning_thresh)`` as reward to encourage capture more area. human_player (int|None): 0, 1 or None """ self._batch_size = batch_size self._width = width self._height = height self._max_num_moves = 2 * height * width self._winning_thresh = float(winning_thresh) self._allow_suicical_move = allow_suicidal_move self._reward_shaping = reward_shaping self._human_player = human_player # width*height for pass # otherwise it is a move at (y=action // width, x=action % width) self._action_spec = alf.BoundedTensorSpec((), minimum=0, maximum=width * height, dtype=torch.int64) self._observation_spec = OrderedDict( board=alf.TensorSpec((1, height, width), torch.int8), prev_action=self._action_spec, valid_action_mask=alf.TensorSpec([width * height + 1], torch.bool), steps=alf.TensorSpec((), torch.int32), to_play=alf.TensorSpec((), torch.int8)) self._B = torch.arange(self._batch_size) self._env_ids = torch.arange(batch_size) self._pass_action = width * height self._board = GoBoard(batch_size, height, width, self._max_num_moves) self._previous_board = self._board.get_board() self._num_moves = torch.zeros((batch_size, ), dtype=torch.int32) self._game_over = torch.zeros((batch_size, ), dtype=torch.bool) self._prev_action = torch.full((batch_size, ), self._pass_action, dtype=torch.int64) self._surface = None if human_player is not None: logging.info("Use mouse click to place a stone") logging.info("Kayboard control:") logging.info("P : pass") logging.info("SPACE : refresh display")
def __init__(self, env, progress_favor=10.0, current_score_update_rate=1e-3, past_score_update_rate=5e-4, warmup_period=100): """ env (AlfEnvironment): environment to be wrapped. It needs to be batched. progress_favor (float): how much more likely to choose the environment with the fastest progress than the ones with no progress. If ``progress_favor`` is 1, all tasks are sampled uniformly. current_score_update_rate (float): the rate for updating the current score past_score_update_rate (float): the rate for updating the past score warmup_period (int): gradually increase ``progress_favor`` from 1 to ``progress_favor`` during the first ``num_tasks * warmup_period`` episodes """ self._env = env assert env.batched, "Only batched env is supported" num_tasks = env.num_tasks task_names = env.task_names batch_size = env.batch_size self._episode_rewards = torch.zeros(batch_size) assert ( len(env.action_spec()) == 2 and 'action' in env.action_spec() and 'task_id' in env.action_spec() ), ("The action_spec in the wrapped " "environment should have exactly two keys: 'task_id' and 'action'") self._action_spec = env.action_spec()['action'] self._num_tasks = num_tasks self._task_names = task_names self._env_info_spec = copy.copy(env.env_info_spec()) self._env_info_spec.update( self._add_task_names({ 'curriculum_task_count': [alf.TensorSpec(())] * num_tasks, 'curriculum_task_score': [alf.TensorSpec(())] * num_tasks, 'curriculum_task_prob': [alf.TensorSpec(())] * num_tasks })) self._zero_curriculum_info = self._add_task_names({ 'curriculum_task_count': [torch.zeros(batch_size, device='cpu')] * num_tasks, 'curriculum_task_score': [torch.zeros(batch_size, device='cpu')] * num_tasks, 'curriculum_task_prob': [torch.zeros(batch_size, device='cpu')] * num_tasks }) self._progress_favor = progress_favor self._current_score_update_rate = current_score_update_rate self._past_score_update_rate = past_score_update_rate self._warmup_period = warmup_period * num_tasks self._scale = math.log(progress_favor) self._total_count = 0 self._current_scores = torch.zeros(num_tasks, device='cpu') self._past_scores = torch.zeros(num_tasks, device='cpu') self._task_probs = torch.ones(num_tasks, device='cpu') / num_tasks self._task_counts = torch.zeros(num_tasks, device='cpu') self._current_task_ids = self._sample_tasks(batch_size)
def test_curriculum_wrapper(self): task_names = ['CartPole-v0', 'CartPole-v1'] env = create_environment( env_name=task_names, env_load_fn=suite_gym.load, num_parallel_environments=4, batched_wrappers=(alf_wrappers.CurriculumWrapper, )) self.assertTrue(type(env.action_spec()) == alf.BoundedTensorSpec) self.assertEqual(env.num_tasks, 2) self.assertEqual(len(env.env_info_spec()['curriculum_task_count']), 2) self.assertEqual(len(env.env_info_spec()['curriculum_task_score']), 2) self.assertEqual(len(env.env_info_spec()['curriculum_task_prob']), 2) for i in task_names: self.assertEqual(env.env_info_spec()['curriculum_task_count'][i], alf.TensorSpec(())) self.assertEqual(env.env_info_spec()['curriculum_task_score'][i], alf.TensorSpec(())) self.assertEqual(env.env_info_spec()['curriculum_task_prob'][i], alf.TensorSpec(())) time_step = env.reset() self.assertEqual(len(env.env_info_spec()['curriculum_task_count']), 2) self.assertEqual(len(env.env_info_spec()['curriculum_task_score']), 2) self.assertEqual(len(env.env_info_spec()['curriculum_task_prob']), 2) for i in task_names: self.assertEqual( time_step.env_info['curriculum_task_count'][i].shape, (4, )) self.assertEqual( time_step.env_info['curriculum_task_score'][i].shape, (4, )) self.assertEqual( time_step.env_info['curriculum_task_prob'][i].shape, (4, )) for j in range(500): time_step = env.step(time_step.prev_action) self.assertEqual(time_step.env_id, torch.arange(4)) self.assertEqual(len(env.env_info_spec()['curriculum_task_count']), 2) self.assertEqual(len(env.env_info_spec()['curriculum_task_score']), 2) self.assertEqual(len(env.env_info_spec()['curriculum_task_prob']), 2) for i in task_names: self.assertEqual( time_step.env_info['curriculum_task_count'][i].shape, (4, )) self.assertEqual( time_step.env_info['curriculum_task_score'][i].shape, (4, )) self.assertEqual( time_step.env_info['curriculum_task_prob'][i].shape, (4, )) sum_probs = sum( time_step.env_info['curriculum_task_prob'].values()) self.assertTrue( torch.all((sum_probs == 0.) | ((sum_probs - 1.).abs() < 1e-3)))
def __init__(self, dim, size, name="FIFOMemory"): """ Args: dim (int): dimension of memory content size (int): number of memory slots """ self._built = False state_spec = (alf.TensorSpec((size, dim), dtype=torch.float32), alf.TensorSpec((), dtype=torch.int64)) self._range = torch.arange(size).unsqueeze(0) super().__init__(dim, size, state_spec=state_spec, name=name)
def __init__(self, network, n, name=None): """ A parallel network has ``n`` copies of network with the same structure but different indepently initialized parameters. ``NaiveParallelNetwork`` created ``n`` independent networks with the same structure as ``network`` and evaluate them separately in loop during ``forward()``. Args: network (Network): the parallel network will have ``n`` copies of ``network``. n (int): ``n`` copies of ``network`` name(str): a string that will be used as the name of the created NaiveParallelNetwork instance. If ``None``, ``naive_parallel_`` followed by the ``network.name`` will be used by default. """ super().__init__(network.input_tensor_spec, name if name else 'naive_parallel_%s' % network.name) self._networks = nn.ModuleList( [network.copy(name=self.name + '_%d' % i) for i in range(n)]) self._n = n self._state_spec = alf.nest.map_structure( lambda spec: alf.TensorSpec((n, ) + spec.shape, spec.dtype), network.state_spec)
def test_conversions(self): dists = { 't': torch.tensor([[1., 2., 4.], [3., 3., 1.]]), 'd': dist_utils.DiagMultivariateNormal( torch.tensor([[1., 2.], [2., 2.]]), torch.tensor([[2., 3.], [1., 1.]])) } params = dist_utils.distributions_to_params(dists) dists_spec = dist_utils.extract_spec(dists, from_dim=1) self.assertEqual(dists_spec['t'], alf.TensorSpec(shape=(3, ), dtype=torch.float32)) self.assertEqual(type(dists_spec['d']), dist_utils.DistributionSpec) self.assertEqual(len(params), 2) self.assertEqual(dists['t'], params['t']) self.assertEqual(dists['d'].base_dist.mean, params['d']['loc']) self.assertEqual(dists['d'].base_dist.stddev, params['d']['scale']) dists1 = dist_utils.params_to_distributions(params, dists_spec) self.assertEqual(len(dists1), 2) self.assertEqual(dists1['t'], dists['t']) self.assertEqual(type(dists1['d']), type(dists['d'])) params_spec = dist_utils.to_distribution_param_spec(dists_spec) alf.nest.assert_same_structure(params_spec, params) params1_spec = dist_utils.extract_spec(params) self.assertEqual(params_spec, params1_spec)
def _create_merlin_algorithm(env, encoder_fc_layers=(3, ), latent_dim=4, lstm_size=(4, ), memory_size=20, learning_rate=1e-3, debug_summaries=True): config = TrainerConfig(root_dir="dummy", unroll_length=6) observation_spec = env.observation_spec() action_spec = env.action_spec() algorithm = MerlinAlgorithm( observation_spec=observation_spec, action_spec=action_spec, env=env, config=config, encoders=alf.networks.EncodingNetwork( input_tensor_spec=observation_spec, fc_layer_params=encoder_fc_layers, activation=math_ops.identity, name="ObsEncoder"), decoders=DecodingAlgorithm(decoder=alf.networks.EncodingNetwork( input_tensor_spec=alf.TensorSpec((latent_dim, )), fc_layer_params=encoder_fc_layers, activation=math_ops.identity, name="ObsDecoder"), loss_weight=100.), latent_dim=latent_dim, lstm_size=lstm_size, memory_size=memory_size, optimizer=alf.optimizers.AdamTF(lr=learning_rate), debug_summaries=debug_summaries) return algorithm
def __init__(self, observation_spec, action_spec, debug_summaries): super().__init__(observation_spec, action_spec, train_state_spec=MCTSState( steps=alf.TensorSpec((), dtype=torch.int64)), debug_summaries=debug_summaries) self._model = None
def _make_stacked_spec(self, spec): assert isinstance( spec, alf.TensorSpec), (str(type(spec)) + "is not a TensorSpec") if spec.ndim > 0: stacked_shape = list(copy.copy(spec.shape)) stacked_shape[self._stack_axis] = stacked_shape[ self._stack_axis] * self._stack_size stacked_shape = tuple(stacked_shape) else: stacked_shape = (self._stack_size, ) if not spec.is_bounded(): return alf.TensorSpec(stacked_shape, spec.dtype) else: if spec.minimum.shape != (): assert spec.minimum.shape == spec.shape minimum = np.repeat( spec.minimum, repeats=self._stack_size, axis=self._stack_axis) else: minimum = spec.minimum if spec.maximum.shape != (): assert spec.maximum.shape == spec.shape maximum = np.repeat( spec.maximum, repeats=self._stack_size, axis=self._stack_axis) else: maximum = spec.maximum return alf.BoundedTensorSpec( stacked_shape, minimum=minimum, maximum=maximum, dtype=spec.dtype)
def create_algorithm(env): config = TrainerConfig(root_dir="dummy", unroll_length=5) obs_spec = alf.TensorSpec((2, ), dtype='float32') action_spec = alf.BoundedTensorSpec( shape=(), dtype='int32', minimum=0, maximum=2) fc_layer_params = (10, 8, 6) actor_network = partial( ActorDistributionNetwork, fc_layer_params=fc_layer_params, discrete_projection_net_ctor=alf.networks.CategoricalProjectionNetwork) value_network = partial(ValueNetwork, fc_layer_params=(10, 8, 1)) alg = ActorCriticAlgorithm( observation_spec=obs_spec, action_spec=action_spec, actor_network_ctor=actor_network, value_network_ctor=value_network, env=env, config=config, optimizer=alf.optimizers.Adam(lr=1e-2), debug_summaries=True, name="MyActorCritic") return alg
def __init__(self, dim, size, snapshot_only=False, normalize=True, scale=None, usage_decay=None, name='MemoryWithUsage'): """ See Methods 2.3 of `Unsupervised Predictive Memory in a Goal-Directed Agent <https://arxiv.org/abs/1803.10760>`_ Args: dim (int): dimension of memory content size (int): number of memory slots snapshot_only (bool): If True, only keeps the last snapshot of the memory instead of keeping all the memory snapshot at every steps. If True, gradient cannot be propagated to the writer. normalize (bool): If True, use cosine similarity, otherwise use dot product. scale (None|float): Scale the similarity by this. If scale is None, a default value is used based ``normalize``. If ``normalize`` is True, ``scale`` is default to 5.0. If ``normalize`` is False, ``scale`` is default to ``1/sqrt(dim)``. usage_decay (None|float): The usage will be scaled by this factor at every ``write`` call. If None, it is default to ``1 - 1 / size`` """ self._normalize = normalize if scale is None: if normalize: scale = 5.0 else: scale = 1. / math.sqrt(dim) self._scale = scale self._built = False self._snapshot_only = snapshot_only if usage_decay is None: usage_decay = 1. - 1. / size self._usage_decay = usage_decay state_spec = (alf.TensorSpec((size, dim), dtype=torch.float32), alf.TensorSpec((size, ), dtype=torch.float32)) super(MemoryWithUsage, self).__init__(dim, size, state_spec=state_spec, name=name)
def test_transformer_network(self, centralized_memory=True): d_model = 32 core_size = 2 memory_size = 128 num_memory_layers = 8 input_tensor_spec = [ alf.TensorSpec((), dtype=torch.int64), alf.TensorSpec((3, 7, 7), dtype=torch.float32) ] input_preprocessors = [ nn.Sequential(nn.Embedding(100, d_model), alf.layers.Reshape((1, d_model))), nn.Sequential(alf.layers.Conv2D(3, d_model, kernel_size=1), alf.layers.Reshape((d_model, 49)), alf.layers.Transpose()) ] transformer = TransformerNetwork( input_tensor_spec, memory_size=memory_size, core_size=core_size, num_prememory_layers=2, num_memory_layers=num_memory_layers, num_attention_heads=8, d_ff=d_model, centralized_memory=centralized_memory, input_preprocessors=input_preprocessors) state_spec = transformer.state_spec if centralized_memory: self.assertEqual(len(state_spec), 1) self.assertEqual(state_spec[0][0].shape, (memory_size, d_model)) else: self.assertEqual(len(state_spec), 8) for i in range(num_memory_layers): self.assertEqual(state_spec[i][0].shape, (memory_size, d_model)) batch_size = 64 x = [ torch.randint(100, size=(batch_size, )), torch.rand((batch_size, 3, 7, 7)) ] state = alf.utils.spec_utils.zeros_from_spec(transformer.state_spec, batch_size) y, state = transformer(x, state) self.assertEqual(y.shape, (batch_size, core_size * d_model))
def __init__(self, batch_size, obs_shape=(2, )): super().__init__() self._batch_size = batch_size self._rewards = torch.tensor([0.5, 1.0, -1.]) self._observation_spec = alf.TensorSpec(obs_shape, dtype='float32') self._action_spec = alf.BoundedTensorSpec( shape=(), dtype='int64', minimum=0, maximum=2) self.reset()
def env_info_spec(self): return { "player0_win": alf.TensorSpec(()), "player1_win": alf.TensorSpec(()), "player0_pass": alf.TensorSpec(()), "player1_pass": alf.TensorSpec(()), "draw": alf.TensorSpec(()), "invalid_move": alf.TensorSpec(()), "too_long": alf.TensorSpec(()), "bad_move": alf.TensorSpec(()), }
def reward_spec(self): """Defines the reward provided by the environment. The reward of the most environments is a scalar. So we provide a default implementation which returns a scalar spec. Returns: alf.TensorSpec """ return alf.TensorSpec(())
def __init__(self, batch_size): self._batch_size = batch_size self._observation_spec = alf.TensorSpec((3, 3)) self._action_spec = alf.BoundedTensorSpec((), minimum=0, maximum=8, dtype=torch.int64) self._line_x = torch.tensor( [[0, 0, 0], [1, 1, 1], [2, 2, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2]]).unsqueeze(0) self._line_y = torch.tensor( [[0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 0, 0], [1, 1, 1], [2, 2, 2], [0, 1, 2], [2, 1, 0]]).unsqueeze(0) self._B = torch.arange(self._batch_size) self._empty_board = self._observation_spec.zeros() self._boards = self._observation_spec.zeros((self._batch_size, )) self._env_ids = torch.arange(batch_size) self._player_0 = torch.tensor(-1.) self._player_1 = torch.tensor(1.)
def __init__(self, input_tensor_spec, output_tensor_spec=alf.TensorSpec((3, 64, 64)), name='ResnetDecodingNetwork'): """ Args: input_tensor_spec (TensorSpec): input latent spec. output_tensor_spec (TensorSpec): desired output shape. Height and width needs to be divisible by 8. """ super().__init__(input_tensor_spec, name=name) c, h, w = output_tensor_spec.shape assert h % 8 == 0 assert w % 8 == 0 dec_layers = [] relu = torch.relu_ dec_layers.extend([ alf.layers.FC(input_tensor_spec.shape[0], 500, activation=relu), alf.layers.FC(500, h * w, activation=relu), alf.layers.Reshape((64, h // 8, w // 8)) ]) for stride in reversed([2, 1, 2, 1, 2, 1]): dec_layers.append( alf.layers.BottleneckBlock( in_channels=64, kernel_size=3, filters=(64, 32, 64), stride=stride, transpose=True)) dec_layers.append( alf.layers.ConvTranspose2D( in_channels=64, out_channels=3, kernel_size=1, activation=torch.sigmoid)) self._model = nn.Sequential(*dec_layers)
def __init__(self, observation_spec, stack_size=4, stack_axis=0, fields=None): """Create a FrameStacker object. Args: observation_spec (nested TensorSpec): describing the observation in timestep stack_size (int): stack so many frames stack_axis (int): the dimension to stack the observation. fields (list[str]): fields to be stacked, A field str is a multi-level path denoted by "A.B.C". If None, then non-nested observation is stacked. """ assert stack_size >= 1, ( "stack_size should be an integer greater than " "or equal to 1") self._stack_axis = stack_axis self._stack_size = stack_size self._frames = dict() self._fields = fields or [None] self._exp_fields = [] prev_frames_spec = [] stacked_observation_spec = observation_spec for field in self._fields: if field is not None: exp_field = 'observation.' + field else: exp_field = 'observation' self._exp_fields.append(exp_field) spec = alf.nest.get_field(observation_spec, field) prev_frames_spec.append([spec] * (self._stack_size - 1)) stacked_observation_spec = alf.nest.transform_nest( stacked_observation_spec, field, self._make_stacked_spec) super().__init__( transformed_observation_spec=stacked_observation_spec, state_spec=FrameStackState( steps=alf.TensorSpec((), dtype=torch.int64), prev_frames=prev_frames_spec))
def __init__(self, *args): super().__init__(*args) alf.set_default_device("cpu") # spawn forking is required to use cuda. self.data_spec = DataItem(env_id=alf.TensorSpec(shape=(), dtype=torch.int64), x=alf.TensorSpec(shape=(self.dim, ), dtype=torch.float32), t=alf.TensorSpec(shape=(), dtype=torch.int32), o=dict({ "a": alf.TensorSpec(shape=(), dtype=torch.float32), "g": alf.TensorSpec(shape=(), dtype=torch.float32) }), reward=alf.TensorSpec(shape=(), dtype=torch.float32))
def observation_spec(self): return alf.TensorSpec([7])
def observation_spec(self): return alf.TensorSpec([self._max_num_collisions, 3])
def observation_spec(self): return alf.TensorSpec([len(self._future_indices), 3])
def observation_spec(self): return alf.TensorSpec([self._max_num_detections, 4])
def test_frame_stacker(self, stack_axis=0): data_spec = DataItem(step_type=alf.TensorSpec((), dtype=torch.int32), observation=dict(scalar=alf.TensorSpec(()), vector=alf.TensorSpec((7, )), matrix=alf.TensorSpec((5, 6)), tensor=alf.TensorSpec( (2, 3, 4)))) replay_buffer = ReplayBuffer(data_spec=data_spec, num_environments=2, max_length=1024, num_earliest_frames_ignored=2) frame_stacker = FrameStacker( data_spec.observation, stack_size=3, stack_axis=stack_axis, fields=['scalar', 'vector', 'matrix', 'tensor']) new_spec = frame_stacker.transformed_observation_spec self.assertEqual(new_spec['scalar'].shape, (3, )) self.assertEqual(new_spec['vector'].shape, (21, )) if stack_axis == -1: self.assertEqual(new_spec['matrix'].shape, (5, 18)) self.assertEqual(new_spec['tensor'].shape, (2, 3, 12)) elif stack_axis == 0: self.assertEqual(new_spec['matrix'].shape, (15, 6)) self.assertEqual(new_spec['tensor'].shape, (6, 3, 4)) def _step_type(t, period): if t % period == 0: return StepType.FIRST if t % period == period - 1: return StepType.LAST return StepType.MID observation = alf.nest.map_structure( lambda spec: spec.randn((1000, 2)), data_spec.observation) state = common.zero_tensor_from_nested_spec(frame_stacker.state_spec, 2) def _get_stacked_data(t, b): if stack_axis == -1: return dict(scalar=observation['scalar'][t, b], vector=observation['vector'][t, b].reshape(-1), matrix=observation['matrix'][t, b].transpose( 0, 1).reshape(5, 18), tensor=observation['tensor'][t, b].permute( 1, 2, 0, 3).reshape(2, 3, 12)) elif stack_axis == 0: return dict(scalar=observation['scalar'][t, b], vector=observation['vector'][t, b].reshape(-1), matrix=observation['matrix'][t, b].reshape(15, 6), tensor=observation['tensor'][t, b].reshape(6, 3, 4)) def _check_equal(stacked, expected, b): self.assertEqual(stacked['scalar'][b], expected['scalar']) self.assertEqual(stacked['vector'][b], expected['vector']) self.assertEqual(stacked['matrix'][b], expected['matrix']) self.assertEqual(stacked['tensor'][b], expected['tensor']) for t in range(1000): batch = DataItem( step_type=torch.tensor([_step_type(t, 17), _step_type(t, 22)]), observation=alf.nest.map_structure(lambda x: x[t], observation)) replay_buffer.add_batch(batch) timestep, state = frame_stacker.transform_timestep(batch, state) if t == 0: for b in (0, 1): expected = _get_stacked_data([0, 0, 0], b) _check_equal(timestep.observation, expected, b) if t == 1: for b in (0, 1): expected = _get_stacked_data([0, 0, 1], b) _check_equal(timestep.observation, expected, b) if t == 2: for b in (0, 1): expected = _get_stacked_data([0, 1, 2], b) _check_equal(timestep.observation, expected, b) if t == 16: for b in (0, 1): expected = _get_stacked_data([14, 15, 16], b) _check_equal(timestep.observation, expected, b) if t == 17: for b, t in ((0, [17, 17, 17]), (1, [15, 16, 17])): expected = _get_stacked_data(t, b) _check_equal(timestep.observation, expected, b) if t == 18: for b, t in ((0, [17, 17, 18]), (1, [16, 17, 18])): expected = _get_stacked_data(t, b) _check_equal(timestep.observation, expected, b) if t == 22: for b, t in ((0, [20, 21, 22]), (1, [22, 22, 22])): expected = _get_stacked_data(t, b) _check_equal(timestep.observation, expected, b) batch_info = BatchInfo(env_ids=torch.tensor([0, 1, 0, 1], dtype=torch.int64), positions=torch.tensor([0, 1, 18, 22], dtype=torch.int64)) # [4, 2, ...] experience = replay_buffer.get_field( '', batch_info.env_ids.unsqueeze(-1), batch_info.positions.unsqueeze(-1) + torch.arange(2)) experience = experience._replace(batch_info=batch_info, replay_buffer=replay_buffer) experience = frame_stacker.transform_experience(experience) expected = _get_stacked_data([0, 0, 0], 0) _check_equal(experience.observation, expected, (0, 0)) expected = _get_stacked_data([0, 0, 1], 0) _check_equal(experience.observation, expected, (0, 1)) expected = _get_stacked_data([0, 0, 1], 1) _check_equal(experience.observation, expected, (1, 0)) expected = _get_stacked_data([0, 1, 2], 1) _check_equal(experience.observation, expected, (1, 1)) expected = _get_stacked_data([17, 17, 18], 0) _check_equal(experience.observation, expected, (2, 0)) expected = _get_stacked_data([17, 18, 19], 0) _check_equal(experience.observation, expected, (2, 1)) expected = _get_stacked_data([22, 22, 22], 1) _check_equal(experience.observation, expected, (3, 0)) expected = _get_stacked_data([22, 22, 23], 1) _check_equal(experience.observation, expected, (3, 1))
def test_mcts_algorithm(self): observation_spec = alf.TensorSpec((3, 3)) action_spec = alf.BoundedTensorSpec((), dtype=torch.int64, minimum=0, maximum=8) model = TicTacToeModel() time_step = TimeStep(step_type=torch.tensor([StepType.MID])) # board situations and expected actions # yapf: disable cases = [ ([[1, -1, 1], [1, -1, -1], [0, 0, 1]], 6), ([[0, 0, 0], [0, -1, -1], [0, 1, 0]], 3), ([[ 1, -1, -1], [-1, -1, 0], [ 0, 1, 1]], 6), ([[-1, 0, 1], [ 0, -1, -1], [ 0, 0, 1]], 3), ([[0, 0, 0], [0, 0, 0], [0, 0, -1]], 4), ([[0, 0, 0], [0, -1, 0], [0, 0, 0]], (0, 2, 6, 8)), ([[0, 0, 0], [0, 1, -1], [1, -1, -1]], 2), ] # yapf: enable def _create_mcts(observation_spec, action_spec, num_simulations): return MCTSAlgorithm( observation_spec, action_spec, discount=1.0, root_dirichlet_alpha=100., root_exploration_fraction=0.25, num_simulations=num_simulations, pb_c_init=1.25, pb_c_base=19652, visit_softmax_temperature_fn=VisitSoftmaxTemperatureByMoves( [(0, 1.0), (10, 0.0001)]), known_value_bounds=(-1, 1), is_two_player_game=True) # test case serially for observation, action in cases: observation = torch.tensor([observation], dtype=torch.float32) state = MCTSState(steps=(observation != 0).sum(dim=(1, 2))) # We use varing num_simulations instead of a fixed large number such # as 2000 to make the test faster. num_simulations = int((observation == 0).sum().cpu()) * 200 mcts = _create_mcts( observation_spec, action_spec, num_simulations=num_simulations) mcts.set_model(model) alg_step = mcts.predict_step( time_step._replace(observation=observation), state) print(observation, alg_step.output, alg_step.info) if type(action) == tuple: self.assertTrue(alg_step.output[0] in action) else: self.assertEqual(alg_step.output[0], action) # test batch predict observation = torch.tensor([case[0] for case in cases], dtype=torch.float32) state = MCTSState(steps=(observation != 0).sum(dim=(1, 2))) mcts = _create_mcts( observation_spec, action_spec, num_simulations=2000) mcts.set_model(model) alg_step = mcts.predict_step( time_step._replace( step_type=torch.tensor([StepType.MID] * len(cases)), observation=observation), state) for i, (observation, action) in enumerate(cases): if type(action) == tuple: self.assertTrue(alg_step.output[i] in action) else: self.assertEqual(alg_step.output[i], action)
def reward_spec(self): return alf.TensorSpec(())
def __init__( self, parent_actor, sensor_type='sensor.camera.rgb', xyz=(1.6, 0., 1.7), pyr=(0., 0., 0.), attachment_type='rigid', fov=90.0, fstop=1.4, gamma=2.2, image_size_x=640, image_size_y=480, iso=1200.0, ): """ Args: parent_actor (carla.Actor): the parent actor of this sensor sensor_type (str): 'sensor.camera.rgb', 'sensor.camera.depth', 'sensor.camera.semantic_segmentation' attachment_type (str): There are two types of attachement. 'rigid': the object follow its parent position strictly. 'spring_arm': the object expands or retracts depending on camera situation. xyz (tuple[float]): the attachment positition (x, y, z) relative to the parent_actor. pyr (tuple[float]): the attachment rotation (pitch, yaw, roll) in degrees. fov (str): horizontal field of view in degrees. image_size_x (int): image width in pixels. image_size_y (int): image height in pixels. gamma (float): target gamma value of the camera. iso (float): the camera sensor sensitivity. """ super().__init__(parent_actor) attachment_type_map = { 'rigid': carla.AttachmentType.Rigid, 'spring_arm': carla.AttachmentType.SpringArm, } assert attachment_type in attachment_type_map, ( "Unknown attachment_type %s" % attachment_type) self._attachment_type = attachment_type_map[attachment_type] self._camera_transform = carla.Transform(carla.Location(*xyz), carla.Rotation(*pyr)) self._sensor_type = sensor_type sensor_map = { 'sensor.camera.rgb': (carla.ColorConverter.Raw, 3), 'sensor.camera.depth': (carla.ColorConverter.LogarithmicDepth, 1), 'sensor.camera.semantic_segmentation': (carla.ColorConverter.Raw, 1), } assert sensor_type in sensor_map, "Unknown sensor type %s" % sensor_type conversion, num_channels = sensor_map[sensor_type] self._conversion = conversion self._observation_spec = alf.TensorSpec( [num_channels, image_size_y, image_size_x], dtype='uint8') world = self._parent.get_world() bp = world.get_blueprint_library().find(sensor_type) attributes = dict(fov=fov, fstop=fstop, gamma=gamma, image_size_x=image_size_x, image_size_y=image_size_y, iso=iso) for name, val in attributes.items(): if bp.has_attribute(name): bp.set_attribute(name, str(val)) self._sensor = self._parent.get_world().spawn_actor( bp, self._camera_transform, attach_to=self._parent, attachment_type=self._attachment_type) # We need to pass the lambda a weak reference to self to avoid # circular reference. weak_self = weakref.ref(self) self._sensor.listen( lambda image: CameraSensor._parse_image(weak_self, image)) self._frame = 0 self._image = np.zeros([num_channels, image_size_y, image_size_x], dtype=np.uint8)
def forward(self, experience, value, target_value): """Cacluate the loss. The first dimension of all the tensors is time dimension and the second dimesion is the batch dimension. Args: experience (Experience): experience collected from ``unroll()`` or a replay buffer. All tensors are time-major. value (torch.Tensor): the time-major tensor for the value at each time step. The loss is between this and the calculated return. target_value (torch.Tensor): the time-major tensor for the value at each time step. This is used to calculate return. ``target_value`` can be same as ``value``. Returns: LossInfo: with the ``extra`` field same as ``loss``. """ if self._lambda == 1.0: returns = value_ops.discounted_return( rewards=experience.reward, values=target_value, step_types=experience.step_type, discounts=experience.discount * self._gamma) elif self._lambda == 0.0: returns = value_ops.one_step_discounted_return( rewards=experience.reward, values=target_value, step_types=experience.step_type, discounts=experience.discount * self._gamma) else: advantages = value_ops.generalized_advantage_estimation( rewards=experience.reward, values=target_value, step_types=experience.step_type, discounts=experience.discount * self._gamma, td_lambda=self._lambda) returns = advantages + target_value[:-1] value = value[:-1] if self._normalize_target: if self._target_normalizer is None: self._target_normalizer = AdaptiveNormalizer( alf.TensorSpec(value.shape[2:]), auto_update=False, debug_summaries=self._debug_summaries, name=self._name + ".target_normalizer") self._target_normalizer.update(returns) returns = self._target_normalizer.normalize(returns) value = self._target_normalizer.normalize(value) if self._debug_summaries and alf.summary.should_record_summaries(): mask = experience.step_type[:-1] != StepType.LAST with alf.summary.scope(self._name): def _summarize(v, r, td, suffix): alf.summary.scalar( "explained_variance_of_return_by_value" + suffix, tensor_utils.explained_variance(v, r, mask)) safe_mean_hist_summary('values' + suffix, v, mask) safe_mean_hist_summary('returns' + suffix, r, mask) safe_mean_hist_summary("td_error" + suffix, td, mask) if value.ndim == 2: _summarize(value, returns, returns - value, '') else: td = returns - value for i in range(value.shape[2]): suffix = '/' + str(i) _summarize(value[..., i], returns[..., i], td[..., i], suffix) loss = self._td_error_loss_fn(returns.detach(), value) if loss.ndim == 3: # Multidimensional reward. Average over the critic loss for all dimensions loss = loss.mean(dim=2) # The shape of the loss expected by Algorith.update_with_gradient is # [T, B], so we need to augment it with additional zeros. loss = tensor_utils.tensor_extend_zero(loss) return LossInfo(loss=loss, extra=loss)