示例#1
0
    def __init__(self,
                 states_spec,
                 actions_spec,
                 capacity,
                 random_sampling=True):
        super(Replay, self).__init__(states_spec=states_spec,
                                     actions_spec=actions_spec)
        self.capacity = capacity
        self.states = {
            name: np.zeros((capacity, ) + tuple(state['shape']),
                           dtype=util.np_dtype(state['type']))
            for name, state in states_spec.items()
        }
        self.next_states = {
            name: np.zeros((capacity, ) + tuple(state['shape']),
                           dtype=util.np_dtype(state['type']))
            for name, state in states_spec.items()
        }
        self.internals, self.next_internals = None, None
        self.actions = {
            name: np.zeros((capacity, ) + tuple(action['shape']),
                           dtype=util.np_dtype(action['type']))
            for name, action in actions_spec.items()
        }
        self.terminal = np.zeros((capacity, ), dtype=util.np_dtype('bool'))
        self.reward = np.zeros((capacity, ), dtype=util.np_dtype('float'))

        self.size = 0
        self.index = 0
        self.random_sampling = random_sampling
示例#2
0
    def long_unittest(self, horizon):
        agent, environment = self.prepare(min_timesteps=3, reward_estimation=dict(horizon=horizon))

        states = environment.reset()
        actions = agent.act(states=states)
        states, terminal, reward = environment.execute(actions=actions)
        _, horizon_output1 = agent.observe(terminal=terminal, reward=reward, query='horizon')
        self.assertIsInstance(horizon_output1, util.np_dtype(dtype='long'))

        actions = agent.act(states=states)
        states, terminal, reward = environment.execute(actions=actions)
        _, horizon_output2 = agent.observe(terminal=terminal, reward=reward, query='horizon')
        if not isinstance(horizon, dict) or horizon['type'] == 'constant':
            self.assertEqual(horizon_output2, horizon_output1)
        else:
            self.assertNotEqual(horizon_output2, horizon_output1)

        actions = agent.act(states=states)
        _, terminal, reward = environment.execute(actions=actions)
        horizon_input = 3
        _, horizon_output = agent.observe(
            terminal=terminal, reward=reward, query='horizon',
            **{'estimator/horizon': horizon_input}
        )
        self.assertEqual(
            horizon_output, np.asarray(horizon_input, dtype=util.np_dtype(dtype='long'))
        )

        agent.close()
        environment.close()

        self.finished_test()
示例#3
0
    def tf_initialize(self):
        super().tf_initialize()

        # Value buffers
        self.buffers = OrderedDict()
        for name, spec in self.values_spec.items():
            if util.is_nested(name=name):
                self.buffers[name] = OrderedDict()
                for inner_name, spec in spec.items():
                    shape = (self.capacity, ) + spec['shape']
                    self.buffers[name][inner_name] = self.add_variable(
                        name=(inner_name + '-buffer'),
                        dtype=spec['type'],
                        shape=shape,
                        is_trainable=False)
            else:
                shape = (self.capacity, ) + spec['shape']
                if name == 'terminal':
                    # Terminal initialization has to agree with terminal_indices
                    initializer = np.zeros(shape=(self.capacity, ),
                                           dtype=util.np_dtype(dtype='long'))
                    initializer[-1] = 1
                    self.buffers[name] = self.add_variable(
                        name=(name + '-buffer'),
                        dtype=spec['type'],
                        shape=shape,
                        is_trainable=False,
                        initializer=initializer)
                else:
                    self.buffers[name] = self.add_variable(name=(name +
                                                                 '-buffer'),
                                                           dtype=spec['type'],
                                                           shape=shape,
                                                           is_trainable=False)

        # Buffer index (modulo capacity, next index to write to)
        self.buffer_index = self.add_variable(name='buffer-index',
                                              dtype='long',
                                              shape=(),
                                              is_trainable=False,
                                              initializer='zeros')

        # Terminal indices
        # (oldest episode terminals first, initially the only terminal is last index)
        initializer = np.zeros(shape=(self.capacity + 1, ),
                               dtype=util.np_dtype(dtype='long'))
        initializer[0] = self.capacity - 1
        self.terminal_indices = self.add_variable(name='terminal-indices',
                                                  dtype='long',
                                                  shape=(self.capacity + 1, ),
                                                  is_trainable=False,
                                                  initializer=initializer)

        # Episode count
        self.episode_count = self.add_variable(name='episode-count',
                                               dtype='long',
                                               shape=(),
                                               is_trainable=False,
                                               initializer='zeros')
示例#4
0
    def get_batch(self, batch_size):
        """
        Samples a batch of the specified size according to priority. 

        Args:
            batch_size: Length of the sampled sequence.

        Returns: A dict containing states, rewards, terminals and internal states

        """
        assert not self.batch_indices

        states = {
            name: np.zeros((batch_size, ) + tuple(shape), dtype=dtype)
            for name, (shape, dtype) in self.state_spec.items()
        }
        actions = {
            name: np.zeros((batch_size, ), dtype=dtype)
            for name, dtype in self.action_spec.items()
        }
        rewards = np.zeros((batch_size, ), dtype=util.np_dtype('float'))
        terminals = np.zeros((batch_size, ), dtype=util.np_dtype('bool'))
        internals = [
            np.zeros((batch_size, ) + shape, dtype)
            for shape, dtype in self.internal_spec
        ]

        zero_priority_index = self.positive_priority_index + 1
        for n in xrange(batch_size):
            if zero_priority_index < len(self.observations):
                _, observation = self.observations[zero_priority_index]
                index = zero_priority_index
                zero_priority_index += 1
            else:
                while True:
                    sample = random()
                    for index, (priority,
                                observation) in enumerate(self.observations):
                        sample -= priority / self.sum_priorities
                        if sample < 0.0:
                            break
                    if index not in self.batch_indices:
                        break

            for name, state in states.items():
                state[n] = observation[0][name]
            for name, action in actions.items():
                action[n] = observation[1][name]
            rewards[n] = observation[2]
            terminals[n] = observation[3]
            for k, internal in enumerate(internals):
                internal[n] = observation[4][k]
            self.batch_indices.append(index)

        return dict(states=states,
                    actions=actions,
                    rewards=rewards,
                    terminals=terminals,
                    internals=internals)
 def __init__(self, capacity, states_config, actions_config, random_sampling=False):
     super(Replay, self).__init__(capacity, states_config, actions_config)
     self.states = {name: np.zeros((capacity,) + tuple(state.shape), dtype=util.np_dtype(state.type)) for name, state in states_config}
     self.actions = {name: np.zeros((capacity,) + tuple(action.shape), dtype=util.np_dtype('float' if action.continuous else 'int')) for name, action in actions_config}
     self.rewards = np.zeros((capacity,), dtype=util.np_dtype('float'))
     self.terminals = np.zeros((capacity,), dtype=util.np_dtype('bool'))
     self.internals = None
     self.size = 0
     self.index = 0
     self.random_sampling = random_sampling
示例#6
0
    def reset(self):
        """
        Resets all agent buffers and discards unfinished episodes.
        """
        self.buffer_indices = np.zeros(shape=(self.parallel_interactions, ),
                                       dtype=util.np_dtype(dtype='int'))
        self.timestep_completed = np.ones(shape=(self.parallel_interactions, ),
                                          dtype=util.np_dtype(dtype='bool'))

        self.timesteps, self.episodes, self.updates = self.model.reset()
示例#7
0
    def __init__(self, capacity, states_config, actions_config):
        self.capacity = capacity

        self.states = {name: np.zeros((capacity,) + tuple(state.shape), dtype=util.np_dtype(state.type)) for name, state in states_config}
        self.actions = {name: np.zeros((capacity,), dtype=util.np_dtype('float' if action.continuous else 'int')) for name, action in actions_config}
        self.rewards = np.zeros((capacity,), dtype=util.np_dtype('float'))
        self.terminals = np.zeros((capacity,), dtype=util.np_dtype('bool'))
        self.internals = None

        self.size = 0
        self.index = 0
示例#8
0
    def __init__(self, capacity, states_config, actions_config, prioritization_weight=1.0):
        self.capacity = capacity
        self.prioritization_weight = prioritization_weight

        self.state_spec = {name: (tuple(state.shape), util.np_dtype(state.type)) for name, state in states_config}
        self.action_spec = {name: util.np_dtype('float' if action.continuous else 'int') for name, action in actions_config}
        self.internal_spec = None
        self.observations = list()

        self.sum_priorities = 0.0
        self.positive_priority_index = -1
        self.batch_indices = list()
示例#9
0
 def reset(self):
     """
     Resets the agent to start a new episode.
     """
     self.buffer_indices = np.zeros(shape=(self.parallel_interactions, ),
                                    dtype=util.np_dtype(dtype='int'))
     self.timesteps, self.episodes, self.updates = self.model.reset()
示例#10
0
    def reset(self, independent=False, evaluation=False):
        """
        Resets all agent buffers, or only terminates and resets an episode in
            independent/evaluation mode if corresponding argument is set.

        Args:
            independent (bool): Whether to terminate an episode in independent mode
                (<span style="color:#00C000"><b>default</b></span>: false).
            evaluation (bool): Whether to terminate an episode in evaluation mode, implies
                independent
                (<span style="color:#00C000"><b>default</b></span>: false).
        """
        if evaluation:
            if independent:
                raise TensorforceError(
                    message="Agent.reset argument independent is implied by and thus should not "
                            "be used together with argument evaluation."
                )
            independent = True

        if not independent:
            self.buffer_indices = np.zeros(
                shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='int')
            )

        self.timesteps, self.episodes, self.updates = self.model.reset(independent=independent)
示例#11
0
    def float_unittest(self, exploration):
        agent, environment = self.prepare(min_timesteps=3, exploration=exploration)

        states = environment.reset()
        actions, exploration_output1 = agent.act(states=states, query='exploration')
        self.assertIsInstance(exploration_output1, util.np_dtype(dtype='float'))
        states, terminal, reward = environment.execute(actions=actions)
        agent.observe(terminal=terminal, reward=reward)

        if not isinstance(exploration, dict) or exploration['type'] == 'constant':
            actions, exploration_output2 = agent.act(states=states, query='exploration')
            self.assertEqual(exploration_output2, exploration_output1)
            states, terminal, reward = environment.execute(actions=actions)
            agent.observe(terminal=terminal, reward=reward)

        else:
            actions, exploration_output2 = agent.act(states=states, query='exploration')
            self.assertNotEqual(exploration_output2, exploration_output1)
            states, terminal, reward = environment.execute(actions=actions)
            agent.observe(terminal=terminal, reward=reward)

        exploration_input = 0.5
        actions, exploration_output = agent.act(
            states=states, query='exploration', exploration=exploration_input
        )
        self.assertEqual(exploration_output, exploration_input)

        agent.close()
        environment.close()

        self.finished_test()
示例#12
0
    def parameter_unittest(self, name, exploration):
        states = dict(type='float', shape=(1,))

        actions = dict(type='int', shape=(), num_values=3)

        agent, environment = self.prepare(
            name=name, states=states, actions=actions, exploration=exploration
        )

        agent.initialize()
        states = environment.reset()

        actions, exploration_output1 = agent.act(states=states, query='exploration')
        self.assertIsInstance(exploration_output1, util.np_dtype(dtype='float'))

        states, terminal, reward = environment.execute(actions=actions)
        agent.observe(terminal=terminal, reward=reward)

        if name != 'constant':
            actions, exploration_output2 = agent.act(states=states, query='exploration')
            self.assertNotEqual(exploration_output2, exploration_output1)

            states, terminal, reward = environment.execute(actions=actions)
            agent.observe(terminal=terminal, reward=reward)

        exploration_input = 0.5
        actions, exploration_output = agent.act(
            states=states, query='exploration', exploration=exploration_input
        )
        self.assertEqual(exploration_output, exploration_input)

        agent.close()
        environment.close()
        sys.stdout.flush()
        self.assertTrue(expr=True)
示例#13
0
    def reset(self):
        """
        Resets possibly inconsistent internal values, for instance, after saving and restoring an
        agent. Automatically triggered as part of Agent.create/load/initialize/restore.
        """
        # Reset timestep completed
        self.timestep_completed = np.ones(
            shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='bool')
        )

        # Reset buffers
        for buffer in self.buffers.values():
            for x in buffer:
                x.clear()
        if self.recorder_spec is not None:
            for x in self.recorded.values():
                x.clear()

        # Reset model
        timesteps, episodes, updates = self.model.reset()
        self.timesteps = timesteps.numpy().item()
        self.episodes = episodes.numpy().item()
        self.updates = updates.numpy().item()

        if self.model.saver is not None:
            self.model.save()
示例#14
0
    def initialize(self):
        """
        Initializes the agent.
        """
        self.is_initialized = True

        # Parallel terminal/reward buffers
        self.terminal_buffers = np.ndarray(shape=(self.parallel_interactions,
                                                  self.buffer_observe),
                                           dtype=util.np_dtype(dtype='long'))
        self.reward_buffers = np.ndarray(shape=(self.parallel_interactions,
                                                self.buffer_observe),
                                         dtype=util.np_dtype(dtype='float'))

        # Parallel buffer indices
        self.buffer_indices = np.zeros(shape=(self.parallel_interactions, ),
                                       dtype=util.np_dtype(dtype='int'))

        self.timesteps = 0
        self.episodes = 0
        self.updates = 0

        if self.recorder_spec is not None:
            self.record_states = OrderedDict(
                ((name, list()) for name in self.states_spec))
            for name, spec in self.actions_spec.items():
                if spec['type'] == 'int':
                    self.record_states[name + '_mask'] = list()
            self.record_actions = OrderedDict(
                ((name, list()) for name in self.actions_spec))
            self.record_terminal = list()
            self.record_reward = list()
            self.num_episodes = 0

        # Setup Model
        if not hasattr(self, 'model'):
            raise TensorforceError.missing(name='Agent', value='model')

        self.model.initialize()
        if self.model.saver_directory is not None:
            file = os.path.join(self.model.saver_directory,
                                self.model.saver_filename + '.json')
            with open(file, 'w') as fp:
                json.dump(obj=self.spec, fp=fp)

        self.reset()
示例#15
0
    def __init__(self, capacity, states_config, actions_config):
        capacity = int(capacity)
        self.states = dict()
        for name, state in states_config:
            self.states[name] = np.zeros((capacity, ) + tuple(state.shape),
                                         dtype=util.np_dtype(state.type))
        self.actions = dict()
        for name, action in actions_config:
            dtype = util.np_dtype('float' if action.continuous else 'int')
            self.actions[name] = np.zeros((capacity, ), dtype=dtype)
        self.rewards = np.zeros((capacity, ), dtype=util.np_dtype('float'))
        self.terminals = np.zeros((capacity, ), dtype=util.np_dtype('bool'))
        self.internals = None

        self.capacity = capacity
        self.size = 0
        self.index = 0
示例#16
0
    def get_batch(self, batch_size):
        """
        Samples a batch of the specified size according to priority. 

        Args:
            batch_size: Length of the sampled sequence.

        Returns: A dict containing states, rewards, terminals and internal states

        """
        assert not self.batch_indices

        states = {name: np.zeros((batch_size,) + tuple(shape), dtype=dtype) for name, (shape, dtype) in self.state_spec.items()}
        actions = {name: np.zeros((batch_size,), dtype=dtype) for name, dtype in self.action_spec.items()}
        rewards = np.zeros((batch_size,), dtype=util.np_dtype('float'))
        terminals = np.zeros((batch_size,), dtype=util.np_dtype('bool'))
        internals = [np.zeros((batch_size,) + shape, dtype) for shape, dtype in self.internal_spec]

        zero_priority_index = self.positive_priority_index + 1
        for n in xrange(batch_size):
            if zero_priority_index < len(self.observations):
                _, observation = self.observations[zero_priority_index]
                index = zero_priority_index
                zero_priority_index += 1
            else:
                while True:
                    sample = random()
                    for index, (priority, observation) in enumerate(self.observations):
                        sample -= priority / self.sum_priorities
                        if sample < 0.0:
                            break
                    if index not in self.batch_indices:
                        break

            for name, state in states.items():
                state[n] = observation[0][name]
            for name, action in actions.items():
                action[n] = observation[1][name]
            rewards[n] = observation[2]
            terminals[n] = observation[3]
            for k, internal in enumerate(internals):
                internal[n] = observation[4][k]
            self.batch_indices.append(index)

        return dict(states=states, actions=actions, rewards=rewards, terminals=terminals, internals=internals)
示例#17
0
    def initialize(self):
        # Check whether already initialized
        if self.is_initialized:
            raise TensorforceError(
                message=
                "Agent is already initialized, possibly as part of Agent.create()."
            )
        self.is_initialized = True

        # Act-observe timestep check
        self.timestep_counter = np.zeros(shape=(self.parallel_interactions, ),
                                         dtype=util.np_dtype(dtype='int'))
        self.timestep_completed = np.ones(shape=(self.parallel_interactions, ),
                                          dtype=util.np_dtype(dtype='bool'))

        # Recorder buffers if required
        if self.recorder is not None:
            self.num_episodes = 0

            self.buffers = ListDict()
            self.buffers['terminal'] = [
                list() for _ in range(self.parallel_interactions)
            ]
            self.buffers['reward'] = [
                list() for _ in range(self.parallel_interactions)
            ]

            def function(spec):
                return [list() for _ in range(self.parallel_interactions)]

            self.buffers['states'] = self.states_spec.fmap(function=function,
                                                           cls=ListDict)
            self.buffers['actions'] = self.actions_spec.fmap(function=function,
                                                             cls=ListDict)

            function = (lambda x: list())

            self.recorded = ListDict()
            self.recorded['states'] = self.states_spec.fmap(function=function,
                                                            cls=ListDict)
            self.recorded['actions'] = self.actions_spec.fmap(
                function=function, cls=ListDict)
            self.recorded['terminal'] = list()
            self.recorded['reward'] = list()
示例#18
0
    def __init__(self, name, capacity, values_spec, device=None, summary_labels=None):
        # Terminal initialization has to agree with terminal_indices
        terminal_initializer = np.zeros(shape=(capacity,), dtype=util.np_dtype(dtype='long'))
        terminal_initializer[-1] = 1
        initializers = OrderedDict(terminal=terminal_initializer)

        super().__init__(
            name=name, capacity=capacity, values_spec=values_spec, initializers=initializers,
            device=device, summary_labels=summary_labels
        )
示例#19
0
    def internals_init(self):
        internals_init = super().internals_init()

        if self.cell_type == 'gru':
            shape = (self.size, )
        elif self.cell_type == 'lstm':
            shape = (2, self.size)

        stddev = min(0.1, np.sqrt(2.0 / self.size))
        internals_init['state'] = np.random.normal(
            scale=stddev, size=shape).astype(util.np_dtype(dtype='float'))

        return internals_init
示例#20
0
    def __init__(self,
                 capacity,
                 states_config,
                 actions_config,
                 prioritization_weight=1.0):
        self.capacity = capacity
        self.prioritization_weight = prioritization_weight

        self.state_spec = {
            name: (tuple(state.shape), util.np_dtype(state.type))
            for name, state in states_config
        }
        self.action_spec = {
            name: util.np_dtype('float' if action.continuous else 'int')
            for name, action in actions_config
        }
        self.internal_spec = None
        self.observations = list()

        self.sum_priorities = 0.0
        self.positive_priority_index = -1
        self.batch_indices = list()
示例#21
0
    def pre_run(self, agent, environment):
        demonstrations = list()

        agent.reset()
        internals = agent.current_internals
        terminal = True

        for n in xrange(50):
            if terminal:
                state = environment.reset()

            actions = dict()
            # Create demonstration actions of the right shape.
            if 'type' in environment.actions:
                if environment.actions['type'] == 'bool':
                    actions = np.full(shape=(),
                                      fill_value=True,
                                      dtype=util.np_dtype(
                                          environment.actions['type']))
                elif environment.actions['type'] == 'int':
                    actions = np.full(shape=(),
                                      fill_value=1,
                                      dtype=util.np_dtype(
                                          environment.actions['type']))
                elif environment.actions['type'] == 'float':
                    actions = np.full(shape=(),
                                      fill_value=1.0,
                                      dtype=util.np_dtype(
                                          environment.actions['type']))
            else:
                for name, action in environment.actions.items():
                    if action['type'] == 'bool':
                        actions[name] = np.full(shape=action['shape'],
                                                fill_value=True,
                                                dtype=util.np_dtype(
                                                    action['type']))
                    elif action['type'] == 'int':
                        actions[name] = np.full(shape=action['shape'],
                                                fill_value=1,
                                                dtype=util.np_dtype(
                                                    action['type']))
                    elif action['type'] == 'float':
                        actions[name] = np.full(shape=action['shape'],
                                                fill_value=1.0,
                                                dtype=util.np_dtype(
                                                    action['type']))

            state, terminal, reward = environment.execute(actions=actions)

            demonstration = dict(states=state,
                                 internals=internals,
                                 actions=actions,
                                 terminal=terminal,
                                 reward=reward)
            demonstrations.append(demonstration)

        agent.import_demonstrations(demonstrations)
        agent.pretrain(steps=1000)
示例#22
0
    def initialize(self):
        super().initialize()

        # Value buffers
        def function(name, spec):
            spec = TensorSpec(type=spec.type,
                              shape=((self.capacity, ) + spec.shape))
            if name == 'terminal':
                initializer = np.zeros(shape=(self.capacity, ),
                                       dtype=spec.np_type())
                initializer[-1] = 1
            else:
                initializer = 'zeros'
            return self.variable(name=(name.replace('/', '_') + '-buffer'),
                                 spec=spec,
                                 initializer=initializer,
                                 is_trainable=False,
                                 is_saved=True)

        self.buffers = self.values_spec.fmap(function=function,
                                             cls=VariableDict,
                                             with_names=True)

        # Buffer index (modulo capacity, next index to write to)
        self.buffer_index = self.variable(name='buffer-index',
                                          spec=TensorSpec(type='int'),
                                          initializer='zeros',
                                          is_trainable=False,
                                          is_saved=True)

        # Terminal indices
        # (oldest episode terminals first, initially the only terminal is last index)
        initializer = np.zeros(shape=(self.capacity + 1, ),
                               dtype=util.np_dtype(dtype='int'))
        initializer[0] = self.capacity - 1
        self.terminal_indices = self.variable(name='terminal-indices',
                                              spec=TensorSpec(
                                                  type='int',
                                                  shape=(self.capacity + 1, )),
                                              initializer=initializer,
                                              is_trainable=False,
                                              is_saved=True)

        # Episode count
        self.episode_count = self.variable(name='episode-count',
                                           spec=TensorSpec(type='int'),
                                           initializer='zeros',
                                           is_trainable=False,
                                           is_saved=True)
示例#23
0
    def tf_initialize(self):
        super().tf_initialize()

        # Terminal indices
        # (oldest episode terminals first, initially the only terminal is last index)
        initializer = np.zeros(shape=(self.capacity + 1,), dtype=util.np_dtype(dtype='long'))
        initializer[0] = self.capacity - 1
        self.terminal_indices = self.add_variable(
            name='terminal-indices', dtype='long', shape=(self.capacity + 1,), is_trainable=False,
            initializer=initializer
        )

        # Episode count
        self.episode_count = self.add_variable(
            name='episode-count', dtype='long', shape=(), is_trainable=False, initializer='zeros'
        )
示例#24
0
    def is_valid_action_function(cls, action_spec):
        dtype = action_spec['type']
        shape = action_spec.get('shape', ())

        if dtype == 'bool':
            return (lambda action, name, states: (
                (isinstance(action, util.py_dtype('bool')) and shape ==
                 ()) or (isinstance(action, np.ndarray) and action.dtype ==
                         util.np_dtype('bool') and action.shape == shape)))

        elif dtype == 'int':
            num_values = action_spec['num_values']
            return (lambda action, name, states: (
                (isinstance(action, util.py_dtype('int')) and shape ==
                 () and 0 <= action and action < num_values and states[
                     name + '_mask'][action]) or
                (isinstance(action, np.ndarray) and action.dtype == util.
                 np_dtype('int') and action.shape == shape and
                 (0 <= action).all() and (action < num_values).all() and np.
                 take_along_axis(states[name + '_mask'],
                                 indices=np.expand_dims(action, axis=-1),
                                 axis=-1).all())))

        elif dtype == 'float':
            if 'min_value' in action_spec:
                min_value = action_spec['min_value']
                max_value = action_spec['max_value']
                return (lambda action, name, states: (
                    (isinstance(action, util.py_dtype('float')) and shape ==
                     () and min_value <= action and action <= max_value) or
                    (isinstance(action, np.ndarray) and action.dtype == util.
                     np_dtype('float') and action.shape == shape and
                     (min_value <= action).all() and
                     (action <= max_value).all())))

            else:
                return (lambda action, name, states: (
                    (isinstance(action, util.py_dtype('float')) and shape ==
                     ()) or
                    (isinstance(action, np.ndarray) and action.dtype == util.
                     np_dtype('float') and action.shape == shape)))
    def is_valid_action_function(cls, action_spec):
        dtype = action_spec['type']
        shape = action_spec.get('shape', ())

        if dtype == 'bool':
            return (lambda action: (
                (isinstance(action, util.np_dtype('bool')) and shape ==
                 ()) or (isinstance(action, np.ndarray) and action.dtype ==
                         util.np_dtype('bool') and action.shape == shape)))

        elif dtype == 'int':
            num_values = action_spec['num_values']
            return (lambda action: (
                ((isinstance(action, util.np_dtype('int')) and shape ==
                  ()) or (isinstance(action, np.ndarray) and action.dtype ==
                          util.np_dtype('int') and action.shape == shape)) and
                (0 <= action).all() and (action < num_values).all()))

        elif dtype == 'float':
            if 'min_value' in action_spec:
                min_value = action_spec['min_value']
                max_value = action_spec['max_value']
                return (lambda action: ((
                    (isinstance(action, util.np_dtype('float')) and shape ==
                     ()) or
                    (isinstance(action, np.ndarray) and action.dtype == util.
                     np_dtype('float') and action.shape == shape)) and
                                        (min_value <= action).all() and
                                        (action <= max_value).all()))

            else:
                return (lambda action: (
                    (isinstance(action, util.np_dtype('float')) and shape ==
                     ()) or
                    (isinstance(action, np.ndarray) and action.dtype == util.
                     np_dtype('float') and action.shape == shape)))
示例#26
0
    def experience(
        self, states, actions, terminal, reward, internals=None, query=None, **kwargs
    ):
        """
        Feed experience traces.

        Args:
            states (dict[array[state]]): Dictionary containing arrays of states
                (<span style="color:#C00000"><b>required</b></span>).
            actions (dict[array[action]]): Dictionary containing arrays of actions
                (<span style="color:#C00000"><b>required</b></span>).
            terminal (array[bool]): Array of terminals
                (<span style="color:#C00000"><b>required</b></span>).
            reward (array[float]): Array of rewards
                (<span style="color:#C00000"><b>required</b></span>).
            internals (dict[state]): Dictionary containing arrays of internal agent states
                (<span style="color:#00C000"><b>default</b></span>: no internal states).
            query (list[str]): Names of tensors to retrieve
                (<span style="color:#00C000"><b>default</b></span>: none).
            kwargs: Additional input values, for instance, for dynamic hyperparameters.
        """
        assert (self.buffer_indices == 0).all()
        assert util.reduce_all(predicate=util.not_nan_inf, xs=states)
        assert internals is None or util.reduce_all(predicate=util.not_nan_inf, xs=internals)
        assert util.reduce_all(predicate=util.not_nan_inf, xs=actions)
        assert util.reduce_all(predicate=util.not_nan_inf, xs=reward)

        # Auxiliaries
        auxiliaries = OrderedDict()
        if isinstance(states, dict):
            for name, spec in self.actions_spec.items():
                if spec['type'] == 'int' and name + '_mask' in states:
                    auxiliaries[name + '_mask'] = np.asarray(states.pop(name + '_mask'))
        auxiliaries = util.fmap(function=np.asarray, xs=auxiliaries, depth=1)

        # Normalize states/actions dictionaries
        states = util.normalize_values(
            value_type='state', values=states, values_spec=self.states_spec
        )
        actions = util.normalize_values(
            value_type='action', values=actions, values_spec=self.actions_spec
        )
        if internals is None:
            internals = OrderedDict()

        if isinstance(terminal, (bool, int)):
            states = util.fmap(function=(lambda x: [x]), xs=states, depth=1)
            internals = util.fmap(function=(lambda x: [x]), xs=internals, depth=1)
            auxiliaries = util.fmap(function=(lambda x: [x]), xs=auxiliaries, depth=1)
            actions = util.fmap(function=(lambda x: [x]), xs=actions, depth=1)
            terminal = [terminal]
            reward = [reward]

        states = util.fmap(function=np.asarray, xs=states, depth=1)
        internals = util.fmap(function=np.asarray, xs=internals, depth=1)
        auxiliaries = util.fmap(function=np.asarray, xs=auxiliaries, depth=1)
        actions = util.fmap(function=np.asarray, xs=actions, depth=1)

        if isinstance(terminal, np.ndarray):
            if terminal.dtype is util.np_dtype(dtype='bool'):
                zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='long'))
                ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='long'))
                terminal = np.where(terminal, ones, zeros)
        else:
            terminal = np.asarray([int(x) if isinstance(x, bool) else x for x in terminal])
        reward = np.asarray(reward)

        # Batch experiences split into episodes and at most size buffer_observe
        last = 0
        for index in range(1, len(terminal) + 1):
            if terminal[index - 1] == 0 and index - last < self.experience_size:
                continue

            # Include terminal in batch if possible
            if index < len(terminal) and terminal[index - 1] == 0 and terminal[index] > 0 and \
                    index - last < self.experience_size:
                index += 1

            function = (lambda x: x[last: index])
            states_batch = util.fmap(function=function, xs=states, depth=1)
            internals_batch = util.fmap(function=function, xs=internals, depth=1)
            auxiliaries_batch = util.fmap(function=function, xs=auxiliaries, depth=1)
            actions_batch = util.fmap(function=function, xs=actions, depth=1)
            terminal_batch = terminal[last: index]
            reward_batch = reward[last: index]
            last = index

            # Model.experience()
            if query is None:
                self.timesteps, self.episodes, self.updates = self.model.experience(
                    states=states_batch, internals=internals_batch,
                    auxiliaries=auxiliaries_batch, actions=actions_batch, terminal=terminal_batch,
                    reward=reward_batch, **kwargs
                )

            else:
                self.timesteps, self.episodes, self.updates, queried = self.model.experience(
                    states=states_batch, internals=internals_batch,
                    auxiliaries=auxiliaries_batch, actions=actions_batch, terminal=terminal_batch,
                    reward=reward_batch, query=query, **kwargs
                )

        if query is not None:
            return queried
示例#27
0
    def act(self,
            states,
            internals=None,
            parallel=0,
            independent=False,
            deterministic=False,
            evaluation=False,
            query=None,
            **kwargs):
        """
        Returns action(s) for the given state(s), needs to be followed by `observe(...)` unless independent mode set via `independent`/`evaluation`.

        Args:
            states (dict[state] | iter[dict[state]]): Dictionary containing state(s) to be acted on
                (<span style="color:#C00000"><b>required</b></span>).
            internals (dict[internal] | iter[dict[internal]]): Dictionary containing current
                internal agent state(s)
                (<span style="color:#C00000"><b>required</b></span> if independent mode).
            parallel (int | iter[int]): Parallel execution index
                (<span style="color:#00C000"><b>default</b></span>: 0).
            independent (bool): Whether act is not part of the main agent-environment interaction,
                and this call is thus not followed by observe
                (<span style="color:#00C000"><b>default</b></span>: false).
            deterministic (bool): Ff independent mode, whether to act deterministically, so no
                exploration and sampling
                (<span style="color:#00C000"><b>default</b></span>: false).
            evaluation (bool): Whether the agent is currently evaluated, implies independent and
                deterministic
                (<span style="color:#00C000"><b>default</b></span>: false).
            query (list[str]): Names of tensors to retrieve
                (<span style="color:#00C000"><b>default</b></span>: none).
            kwargs: Additional input values, for instance, for dynamic hyperparameters.

        Returns:
            dict[action] | iter[dict[action]], if independent mode dict[internal] |
            iter[dict[internal]], plus optional list[str]: Dictionary containing action(s),
            dictionary containing next internal agent state(s) if independent mode, plus queried
            tensor values if requested.
        """
        assert util.reduce_all(predicate=util.not_nan_inf, xs=states)

        if evaluation:
            if deterministic:
                raise TensorforceError.invalid(name='agent.act',
                                               argument='deterministic',
                                               condition='evaluation = true')
            if independent:
                raise TensorforceError.invalid(name='agent.act',
                                               argument='independent',
                                               condition='evaluation = true')
            deterministic = independent = True

        if not independent:
            if internals is not None:
                raise TensorforceError.invalid(name='agent.act',
                                               argument='internals',
                                               condition='independent = false')
            if deterministic:
                raise TensorforceError.invalid(name='agent.act',
                                               argument='deterministic',
                                               condition='independent = false')

        if independent:
            internals_is_none = (internals is None)
            if internals_is_none:
                internals = OrderedDict()

        # Batch states
        batched = (not isinstance(parallel, int))
        if batched:
            if len(parallel) == 0:
                raise TensorforceError.value(name='agent.act',
                                             argument='parallel',
                                             value=parallel,
                                             hint='zero-length')
            parallel = np.asarray(list(parallel))
            if isinstance(states[0], dict):
                states = OrderedDict(
                    ((name,
                      np.asarray(
                          [states[n][name] for n in range(len(parallel))]))
                     for name in states[0]))
            else:
                states = np.asarray(states)
            if independent:
                internals = OrderedDict(
                    ((name,
                      np.asarray(
                          [internals[n][name] for n in range(len(parallel))]))
                     for name in internals[0]))
        else:
            parallel = np.asarray([parallel])
            states = util.fmap(function=(lambda x: np.asarray([x])),
                               xs=states,
                               depth=int(isinstance(states, dict)))
            if independent:
                internals = util.fmap(function=(lambda x: np.asarray([x])),
                                      xs=internals,
                                      depth=1)

        if not independent and not all(self.timestep_completed[n]
                                       for n in parallel):
            raise TensorforceError(
                message="Calling agent.act must be preceded by agent.observe.")

        # Auxiliaries
        auxiliaries = OrderedDict()
        if isinstance(states, dict):
            states = dict(states)
            for name, spec in self.actions_spec.items():
                if spec['type'] == 'int' and name + '_mask' in states:
                    auxiliaries[name + '_mask'] = states.pop(name + '_mask')

        # Normalize states dictionary
        states = util.normalize_values(value_type='state',
                                       values=states,
                                       values_spec=self.states_spec)

        # Model.act()
        if independent:
            if query is None:
                actions, internals = self.model.independent_act(
                    states=states,
                    internals=internals,
                    auxiliaries=auxiliaries,
                    parallel=parallel,
                    deterministic=deterministic,
                    **kwargs)

            else:
                actions, internals, queried = self.model.independent_act(
                    states=states,
                    internals=internals,
                    auxiliaries=auxiliaries,
                    parallel=parallel,
                    deterministic=deterministic,
                    query=query,
                    **kwargs)

        else:
            if query is None:
                actions, self.timesteps = self.model.act(
                    states=states,
                    auxiliaries=auxiliaries,
                    parallel=parallel,
                    **kwargs)

            else:
                actions, self.timesteps, queried = self.model.act(
                    states=states,
                    auxiliaries=auxiliaries,
                    parallel=parallel,
                    query=query,
                    **kwargs)

        if not independent:
            for n in parallel:
                self.timestep_completed[n] = False

        if self.recorder_spec is not None and not independent and \
                self.episodes >= self.recorder_spec.get('start', 0):
            for n in range(len(parallel)):
                index = self.buffer_indices[parallel[n]]
                for name in self.states_spec:
                    self.states_buffers[name][parallel[n],
                                              index] = states[name][n]
                for name, spec in self.actions_spec.items():
                    self.actions_buffers[name][parallel[n],
                                               index] = actions[name][n]
                    if spec['type'] == 'int':
                        name = name + '_mask'
                        if name in auxiliaries:
                            self.states_buffers[name][
                                parallel[n], index] = auxiliaries[name][n]
                        else:
                            shape = (1, ) + spec['shape'] + (
                                spec['num_values'], )
                            self.states_buffers[name][parallel[n],
                                                      index] = np.full(
                                                          shape=shape,
                                                          fill_value=True,
                                                          dtype=util.np_dtype(
                                                              dtype='bool'))

        # Reverse normalized actions dictionary
        actions = util.unpack_values(value_type='action',
                                     values=actions,
                                     values_spec=self.actions_spec)

        # Unbatch actions
        if batched:
            if isinstance(actions, dict):
                actions = [
                    OrderedDict(((name, actions[name][n]) for name in actions))
                    for n in range(len(parallel))
                ]
        else:
            actions = util.fmap(function=(lambda x: x[0]),
                                xs=actions,
                                depth=int(isinstance(actions, dict)))
            if independent:
                internals = util.fmap(function=(lambda x: x[0]),
                                      xs=internals,
                                      depth=1)

        if independent and not internals_is_none:
            if query is None:
                return actions, internals
            else:
                return actions, internals, queried

        else:
            if query is None:
                return actions
            else:
                return actions, queried
示例#28
0
    def initialize(self):
        """
        Initializes the agent, usually done as part of Agent.create/load.
        """
        if self.is_initialized:
            raise TensorforceError(
                message=
                "Agent is already initialized, possibly as part of Agent.create()."
            )

        self.is_initialized = True

        # Parallel terminal/reward buffers
        self.terminal_buffers = np.ndarray(shape=(self.parallel_interactions,
                                                  self.buffer_observe),
                                           dtype=util.np_dtype(dtype='long'))
        self.reward_buffers = np.ndarray(shape=(self.parallel_interactions,
                                                self.buffer_observe),
                                         dtype=util.np_dtype(dtype='float'))

        # Recorder buffers if required
        if self.recorder_spec is not None:
            self.states_buffers = OrderedDict()
            self.actions_buffers = OrderedDict()
            for name, spec in self.states_spec.items():
                shape = (self.parallel_interactions,
                         self.buffer_observe) + spec['shape']
                self.states_buffers[name] = np.ndarray(
                    shape=shape, dtype=util.np_dtype(dtype=spec['type']))
            for name, spec in self.actions_spec.items():
                shape = (self.parallel_interactions,
                         self.buffer_observe) + spec['shape']
                self.actions_buffers[name] = np.ndarray(
                    shape=shape, dtype=util.np_dtype(dtype=spec['type']))
                if spec['type'] == 'int':
                    shape = (self.parallel_interactions, self.buffer_observe) + spec['shape'] + \
                        (spec['num_values'],)
                    self.states_buffers[name + '_mask'] = np.ndarray(
                        shape=shape, dtype=util.np_dtype(dtype='bool'))

            self.num_episodes = 0
            self.record_states = OrderedDict(
                ((name, list()) for name in self.states_spec))
            self.record_actions = OrderedDict(
                ((name, list()) for name in self.actions_spec))
            for name, spec in self.actions_spec.items():
                if spec['type'] == 'int':
                    self.record_states[name + '_mask'] = list()
            self.record_terminal = list()
            self.record_reward = list()

        # Parallel buffer indices
        self.buffer_indices = np.zeros(shape=(self.parallel_interactions, ),
                                       dtype=util.np_dtype(dtype='int'))
        self.timestep_completed = np.ndarray(
            shape=(self.parallel_interactions, ),
            dtype=util.np_dtype(dtype='bool'))

        self.timesteps = 0
        self.episodes = 0
        self.updates = 0

        # Setup Model
        if not hasattr(self, 'model'):
            raise TensorforceError(message="Missing agent attribute model.")

        self.model.initialize()
        if self.model.saver_directory is not None:
            file = os.path.join(self.model.saver_directory,
                                self.model.saver_filename + '.json')
            try:
                with open(file, 'w') as fp:
                    json.dump(obj=self.spec, fp=fp, cls=NumpyJSONEncoder)
            except BaseException:
                os.remove(file)

        self.reset()
示例#29
0
 def np_type(self):
     return util.np_dtype(dtype=self.type)
示例#30
0
    def observe(self, reward=0.0, terminal=False, parallel=0):
        # Check whether inputs are batched
        if util.is_iterable(x=reward) or (isinstance(reward, np.ndarray)
                                          and reward.ndim > 0):
            reward = np.asarray(reward)
            num_parallel = reward.shape[0]
            if not isinstance(terminal, np.ndarray) and terminal is False:
                terminal = np.asarray([0 for _ in range(num_parallel)])
            else:
                terminal = np.asarray(terminal)
            if not isinstance(parallel, np.ndarray) and parallel == 0:
                assert num_parallel == self.parallel_interactions
                parallel = np.asarray(list(range(num_parallel)))
            else:
                parallel = np.asarray(parallel)

        elif util.is_iterable(x=terminal) or \
                (isinstance(terminal, np.ndarray) and terminal.ndim > 0):
            terminal = np.asarray(terminal, dtype=util.np_dtype(dtype='int'))
            num_parallel = terminal.shape[0]
            if not isinstance(reward, np.ndarray) and reward == 0.0:
                reward = np.asarray([0.0 for _ in range(num_parallel)])
            else:
                reward = np.asarray(reward)
            if not isinstance(parallel, np.ndarray) and parallel == 0:
                assert num_parallel == self.parallel_interactions
                parallel = np.asarray(list(range(num_parallel)))
            else:
                parallel = np.asarray(parallel)

        elif util.is_iterable(x=parallel) or \
                (isinstance(parallel, np.ndarray) and parallel.ndim > 0):
            parallel = np.asarray(parallel)
            num_parallel = parallel.shape[0]
            if not isinstance(reward, np.ndarray) and reward == 0.0:
                reward = np.asarray([0.0 for _ in range(num_parallel)])
            else:
                reward = np.asarray(reward)
            if not isinstance(terminal, np.ndarray) and terminal is False:
                terminal = np.asarray([0 for _ in range(num_parallel)])
            else:
                terminal = np.asarray(terminal)

        else:
            reward = np.asarray([float(reward)])
            terminal = np.asarray([int(terminal)])
            parallel = np.asarray([int(parallel)])
            num_parallel = 1

        # Check whether shapes/lengths are consistent
        if parallel.shape[0] == 0:
            raise TensorforceError.value(name='Agent.observe',
                                         argument='len(parallel)',
                                         value=parallel.shape[0],
                                         hint='= 0')
        if reward.shape != parallel.shape:
            raise TensorforceError.value(name='Agent.observe',
                                         argument='len(reward)',
                                         value=reward.shape,
                                         hint='!= parallel length')
        if terminal.shape != parallel.shape:
            raise TensorforceError.value(name='Agent.observe',
                                         argument='len(terminal)',
                                         value=terminal.shape,
                                         hint='!= parallel length')

        # Convert terminal to int if necessary
        if terminal.dtype is util.np_dtype(dtype='bool'):
            zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int'))
            ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int'))
            terminal = np.where(terminal, ones, zeros)

        # Check whether current timesteps are not completed
        if self.timestep_completed[parallel].any():
            raise TensorforceError(
                message="Calling agent.observe must be preceded by agent.act.")
        self.timestep_completed[parallel] = True

        # Check whether episode is too long
        self.timestep_counter[parallel] += 1
        if self.max_episode_timesteps is not None and np.logical_and(
                terminal == 0, self.timestep_counter[parallel] >
                self.max_episode_timesteps).any():
            raise TensorforceError(
                message="Episode longer than max_episode_timesteps.")
        self.timestep_counter[parallel] = np.where(
            terminal > 0, 0, self.timestep_counter[parallel])

        if self.recorder is None:
            pass

        elif self.num_episodes < self.recorder.get('start', 0):
            # Increment num_episodes
            for t in terminal.tolist():
                if t > 0:
                    self.num_episodes += 1

        else:
            # Store values per parallel interaction
            for p, t, r in zip(parallel.tolist(), terminal.tolist(),
                               reward.tolist()):

                # Buffer inputs
                self.buffers['terminal'][p].append(t)
                self.buffers['reward'][p].append(r)

                # Continue if not terminal
                if t == 0:
                    continue
                self.num_episodes += 1

                # Buffered terminal/reward inputs
                for name in self.states_spec:
                    self.recorded['states'][name].append(
                        np.stack(self.buffers['states'][name][p], axis=0))
                    self.buffers['states'][name][p].clear()
                for name, spec in self.actions_spec.items():
                    self.recorded['actions'][name].append(
                        np.stack(self.buffers['actions'][name][p], axis=0))
                    self.buffers['actions'][name][p].clear()
                self.recorded['terminal'].append(
                    np.array(self.buffers['terminal'][p],
                             dtype=self.terminal_spec.np_type()))
                self.buffers['terminal'][p].clear()
                self.recorded['reward'].append(
                    np.array(self.buffers['reward'][p],
                             dtype=self.reward_spec.np_type()))
                self.buffers['reward'][p].clear()

                # Check whether recording step
                if (self.num_episodes - self.recorder.get('start', 0)) \
                        % self.recorder.get('frequency', 1) != 0:
                    continue

                # Manage recorder directory
                directory = self.recorder['directory']
                if os.path.isdir(directory):
                    files = sorted(
                        f for f in os.listdir(directory)
                        if os.path.isfile(os.path.join(directory, f))
                        and os.path.splitext(f)[1] == '.npz')
                else:
                    os.makedirs(directory)
                    files = list()
                max_traces = self.recorder.get('max-traces')
                if max_traces is not None and len(files) > max_traces - 1:
                    for filename in files[:-max_traces + 1]:
                        filename = os.path.join(directory, filename)
                        os.remove(filename)

                # Write recording file
                filename = os.path.join(
                    directory,
                    'trace-{:09d}.npz'.format(self.num_episodes - 1))
                # time.strftime('%Y%m%d-%H%M%S')
                kwargs = self.recorded.fmap(function=np.concatenate,
                                            cls=ArrayDict).items()
                np.savez_compressed(file=filename, **dict(kwargs))

                # Clear recorded values
                for recorded in self.recorded.values():
                    recorded.clear()

        if self._is_agent:
            return reward, terminal, parallel
        else:
            return 0
示例#31
0
    def experience(self, states, actions, terminal, reward, internals=None):
        """
        Feed experience traces.

        See the [act-experience-update script](https://github.com/tensorforce/tensorforce/blob/master/examples/act_experience_update_interface.py)
        for an example application as part of the act-experience-update interface, which is an
        alternative to the act-observe interaction pattern.

        Args:
            states (dict[array[state]]): Dictionary containing arrays of states
                (<span style="color:#C00000"><b>required</b></span>).
            actions (dict[array[action]]): Dictionary containing arrays of actions
                (<span style="color:#C00000"><b>required</b></span>).
            terminal (array[bool]): Array of terminals
                (<span style="color:#C00000"><b>required</b></span>).
            reward (array[float]): Array of rewards
                (<span style="color:#C00000"><b>required</b></span>).
            internals (dict[state]): Dictionary containing arrays of internal agent states
                (<span style="color:#C00000"><b>required</b></span> if agent has internal states).
        """
        if not all(len(buffer) == 0 for buffer in self.terminal_buffer):
            raise TensorforceError(
                message="Calling agent.experience is not possible mid-episode."
            )

        # Process states input and infer batching structure
        states, batched, num_instances, is_iter_of_dicts = self._process_states_input(
            states=states, function_name='Agent.experience')

        if is_iter_of_dicts:
            # Input structure iter[dict[input]]

            # Internals
            if internals is None:
                internals = ArrayDict(self.initial_internals())
                internals = internals.fmap(function=(lambda x: np.repeat(
                    np.expand_dims(x, axis=0), repeats=num_instances, axis=0)))
            elif not isinstance(internals, (tuple, list)):
                raise TensorforceError.type(name='Agent.experience',
                                            argument='internals',
                                            dtype=type(internals),
                                            hint='is not tuple/list')
            else:
                internals = [ArrayDict(internal) for internal in internals]
                internals = internals[0].fmap(
                    function=(lambda *xs: np.stack(xs, axis=0)),
                    zip_values=internals[1:])

            # Actions
            if isinstance(actions, np.ndarray):
                actions = ArrayDict(singleton=actions)
            elif not isinstance(actions, (tuple, list)):
                raise TensorforceError.type(name='Agent.experience',
                                            argument='actions',
                                            dtype=type(actions),
                                            hint='is not tuple/list')
            elif not isinstance(actions[0], dict):
                actions = ArrayDict(singleton=np.asarray(actions))
            else:
                actions = [ArrayDict(action) for action in actions]
                actions = actions[0].fmap(
                    function=(lambda *xs: np.stack(xs, axis=0)),
                    zip_values=actions[1:])

        else:
            # Input structure dict[iter[input]]

            # Internals
            if internals is None:
                internals = ArrayDict(self.initial_internals())
                internals = internals.fmap(function=(lambda x: np.tile(
                    np.expand_dims(x, axis=0), reps=(num_instances, ))))
            elif not isinstance(internals, dict):
                raise TensorforceError.type(name='Agent.experience',
                                            argument='internals',
                                            dtype=type(internals),
                                            hint='is not dict')
            else:
                internals = ArrayDict(internals)

            # Actions
            if not isinstance(actions, np.ndarray):
                actions = ArrayDict(singleton=actions)
            elif not isinstance(actions, dict):
                raise TensorforceError.type(name='Agent.experience',
                                            argument='actions',
                                            dtype=type(actions),
                                            hint='is not dict')
            else:
                actions = ArrayDict(actions)

        # Expand inputs if not batched
        if not batched:
            internals = internals.fmap(
                function=(lambda x: np.expand_dims(x, axis=0)))
            actions = actions.fmap(
                function=(lambda x: np.expand_dims(x, axis=0)))
            terminal = np.asarray([terminal])
            reward = np.asarray([reward])
        else:
            terminal = np.asarray(terminal)
            reward = np.asarray(reward)

        # Check number of inputs
        for name, internal in internals.items():
            if internal.shape[0] != num_instances:
                raise TensorforceError.value(
                    name='Agent.experience',
                    argument='len(internals[{}])'.format(name),
                    value=internal.shape[0],
                    hint='!= len(states)')
        for name, action in actions.items():
            if action.shape[0] != num_instances:
                raise TensorforceError.value(
                    name='Agent.experience',
                    argument='len(actions[{}])'.format(name),
                    value=action.shape[0],
                    hint='!= len(states)')
        if terminal.shape[0] != num_instances:
            raise TensorforceError.value(name='Agent.experience',
                                         argument='len(terminal)'.format(name),
                                         value=terminal.shape[0],
                                         hint='!= len(states)')
        if reward.shape[0] != num_instances:
            raise TensorforceError.value(name='Agent.experience',
                                         argument='len(reward)'.format(name),
                                         value=reward.shape[0],
                                         hint='!= len(states)')

        def function(name, spec):
            auxiliary = ArrayDict()
            if self.config.enable_int_action_masking and spec.type == 'int' and \
                    spec.num_values is not None:
                if name is None:
                    name = 'action'
                # Mask, either part of states or default all true
                auxiliary['mask'] = states.pop(
                    name + '_mask',
                    np.ones(shape=(num_instances, ) + spec.shape +
                            (spec.num_values, ),
                            dtype=spec.np_type()))
            return auxiliary

        auxiliaries = self.actions_spec.fmap(function=function,
                                             cls=ArrayDict,
                                             with_names=True)
        if self.states_spec.is_singleton() and not states.is_singleton():
            states[None] = states.pop('state')

        # Convert terminal to int if necessary
        if terminal.dtype is util.np_dtype(dtype='bool'):
            zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int'))
            ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int'))
            terminal = np.where(terminal, ones, zeros)

        if terminal[-1] == 0:
            raise TensorforceError(
                message="Agent.experience() requires full episodes as input.")

        # Batch experiences split into episodes and at most size buffer_observe
        last = 0
        for index in range(1, len(terminal) + 1):
            if terminal[index - 1] == 0:
                continue

            function = (lambda x: x[last:index])
            states_batch = states.fmap(function=function)
            internals_batch = internals.fmap(function=function)
            auxiliaries_batch = auxiliaries.fmap(function=function)
            actions_batch = actions.fmap(function=function)
            terminal_batch = function(terminal)
            reward_batch = function(reward)
            last = index

            # Inputs to tensors
            states_batch = self.states_spec.to_tensor(
                value=states_batch,
                batched=True,
                name='Agent.experience states')
            internals_batch = self.internals_spec.to_tensor(
                value=internals_batch,
                batched=True,
                recover_empty=True,
                name='Agent.experience internals')
            auxiliaries_batch = self.auxiliaries_spec.to_tensor(
                value=auxiliaries_batch,
                batched=True,
                name='Agent.experience auxiliaries')
            actions_batch = self.actions_spec.to_tensor(
                value=actions_batch,
                batched=True,
                name='Agent.experience actions')
            terminal_batch = self.terminal_spec.to_tensor(
                value=terminal_batch,
                batched=True,
                name='Agent.experience terminal')
            reward_batch = self.reward_spec.to_tensor(
                value=reward_batch,
                batched=True,
                name='Agent.experience reward')

            # Model.experience()
            timesteps, episodes = self.model.experience(
                states=states_batch,
                internals=internals_batch,
                auxiliaries=auxiliaries_batch,
                actions=actions_batch,
                terminal=terminal_batch,
                reward=reward_batch)
            self.timesteps = timesteps.numpy().item()
            self.episodes = episodes.numpy().item()

        if self.model.saver is not None:
            self.model.save()
示例#32
0
    def act(
        self, states, parallel=0, deterministic=False, independent=False, evaluation=False,
        query=None, **kwargs
    ):
        """
        Returns action(s) for the given state(s), needs to be followed by `observe(...)` unless
        `independent` is true.

        Args:
            states (dict[state]): Dictionary containing state(s) to be acted on
                (<span style="color:#C00000"><b>required</b></span>).
            parallel (int): Parallel execution index
                (<span style="color:#00C000"><b>default</b></span>: 0).
            deterministic (bool): Whether to apply exploration and sampling
                (<span style="color:#00C000"><b>default</b></span>: false).
            independent (bool): Whether action is not remembered, and this call is thus not
                followed by observe
                (<span style="color:#00C000"><b>default</b></span>: false).
            evaluation (bool): Whether the agent is currently evaluated, implies and overwrites
                deterministic and independent
                (<span style="color:#00C000"><b>default</b></span>: false).
            query (list[str]): Names of tensors to retrieve
                (<span style="color:#00C000"><b>default</b></span>: none).
            kwargs: Additional input values, for instance, for dynamic hyperparameters.

        Returns:
            (dict[action], plus optional list[str]): Dictionary containing action(s), plus queried
            tensor values if requested.
        """
        assert util.reduce_all(predicate=util.not_nan_inf, xs=states)

        # self.current_internals = self.next_internals
        if evaluation:
            if deterministic or independent:
                raise TensorforceError.unexpected()
            deterministic = independent = True

        # Auxiliaries
        auxiliaries = OrderedDict()
        if isinstance(states, dict):
            states = dict(states)
            for name, spec in self.actions_spec.items():
                if spec['type'] == 'int' and name + '_mask' in states:
                    auxiliaries[name + '_mask'] = states.pop(name + '_mask')

        # Normalize states dictionary
        states = util.normalize_values(
            value_type='state', values=states, values_spec=self.states_spec
        )

        # Batch states
        states = util.fmap(function=(lambda x: np.asarray([x])), xs=states, depth=1)
        auxiliaries = util.fmap(function=(lambda x: np.asarray([x])), xs=auxiliaries, depth=1)

        # Model.act()
        if query is None:
            actions, self.timesteps = self.model.act(
                states=states, auxiliaries=auxiliaries, parallel=parallel,
                deterministic=deterministic, independent=independent, **kwargs
            )

        else:
            actions, self.timesteps, queried = self.model.act(
                states=states, auxiliaries=auxiliaries, parallel=parallel,
                deterministic=deterministic, independent=independent, query=query, **kwargs
            )

        if self.recorder_spec is not None and not independent and \
                self.episodes >= self.recorder_spec.get('start', 0):
            index = self.buffer_indices[parallel]
            for name in self.states_spec:
                self.states_buffers[name][parallel, index] = states[name][0]
            for name, spec in self.actions_spec.items():
                self.actions_buffers[name][parallel, index] = actions[name][0]
                if spec['type'] == 'int':
                    name = name + '_mask'
                    if name in auxiliaries:
                        self.states_buffers[name][parallel, index] = auxiliaries[name][0]
                    else:
                        shape = (1,) + spec['shape'] + (spec['num_values'],)
                        self.states_buffers[name][parallel, index] = np.full(
                            shape=shape, fill_value=True, dtype=util.np_dtype(dtype='bool')
                        )

        # Unbatch actions
        actions = util.fmap(function=(lambda x: x[0]), xs=actions, depth=1)

        # Reverse normalized actions dictionary
        actions = util.unpack_values(
            value_type='action', values=actions, values_spec=self.actions_spec
        )

        # if independent, return processed state as well?

        if query is None:
            return actions
        else:
            return actions, queried
示例#33
0
    def get_batch(self, batch_size, next_states=False):
        """
        Samples a batch of the specified size according to priority.

        Args:
            batch_size: The batch size
            next_states: A boolean flag indicating whether 'next_states' values should be included

        Returns: A dict containing states, actions, rewards, terminals, internal states (and next states)

        """
        if batch_size > len(self.observations):
            raise TensorForceError(
                "Requested batch size is larger than observations in memory: increase config.first_update."
            )

        # Init empty states
        states = {
            name: np.zeros((batch_size, ) + tuple(state.shape),
                           dtype=util.np_dtype(state.type))
            for name, state in self.states_config.items()
        }
        actions = {
            name: np.zeros(
                (batch_size, ) + tuple(action.shape),
                dtype=util.np_dtype('float' if action.continuous else 'int'))
            for name, action in self.actions_config.items()
        }
        rewards = np.zeros((batch_size, ), dtype=util.np_dtype('float'))
        terminals = np.zeros((batch_size, ), dtype=util.np_dtype('bool'))
        internals = [
            np.zeros((batch_size, ) + shape, dtype)
            for shape, dtype in self.internals_config
        ]
        if next_states:
            next_states = {
                name: np.zeros((batch_size, ) + tuple(state.shape),
                               dtype=util.np_dtype(state.type))
                for name, state in self.states_config.items()
            }
            next_internals = [
                np.zeros((batch_size, ) + shape, dtype)
                for shape, dtype in self.internals_config
            ]

        # Start with unseen observations
        unseen_indices = list(
            xrange(self.none_priority_index + self.observations._capacity - 1,
                   len(self.observations) + self.observations._capacity - 1))
        self.batch_indices = unseen_indices[:batch_size]

        # Get remaining observations using weighted sampling
        remaining = batch_size - len(self.batch_indices)
        if remaining:
            samples = self.observations.sample_minibatch(remaining)
            sample_indices = [i for i, o in samples]
            self.batch_indices += sample_indices

        # Shuffle
        np.random.shuffle(self.batch_indices)

        # Collect observations
        for n, index in enumerate(self.batch_indices):
            observation, _ = self.observations._memory[index]

            for name, state in states.items():
                state[n] = observation[0][name]
            for name, action in actions.items():
                action[n] = observation[1][name]
            rewards[n] = observation[2]
            terminals[n] = observation[3]
            for k, internal in enumerate(internals):
                internal[n] = observation[4][k]
            if next_states:
                for name, next_state in next_states.items():
                    next_state[n] = observation[5][name]
                for k, next_internal in enumerate(next_internals):
                    next_internal[n] = observation[6][k]

        if next_states:
            return dict(states=states,
                        actions=actions,
                        rewards=rewards,
                        terminals=terminals,
                        internals=internals,
                        next_states=next_states,
                        next_internals=next_internals)
        else:
            return dict(states=states,
                        actions=actions,
                        rewards=rewards,
                        terminals=terminals,
                        internals=internals)