def __init__(self, states_spec, actions_spec, capacity, random_sampling=True): super(Replay, self).__init__(states_spec=states_spec, actions_spec=actions_spec) self.capacity = capacity self.states = { name: np.zeros((capacity, ) + tuple(state['shape']), dtype=util.np_dtype(state['type'])) for name, state in states_spec.items() } self.next_states = { name: np.zeros((capacity, ) + tuple(state['shape']), dtype=util.np_dtype(state['type'])) for name, state in states_spec.items() } self.internals, self.next_internals = None, None self.actions = { name: np.zeros((capacity, ) + tuple(action['shape']), dtype=util.np_dtype(action['type'])) for name, action in actions_spec.items() } self.terminal = np.zeros((capacity, ), dtype=util.np_dtype('bool')) self.reward = np.zeros((capacity, ), dtype=util.np_dtype('float')) self.size = 0 self.index = 0 self.random_sampling = random_sampling
def long_unittest(self, horizon): agent, environment = self.prepare(min_timesteps=3, reward_estimation=dict(horizon=horizon)) states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) _, horizon_output1 = agent.observe(terminal=terminal, reward=reward, query='horizon') self.assertIsInstance(horizon_output1, util.np_dtype(dtype='long')) actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) _, horizon_output2 = agent.observe(terminal=terminal, reward=reward, query='horizon') if not isinstance(horizon, dict) or horizon['type'] == 'constant': self.assertEqual(horizon_output2, horizon_output1) else: self.assertNotEqual(horizon_output2, horizon_output1) actions = agent.act(states=states) _, terminal, reward = environment.execute(actions=actions) horizon_input = 3 _, horizon_output = agent.observe( terminal=terminal, reward=reward, query='horizon', **{'estimator/horizon': horizon_input} ) self.assertEqual( horizon_output, np.asarray(horizon_input, dtype=util.np_dtype(dtype='long')) ) agent.close() environment.close() self.finished_test()
def tf_initialize(self): super().tf_initialize() # Value buffers self.buffers = OrderedDict() for name, spec in self.values_spec.items(): if util.is_nested(name=name): self.buffers[name] = OrderedDict() for inner_name, spec in spec.items(): shape = (self.capacity, ) + spec['shape'] self.buffers[name][inner_name] = self.add_variable( name=(inner_name + '-buffer'), dtype=spec['type'], shape=shape, is_trainable=False) else: shape = (self.capacity, ) + spec['shape'] if name == 'terminal': # Terminal initialization has to agree with terminal_indices initializer = np.zeros(shape=(self.capacity, ), dtype=util.np_dtype(dtype='long')) initializer[-1] = 1 self.buffers[name] = self.add_variable( name=(name + '-buffer'), dtype=spec['type'], shape=shape, is_trainable=False, initializer=initializer) else: self.buffers[name] = self.add_variable(name=(name + '-buffer'), dtype=spec['type'], shape=shape, is_trainable=False) # Buffer index (modulo capacity, next index to write to) self.buffer_index = self.add_variable(name='buffer-index', dtype='long', shape=(), is_trainable=False, initializer='zeros') # Terminal indices # (oldest episode terminals first, initially the only terminal is last index) initializer = np.zeros(shape=(self.capacity + 1, ), dtype=util.np_dtype(dtype='long')) initializer[0] = self.capacity - 1 self.terminal_indices = self.add_variable(name='terminal-indices', dtype='long', shape=(self.capacity + 1, ), is_trainable=False, initializer=initializer) # Episode count self.episode_count = self.add_variable(name='episode-count', dtype='long', shape=(), is_trainable=False, initializer='zeros')
def get_batch(self, batch_size): """ Samples a batch of the specified size according to priority. Args: batch_size: Length of the sampled sequence. Returns: A dict containing states, rewards, terminals and internal states """ assert not self.batch_indices states = { name: np.zeros((batch_size, ) + tuple(shape), dtype=dtype) for name, (shape, dtype) in self.state_spec.items() } actions = { name: np.zeros((batch_size, ), dtype=dtype) for name, dtype in self.action_spec.items() } rewards = np.zeros((batch_size, ), dtype=util.np_dtype('float')) terminals = np.zeros((batch_size, ), dtype=util.np_dtype('bool')) internals = [ np.zeros((batch_size, ) + shape, dtype) for shape, dtype in self.internal_spec ] zero_priority_index = self.positive_priority_index + 1 for n in xrange(batch_size): if zero_priority_index < len(self.observations): _, observation = self.observations[zero_priority_index] index = zero_priority_index zero_priority_index += 1 else: while True: sample = random() for index, (priority, observation) in enumerate(self.observations): sample -= priority / self.sum_priorities if sample < 0.0: break if index not in self.batch_indices: break for name, state in states.items(): state[n] = observation[0][name] for name, action in actions.items(): action[n] = observation[1][name] rewards[n] = observation[2] terminals[n] = observation[3] for k, internal in enumerate(internals): internal[n] = observation[4][k] self.batch_indices.append(index) return dict(states=states, actions=actions, rewards=rewards, terminals=terminals, internals=internals)
def __init__(self, capacity, states_config, actions_config, random_sampling=False): super(Replay, self).__init__(capacity, states_config, actions_config) self.states = {name: np.zeros((capacity,) + tuple(state.shape), dtype=util.np_dtype(state.type)) for name, state in states_config} self.actions = {name: np.zeros((capacity,) + tuple(action.shape), dtype=util.np_dtype('float' if action.continuous else 'int')) for name, action in actions_config} self.rewards = np.zeros((capacity,), dtype=util.np_dtype('float')) self.terminals = np.zeros((capacity,), dtype=util.np_dtype('bool')) self.internals = None self.size = 0 self.index = 0 self.random_sampling = random_sampling
def reset(self): """ Resets all agent buffers and discards unfinished episodes. """ self.buffer_indices = np.zeros(shape=(self.parallel_interactions, ), dtype=util.np_dtype(dtype='int')) self.timestep_completed = np.ones(shape=(self.parallel_interactions, ), dtype=util.np_dtype(dtype='bool')) self.timesteps, self.episodes, self.updates = self.model.reset()
def __init__(self, capacity, states_config, actions_config): self.capacity = capacity self.states = {name: np.zeros((capacity,) + tuple(state.shape), dtype=util.np_dtype(state.type)) for name, state in states_config} self.actions = {name: np.zeros((capacity,), dtype=util.np_dtype('float' if action.continuous else 'int')) for name, action in actions_config} self.rewards = np.zeros((capacity,), dtype=util.np_dtype('float')) self.terminals = np.zeros((capacity,), dtype=util.np_dtype('bool')) self.internals = None self.size = 0 self.index = 0
def __init__(self, capacity, states_config, actions_config, prioritization_weight=1.0): self.capacity = capacity self.prioritization_weight = prioritization_weight self.state_spec = {name: (tuple(state.shape), util.np_dtype(state.type)) for name, state in states_config} self.action_spec = {name: util.np_dtype('float' if action.continuous else 'int') for name, action in actions_config} self.internal_spec = None self.observations = list() self.sum_priorities = 0.0 self.positive_priority_index = -1 self.batch_indices = list()
def reset(self): """ Resets the agent to start a new episode. """ self.buffer_indices = np.zeros(shape=(self.parallel_interactions, ), dtype=util.np_dtype(dtype='int')) self.timesteps, self.episodes, self.updates = self.model.reset()
def reset(self, independent=False, evaluation=False): """ Resets all agent buffers, or only terminates and resets an episode in independent/evaluation mode if corresponding argument is set. Args: independent (bool): Whether to terminate an episode in independent mode (<span style="color:#00C000"><b>default</b></span>: false). evaluation (bool): Whether to terminate an episode in evaluation mode, implies independent (<span style="color:#00C000"><b>default</b></span>: false). """ if evaluation: if independent: raise TensorforceError( message="Agent.reset argument independent is implied by and thus should not " "be used together with argument evaluation." ) independent = True if not independent: self.buffer_indices = np.zeros( shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='int') ) self.timesteps, self.episodes, self.updates = self.model.reset(independent=independent)
def float_unittest(self, exploration): agent, environment = self.prepare(min_timesteps=3, exploration=exploration) states = environment.reset() actions, exploration_output1 = agent.act(states=states, query='exploration') self.assertIsInstance(exploration_output1, util.np_dtype(dtype='float')) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) if not isinstance(exploration, dict) or exploration['type'] == 'constant': actions, exploration_output2 = agent.act(states=states, query='exploration') self.assertEqual(exploration_output2, exploration_output1) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) else: actions, exploration_output2 = agent.act(states=states, query='exploration') self.assertNotEqual(exploration_output2, exploration_output1) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) exploration_input = 0.5 actions, exploration_output = agent.act( states=states, query='exploration', exploration=exploration_input ) self.assertEqual(exploration_output, exploration_input) agent.close() environment.close() self.finished_test()
def parameter_unittest(self, name, exploration): states = dict(type='float', shape=(1,)) actions = dict(type='int', shape=(), num_values=3) agent, environment = self.prepare( name=name, states=states, actions=actions, exploration=exploration ) agent.initialize() states = environment.reset() actions, exploration_output1 = agent.act(states=states, query='exploration') self.assertIsInstance(exploration_output1, util.np_dtype(dtype='float')) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) if name != 'constant': actions, exploration_output2 = agent.act(states=states, query='exploration') self.assertNotEqual(exploration_output2, exploration_output1) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) exploration_input = 0.5 actions, exploration_output = agent.act( states=states, query='exploration', exploration=exploration_input ) self.assertEqual(exploration_output, exploration_input) agent.close() environment.close() sys.stdout.flush() self.assertTrue(expr=True)
def reset(self): """ Resets possibly inconsistent internal values, for instance, after saving and restoring an agent. Automatically triggered as part of Agent.create/load/initialize/restore. """ # Reset timestep completed self.timestep_completed = np.ones( shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='bool') ) # Reset buffers for buffer in self.buffers.values(): for x in buffer: x.clear() if self.recorder_spec is not None: for x in self.recorded.values(): x.clear() # Reset model timesteps, episodes, updates = self.model.reset() self.timesteps = timesteps.numpy().item() self.episodes = episodes.numpy().item() self.updates = updates.numpy().item() if self.model.saver is not None: self.model.save()
def initialize(self): """ Initializes the agent. """ self.is_initialized = True # Parallel terminal/reward buffers self.terminal_buffers = np.ndarray(shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='long')) self.reward_buffers = np.ndarray(shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='float')) # Parallel buffer indices self.buffer_indices = np.zeros(shape=(self.parallel_interactions, ), dtype=util.np_dtype(dtype='int')) self.timesteps = 0 self.episodes = 0 self.updates = 0 if self.recorder_spec is not None: self.record_states = OrderedDict( ((name, list()) for name in self.states_spec)) for name, spec in self.actions_spec.items(): if spec['type'] == 'int': self.record_states[name + '_mask'] = list() self.record_actions = OrderedDict( ((name, list()) for name in self.actions_spec)) self.record_terminal = list() self.record_reward = list() self.num_episodes = 0 # Setup Model if not hasattr(self, 'model'): raise TensorforceError.missing(name='Agent', value='model') self.model.initialize() if self.model.saver_directory is not None: file = os.path.join(self.model.saver_directory, self.model.saver_filename + '.json') with open(file, 'w') as fp: json.dump(obj=self.spec, fp=fp) self.reset()
def __init__(self, capacity, states_config, actions_config): capacity = int(capacity) self.states = dict() for name, state in states_config: self.states[name] = np.zeros((capacity, ) + tuple(state.shape), dtype=util.np_dtype(state.type)) self.actions = dict() for name, action in actions_config: dtype = util.np_dtype('float' if action.continuous else 'int') self.actions[name] = np.zeros((capacity, ), dtype=dtype) self.rewards = np.zeros((capacity, ), dtype=util.np_dtype('float')) self.terminals = np.zeros((capacity, ), dtype=util.np_dtype('bool')) self.internals = None self.capacity = capacity self.size = 0 self.index = 0
def get_batch(self, batch_size): """ Samples a batch of the specified size according to priority. Args: batch_size: Length of the sampled sequence. Returns: A dict containing states, rewards, terminals and internal states """ assert not self.batch_indices states = {name: np.zeros((batch_size,) + tuple(shape), dtype=dtype) for name, (shape, dtype) in self.state_spec.items()} actions = {name: np.zeros((batch_size,), dtype=dtype) for name, dtype in self.action_spec.items()} rewards = np.zeros((batch_size,), dtype=util.np_dtype('float')) terminals = np.zeros((batch_size,), dtype=util.np_dtype('bool')) internals = [np.zeros((batch_size,) + shape, dtype) for shape, dtype in self.internal_spec] zero_priority_index = self.positive_priority_index + 1 for n in xrange(batch_size): if zero_priority_index < len(self.observations): _, observation = self.observations[zero_priority_index] index = zero_priority_index zero_priority_index += 1 else: while True: sample = random() for index, (priority, observation) in enumerate(self.observations): sample -= priority / self.sum_priorities if sample < 0.0: break if index not in self.batch_indices: break for name, state in states.items(): state[n] = observation[0][name] for name, action in actions.items(): action[n] = observation[1][name] rewards[n] = observation[2] terminals[n] = observation[3] for k, internal in enumerate(internals): internal[n] = observation[4][k] self.batch_indices.append(index) return dict(states=states, actions=actions, rewards=rewards, terminals=terminals, internals=internals)
def initialize(self): # Check whether already initialized if self.is_initialized: raise TensorforceError( message= "Agent is already initialized, possibly as part of Agent.create()." ) self.is_initialized = True # Act-observe timestep check self.timestep_counter = np.zeros(shape=(self.parallel_interactions, ), dtype=util.np_dtype(dtype='int')) self.timestep_completed = np.ones(shape=(self.parallel_interactions, ), dtype=util.np_dtype(dtype='bool')) # Recorder buffers if required if self.recorder is not None: self.num_episodes = 0 self.buffers = ListDict() self.buffers['terminal'] = [ list() for _ in range(self.parallel_interactions) ] self.buffers['reward'] = [ list() for _ in range(self.parallel_interactions) ] def function(spec): return [list() for _ in range(self.parallel_interactions)] self.buffers['states'] = self.states_spec.fmap(function=function, cls=ListDict) self.buffers['actions'] = self.actions_spec.fmap(function=function, cls=ListDict) function = (lambda x: list()) self.recorded = ListDict() self.recorded['states'] = self.states_spec.fmap(function=function, cls=ListDict) self.recorded['actions'] = self.actions_spec.fmap( function=function, cls=ListDict) self.recorded['terminal'] = list() self.recorded['reward'] = list()
def __init__(self, name, capacity, values_spec, device=None, summary_labels=None): # Terminal initialization has to agree with terminal_indices terminal_initializer = np.zeros(shape=(capacity,), dtype=util.np_dtype(dtype='long')) terminal_initializer[-1] = 1 initializers = OrderedDict(terminal=terminal_initializer) super().__init__( name=name, capacity=capacity, values_spec=values_spec, initializers=initializers, device=device, summary_labels=summary_labels )
def internals_init(self): internals_init = super().internals_init() if self.cell_type == 'gru': shape = (self.size, ) elif self.cell_type == 'lstm': shape = (2, self.size) stddev = min(0.1, np.sqrt(2.0 / self.size)) internals_init['state'] = np.random.normal( scale=stddev, size=shape).astype(util.np_dtype(dtype='float')) return internals_init
def __init__(self, capacity, states_config, actions_config, prioritization_weight=1.0): self.capacity = capacity self.prioritization_weight = prioritization_weight self.state_spec = { name: (tuple(state.shape), util.np_dtype(state.type)) for name, state in states_config } self.action_spec = { name: util.np_dtype('float' if action.continuous else 'int') for name, action in actions_config } self.internal_spec = None self.observations = list() self.sum_priorities = 0.0 self.positive_priority_index = -1 self.batch_indices = list()
def pre_run(self, agent, environment): demonstrations = list() agent.reset() internals = agent.current_internals terminal = True for n in xrange(50): if terminal: state = environment.reset() actions = dict() # Create demonstration actions of the right shape. if 'type' in environment.actions: if environment.actions['type'] == 'bool': actions = np.full(shape=(), fill_value=True, dtype=util.np_dtype( environment.actions['type'])) elif environment.actions['type'] == 'int': actions = np.full(shape=(), fill_value=1, dtype=util.np_dtype( environment.actions['type'])) elif environment.actions['type'] == 'float': actions = np.full(shape=(), fill_value=1.0, dtype=util.np_dtype( environment.actions['type'])) else: for name, action in environment.actions.items(): if action['type'] == 'bool': actions[name] = np.full(shape=action['shape'], fill_value=True, dtype=util.np_dtype( action['type'])) elif action['type'] == 'int': actions[name] = np.full(shape=action['shape'], fill_value=1, dtype=util.np_dtype( action['type'])) elif action['type'] == 'float': actions[name] = np.full(shape=action['shape'], fill_value=1.0, dtype=util.np_dtype( action['type'])) state, terminal, reward = environment.execute(actions=actions) demonstration = dict(states=state, internals=internals, actions=actions, terminal=terminal, reward=reward) demonstrations.append(demonstration) agent.import_demonstrations(demonstrations) agent.pretrain(steps=1000)
def initialize(self): super().initialize() # Value buffers def function(name, spec): spec = TensorSpec(type=spec.type, shape=((self.capacity, ) + spec.shape)) if name == 'terminal': initializer = np.zeros(shape=(self.capacity, ), dtype=spec.np_type()) initializer[-1] = 1 else: initializer = 'zeros' return self.variable(name=(name.replace('/', '_') + '-buffer'), spec=spec, initializer=initializer, is_trainable=False, is_saved=True) self.buffers = self.values_spec.fmap(function=function, cls=VariableDict, with_names=True) # Buffer index (modulo capacity, next index to write to) self.buffer_index = self.variable(name='buffer-index', spec=TensorSpec(type='int'), initializer='zeros', is_trainable=False, is_saved=True) # Terminal indices # (oldest episode terminals first, initially the only terminal is last index) initializer = np.zeros(shape=(self.capacity + 1, ), dtype=util.np_dtype(dtype='int')) initializer[0] = self.capacity - 1 self.terminal_indices = self.variable(name='terminal-indices', spec=TensorSpec( type='int', shape=(self.capacity + 1, )), initializer=initializer, is_trainable=False, is_saved=True) # Episode count self.episode_count = self.variable(name='episode-count', spec=TensorSpec(type='int'), initializer='zeros', is_trainable=False, is_saved=True)
def tf_initialize(self): super().tf_initialize() # Terminal indices # (oldest episode terminals first, initially the only terminal is last index) initializer = np.zeros(shape=(self.capacity + 1,), dtype=util.np_dtype(dtype='long')) initializer[0] = self.capacity - 1 self.terminal_indices = self.add_variable( name='terminal-indices', dtype='long', shape=(self.capacity + 1,), is_trainable=False, initializer=initializer ) # Episode count self.episode_count = self.add_variable( name='episode-count', dtype='long', shape=(), is_trainable=False, initializer='zeros' )
def is_valid_action_function(cls, action_spec): dtype = action_spec['type'] shape = action_spec.get('shape', ()) if dtype == 'bool': return (lambda action, name, states: ( (isinstance(action, util.py_dtype('bool')) and shape == ()) or (isinstance(action, np.ndarray) and action.dtype == util.np_dtype('bool') and action.shape == shape))) elif dtype == 'int': num_values = action_spec['num_values'] return (lambda action, name, states: ( (isinstance(action, util.py_dtype('int')) and shape == () and 0 <= action and action < num_values and states[ name + '_mask'][action]) or (isinstance(action, np.ndarray) and action.dtype == util. np_dtype('int') and action.shape == shape and (0 <= action).all() and (action < num_values).all() and np. take_along_axis(states[name + '_mask'], indices=np.expand_dims(action, axis=-1), axis=-1).all()))) elif dtype == 'float': if 'min_value' in action_spec: min_value = action_spec['min_value'] max_value = action_spec['max_value'] return (lambda action, name, states: ( (isinstance(action, util.py_dtype('float')) and shape == () and min_value <= action and action <= max_value) or (isinstance(action, np.ndarray) and action.dtype == util. np_dtype('float') and action.shape == shape and (min_value <= action).all() and (action <= max_value).all()))) else: return (lambda action, name, states: ( (isinstance(action, util.py_dtype('float')) and shape == ()) or (isinstance(action, np.ndarray) and action.dtype == util. np_dtype('float') and action.shape == shape)))
def is_valid_action_function(cls, action_spec): dtype = action_spec['type'] shape = action_spec.get('shape', ()) if dtype == 'bool': return (lambda action: ( (isinstance(action, util.np_dtype('bool')) and shape == ()) or (isinstance(action, np.ndarray) and action.dtype == util.np_dtype('bool') and action.shape == shape))) elif dtype == 'int': num_values = action_spec['num_values'] return (lambda action: ( ((isinstance(action, util.np_dtype('int')) and shape == ()) or (isinstance(action, np.ndarray) and action.dtype == util.np_dtype('int') and action.shape == shape)) and (0 <= action).all() and (action < num_values).all())) elif dtype == 'float': if 'min_value' in action_spec: min_value = action_spec['min_value'] max_value = action_spec['max_value'] return (lambda action: (( (isinstance(action, util.np_dtype('float')) and shape == ()) or (isinstance(action, np.ndarray) and action.dtype == util. np_dtype('float') and action.shape == shape)) and (min_value <= action).all() and (action <= max_value).all())) else: return (lambda action: ( (isinstance(action, util.np_dtype('float')) and shape == ()) or (isinstance(action, np.ndarray) and action.dtype == util. np_dtype('float') and action.shape == shape)))
def experience( self, states, actions, terminal, reward, internals=None, query=None, **kwargs ): """ Feed experience traces. Args: states (dict[array[state]]): Dictionary containing arrays of states (<span style="color:#C00000"><b>required</b></span>). actions (dict[array[action]]): Dictionary containing arrays of actions (<span style="color:#C00000"><b>required</b></span>). terminal (array[bool]): Array of terminals (<span style="color:#C00000"><b>required</b></span>). reward (array[float]): Array of rewards (<span style="color:#C00000"><b>required</b></span>). internals (dict[state]): Dictionary containing arrays of internal agent states (<span style="color:#00C000"><b>default</b></span>: no internal states). query (list[str]): Names of tensors to retrieve (<span style="color:#00C000"><b>default</b></span>: none). kwargs: Additional input values, for instance, for dynamic hyperparameters. """ assert (self.buffer_indices == 0).all() assert util.reduce_all(predicate=util.not_nan_inf, xs=states) assert internals is None or util.reduce_all(predicate=util.not_nan_inf, xs=internals) assert util.reduce_all(predicate=util.not_nan_inf, xs=actions) assert util.reduce_all(predicate=util.not_nan_inf, xs=reward) # Auxiliaries auxiliaries = OrderedDict() if isinstance(states, dict): for name, spec in self.actions_spec.items(): if spec['type'] == 'int' and name + '_mask' in states: auxiliaries[name + '_mask'] = np.asarray(states.pop(name + '_mask')) auxiliaries = util.fmap(function=np.asarray, xs=auxiliaries, depth=1) # Normalize states/actions dictionaries states = util.normalize_values( value_type='state', values=states, values_spec=self.states_spec ) actions = util.normalize_values( value_type='action', values=actions, values_spec=self.actions_spec ) if internals is None: internals = OrderedDict() if isinstance(terminal, (bool, int)): states = util.fmap(function=(lambda x: [x]), xs=states, depth=1) internals = util.fmap(function=(lambda x: [x]), xs=internals, depth=1) auxiliaries = util.fmap(function=(lambda x: [x]), xs=auxiliaries, depth=1) actions = util.fmap(function=(lambda x: [x]), xs=actions, depth=1) terminal = [terminal] reward = [reward] states = util.fmap(function=np.asarray, xs=states, depth=1) internals = util.fmap(function=np.asarray, xs=internals, depth=1) auxiliaries = util.fmap(function=np.asarray, xs=auxiliaries, depth=1) actions = util.fmap(function=np.asarray, xs=actions, depth=1) if isinstance(terminal, np.ndarray): if terminal.dtype is util.np_dtype(dtype='bool'): zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='long')) ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='long')) terminal = np.where(terminal, ones, zeros) else: terminal = np.asarray([int(x) if isinstance(x, bool) else x for x in terminal]) reward = np.asarray(reward) # Batch experiences split into episodes and at most size buffer_observe last = 0 for index in range(1, len(terminal) + 1): if terminal[index - 1] == 0 and index - last < self.experience_size: continue # Include terminal in batch if possible if index < len(terminal) and terminal[index - 1] == 0 and terminal[index] > 0 and \ index - last < self.experience_size: index += 1 function = (lambda x: x[last: index]) states_batch = util.fmap(function=function, xs=states, depth=1) internals_batch = util.fmap(function=function, xs=internals, depth=1) auxiliaries_batch = util.fmap(function=function, xs=auxiliaries, depth=1) actions_batch = util.fmap(function=function, xs=actions, depth=1) terminal_batch = terminal[last: index] reward_batch = reward[last: index] last = index # Model.experience() if query is None: self.timesteps, self.episodes, self.updates = self.model.experience( states=states_batch, internals=internals_batch, auxiliaries=auxiliaries_batch, actions=actions_batch, terminal=terminal_batch, reward=reward_batch, **kwargs ) else: self.timesteps, self.episodes, self.updates, queried = self.model.experience( states=states_batch, internals=internals_batch, auxiliaries=auxiliaries_batch, actions=actions_batch, terminal=terminal_batch, reward=reward_batch, query=query, **kwargs ) if query is not None: return queried
def act(self, states, internals=None, parallel=0, independent=False, deterministic=False, evaluation=False, query=None, **kwargs): """ Returns action(s) for the given state(s), needs to be followed by `observe(...)` unless independent mode set via `independent`/`evaluation`. Args: states (dict[state] | iter[dict[state]]): Dictionary containing state(s) to be acted on (<span style="color:#C00000"><b>required</b></span>). internals (dict[internal] | iter[dict[internal]]): Dictionary containing current internal agent state(s) (<span style="color:#C00000"><b>required</b></span> if independent mode). parallel (int | iter[int]): Parallel execution index (<span style="color:#00C000"><b>default</b></span>: 0). independent (bool): Whether act is not part of the main agent-environment interaction, and this call is thus not followed by observe (<span style="color:#00C000"><b>default</b></span>: false). deterministic (bool): Ff independent mode, whether to act deterministically, so no exploration and sampling (<span style="color:#00C000"><b>default</b></span>: false). evaluation (bool): Whether the agent is currently evaluated, implies independent and deterministic (<span style="color:#00C000"><b>default</b></span>: false). query (list[str]): Names of tensors to retrieve (<span style="color:#00C000"><b>default</b></span>: none). kwargs: Additional input values, for instance, for dynamic hyperparameters. Returns: dict[action] | iter[dict[action]], if independent mode dict[internal] | iter[dict[internal]], plus optional list[str]: Dictionary containing action(s), dictionary containing next internal agent state(s) if independent mode, plus queried tensor values if requested. """ assert util.reduce_all(predicate=util.not_nan_inf, xs=states) if evaluation: if deterministic: raise TensorforceError.invalid(name='agent.act', argument='deterministic', condition='evaluation = true') if independent: raise TensorforceError.invalid(name='agent.act', argument='independent', condition='evaluation = true') deterministic = independent = True if not independent: if internals is not None: raise TensorforceError.invalid(name='agent.act', argument='internals', condition='independent = false') if deterministic: raise TensorforceError.invalid(name='agent.act', argument='deterministic', condition='independent = false') if independent: internals_is_none = (internals is None) if internals_is_none: internals = OrderedDict() # Batch states batched = (not isinstance(parallel, int)) if batched: if len(parallel) == 0: raise TensorforceError.value(name='agent.act', argument='parallel', value=parallel, hint='zero-length') parallel = np.asarray(list(parallel)) if isinstance(states[0], dict): states = OrderedDict( ((name, np.asarray( [states[n][name] for n in range(len(parallel))])) for name in states[0])) else: states = np.asarray(states) if independent: internals = OrderedDict( ((name, np.asarray( [internals[n][name] for n in range(len(parallel))])) for name in internals[0])) else: parallel = np.asarray([parallel]) states = util.fmap(function=(lambda x: np.asarray([x])), xs=states, depth=int(isinstance(states, dict))) if independent: internals = util.fmap(function=(lambda x: np.asarray([x])), xs=internals, depth=1) if not independent and not all(self.timestep_completed[n] for n in parallel): raise TensorforceError( message="Calling agent.act must be preceded by agent.observe.") # Auxiliaries auxiliaries = OrderedDict() if isinstance(states, dict): states = dict(states) for name, spec in self.actions_spec.items(): if spec['type'] == 'int' and name + '_mask' in states: auxiliaries[name + '_mask'] = states.pop(name + '_mask') # Normalize states dictionary states = util.normalize_values(value_type='state', values=states, values_spec=self.states_spec) # Model.act() if independent: if query is None: actions, internals = self.model.independent_act( states=states, internals=internals, auxiliaries=auxiliaries, parallel=parallel, deterministic=deterministic, **kwargs) else: actions, internals, queried = self.model.independent_act( states=states, internals=internals, auxiliaries=auxiliaries, parallel=parallel, deterministic=deterministic, query=query, **kwargs) else: if query is None: actions, self.timesteps = self.model.act( states=states, auxiliaries=auxiliaries, parallel=parallel, **kwargs) else: actions, self.timesteps, queried = self.model.act( states=states, auxiliaries=auxiliaries, parallel=parallel, query=query, **kwargs) if not independent: for n in parallel: self.timestep_completed[n] = False if self.recorder_spec is not None and not independent and \ self.episodes >= self.recorder_spec.get('start', 0): for n in range(len(parallel)): index = self.buffer_indices[parallel[n]] for name in self.states_spec: self.states_buffers[name][parallel[n], index] = states[name][n] for name, spec in self.actions_spec.items(): self.actions_buffers[name][parallel[n], index] = actions[name][n] if spec['type'] == 'int': name = name + '_mask' if name in auxiliaries: self.states_buffers[name][ parallel[n], index] = auxiliaries[name][n] else: shape = (1, ) + spec['shape'] + ( spec['num_values'], ) self.states_buffers[name][parallel[n], index] = np.full( shape=shape, fill_value=True, dtype=util.np_dtype( dtype='bool')) # Reverse normalized actions dictionary actions = util.unpack_values(value_type='action', values=actions, values_spec=self.actions_spec) # Unbatch actions if batched: if isinstance(actions, dict): actions = [ OrderedDict(((name, actions[name][n]) for name in actions)) for n in range(len(parallel)) ] else: actions = util.fmap(function=(lambda x: x[0]), xs=actions, depth=int(isinstance(actions, dict))) if independent: internals = util.fmap(function=(lambda x: x[0]), xs=internals, depth=1) if independent and not internals_is_none: if query is None: return actions, internals else: return actions, internals, queried else: if query is None: return actions else: return actions, queried
def initialize(self): """ Initializes the agent, usually done as part of Agent.create/load. """ if self.is_initialized: raise TensorforceError( message= "Agent is already initialized, possibly as part of Agent.create()." ) self.is_initialized = True # Parallel terminal/reward buffers self.terminal_buffers = np.ndarray(shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='long')) self.reward_buffers = np.ndarray(shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='float')) # Recorder buffers if required if self.recorder_spec is not None: self.states_buffers = OrderedDict() self.actions_buffers = OrderedDict() for name, spec in self.states_spec.items(): shape = (self.parallel_interactions, self.buffer_observe) + spec['shape'] self.states_buffers[name] = np.ndarray( shape=shape, dtype=util.np_dtype(dtype=spec['type'])) for name, spec in self.actions_spec.items(): shape = (self.parallel_interactions, self.buffer_observe) + spec['shape'] self.actions_buffers[name] = np.ndarray( shape=shape, dtype=util.np_dtype(dtype=spec['type'])) if spec['type'] == 'int': shape = (self.parallel_interactions, self.buffer_observe) + spec['shape'] + \ (spec['num_values'],) self.states_buffers[name + '_mask'] = np.ndarray( shape=shape, dtype=util.np_dtype(dtype='bool')) self.num_episodes = 0 self.record_states = OrderedDict( ((name, list()) for name in self.states_spec)) self.record_actions = OrderedDict( ((name, list()) for name in self.actions_spec)) for name, spec in self.actions_spec.items(): if spec['type'] == 'int': self.record_states[name + '_mask'] = list() self.record_terminal = list() self.record_reward = list() # Parallel buffer indices self.buffer_indices = np.zeros(shape=(self.parallel_interactions, ), dtype=util.np_dtype(dtype='int')) self.timestep_completed = np.ndarray( shape=(self.parallel_interactions, ), dtype=util.np_dtype(dtype='bool')) self.timesteps = 0 self.episodes = 0 self.updates = 0 # Setup Model if not hasattr(self, 'model'): raise TensorforceError(message="Missing agent attribute model.") self.model.initialize() if self.model.saver_directory is not None: file = os.path.join(self.model.saver_directory, self.model.saver_filename + '.json') try: with open(file, 'w') as fp: json.dump(obj=self.spec, fp=fp, cls=NumpyJSONEncoder) except BaseException: os.remove(file) self.reset()
def np_type(self): return util.np_dtype(dtype=self.type)
def observe(self, reward=0.0, terminal=False, parallel=0): # Check whether inputs are batched if util.is_iterable(x=reward) or (isinstance(reward, np.ndarray) and reward.ndim > 0): reward = np.asarray(reward) num_parallel = reward.shape[0] if not isinstance(terminal, np.ndarray) and terminal is False: terminal = np.asarray([0 for _ in range(num_parallel)]) else: terminal = np.asarray(terminal) if not isinstance(parallel, np.ndarray) and parallel == 0: assert num_parallel == self.parallel_interactions parallel = np.asarray(list(range(num_parallel))) else: parallel = np.asarray(parallel) elif util.is_iterable(x=terminal) or \ (isinstance(terminal, np.ndarray) and terminal.ndim > 0): terminal = np.asarray(terminal, dtype=util.np_dtype(dtype='int')) num_parallel = terminal.shape[0] if not isinstance(reward, np.ndarray) and reward == 0.0: reward = np.asarray([0.0 for _ in range(num_parallel)]) else: reward = np.asarray(reward) if not isinstance(parallel, np.ndarray) and parallel == 0: assert num_parallel == self.parallel_interactions parallel = np.asarray(list(range(num_parallel))) else: parallel = np.asarray(parallel) elif util.is_iterable(x=parallel) or \ (isinstance(parallel, np.ndarray) and parallel.ndim > 0): parallel = np.asarray(parallel) num_parallel = parallel.shape[0] if not isinstance(reward, np.ndarray) and reward == 0.0: reward = np.asarray([0.0 for _ in range(num_parallel)]) else: reward = np.asarray(reward) if not isinstance(terminal, np.ndarray) and terminal is False: terminal = np.asarray([0 for _ in range(num_parallel)]) else: terminal = np.asarray(terminal) else: reward = np.asarray([float(reward)]) terminal = np.asarray([int(terminal)]) parallel = np.asarray([int(parallel)]) num_parallel = 1 # Check whether shapes/lengths are consistent if parallel.shape[0] == 0: raise TensorforceError.value(name='Agent.observe', argument='len(parallel)', value=parallel.shape[0], hint='= 0') if reward.shape != parallel.shape: raise TensorforceError.value(name='Agent.observe', argument='len(reward)', value=reward.shape, hint='!= parallel length') if terminal.shape != parallel.shape: raise TensorforceError.value(name='Agent.observe', argument='len(terminal)', value=terminal.shape, hint='!= parallel length') # Convert terminal to int if necessary if terminal.dtype is util.np_dtype(dtype='bool'): zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int')) ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int')) terminal = np.where(terminal, ones, zeros) # Check whether current timesteps are not completed if self.timestep_completed[parallel].any(): raise TensorforceError( message="Calling agent.observe must be preceded by agent.act.") self.timestep_completed[parallel] = True # Check whether episode is too long self.timestep_counter[parallel] += 1 if self.max_episode_timesteps is not None and np.logical_and( terminal == 0, self.timestep_counter[parallel] > self.max_episode_timesteps).any(): raise TensorforceError( message="Episode longer than max_episode_timesteps.") self.timestep_counter[parallel] = np.where( terminal > 0, 0, self.timestep_counter[parallel]) if self.recorder is None: pass elif self.num_episodes < self.recorder.get('start', 0): # Increment num_episodes for t in terminal.tolist(): if t > 0: self.num_episodes += 1 else: # Store values per parallel interaction for p, t, r in zip(parallel.tolist(), terminal.tolist(), reward.tolist()): # Buffer inputs self.buffers['terminal'][p].append(t) self.buffers['reward'][p].append(r) # Continue if not terminal if t == 0: continue self.num_episodes += 1 # Buffered terminal/reward inputs for name in self.states_spec: self.recorded['states'][name].append( np.stack(self.buffers['states'][name][p], axis=0)) self.buffers['states'][name][p].clear() for name, spec in self.actions_spec.items(): self.recorded['actions'][name].append( np.stack(self.buffers['actions'][name][p], axis=0)) self.buffers['actions'][name][p].clear() self.recorded['terminal'].append( np.array(self.buffers['terminal'][p], dtype=self.terminal_spec.np_type())) self.buffers['terminal'][p].clear() self.recorded['reward'].append( np.array(self.buffers['reward'][p], dtype=self.reward_spec.np_type())) self.buffers['reward'][p].clear() # Check whether recording step if (self.num_episodes - self.recorder.get('start', 0)) \ % self.recorder.get('frequency', 1) != 0: continue # Manage recorder directory directory = self.recorder['directory'] if os.path.isdir(directory): files = sorted( f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and os.path.splitext(f)[1] == '.npz') else: os.makedirs(directory) files = list() max_traces = self.recorder.get('max-traces') if max_traces is not None and len(files) > max_traces - 1: for filename in files[:-max_traces + 1]: filename = os.path.join(directory, filename) os.remove(filename) # Write recording file filename = os.path.join( directory, 'trace-{:09d}.npz'.format(self.num_episodes - 1)) # time.strftime('%Y%m%d-%H%M%S') kwargs = self.recorded.fmap(function=np.concatenate, cls=ArrayDict).items() np.savez_compressed(file=filename, **dict(kwargs)) # Clear recorded values for recorded in self.recorded.values(): recorded.clear() if self._is_agent: return reward, terminal, parallel else: return 0
def experience(self, states, actions, terminal, reward, internals=None): """ Feed experience traces. See the [act-experience-update script](https://github.com/tensorforce/tensorforce/blob/master/examples/act_experience_update_interface.py) for an example application as part of the act-experience-update interface, which is an alternative to the act-observe interaction pattern. Args: states (dict[array[state]]): Dictionary containing arrays of states (<span style="color:#C00000"><b>required</b></span>). actions (dict[array[action]]): Dictionary containing arrays of actions (<span style="color:#C00000"><b>required</b></span>). terminal (array[bool]): Array of terminals (<span style="color:#C00000"><b>required</b></span>). reward (array[float]): Array of rewards (<span style="color:#C00000"><b>required</b></span>). internals (dict[state]): Dictionary containing arrays of internal agent states (<span style="color:#C00000"><b>required</b></span> if agent has internal states). """ if not all(len(buffer) == 0 for buffer in self.terminal_buffer): raise TensorforceError( message="Calling agent.experience is not possible mid-episode." ) # Process states input and infer batching structure states, batched, num_instances, is_iter_of_dicts = self._process_states_input( states=states, function_name='Agent.experience') if is_iter_of_dicts: # Input structure iter[dict[input]] # Internals if internals is None: internals = ArrayDict(self.initial_internals()) internals = internals.fmap(function=(lambda x: np.repeat( np.expand_dims(x, axis=0), repeats=num_instances, axis=0))) elif not isinstance(internals, (tuple, list)): raise TensorforceError.type(name='Agent.experience', argument='internals', dtype=type(internals), hint='is not tuple/list') else: internals = [ArrayDict(internal) for internal in internals] internals = internals[0].fmap( function=(lambda *xs: np.stack(xs, axis=0)), zip_values=internals[1:]) # Actions if isinstance(actions, np.ndarray): actions = ArrayDict(singleton=actions) elif not isinstance(actions, (tuple, list)): raise TensorforceError.type(name='Agent.experience', argument='actions', dtype=type(actions), hint='is not tuple/list') elif not isinstance(actions[0], dict): actions = ArrayDict(singleton=np.asarray(actions)) else: actions = [ArrayDict(action) for action in actions] actions = actions[0].fmap( function=(lambda *xs: np.stack(xs, axis=0)), zip_values=actions[1:]) else: # Input structure dict[iter[input]] # Internals if internals is None: internals = ArrayDict(self.initial_internals()) internals = internals.fmap(function=(lambda x: np.tile( np.expand_dims(x, axis=0), reps=(num_instances, )))) elif not isinstance(internals, dict): raise TensorforceError.type(name='Agent.experience', argument='internals', dtype=type(internals), hint='is not dict') else: internals = ArrayDict(internals) # Actions if not isinstance(actions, np.ndarray): actions = ArrayDict(singleton=actions) elif not isinstance(actions, dict): raise TensorforceError.type(name='Agent.experience', argument='actions', dtype=type(actions), hint='is not dict') else: actions = ArrayDict(actions) # Expand inputs if not batched if not batched: internals = internals.fmap( function=(lambda x: np.expand_dims(x, axis=0))) actions = actions.fmap( function=(lambda x: np.expand_dims(x, axis=0))) terminal = np.asarray([terminal]) reward = np.asarray([reward]) else: terminal = np.asarray(terminal) reward = np.asarray(reward) # Check number of inputs for name, internal in internals.items(): if internal.shape[0] != num_instances: raise TensorforceError.value( name='Agent.experience', argument='len(internals[{}])'.format(name), value=internal.shape[0], hint='!= len(states)') for name, action in actions.items(): if action.shape[0] != num_instances: raise TensorforceError.value( name='Agent.experience', argument='len(actions[{}])'.format(name), value=action.shape[0], hint='!= len(states)') if terminal.shape[0] != num_instances: raise TensorforceError.value(name='Agent.experience', argument='len(terminal)'.format(name), value=terminal.shape[0], hint='!= len(states)') if reward.shape[0] != num_instances: raise TensorforceError.value(name='Agent.experience', argument='len(reward)'.format(name), value=reward.shape[0], hint='!= len(states)') def function(name, spec): auxiliary = ArrayDict() if self.config.enable_int_action_masking and spec.type == 'int' and \ spec.num_values is not None: if name is None: name = 'action' # Mask, either part of states or default all true auxiliary['mask'] = states.pop( name + '_mask', np.ones(shape=(num_instances, ) + spec.shape + (spec.num_values, ), dtype=spec.np_type())) return auxiliary auxiliaries = self.actions_spec.fmap(function=function, cls=ArrayDict, with_names=True) if self.states_spec.is_singleton() and not states.is_singleton(): states[None] = states.pop('state') # Convert terminal to int if necessary if terminal.dtype is util.np_dtype(dtype='bool'): zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int')) ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int')) terminal = np.where(terminal, ones, zeros) if terminal[-1] == 0: raise TensorforceError( message="Agent.experience() requires full episodes as input.") # Batch experiences split into episodes and at most size buffer_observe last = 0 for index in range(1, len(terminal) + 1): if terminal[index - 1] == 0: continue function = (lambda x: x[last:index]) states_batch = states.fmap(function=function) internals_batch = internals.fmap(function=function) auxiliaries_batch = auxiliaries.fmap(function=function) actions_batch = actions.fmap(function=function) terminal_batch = function(terminal) reward_batch = function(reward) last = index # Inputs to tensors states_batch = self.states_spec.to_tensor( value=states_batch, batched=True, name='Agent.experience states') internals_batch = self.internals_spec.to_tensor( value=internals_batch, batched=True, recover_empty=True, name='Agent.experience internals') auxiliaries_batch = self.auxiliaries_spec.to_tensor( value=auxiliaries_batch, batched=True, name='Agent.experience auxiliaries') actions_batch = self.actions_spec.to_tensor( value=actions_batch, batched=True, name='Agent.experience actions') terminal_batch = self.terminal_spec.to_tensor( value=terminal_batch, batched=True, name='Agent.experience terminal') reward_batch = self.reward_spec.to_tensor( value=reward_batch, batched=True, name='Agent.experience reward') # Model.experience() timesteps, episodes = self.model.experience( states=states_batch, internals=internals_batch, auxiliaries=auxiliaries_batch, actions=actions_batch, terminal=terminal_batch, reward=reward_batch) self.timesteps = timesteps.numpy().item() self.episodes = episodes.numpy().item() if self.model.saver is not None: self.model.save()
def act( self, states, parallel=0, deterministic=False, independent=False, evaluation=False, query=None, **kwargs ): """ Returns action(s) for the given state(s), needs to be followed by `observe(...)` unless `independent` is true. Args: states (dict[state]): Dictionary containing state(s) to be acted on (<span style="color:#C00000"><b>required</b></span>). parallel (int): Parallel execution index (<span style="color:#00C000"><b>default</b></span>: 0). deterministic (bool): Whether to apply exploration and sampling (<span style="color:#00C000"><b>default</b></span>: false). independent (bool): Whether action is not remembered, and this call is thus not followed by observe (<span style="color:#00C000"><b>default</b></span>: false). evaluation (bool): Whether the agent is currently evaluated, implies and overwrites deterministic and independent (<span style="color:#00C000"><b>default</b></span>: false). query (list[str]): Names of tensors to retrieve (<span style="color:#00C000"><b>default</b></span>: none). kwargs: Additional input values, for instance, for dynamic hyperparameters. Returns: (dict[action], plus optional list[str]): Dictionary containing action(s), plus queried tensor values if requested. """ assert util.reduce_all(predicate=util.not_nan_inf, xs=states) # self.current_internals = self.next_internals if evaluation: if deterministic or independent: raise TensorforceError.unexpected() deterministic = independent = True # Auxiliaries auxiliaries = OrderedDict() if isinstance(states, dict): states = dict(states) for name, spec in self.actions_spec.items(): if spec['type'] == 'int' and name + '_mask' in states: auxiliaries[name + '_mask'] = states.pop(name + '_mask') # Normalize states dictionary states = util.normalize_values( value_type='state', values=states, values_spec=self.states_spec ) # Batch states states = util.fmap(function=(lambda x: np.asarray([x])), xs=states, depth=1) auxiliaries = util.fmap(function=(lambda x: np.asarray([x])), xs=auxiliaries, depth=1) # Model.act() if query is None: actions, self.timesteps = self.model.act( states=states, auxiliaries=auxiliaries, parallel=parallel, deterministic=deterministic, independent=independent, **kwargs ) else: actions, self.timesteps, queried = self.model.act( states=states, auxiliaries=auxiliaries, parallel=parallel, deterministic=deterministic, independent=independent, query=query, **kwargs ) if self.recorder_spec is not None and not independent and \ self.episodes >= self.recorder_spec.get('start', 0): index = self.buffer_indices[parallel] for name in self.states_spec: self.states_buffers[name][parallel, index] = states[name][0] for name, spec in self.actions_spec.items(): self.actions_buffers[name][parallel, index] = actions[name][0] if spec['type'] == 'int': name = name + '_mask' if name in auxiliaries: self.states_buffers[name][parallel, index] = auxiliaries[name][0] else: shape = (1,) + spec['shape'] + (spec['num_values'],) self.states_buffers[name][parallel, index] = np.full( shape=shape, fill_value=True, dtype=util.np_dtype(dtype='bool') ) # Unbatch actions actions = util.fmap(function=(lambda x: x[0]), xs=actions, depth=1) # Reverse normalized actions dictionary actions = util.unpack_values( value_type='action', values=actions, values_spec=self.actions_spec ) # if independent, return processed state as well? if query is None: return actions else: return actions, queried
def get_batch(self, batch_size, next_states=False): """ Samples a batch of the specified size according to priority. Args: batch_size: The batch size next_states: A boolean flag indicating whether 'next_states' values should be included Returns: A dict containing states, actions, rewards, terminals, internal states (and next states) """ if batch_size > len(self.observations): raise TensorForceError( "Requested batch size is larger than observations in memory: increase config.first_update." ) # Init empty states states = { name: np.zeros((batch_size, ) + tuple(state.shape), dtype=util.np_dtype(state.type)) for name, state in self.states_config.items() } actions = { name: np.zeros( (batch_size, ) + tuple(action.shape), dtype=util.np_dtype('float' if action.continuous else 'int')) for name, action in self.actions_config.items() } rewards = np.zeros((batch_size, ), dtype=util.np_dtype('float')) terminals = np.zeros((batch_size, ), dtype=util.np_dtype('bool')) internals = [ np.zeros((batch_size, ) + shape, dtype) for shape, dtype in self.internals_config ] if next_states: next_states = { name: np.zeros((batch_size, ) + tuple(state.shape), dtype=util.np_dtype(state.type)) for name, state in self.states_config.items() } next_internals = [ np.zeros((batch_size, ) + shape, dtype) for shape, dtype in self.internals_config ] # Start with unseen observations unseen_indices = list( xrange(self.none_priority_index + self.observations._capacity - 1, len(self.observations) + self.observations._capacity - 1)) self.batch_indices = unseen_indices[:batch_size] # Get remaining observations using weighted sampling remaining = batch_size - len(self.batch_indices) if remaining: samples = self.observations.sample_minibatch(remaining) sample_indices = [i for i, o in samples] self.batch_indices += sample_indices # Shuffle np.random.shuffle(self.batch_indices) # Collect observations for n, index in enumerate(self.batch_indices): observation, _ = self.observations._memory[index] for name, state in states.items(): state[n] = observation[0][name] for name, action in actions.items(): action[n] = observation[1][name] rewards[n] = observation[2] terminals[n] = observation[3] for k, internal in enumerate(internals): internal[n] = observation[4][k] if next_states: for name, next_state in next_states.items(): next_state[n] = observation[5][name] for k, next_internal in enumerate(next_internals): next_internal[n] = observation[6][k] if next_states: return dict(states=states, actions=actions, rewards=rewards, terminals=terminals, internals=internals, next_states=next_states, next_internals=next_internals) else: return dict(states=states, actions=actions, rewards=rewards, terminals=terminals, internals=internals)