def experience(self, states, actions, terminal, reward, internals=None): """ Feed experience traces. See the [act-experience-update script](https://github.com/tensorforce/tensorforce/blob/master/examples/act_experience_update_interface.py) for an example application as part of the act-experience-update interface, which is an alternative to the act-observe interaction pattern. Args: states (dict[array[state]]): Dictionary containing arrays of states (<span style="color:#C00000"><b>required</b></span>). actions (dict[array[action]]): Dictionary containing arrays of actions (<span style="color:#C00000"><b>required</b></span>). terminal (array[bool]): Array of terminals (<span style="color:#C00000"><b>required</b></span>). reward (array[float]): Array of rewards (<span style="color:#C00000"><b>required</b></span>). internals (dict[state]): Dictionary containing arrays of internal agent states (<span style="color:#C00000"><b>required</b></span> if agent has internal states). """ if not all(len(buffer) == 0 for buffer in self.terminal_buffer): raise TensorforceError( message="Calling agent.experience is not possible mid-episode." ) # Process states input and infer batching structure states, batched, num_instances, is_iter_of_dicts = self._process_states_input( states=states, function_name='Agent.experience') if is_iter_of_dicts: # Input structure iter[dict[input]] # Internals if internals is None: internals = ArrayDict(self.initial_internals()) internals = internals.fmap(function=(lambda x: np.repeat( np.expand_dims(x, axis=0), repeats=num_instances, axis=0))) elif not isinstance(internals, (tuple, list)): raise TensorforceError.type(name='Agent.experience', argument='internals', dtype=type(internals), hint='is not tuple/list') else: internals = [ArrayDict(internal) for internal in internals] internals = internals[0].fmap( function=(lambda *xs: np.stack(xs, axis=0)), zip_values=internals[1:]) # Actions if isinstance(actions, np.ndarray): actions = ArrayDict(singleton=actions) elif not isinstance(actions, (tuple, list)): raise TensorforceError.type(name='Agent.experience', argument='actions', dtype=type(actions), hint='is not tuple/list') elif not isinstance(actions[0], dict): actions = ArrayDict(singleton=np.asarray(actions)) else: actions = [ArrayDict(action) for action in actions] actions = actions[0].fmap( function=(lambda *xs: np.stack(xs, axis=0)), zip_values=actions[1:]) else: # Input structure dict[iter[input]] # Internals if internals is None: internals = ArrayDict(self.initial_internals()) internals = internals.fmap(function=(lambda x: np.tile( np.expand_dims(x, axis=0), reps=(num_instances, )))) elif not isinstance(internals, dict): raise TensorforceError.type(name='Agent.experience', argument='internals', dtype=type(internals), hint='is not dict') else: internals = ArrayDict(internals) # Actions if not isinstance(actions, np.ndarray): actions = ArrayDict(singleton=actions) elif not isinstance(actions, dict): raise TensorforceError.type(name='Agent.experience', argument='actions', dtype=type(actions), hint='is not dict') else: actions = ArrayDict(actions) # Expand inputs if not batched if not batched: internals = internals.fmap( function=(lambda x: np.expand_dims(x, axis=0))) actions = actions.fmap( function=(lambda x: np.expand_dims(x, axis=0))) terminal = np.asarray([terminal]) reward = np.asarray([reward]) else: terminal = np.asarray(terminal) reward = np.asarray(reward) # Check number of inputs for name, internal in internals.items(): if internal.shape[0] != num_instances: raise TensorforceError.value( name='Agent.experience', argument='len(internals[{}])'.format(name), value=internal.shape[0], hint='!= len(states)') for name, action in actions.items(): if action.shape[0] != num_instances: raise TensorforceError.value( name='Agent.experience', argument='len(actions[{}])'.format(name), value=action.shape[0], hint='!= len(states)') if terminal.shape[0] != num_instances: raise TensorforceError.value(name='Agent.experience', argument='len(terminal)'.format(name), value=terminal.shape[0], hint='!= len(states)') if reward.shape[0] != num_instances: raise TensorforceError.value(name='Agent.experience', argument='len(reward)'.format(name), value=reward.shape[0], hint='!= len(states)') def function(name, spec): auxiliary = ArrayDict() if self.config.enable_int_action_masking and spec.type == 'int' and \ spec.num_values is not None: if name is None: name = 'action' # Mask, either part of states or default all true auxiliary['mask'] = states.pop( name + '_mask', np.ones(shape=(num_instances, ) + spec.shape + (spec.num_values, ), dtype=spec.np_type())) return auxiliary auxiliaries = self.actions_spec.fmap(function=function, cls=ArrayDict, with_names=True) if self.states_spec.is_singleton() and not states.is_singleton(): states[None] = states.pop('state') # Convert terminal to int if necessary if terminal.dtype is util.np_dtype(dtype='bool'): zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int')) ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int')) terminal = np.where(terminal, ones, zeros) if terminal[-1] == 0: raise TensorforceError( message="Agent.experience() requires full episodes as input.") # Batch experiences split into episodes and at most size buffer_observe last = 0 for index in range(1, len(terminal) + 1): if terminal[index - 1] == 0: continue function = (lambda x: x[last:index]) states_batch = states.fmap(function=function) internals_batch = internals.fmap(function=function) auxiliaries_batch = auxiliaries.fmap(function=function) actions_batch = actions.fmap(function=function) terminal_batch = function(terminal) reward_batch = function(reward) last = index # Inputs to tensors states_batch = self.states_spec.to_tensor( value=states_batch, batched=True, name='Agent.experience states') internals_batch = self.internals_spec.to_tensor( value=internals_batch, batched=True, recover_empty=True, name='Agent.experience internals') auxiliaries_batch = self.auxiliaries_spec.to_tensor( value=auxiliaries_batch, batched=True, name='Agent.experience auxiliaries') actions_batch = self.actions_spec.to_tensor( value=actions_batch, batched=True, name='Agent.experience actions') terminal_batch = self.terminal_spec.to_tensor( value=terminal_batch, batched=True, name='Agent.experience terminal') reward_batch = self.reward_spec.to_tensor( value=reward_batch, batched=True, name='Agent.experience reward') # Model.experience() timesteps, episodes = self.model.experience( states=states_batch, internals=internals_batch, auxiliaries=auxiliaries_batch, actions=actions_batch, terminal=terminal_batch, reward=reward_batch) self.timesteps = timesteps.numpy().item() self.episodes = episodes.numpy().item() if self.model.saver is not None: self.model.save()
def act(self, states, internals=None, parallel=0, independent=False, deterministic=True, **kwargs): # Independent and internals is_internals_none = (internals is None) if independent: if parallel != 0: raise TensorforceError.invalid(name='Agent.act', argument='parallel', condition='independent is true') if is_internals_none and len(self.internals_spec) > 0: raise TensorforceError.required( name='Agent.act', argument='internals', condition='independent is true') else: if not is_internals_none: raise TensorforceError.invalid( name='Agent.act', argument='internals', condition='independent is false') # Process states input and infer batching structure states, batched, num_parallel, is_iter_of_dicts = self._process_states_input( states=states, function_name='Agent.act') if independent: # Independent mode: handle internals argument if is_internals_none: # Default input internals=None pass elif is_iter_of_dicts or isinstance(internals, (tuple, list)): # Input structure iter[dict[internal]] if not isinstance(internals, (tuple, list)): raise TensorforceError.type(name='Agent.act', argument='internals', dtype=type(internals), hint='is not tuple/list') internals = [ArrayDict(internal) for internal in internals] internals = internals[0].fmap( function=(lambda *xs: np.stack(xs, axis=0)), zip_values=internals[1:]) else: # Input structure dict[iter[internal]] if not isinstance(internals, dict): raise TensorforceError.type(name='Agent.act', argument='internals', dtype=type(internals), hint='is not dict') internals = ArrayDict(internals) if not independent or not is_internals_none: # Expand inputs if not batched if not batched: internals = internals.fmap( function=(lambda x: np.expand_dims(x, axis=0))) # Check number of inputs for name, internal in internals.items(): if internal.shape[0] != num_parallel: raise TensorforceError.value( name='Agent.act', argument='len(internals[{}])'.format(name), value=internal.shape[0], hint='!= len(states)') else: # Non-independent mode: handle parallel input if batched: # Batched input parallel = np.asarray(parallel) elif parallel == 0: # Default input parallel=0 if batched: assert num_parallel == self.parallel_interactions parallel = np.asarray(list(range(num_parallel))) else: parallel = np.asarray([parallel]) else: # Expand input if not batched parallel = np.asarray([parallel]) # Check number of inputs if parallel.shape[0] != num_parallel: raise TensorforceError.value(name='Agent.act', argument='len(parallel)', value=len(parallel), hint='!= len(states)') # If not independent, check whether previous timesteps were completed if not independent: if not self.timestep_completed[parallel].all(): raise TensorforceError( message= "Calling agent.act must be preceded by agent.observe for training, or " "agent.act argument 'independent' must be passed as True.") self.timestep_completed[parallel] = False # Buffer inputs for recording if self.recorder is not None and not independent and \ self.num_episodes >= self.recorder.get('start', 0): for n in range(num_parallel): for name in self.states_spec: self.buffers['states'][name][parallel[n]].append( states[name][n]) # fn_act() if self._is_agent: actions, internals = self.fn_act( states=states, internals=internals, parallel=parallel, independent=independent, deterministic=deterministic, is_internals_none=is_internals_none, num_parallel=num_parallel) else: if batched: assert False else: states = states.fmap(function=( lambda x: x[0].item() if x.shape == (1, ) else x[0])) actions = self.fn_act(states.to_kwargs()) if self.actions_spec.is_singleton(): actions = ArrayDict(singleton=np.asarray([actions])) else: actions = ArrayDict(actions) actions = actions.fmap( function=(lambda x: np.asarray([x]))) # Buffer outputs for recording if self.recorder is not None and not independent and \ self.num_episodes >= self.recorder.get('start', 0): for n in range(num_parallel): for name in self.actions_spec: self.buffers['actions'][name][parallel[n]].append( actions[name][n]) # Unbatch actions if batched: # If inputs were batched, turn dict of lists into list of dicts function = (lambda x: x.item() if x.shape == () else x) # TODO: recursive if self.actions_spec.is_singleton(): actions = actions.singleton() if is_iter_of_dicts: actions = [ function(actions[n]) for n in range(num_parallel) ] else: if is_iter_of_dicts: actions = [ OrderedDict(((name, function(x[n])) for name, x in actions.items())) for n in range(num_parallel) ] else: actions = OrderedDict(actions.items()) if independent and not is_internals_none: if is_iter_of_dicts: # TODO: recursive internals = [ OrderedDict(((name, function(x[n])) for name, x in internals.items())) for n in range(num_parallel) ] else: internals = OrderedDict(internals.items()) else: # If inputs were not batched, unbatch outputs function = (lambda x: x.item() if x.shape == (1, ) else x[0]) if self.actions_spec.is_singleton(): actions = function(actions.singleton()) else: actions = actions.fmap(function=function, cls=OrderedDict) if independent and not is_internals_none: internals = internals.fmap(function=function, cls=OrderedDict) if independent and not is_internals_none: return actions, internals else: return actions
def act( self, states, internals=None, parallel=0, independent=False, # Deprecated deterministic=None, evaluation=None ): """ Returns action(s) for the given state(s), needs to be followed by `observe()` unless independent mode. Args: states (dict[state] | iter[dict[state]]): Dictionary containing state(s) to be acted on (<span style="color:#C00000"><b>required</b></span>). internals (dict[internal] | iter[dict[internal]]): Dictionary containing current internal agent state(s), either given by `initial_internals()` at the beginning of an episode or as return value of the preceding `act()` call (<span style="color:#C00000"><b>required</b></span> if independent mode and agent has internal states). parallel (int | iter[int]): Parallel execution index (<span style="color:#00C000"><b>default</b></span>: 0). independent (bool): Whether act is not part of the main agent-environment interaction, and this call is thus not followed by observe (<span style="color:#00C000"><b>default</b></span>: false). Returns: dict[action] | iter[dict[action]], dict[internal] | iter[dict[internal]] if `internals` argument given: Dictionary containing action(s), dictionary containing next internal agent state(s) if independent mode. """ if deterministic is not None: raise TensorforceError.deprecated( name='Agent.act', argument='deterministic', replacement='independent' ) if evaluation is not None: raise TensorforceError.deprecated( name='Agent.act', argument='evaluation', replacement='independent' ) # Independent and internals if independent: if parallel != 0: raise TensorforceError.invalid( name='Agent.act', argument='parallel', condition='independent is true' ) is_internals_none = (internals is None) if is_internals_none and len(self.internals_spec) > 0: raise TensorforceError.required( name='Agent.act', argument='internals', condition='independent is true' ) else: if internals is not None: raise TensorforceError.invalid( name='Agent.act', argument='internals', condition='independent is false' ) # Process states input and infer batching structure states, batched, num_parallel, is_iter_of_dicts, input_type = self._process_states_input( states=states, function_name='Agent.act' ) if independent: # Independent mode: handle internals argument if is_internals_none: # Default input internals=None pass elif is_iter_of_dicts: # Input structure iter[dict[internal]] if not isinstance(internals, (tuple, list)): raise TensorforceError.type( name='Agent.act', argument='internals', dtype=type(internals), hint='is not tuple/list' ) internals = [ArrayDict(internal) for internal in internals] internals = internals[0].fmap( function=(lambda *xs: np.stack(xs, axis=0)), zip_values=internals[1:] ) else: # Input structure dict[iter[internal]] if not isinstance(internals, dict): raise TensorforceError.type( name='Agent.act', argument='internals', dtype=type(internals), hint='is not dict' ) internals = ArrayDict(internals) if not independent or not is_internals_none: # Expand inputs if not batched if not batched: internals = internals.fmap(function=(lambda x: np.expand_dims(x, axis=0))) # Check number of inputs for name, internal in internals.items(): if internal.shape[0] != num_parallel: raise TensorforceError.value( name='Agent.act', argument='len(internals[{}])'.format(name), value=internal.shape[0], hint='!= len(states)' ) else: # Non-independent mode: handle parallel input if parallel == 0: # Default input parallel=0 if batched: assert num_parallel == self.parallel_interactions parallel = np.asarray(list(range(num_parallel))) else: parallel = np.asarray([parallel]) elif batched: # Batched input parallel = np.asarray(parallel) else: # Expand input if not batched parallel = np.asarray([parallel]) # Check number of inputs if parallel.shape[0] != num_parallel: raise TensorforceError.value( name='Agent.act', argument='len(parallel)', value=len(parallel), hint='!= len(states)' ) def function(name, spec): auxiliary = ArrayDict() if self.config.enable_int_action_masking and spec.type == 'int' and \ spec.num_values is not None: # Mask, either part of states or default all true auxiliary['mask'] = states.pop(name + '_mask', np.ones( shape=(num_parallel,) + spec.shape + (spec.num_values,), dtype=spec.np_type() )) return auxiliary auxiliaries = self.actions_spec.fmap(function=function, cls=ArrayDict, with_names=True) # If not independent, check whether previous timesteps were completed if not independent: if not self.timestep_completed[parallel].all(): raise TensorforceError( message="Calling agent.act must be preceded by agent.observe." ) self.timestep_completed[parallel] = False # Buffer inputs for recording if self.recorder_spec is not None and not independent and \ self.episodes >= self.recorder_spec.get('start', 0): for n in range(num_parallel): for name in self.states_spec: self.buffers['states'][name][parallel[n]].append(states[name][n]) for name in self.auxiliaries_spec: self.buffers['auxiliaries'][name][parallel[n]].append(auxiliaries[name][n]) # Inputs to tensors states = self.states_spec.to_tensor(value=states, batched=True) if independent and not is_internals_none: internals = self.internals_spec.to_tensor(value=internals, batched=True) auxiliaries = self.auxiliaries_spec.to_tensor(value=auxiliaries, batched=True) parallel_tensor = self.parallel_spec.to_tensor(value=parallel, batched=True) # Model.act() if not independent: actions, timesteps = self.model.act( states=states, auxiliaries=auxiliaries, parallel=parallel_tensor ) self.timesteps = timesteps.numpy().item() elif len(self.internals_spec) > 0: if len(self.auxiliaries_spec) > 0: actions_internals = self.model.independent_act( states=states, internals=internals, auxiliaries=auxiliaries ) else: assert len(auxiliaries) == 0 actions_internals = self.model.independent_act(states=states, internals=internals) actions_internals = TensorDict(actions_internals) actions = actions_internals['actions'] internals = actions_internals['internals'] else: if len(self.auxiliaries_spec) > 0: actions = self.model.independent_act(states=states, auxiliaries=auxiliaries) else: assert len(auxiliaries) == 0 actions = self.model.independent_act(states=states) actions = TensorDict(actions) # Outputs from tensors # print(actions) actions = self.actions_spec.from_tensor(tensor=actions, batched=True) # Buffer outputs for recording if self.recorder_spec is not None and not independent and \ self.episodes >= self.recorder_spec.get('start', 0): for n in range(num_parallel): for name in self.actions_spec: self.buffers['actions'][name][parallel[n]].append(actions[name][n]) # Unbatch actions if batched: # If inputs were batched, turn list of dicts into dict of lists function = (lambda x: x.item() if x.shape == () else x) if self.single_action: actions = input_type(function(actions['action'][n]) for n in range(num_parallel)) else: # TODO: recursive actions = input_type( OrderedDict(((name, function(x[n])) for name, x in actions.items())) for n in range(num_parallel) ) if independent and not is_internals_none and is_iter_of_dicts: # TODO: recursive internals = input_type( OrderedDict(((name, function(x[n])) for name, x in internals.items())) for n in range(num_parallel) ) else: # If inputs were not batched, unbatch outputs function = (lambda x: x.item() if x.shape == (1,) else x[0]) if self.single_action: actions = function(actions['action']) else: actions = actions.fmap(function=function, cls=OrderedDict) if independent and not is_internals_none: internals = internals.fmap(function=function, cls=OrderedDict) if self.model.saver is not None: self.model.save() if independent and not is_internals_none: return actions, internals else: return actions