Exemplo n.º 1
0
        def fn(query=None, **kwargs):
            # Feed_dict dictionary
            feed_dict = dict()
            for key, arg in kwargs.items():
                if arg is None:
                    continue
                elif isinstance(arg, dict):
                    # Support single nesting (for states, internals, actions)
                    for key, arg in arg.items():
                        feed_dict[util.join_scopes(self.name, key) + '-input:0'] = arg
                else:
                    feed_dict[util.join_scopes(self.name, key) + '-input:0'] = arg
            if not all(isinstance(x, str) and x.endswith('-input:0') for x in feed_dict):
                raise TensorforceError.unexpected()

            # Fetches value/tuple
            fetches = util.fmap(function=(lambda x: x.name), xs=results)
            if query is not None:
                # If additional tensors are to be fetched
                query = util.fmap(
                    function=(lambda x: util.join_scopes(name, x) + '-output:0'), xs=query
                )
                if util.is_iterable(x=fetches):
                    fetches = tuple(fetches) + (query,)
                else:
                    fetches = (fetches, query)
            if not util.reduce_all(
                predicate=(lambda x: isinstance(x, str) and x.endswith('-output:0')), xs=fetches
            ):
                raise TensorforceError.unexpected()

            # TensorFlow session call
            fetched = self.monitored_session.run(fetches=fetches, feed_dict=feed_dict)

            return fetched
Exemplo n.º 2
0
    def observe(self, reward, terminal=False, parallel=0, query=None, **kwargs):
        """
        Observes reward and whether a terminal state is reached, needs to be preceded by
        `act(...)`.

        Args:
            reward (float): Reward
                (<span style="color:#C00000"><b>required</b></span>).
            terminal (bool | 0 | 1 | 2): Whether a terminal state is reached or 2 if the
                episode was aborted (<span style="color:#00C000"><b>default</b></span>: false).
            parallel (int): Parallel execution index
                (<span style="color:#00C000"><b>default</b></span>: 0).
            query (list[str]): Names of tensors to retrieve
                (<span style="color:#00C000"><b>default</b></span>: none).
            kwargs: Additional input values, for instance, for dynamic hyperparameters.

        Returns:
            (bool, optional list[str]): Whether an update was performed, plus queried tensor values
            if requested.
        """
        assert util.reduce_all(predicate=util.not_nan_inf, xs=reward)

        if query is not None and self.parallel_interactions > 1:
            raise TensorforceError.unexpected()

        if isinstance(terminal, bool):
            terminal = int(terminal)

        # Update terminal/reward buffer
        index = self.buffer_indices[parallel]
        self.terminal_buffers[parallel, index] = terminal
        self.reward_buffers[parallel, index] = reward
        index += 1

        if self.max_episode_timesteps is not None and index > self.max_episode_timesteps:
            raise TensorforceError.unexpected()

        if terminal > 0 or index == self.buffer_observe or query is not None:
            terminal = self.terminal_buffers[parallel, :index]
            reward = self.reward_buffers[parallel, :index]

            if self.recorder_spec is not None and \
                    self.episodes >= self.recorder_spec.get('start', 0):
                for name in self.states_spec:
                    self.record_states[name].append(
                        np.array(self.states_buffers[name][parallel, :index])
                    )
                for name, spec in self.actions_spec.items():
                    self.record_actions[name].append(
                        np.array(self.actions_buffers[name][parallel, :index])
                    )
                    if spec['type'] == 'int':
                        self.record_states[name + '_mask'].append(
                            np.array(self.states_buffers[name + '_mask'][parallel, :index])
                        )
                self.record_terminal.append(np.array(terminal))
                self.record_reward.append(np.array(reward))

                if terminal[-1] > 0:
                    self.num_episodes += 1

                    if self.num_episodes == self.recorder_spec.get('frequency', 1):
                        directory = self.recorder_spec['directory']
                        if os.path.isdir(directory):
                            files = sorted(
                                f for f in os.listdir(directory)
                                if os.path.isfile(os.path.join(directory, f))
                                and f.startswith('trace-')
                            )
                        else:
                            os.makedirs(directory)
                            files = list()
                        max_traces = self.recorder_spec.get('max-traces')
                        if max_traces is not None and len(files) > max_traces - 1:
                            for filename in files[:-max_traces + 1]:
                                filename = os.path.join(directory, filename)
                                os.remove(filename)

                        filename = 'trace-{}-{}.npz'.format(
                            self.episodes, time.strftime('%Y%m%d-%H%M%S')
                        )
                        filename = os.path.join(directory, filename)
                        self.record_states = util.fmap(
                            function=np.concatenate, xs=self.record_states, depth=1
                        )
                        self.record_actions = util.fmap(
                            function=np.concatenate, xs=self.record_actions, depth=1
                        )
                        self.record_terminal = np.concatenate(self.record_terminal)
                        self.record_reward = np.concatenate(self.record_reward)
                        np.savez_compressed(
                            filename, **self.record_states, **self.record_actions,
                            terminal=self.record_terminal, reward=self.record_reward
                        )
                        self.record_states = util.fmap(
                            function=(lambda x: list()), xs=self.record_states, depth=1
                        )
                        self.record_actions = util.fmap(
                            function=(lambda x: list()), xs=self.record_actions, depth=1
                        )
                        self.record_terminal = list()
                        self.record_reward = list()
                        self.num_episodes = 0

            # Model.observe()
            if query is None:
                updated, self.episodes, self.updates = self.model.observe(
                    terminal=terminal, reward=reward, parallel=parallel, **kwargs
                )

            else:
                updated, self.episodes, self.updates, queried = self.model.observe(
                    terminal=terminal, reward=reward, parallel=parallel, query=query, **kwargs
                )

            # Reset buffer index
            self.buffer_indices[parallel] = 0

        else:
            # Increment buffer index
            self.buffer_indices[parallel] = index
            updated = False

        if query is None:
            return updated
        else:
            return updated, queried
Exemplo n.º 3
0
    def act(
        self, states, parallel=0, deterministic=False, independent=False, evaluation=False,
        query=None, **kwargs
    ):
        """
        Returns action(s) for the given state(s), needs to be followed by `observe(...)` unless
        `independent` is true.

        Args:
            states (dict[state]): Dictionary containing state(s) to be acted on
                (<span style="color:#C00000"><b>required</b></span>).
            parallel (int): Parallel execution index
                (<span style="color:#00C000"><b>default</b></span>: 0).
            deterministic (bool): Whether to apply exploration and sampling
                (<span style="color:#00C000"><b>default</b></span>: false).
            independent (bool): Whether action is not remembered, and this call is thus not
                followed by observe
                (<span style="color:#00C000"><b>default</b></span>: false).
            evaluation (bool): Whether the agent is currently evaluated, implies and overwrites
                deterministic and independent
                (<span style="color:#00C000"><b>default</b></span>: false).
            query (list[str]): Names of tensors to retrieve
                (<span style="color:#00C000"><b>default</b></span>: none).
            kwargs: Additional input values, for instance, for dynamic hyperparameters.

        Returns:
            (dict[action], plus optional list[str]): Dictionary containing action(s), plus queried
            tensor values if requested.
        """
        assert util.reduce_all(predicate=util.not_nan_inf, xs=states)

        # self.current_internals = self.next_internals
        if evaluation:
            if deterministic or independent:
                raise TensorforceError.unexpected()
            deterministic = independent = True

        # Auxiliaries
        auxiliaries = OrderedDict()
        if isinstance(states, dict):
            states = dict(states)
            for name, spec in self.actions_spec.items():
                if spec['type'] == 'int' and name + '_mask' in states:
                    auxiliaries[name + '_mask'] = states.pop(name + '_mask')

        # Normalize states dictionary
        states = util.normalize_values(
            value_type='state', values=states, values_spec=self.states_spec
        )

        # Batch states
        states = util.fmap(function=(lambda x: np.asarray([x])), xs=states, depth=1)
        auxiliaries = util.fmap(function=(lambda x: np.asarray([x])), xs=auxiliaries, depth=1)

        # Model.act()
        if query is None:
            actions, self.timesteps = self.model.act(
                states=states, auxiliaries=auxiliaries, parallel=parallel,
                deterministic=deterministic, independent=independent, **kwargs
            )

        else:
            actions, self.timesteps, queried = self.model.act(
                states=states, auxiliaries=auxiliaries, parallel=parallel,
                deterministic=deterministic, independent=independent, query=query, **kwargs
            )

        if self.recorder_spec is not None and not independent and \
                self.episodes >= self.recorder_spec.get('start', 0):
            index = self.buffer_indices[parallel]
            for name in self.states_spec:
                self.states_buffers[name][parallel, index] = states[name][0]
            for name, spec in self.actions_spec.items():
                self.actions_buffers[name][parallel, index] = actions[name][0]
                if spec['type'] == 'int':
                    name = name + '_mask'
                    if name in auxiliaries:
                        self.states_buffers[name][parallel, index] = auxiliaries[name][0]
                    else:
                        shape = (1,) + spec['shape'] + (spec['num_values'],)
                        self.states_buffers[name][parallel, index] = np.full(
                            shape=shape, fill_value=True, dtype=util.np_dtype(dtype='bool')
                        )

        # Unbatch actions
        actions = util.fmap(function=(lambda x: x[0]), xs=actions, depth=1)

        # Reverse normalized actions dictionary
        actions = util.unpack_values(
            value_type='action', values=actions, values_spec=self.actions_spec
        )

        # if independent, return processed state as well?

        if query is None:
            return actions
        else:
            return actions, queried
Exemplo n.º 4
0
    def experience(
        self, states, actions, terminal, reward, internals=None, query=None, **kwargs
    ):
        """
        Feed experience traces.

        Args:
            states (dict[array[state]]): Dictionary containing arrays of states
                (<span style="color:#C00000"><b>required</b></span>).
            actions (dict[array[action]]): Dictionary containing arrays of actions
                (<span style="color:#C00000"><b>required</b></span>).
            terminal (array[bool]): Array of terminals
                (<span style="color:#C00000"><b>required</b></span>).
            reward (array[float]): Array of rewards
                (<span style="color:#C00000"><b>required</b></span>).
            internals (dict[state]): Dictionary containing arrays of internal agent states
                (<span style="color:#00C000"><b>default</b></span>: no internal states).
            query (list[str]): Names of tensors to retrieve
                (<span style="color:#00C000"><b>default</b></span>: none).
            kwargs: Additional input values, for instance, for dynamic hyperparameters.
        """
        assert (self.buffer_indices == 0).all()
        assert util.reduce_all(predicate=util.not_nan_inf, xs=states)
        assert internals is None or util.reduce_all(predicate=util.not_nan_inf, xs=internals)
        assert util.reduce_all(predicate=util.not_nan_inf, xs=actions)
        assert util.reduce_all(predicate=util.not_nan_inf, xs=reward)

        # Auxiliaries
        auxiliaries = OrderedDict()
        if isinstance(states, dict):
            for name, spec in self.actions_spec.items():
                if spec['type'] == 'int' and name + '_mask' in states:
                    auxiliaries[name + '_mask'] = np.asarray(states.pop(name + '_mask'))
        auxiliaries = util.fmap(function=np.asarray, xs=auxiliaries, depth=1)

        # Normalize states/actions dictionaries
        states = util.normalize_values(
            value_type='state', values=states, values_spec=self.states_spec
        )
        actions = util.normalize_values(
            value_type='action', values=actions, values_spec=self.actions_spec
        )
        if internals is None:
            internals = OrderedDict()

        if isinstance(terminal, (bool, int)):
            states = util.fmap(function=(lambda x: [x]), xs=states, depth=1)
            internals = util.fmap(function=(lambda x: [x]), xs=internals, depth=1)
            auxiliaries = util.fmap(function=(lambda x: [x]), xs=auxiliaries, depth=1)
            actions = util.fmap(function=(lambda x: [x]), xs=actions, depth=1)
            terminal = [terminal]
            reward = [reward]

        states = util.fmap(function=np.asarray, xs=states, depth=1)
        internals = util.fmap(function=np.asarray, xs=internals, depth=1)
        auxiliaries = util.fmap(function=np.asarray, xs=auxiliaries, depth=1)
        actions = util.fmap(function=np.asarray, xs=actions, depth=1)

        if isinstance(terminal, np.ndarray):
            if terminal.dtype is util.np_dtype(dtype='bool'):
                zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='long'))
                ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='long'))
                terminal = np.where(terminal, ones, zeros)
        else:
            terminal = np.asarray([int(x) if isinstance(x, bool) else x for x in terminal])
        reward = np.asarray(reward)

        # Batch experiences split into episodes and at most size buffer_observe
        last = 0
        for index in range(1, len(terminal) + 1):
            if terminal[index - 1] == 0 and index - last < self.experience_size:
                continue

            # Include terminal in batch if possible
            if index < len(terminal) and terminal[index - 1] == 0 and terminal[index] > 0 and \
                    index - last < self.experience_size:
                index += 1

            function = (lambda x: x[last: index])
            states_batch = util.fmap(function=function, xs=states, depth=1)
            internals_batch = util.fmap(function=function, xs=internals, depth=1)
            auxiliaries_batch = util.fmap(function=function, xs=auxiliaries, depth=1)
            actions_batch = util.fmap(function=function, xs=actions, depth=1)
            terminal_batch = terminal[last: index]
            reward_batch = reward[last: index]
            last = index

            # Model.experience()
            if query is None:
                self.timesteps, self.episodes, self.updates = self.model.experience(
                    states=states_batch, internals=internals_batch,
                    auxiliaries=auxiliaries_batch, actions=actions_batch, terminal=terminal_batch,
                    reward=reward_batch, **kwargs
                )

            else:
                self.timesteps, self.episodes, self.updates, queried = self.model.experience(
                    states=states_batch, internals=internals_batch,
                    auxiliaries=auxiliaries_batch, actions=actions_batch, terminal=terminal_batch,
                    reward=reward_batch, query=query, **kwargs
                )

        if query is not None:
            return queried
Exemplo n.º 5
0
    def act(self,
            states,
            internals=None,
            parallel=0,
            independent=False,
            deterministic=False,
            evaluation=False,
            query=None,
            **kwargs):
        """
        Returns action(s) for the given state(s), needs to be followed by `observe(...)` unless independent mode set via `independent`/`evaluation`.

        Args:
            states (dict[state] | iter[dict[state]]): Dictionary containing state(s) to be acted on
                (<span style="color:#C00000"><b>required</b></span>).
            internals (dict[internal] | iter[dict[internal]]): Dictionary containing current
                internal agent state(s)
                (<span style="color:#C00000"><b>required</b></span> if independent mode).
            parallel (int | iter[int]): Parallel execution index
                (<span style="color:#00C000"><b>default</b></span>: 0).
            independent (bool): Whether act is not part of the main agent-environment interaction,
                and this call is thus not followed by observe
                (<span style="color:#00C000"><b>default</b></span>: false).
            deterministic (bool): Ff independent mode, whether to act deterministically, so no
                exploration and sampling
                (<span style="color:#00C000"><b>default</b></span>: false).
            evaluation (bool): Whether the agent is currently evaluated, implies independent and
                deterministic
                (<span style="color:#00C000"><b>default</b></span>: false).
            query (list[str]): Names of tensors to retrieve
                (<span style="color:#00C000"><b>default</b></span>: none).
            kwargs: Additional input values, for instance, for dynamic hyperparameters.

        Returns:
            dict[action] | iter[dict[action]], if independent mode dict[internal] |
            iter[dict[internal]], plus optional list[str]: Dictionary containing action(s),
            dictionary containing next internal agent state(s) if independent mode, plus queried
            tensor values if requested.
        """
        assert util.reduce_all(predicate=util.not_nan_inf, xs=states)

        if evaluation:
            if deterministic:
                raise TensorforceError.invalid(name='agent.act',
                                               argument='deterministic',
                                               condition='evaluation = true')
            if independent:
                raise TensorforceError.invalid(name='agent.act',
                                               argument='independent',
                                               condition='evaluation = true')
            deterministic = independent = True

        if not independent:
            if internals is not None:
                raise TensorforceError.invalid(name='agent.act',
                                               argument='internals',
                                               condition='independent = false')
            if deterministic:
                raise TensorforceError.invalid(name='agent.act',
                                               argument='deterministic',
                                               condition='independent = false')

        if independent:
            internals_is_none = (internals is None)
            if internals_is_none:
                internals = OrderedDict()

        # Batch states
        batched = (not isinstance(parallel, int))
        if batched:
            if len(parallel) == 0:
                raise TensorforceError.value(name='agent.act',
                                             argument='parallel',
                                             value=parallel,
                                             hint='zero-length')
            parallel = np.asarray(list(parallel))
            if isinstance(states[0], dict):
                states = OrderedDict(
                    ((name,
                      np.asarray(
                          [states[n][name] for n in range(len(parallel))]))
                     for name in states[0]))
            else:
                states = np.asarray(states)
            if independent:
                internals = OrderedDict(
                    ((name,
                      np.asarray(
                          [internals[n][name] for n in range(len(parallel))]))
                     for name in internals[0]))
        else:
            parallel = np.asarray([parallel])
            states = util.fmap(function=(lambda x: np.asarray([x])),
                               xs=states,
                               depth=int(isinstance(states, dict)))
            if independent:
                internals = util.fmap(function=(lambda x: np.asarray([x])),
                                      xs=internals,
                                      depth=1)

        if not independent and not all(self.timestep_completed[n]
                                       for n in parallel):
            raise TensorforceError(
                message="Calling agent.act must be preceded by agent.observe.")

        # Auxiliaries
        auxiliaries = OrderedDict()
        if isinstance(states, dict):
            states = dict(states)
            for name, spec in self.actions_spec.items():
                if spec['type'] == 'int' and name + '_mask' in states:
                    auxiliaries[name + '_mask'] = states.pop(name + '_mask')

        # Normalize states dictionary
        states = util.normalize_values(value_type='state',
                                       values=states,
                                       values_spec=self.states_spec)

        # Model.act()
        if independent:
            if query is None:
                actions, internals = self.model.independent_act(
                    states=states,
                    internals=internals,
                    auxiliaries=auxiliaries,
                    parallel=parallel,
                    deterministic=deterministic,
                    **kwargs)

            else:
                actions, internals, queried = self.model.independent_act(
                    states=states,
                    internals=internals,
                    auxiliaries=auxiliaries,
                    parallel=parallel,
                    deterministic=deterministic,
                    query=query,
                    **kwargs)

        else:
            if query is None:
                actions, self.timesteps = self.model.act(
                    states=states,
                    auxiliaries=auxiliaries,
                    parallel=parallel,
                    **kwargs)

            else:
                actions, self.timesteps, queried = self.model.act(
                    states=states,
                    auxiliaries=auxiliaries,
                    parallel=parallel,
                    query=query,
                    **kwargs)

        if not independent:
            for n in parallel:
                self.timestep_completed[n] = False

        if self.recorder_spec is not None and not independent and \
                self.episodes >= self.recorder_spec.get('start', 0):
            for n in range(len(parallel)):
                index = self.buffer_indices[parallel[n]]
                for name in self.states_spec:
                    self.states_buffers[name][parallel[n],
                                              index] = states[name][n]
                for name, spec in self.actions_spec.items():
                    self.actions_buffers[name][parallel[n],
                                               index] = actions[name][n]
                    if spec['type'] == 'int':
                        name = name + '_mask'
                        if name in auxiliaries:
                            self.states_buffers[name][
                                parallel[n], index] = auxiliaries[name][n]
                        else:
                            shape = (1, ) + spec['shape'] + (
                                spec['num_values'], )
                            self.states_buffers[name][parallel[n],
                                                      index] = np.full(
                                                          shape=shape,
                                                          fill_value=True,
                                                          dtype=util.np_dtype(
                                                              dtype='bool'))

        # Reverse normalized actions dictionary
        actions = util.unpack_values(value_type='action',
                                     values=actions,
                                     values_spec=self.actions_spec)

        # Unbatch actions
        if batched:
            if isinstance(actions, dict):
                actions = [
                    OrderedDict(((name, actions[name][n]) for name in actions))
                    for n in range(len(parallel))
                ]
        else:
            actions = util.fmap(function=(lambda x: x[0]),
                                xs=actions,
                                depth=int(isinstance(actions, dict)))
            if independent:
                internals = util.fmap(function=(lambda x: x[0]),
                                      xs=internals,
                                      depth=1)

        if independent and not internals_is_none:
            if query is None:
                return actions, internals
            else:
                return actions, internals, queried

        else:
            if query is None:
                return actions
            else:
                return actions, queried
Exemplo n.º 6
0
    def observe(self,
                reward,
                terminal=False,
                parallel=0,
                query=None,
                **kwargs):
        """
        Observes reward and whether a terminal state is reached, needs to be preceded by
        `act(...)`.

        Args:
            reward (float | iter[float]): Reward
                (<span style="color:#C00000"><b>required</b></span>).
            terminal (bool | 0 | 1 | 2 | iter[...]): Whether a terminal state is reached or 2 if
                the episode was aborted (<span style="color:#00C000"><b>default</b></span>: false).
            parallel (int, iter[int]): Parallel execution index
                (<span style="color:#00C000"><b>default</b></span>: 0).
            query (list[str]): Names of tensors to retrieve
                (<span style="color:#00C000"><b>default</b></span>: none).
            kwargs: Additional input values, for instance, for dynamic hyperparameters.

        Returns:
            (bool | int, optional list[str]): Whether an update was performed, plus queried tensor
            values if requested.
        """
        assert util.reduce_all(predicate=util.not_nan_inf, xs=reward)

        if query is not None and self.parallel_interactions > 1:
            raise TensorforceError.invalid(
                name='agent.observe',
                argument='query',
                condition='parallel_interactions > 1')

        batched = (not isinstance(parallel, int))
        if batched:
            if len(parallel) == 0:
                raise TensorforceError.value(name='agent.observe',
                                             argument='parallel',
                                             value=parallel,
                                             hint='zero-length')
            if query is not None:
                raise TensorforceError.invalid(name='agent.observe',
                                               argument='query',
                                               condition='len(parallel) > 1')
        else:
            terminal = [terminal]
            reward = [reward]
            parallel = [parallel]

        if any(self.timestep_completed[n] for n in parallel):
            raise TensorforceError(
                message="Calling agent.observe must be preceded by agent.act.")

        num_updates = 0
        # TODO: Differently if not buffer_observe
        for terminal, reward, parallel in zip(terminal, reward, parallel):
            # Update terminal/reward buffer
            if isinstance(terminal, bool):
                terminal = int(terminal)
            index = self.buffer_indices[parallel]
            self.terminal_buffers[parallel, index] = terminal
            self.reward_buffers[parallel, index] = reward
            index += 1
            self.buffer_indices[parallel] = index

            if self.max_episode_timesteps is not None and index > self.max_episode_timesteps:
                raise TensorforceError.value(
                    name='agent.observe',
                    argument='index',
                    value=index,
                    condition='> max_episode_timesteps')

            if terminal > 0 or index == self.buffer_observe or query is not None:
                self.timestep_completed[parallel] = True
                if query is None:
                    updated = self.model_observe(parallel=parallel, **kwargs)
                else:
                    updated, queried = self.model_observe(parallel=parallel,
                                                          query=query,
                                                          **kwargs)

            else:
                # Increment buffer index
                self.timestep_completed[parallel] = True
                updated = False

            num_updates += int(updated)

        if batched:
            updated = num_updates
        else:
            assert num_updates <= 1
            updated = (num_updates == 1)

        if query is None:
            return updated
        else:
            return updated, queried