示例#1
0
    def _process_states_input(self, states, function_name):
        if self.states_spec.is_singleton() and not isinstance(
                states, dict) and not (util.is_iterable(x=states)
                                       and isinstance(states[0], dict)):
            # Single state
            input_type = type(states)
            states = np.asarray(states)

            if states.shape == self.states_spec.value().shape:
                # Single state is not batched
                states = ArrayDict(singleton=np.expand_dims(states, axis=0))
                batched = False
                num_instances = 1
                is_iter_of_dicts = None
                input_type = None

            else:
                # Single state is batched, iter[state]
                assert states.shape[1:] == self.states_spec.value().shape
                assert input_type in (tuple, list, np.ndarray)
                num_instances = states.shape[0]
                states = ArrayDict(singleton=states)
                batched = True
                is_iter_of_dicts = True  # Default

        elif util.is_iterable(x=states):
            # States is batched, iter[dict[state]]
            batched = True
            num_instances = len(states)
            is_iter_of_dicts = True
            input_type = type(states)
            assert input_type in (tuple, list)
            if num_instances == 0:
                raise TensorforceError.value(name=function_name,
                                             argument='len(states)',
                                             value=num_instances,
                                             hint='= 0')
            for n, state in enumerate(states):
                if not isinstance(state, dict):
                    raise TensorforceError.type(
                        name=function_name,
                        argument='states[{}]'.format(n),
                        dtype=type(state),
                        hint='is not dict')
            # Turn iter of dicts into dict of arrays
            # (Doesn't use self.states_spec since states also contains auxiliaries)
            states = [ArrayDict(state) for state in states]
            states = states[0].fmap(
                function=(lambda *xs: np.stack(xs, axis=0)),
                zip_values=states[1:])

        elif isinstance(states, dict):
            # States is dict, turn into arrays
            some_state = next(iter(states.values()))
            input_type = type(some_state)

            states = ArrayDict(states)

            name, spec = self.states_spec.item()
            if name is None:
                name = 'state'

            if states[name].shape == spec.shape:
                # States is not batched, dict[state]
                states = states.fmap(
                    function=(lambda state: np.expand_dims(state, axis=0)))
                batched = False
                num_instances = 1
                is_iter_of_dicts = None
                input_type = None

            else:
                # States is batched, dict[iter[state]]
                assert states[name].shape[1:] == spec.shape
                assert input_type in (tuple, list, np.ndarray)
                batched = True
                num_instances = states[name].shape[0]
                is_iter_of_dicts = False
                if num_instances == 0:
                    raise TensorforceError.value(name=function_name,
                                                 argument='len(states)',
                                                 value=num_instances,
                                                 hint='= 0')

        else:
            raise TensorforceError.type(name=function_name,
                                        argument='states',
                                        dtype=type(states),
                                        hint='is not array/tuple/list/dict')

        # Check number of inputs
        if any(state.shape[0] != num_instances for state in states.values()):
            raise TensorforceError.value(
                name=function_name,
                argument='len(states)',
                value=[state.shape[0] for state in states.values()],
                hint='inconsistent')

        return states, batched, num_instances, is_iter_of_dicts, input_type
示例#2
0
    def get_module_class_and_args(*,
                                  name,
                                  module=None,
                                  modules=None,
                                  default_module=None,
                                  disable_first_arg=False,
                                  **kwargs):
        # name
        if not isinstance(name, str):
            raise TensorforceError.type(name='Module.add_module',
                                        argument='name',
                                        dtype=type(name))
        # modules
        if modules is not None and not isinstance(modules, dict):
            raise TensorforceError.type(name='Module.add_module',
                                        argument='modules',
                                        dtype=type(modules))

        # default_module
        if default_module is not None and default_module not in modules and \
                not issubclass(default_module, Module):
            raise TensorforceError.value(name='Module.add_module',
                                         argument='default_module',
                                         value=default_module)

        # disable_first_arg
        if not isinstance(disable_first_arg, bool):
            raise TensorforceError.type(name='Module.add_module',
                                        argument='disable_first_arg',
                                        dtype=type(disable_first_arg))

        # module
        if isinstance(module, dict):
            # Dictionary module specification (type either given via 'type' or 'default_module')
            util.deep_disjoint_update(target=kwargs, source=module)
            module = kwargs.pop('type', default_module)
            return Module.get_module_class_and_args(
                name=name,
                module=module,
                modules=modules,
                default_module=default_module,
                disable_first_arg=True,
                **kwargs)

        elif isinstance(module, str):
            if os.path.isfile(module):
                # JSON file module specification
                with open(module, 'r') as fp:
                    module = json.load(fp=fp)
                return Module.get_module_class_and_args(
                    name=name,
                    module=module,
                    modules=modules,
                    default_module=default_module,
                    disable_first_arg=True,
                    **kwargs)

            elif '.' in module:
                # Library module specification
                library_name, module_name = module.rsplit('.', 1)
                library = importlib.import_module(name=library_name)
                module = getattr(library, module_name)
                return Module.get_module_class_and_args(
                    name=name,
                    module=module,
                    modules=modules,
                    default_module=default_module,
                    disable_first_arg=True,
                    **kwargs)

            elif modules is not None and module in modules:
                # Keyword module specification
                return Module.get_module_class_and_args(
                    name=name,
                    module=modules[module],
                    modules=modules,
                    default_module=default_module,
                    disable_first_arg=True,
                    **kwargs)

            elif 'default' in modules or default_module is not None:
                # Default module specification
                if '_first_arg' in kwargs:
                    raise TensorforceError.invalid(name='Module.add_module',
                                                   argument='_first_arg')
                if module is not None:
                    if disable_first_arg:
                        raise TensorforceError.value(name='Module.add_module',
                                                     argument='module',
                                                     value=module)
                    kwargs['_first_arg'] = module
                if default_module is None:
                    default_module = modules['default']
                return Module.get_module_class_and_args(name=name,
                                                        module=default_module,
                                                        modules=modules,
                                                        **kwargs)

            else:
                raise TensorforceError.value(name='Module.add_module',
                                             argument='module',
                                             value=module)

        elif not callable(module) and ('default' in modules
                                       or default_module is not None):
            # Default module specification
            if '_first_arg' in kwargs:
                raise TensorforceError.invalid(name='Module.add_module',
                                               argument='_first_arg')
            if module is not None:
                kwargs['_first_arg'] = module
            if default_module is None:
                default_module = modules['default']
            return Module.get_module_class_and_args(name=name,
                                                    module=default_module,
                                                    modules=modules,
                                                    **kwargs)

        elif callable(module):
            if '_first_arg' in kwargs:
                args = (kwargs.pop('_first_arg'), )
            else:
                args = ()
            kwargs['name'] = name
            return module, args, kwargs

        else:
            raise TensorforceError.value(name='Module.add_module',
                                         argument='module',
                                         value=module)
示例#3
0
    def summary(self, *, label, name, data, step):
        # label
        if not isinstance(label, str):
            raise TensorforceError.type(name='Module.summary',
                                        argument='label',
                                        dtype=type(label))
        # name
        if not isinstance(name, (str, tuple, list)):
            raise TensorforceError.type(name='Module.summary',
                                        argument='name',
                                        dtype=type(name))
        if isinstance(name, str):
            names = None
        else:
            names = name
            name = name[0]
        # data
        if not tf_util.is_tensor(x=data) and not callable(data):
            raise TensorforceError.type(name='Module.summary',
                                        argument='data',
                                        dtype=type(data))
        # step
        if step not in self.root.units:
            raise TensorforceError.value(name='Module.summary',
                                         argument='step',
                                         value=step)

        if self.root.summaries == 'all' or label in self.root.summaries:
            if name not in self.summary_steps:
                raise TensorforceError.value(name='Module.summary',
                                             argument='name',
                                             value=name,
                                             hint='is not registered')

            unit = self.root.units[step]

            def fn_summary():
                if callable(data):
                    value = data()
                else:
                    value = data
                dependencies = list()
                with self.root.summarizer.as_default():
                    if names is None:
                        dependencies.append(
                            tf.summary.scalar(name=name, data=value,
                                              step=unit))
                    else:
                        for n, x in zip(names, value):
                            dependencies.append(
                                tf.summary.scalar(name=n, data=x, step=unit))
                previous = self.summary_steps[name]
                dependencies.append(
                    previous.assign(value=unit, read_value=False))
                return tf.group(*dependencies)

            pred = unit > self.summary_steps[name]
            return [tf.cond(pred=pred, true_fn=fn_summary, false_fn=tf.no_op)]

        else:
            return list()
示例#4
0
    def receive_execute(self):
        if self._expect_receive == 'reset':
            self._expect_receive = None
            if self._num_parallel is None:
                states = self.reset()
            else:
                parallel, states = self.reset(num_parallel=num_parallel)
            if self._reset_output_check:
                self._check_states_output(states=states, function='reset')
                if self._num_parallel is not None:
                    TensorSpec(type='int',
                               shape=(),
                               num_values=self._num_parallel).np_assert(
                                   x=parallel,
                                   batched=True,
                                   message=(function +
                                            ': invalid {issue} for parallel.'))
                self._reset_output_check = False
            if self._num_parallel is None:
                return states, -1, None
            else:
                return parallel, states, -1, None

        elif self._expect_receive == 'execute':
            self._expect_receive = None
            assert self._actions is not None
            if self._num_parallel is None:
                states, terminal, reward = self.execute(actions=self._actions)
            else:
                parallel, states, terminal, reward = self.execute(
                    actions=self._actions)
            if self._execute_output_check:
                self._check_states_output(states=states, function='execute')
                if self._num_parallel is None:
                    if isinstance(reward, (np.generic, np.ndarray)):
                        reward = reward.item()
                    if isinstance(terminal, (np.generic, np.ndarray)):
                        terminal = terminal.item()
                    if not isinstance(terminal, bool) and \
                            (not isinstance(terminal, int) or terminal < 0 or terminal > 2):
                        raise TensorforceError(
                            'Environment.execute: invalid value {} for terminal.'
                            .format(terminal))
                    if not isinstance(reward, (float, int)):
                        raise TensorforceError(
                            'Environment.execute: invalid type {} for reward.'.
                            format(type(reward)))
                else:
                    TensorSpec(type='int',
                               shape=(),
                               num_values=self._num_parallel).np_assert(
                                   x=parallel,
                                   batched=True,
                                   message=(function +
                                            ': invalid {issue} for parallel.'))
                    TensorSpec(type='bool', shape=()).np_assert(
                        x=terminal,
                        batched=True,
                        message=(function + ': invalid {issue} for terminal.'))
                    TensorSpec(type='float', shape=()).np_assert(
                        x=reward,
                        batched=True,
                        message=(function + ': invalid {issue} for reward.'))
                self._execute_output_check = False
            self._actions = None
            if self._num_parallel is None:
                return states, int(terminal), reward
            else:
                return parallel, states, terminal, reward

        else:
            raise TensorforceError.unexpected()
示例#5
0
    def remote(cls,
               connection,
               environment,
               max_episode_timesteps=None,
               reward_shaping=None,
               **kwargs):
        try:
            env = None
            env = Environment.create(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps,
                reward_shaping=reward_shaping,
                **kwargs)

            while True:
                attribute, kwargs = cls.remote_receive(connection=connection)

                if attribute in ('reset', 'execute'):
                    environment_start = time.time()

                try:
                    result = getattr(env, attribute)
                    if callable(result):
                        if kwargs is None:
                            result = None
                        else:
                            result = result(**kwargs)
                    elif kwargs is None:
                        pass
                    elif len(kwargs) == 1 and 'value' in kwargs:
                        setattr(env, attribute, kwargs['value'])
                        result = None
                    else:
                        raise TensorforceError(
                            message="Invalid remote attribute/function access."
                        )
                except AttributeError:
                    if kwargs is None or len(
                            kwargs) != 1 or 'value' not in kwargs:
                        raise TensorforceError(
                            message="Invalid remote attribute/function access."
                        )
                    setattr(env, attribute, kwargs['value'])
                    result = None

                if attribute in ('reset', 'execute'):
                    seconds = time.time() - environment_start
                    if attribute == 'reset':
                        result = (result, seconds)
                    else:
                        result += (seconds, )

                cls.remote_send(connection=connection,
                                success=True,
                                result=result)

                if attribute == 'close':
                    break

        except BaseException:
            etype, value, traceback = sys.exc_info()
            cls.remote_send(connection=connection,
                            success=False,
                            result=(str(etype), str(value),
                                    format_tb(traceback)))

            try:
                if env is not None:
                    env.close()
            except BaseException:
                pass
            finally:
                etype, value, traceback = sys.exc_info()
                cls.remote_send(connection=connection,
                                success=False,
                                result=(str(etype), str(value),
                                        format_tb(traceback)))

        finally:
            cls.remote_close(connection=connection)
示例#6
0
    def __init__(
            # Required
            self,
            states,
            actions,
            memory,
            batch_size,
            # Environment
            max_episode_timesteps=None,
            # Network
            network='auto',
            # Optimization
            update_frequency='batch_size',
            start_updating=None,
            learning_rate=1e-3,
            huber_loss=0.0,
            # Reward estimation
            horizon=1,
            discount=0.99,
            predict_terminal_values=False,
            # Target network
            target_sync_frequency=1,
            target_update_weight=1.0,
            # Preprocessing
            preprocessing='linear_normalization',
            # Exploration
            exploration=0.0,
            variable_noise=0.0,
            # Regularization
            l2_regularization=0.0,
            entropy_regularization=0.0,
            # Parallel interactions
            parallel_interactions=1,
            # Config, saver, summarizer, recorder
            config=None,
            saver=None,
            summarizer=None,
            recorder=None,
            # Deprecated
            estimate_terminal=None,
            **kwargs):
        if estimate_terminal is not None:
            raise TensorforceError.deprecated(
                name='DuelingDQN',
                argument='estimate_terminal',
                replacement='predict_terminal_values')

        self.spec = OrderedDict(
            agent='dueling_dqn',
            states=states,
            actions=actions,
            memory=memory,
            batch_size=batch_size,
            max_episode_timesteps=max_episode_timesteps,
            network=network,
            update_frequency=update_frequency,
            start_updating=start_updating,
            learning_rate=learning_rate,
            huber_loss=huber_loss,
            horizon=horizon,
            discount=discount,
            predict_terminal_values=predict_terminal_values,
            target_sync_frequency=target_sync_frequency,
            target_update_weight=target_update_weight,
            preprocessing=preprocessing,
            exploration=exploration,
            variable_noise=variable_noise,
            l2_regularization=l2_regularization,
            entropy_regularization=entropy_regularization,
            parallel_interactions=parallel_interactions,
            config=config,
            saver=saver,
            summarizer=summarizer,
            recorder=recorder)

        distributions = dict(
            int=dict(type='categorical', advantage_based=True))
        policy = dict(network=network,
                      distributions=distributions,
                      temperature=0.0)

        memory = dict(type='replay', capacity=memory)

        update = dict(unit='timesteps', batch_size=batch_size)
        if update_frequency != 'batch_size':
            update['frequency'] = update_frequency
        if start_updating is not None:
            update['start'] = start_updating

        optimizer = dict(type='adam', learning_rate=learning_rate)
        objective = dict(type='value', value='action', huber_loss=huber_loss)

        reward_estimation = dict(
            horizon=horizon,
            discount=discount,
            predict_horizon_values='late',
            estimate_advantage=False,
            predict_action_values=True,
            predict_terminal_values=predict_terminal_values)

        baseline_policy = policy
        baseline_optimizer = dict(type='synchronization',
                                  sync_frequency=target_sync_frequency,
                                  update_weight=target_update_weight)
        baseline_objective = None

        super().__init__(
            # Agent
            states=states,
            actions=actions,
            max_episode_timesteps=max_episode_timesteps,
            parallel_interactions=parallel_interactions,
            config=config,
            recorder=recorder,
            # Model
            preprocessing=preprocessing,
            exploration=exploration,
            variable_noise=variable_noise,
            l2_regularization=l2_regularization,
            saver=saver,
            summarizer=summarizer,
            # TensorforceModel
            policy=policy,
            memory=memory,
            update=update,
            optimizer=optimizer,
            objective=objective,
            reward_estimation=reward_estimation,
            baseline_policy=baseline_policy,
            baseline_optimizer=baseline_optimizer,
            baseline_objective=baseline_objective,
            entropy_regularization=entropy_regularization,
            **kwargs)

        if any(spec['type'] != 'int' for spec in self.actions_spec.values()):
            raise TensorforceError.value(name='DuelingDQN',
                                         argument='actions',
                                         value=actions,
                                         hint='contains non-int action')
示例#7
0
 def start_reset(self, num_parallel=None):
     if self._expect_receive is not None:
         raise TensorforceError.unexpected()
     self._expect_receive = 'reset'
     assert num_parallel is None or self.is_vectorizable()
     self._num_parallel = num_parallel
示例#8
0
    def act(
        self, states, parallel=0, deterministic=False, independent=False, evaluation=False,
        query=None, **kwargs
    ):
        """
        Returns action(s) for the given state(s), needs to be followed by `observe(...)` unless
        `independent` is true.

        Args:
            states (dict[state]): Dictionary containing state(s) to be acted on
                (<span style="color:#C00000"><b>required</b></span>).
            parallel (int): Parallel execution index
                (<span style="color:#00C000"><b>default</b></span>: 0).
            deterministic (bool): Whether to apply exploration and sampling
                (<span style="color:#00C000"><b>default</b></span>: false).
            independent (bool): Whether action is not remembered, and this call is thus not
                followed by observe
                (<span style="color:#00C000"><b>default</b></span>: false).
            evaluation (bool): Whether the agent is currently evaluated, implies and overwrites
                deterministic and independent
                (<span style="color:#00C000"><b>default</b></span>: false).
            query (list[str]): Names of tensors to retrieve
                (<span style="color:#00C000"><b>default</b></span>: none).
            kwargs: Additional input values, for instance, for dynamic hyperparameters.

        Returns:
            (dict[action], plus optional list[str]): Dictionary containing action(s), plus queried
            tensor values if requested.
        """
        assert util.reduce_all(predicate=util.not_nan_inf, xs=states)

        # self.current_internals = self.next_internals
        if evaluation:
            if deterministic or independent:
                raise TensorforceError.unexpected()
            deterministic = independent = True

        # Auxiliaries
        auxiliaries = OrderedDict()
        if isinstance(states, dict):
            states = dict(states)
            for name, spec in self.actions_spec.items():
                if spec['type'] == 'int' and name + '_mask' in states:
                    auxiliaries[name + '_mask'] = states.pop(name + '_mask')

        # Normalize states dictionary
        states = util.normalize_values(
            value_type='state', values=states, values_spec=self.states_spec
        )

        # Batch states
        states = util.fmap(function=(lambda x: np.asarray([x])), xs=states, depth=1)
        auxiliaries = util.fmap(function=(lambda x: np.asarray([x])), xs=auxiliaries, depth=1)

        # Model.act()
        if query is None:
            actions, self.timesteps = self.model.act(
                states=states, auxiliaries=auxiliaries, parallel=[parallel],
                deterministic=deterministic, independent=independent, **kwargs
            )

        else:
            actions, self.timesteps, queried = self.model.act(
                states=states, auxiliaries=auxiliaries, parallel=[parallel],
                deterministic=deterministic, independent=independent, query=query, **kwargs
            )

        if self.recorder_spec is not None and not independent and \
                self.episodes >= self.recorder_spec.get('start', 0):
            index = self.buffer_indices[parallel]
            for name in self.states_spec:
                self.states_buffers[name][parallel, index] = states[name][0]
            for name, spec in self.actions_spec.items():
                self.actions_buffers[name][parallel, index] = actions[name][0]
                if spec['type'] == 'int':
                    name = name + '_mask'
                    if name in auxiliaries:
                        self.states_buffers[name][parallel, index] = auxiliaries[name][0]
                    else:
                        shape = (1,) + spec['shape'] + (spec['num_values'],)
                        self.states_buffers[name][parallel, index] = np.full(
                            shape=shape, fill_value=True, dtype=util.np_dtype(dtype='bool')
                        )

        # Unbatch actions
        actions = util.fmap(function=(lambda x: x[0]), xs=actions, depth=1)

        # Reverse normalized actions dictionary
        actions = util.unpack_values(
            value_type='action', values=actions, values_spec=self.actions_spec
        )

        # if independent, return processed state as well?

        if query is None:
            return actions
        else:
            return actions, queried
示例#9
0
    def observe(self, reward, terminal=False, parallel=0, query=None, **kwargs):
        """
        Observes reward and whether a terminal state is reached, needs to be preceded by
        `act(...)`.

        Args:
            reward (float): Reward
                (<span style="color:#C00000"><b>required</b></span>).
            terminal (bool | 0 | 1 | 2): Whether a terminal state is reached or 2 if the
                episode was aborted (<span style="color:#00C000"><b>default</b></span>: false).
            parallel (int): Parallel execution index
                (<span style="color:#00C000"><b>default</b></span>: 0).
            query (list[str]): Names of tensors to retrieve
                (<span style="color:#00C000"><b>default</b></span>: none).
            kwargs: Additional input values, for instance, for dynamic hyperparameters.

        Returns:
            (bool, optional list[str]): Whether an update was performed, plus queried tensor values
            if requested.
        """
        assert util.reduce_all(predicate=util.not_nan_inf, xs=reward)

        if query is not None and self.parallel_interactions > 1:
            raise TensorforceError.unexpected()

        if isinstance(terminal, bool):
            terminal = int(terminal)

        # Update terminal/reward buffer
        index = self.buffer_indices[parallel]
        self.terminal_buffers[parallel, index] = terminal
        self.reward_buffers[parallel, index] = reward
        index += 1

        if self.max_episode_timesteps is not None and index > self.max_episode_timesteps:
            raise TensorforceError.unexpected()

        if terminal > 0 or index == self.buffer_observe or query is not None:
            terminal = self.terminal_buffers[parallel, :index]
            reward = self.reward_buffers[parallel, :index]

            if self.recorder_spec is not None and \
                    self.episodes >= self.recorder_spec.get('start', 0):
                for name in self.states_spec:
                    self.record_states[name].append(
                        np.array(self.states_buffers[name][parallel, :index])
                    )
                for name, spec in self.actions_spec.items():
                    self.record_actions[name].append(
                        np.array(self.actions_buffers[name][parallel, :index])
                    )
                    if spec['type'] == 'int':
                        self.record_states[name + '_mask'].append(
                            np.array(self.states_buffers[name + '_mask'][parallel, :index])
                        )
                self.record_terminal.append(np.array(terminal))
                self.record_reward.append(np.array(reward))

                if terminal[-1] > 0:
                    self.num_episodes += 1

                    if self.num_episodes == self.recorder_spec.get('frequency', 1):
                        directory = self.recorder_spec['directory']
                        if os.path.isdir(directory):
                            files = sorted(
                                f for f in os.listdir(directory)
                                if os.path.isfile(os.path.join(directory, f))
                                and f.startswith('trace-')
                            )
                        else:
                            os.makedirs(directory)
                            files = list()
                        max_traces = self.recorder_spec.get('max-traces')
                        if max_traces is not None and len(files) > max_traces - 1:
                            for filename in files[:-max_traces + 1]:
                                filename = os.path.join(directory, filename)
                                os.remove(filename)

                        filename = 'trace-{}-{}.npz'.format(
                            self.episodes, time.strftime('%Y%m%d-%H%M%S')
                        )
                        filename = os.path.join(directory, filename)
                        self.record_states = util.fmap(
                            function=np.concatenate, xs=self.record_states, depth=1
                        )
                        self.record_actions = util.fmap(
                            function=np.concatenate, xs=self.record_actions, depth=1
                        )
                        self.record_terminal = np.concatenate(self.record_terminal)
                        self.record_reward = np.concatenate(self.record_reward)
                        np.savez_compressed(
                            filename, **self.record_states, **self.record_actions,
                            terminal=self.record_terminal, reward=self.record_reward
                        )
                        self.record_states = util.fmap(
                            function=(lambda x: list()), xs=self.record_states, depth=1
                        )
                        self.record_actions = util.fmap(
                            function=(lambda x: list()), xs=self.record_actions, depth=1
                        )
                        self.record_terminal = list()
                        self.record_reward = list()
                        self.num_episodes = 0

            # Model.observe()
            if query is None:
                updated, self.episodes, self.updates = self.model.observe(
                    terminal=terminal, reward=reward, parallel=[parallel], **kwargs
                )

            else:
                updated, self.episodes, self.updates, queried = self.model.observe(
                    terminal=terminal, reward=reward, parallel=[parallel], query=query, **kwargs
                )

            # Reset buffer index
            self.buffer_indices[parallel] = 0

        else:
            # Increment buffer index
            self.buffer_indices[parallel] = index
            updated = False

        if query is None:
            return updated
        else:
            return updated, queried
示例#10
0
    def __init__(
        # Environment
        self, states, actions, max_episode_timesteps=None,
        # TensorFlow etc
        parallel_interactions=1, buffer_observe=True, seed=None, recorder=None
    ):
        assert hasattr(self, 'spec')

        if seed is not None:
            assert isinstance(seed, int)
            random.seed(a=seed)
            np.random.seed(seed=seed)

        # States/actions specification
        self.states_spec = util.valid_values_spec(
            values_spec=states, value_type='state', return_normalized=True
        )
        self.actions_spec = util.valid_values_spec(
            values_spec=actions, value_type='action', return_normalized=True
        )
        self.max_episode_timesteps = max_episode_timesteps

        # Check for name overlap
        for name in self.states_spec:
            if name in self.actions_spec:
                TensorforceError.collision(
                    name='name', value=name, group1='states', group2='actions'
                )

        # Parallel episodes
        if isinstance(parallel_interactions, int):
            if parallel_interactions <= 0:
                raise TensorforceError.value(
                    name='parallel_interactions', value=parallel_interactions
                )
            self.parallel_interactions = parallel_interactions
        else:
            raise TensorforceError.type(name='parallel_interactions', value=parallel_interactions)

        # Buffer observe
        if isinstance(buffer_observe, bool):
            if not buffer_observe and self.parallel_interactions > 1:
                raise TensorforceError.unexpected()
            if self.max_episode_timesteps is None and self.parallel_interactions > 1:
                raise TensorforceError.unexpected()
            if not buffer_observe:
                self.buffer_observe = 1
            elif self.max_episode_timesteps is None:
                self.buffer_observe = 100
            else:
                self.buffer_observe = self.max_episode_timesteps
        elif isinstance(buffer_observe, int):
            if buffer_observe <= 0:
                raise TensorforceError.value(name='buffer_observe', value=buffer_observe)
            if self.parallel_interactions > 1:
                raise TensorforceError.unexpected()
            if self.max_episode_timesteps is None:
                self.buffer_observe = buffer_observe
            else:
                self.buffer_observe = min(buffer_observe, self.max_episode_timesteps)
        else:
            raise TensorforceError.type(name='buffer_observe', value=buffer_observe)

        # Recorder
        if recorder is None:
            pass
        elif not all(key in ('directory', 'frequency', 'max-traces', 'start') for key in recorder):
            raise TensorforceError.value(name='recorder', value=list(recorder))
        self.recorder_spec = recorder if recorder is None else dict(recorder)

        self.is_initialized = False
示例#11
0
    def initialize(self):
        """
        Initializes the agent.
        """
        if self.is_initialized:
            raise TensorforceError.unexpected()

        self.is_initialized = True

        # Parallel terminal/reward buffers
        self.terminal_buffers = np.ndarray(
            shape=(self.parallel_interactions, self.buffer_observe),
            dtype=util.np_dtype(dtype='long')
        )
        self.reward_buffers = np.ndarray(
            shape=(self.parallel_interactions, self.buffer_observe),
            dtype=util.np_dtype(dtype='float')
        )

        # Recorder buffers if required
        if self.recorder_spec is not None:
            self.states_buffers = OrderedDict()
            self.actions_buffers = OrderedDict()
            for name, spec in self.states_spec.items():
                shape = (self.parallel_interactions, self.buffer_observe) + spec['shape']
                self.states_buffers[name] = np.ndarray(
                    shape=shape, dtype=util.np_dtype(dtype=spec['type'])
                )
            for name, spec in self.actions_spec.items():
                shape = (self.parallel_interactions, self.buffer_observe) + spec['shape']
                self.actions_buffers[name] = np.ndarray(
                    shape=shape, dtype=util.np_dtype(dtype=spec['type'])
                )
                if spec['type'] == 'int':
                    shape = (self.parallel_interactions, self.buffer_observe) + spec['shape'] + \
                        (spec['num_values'],)
                    self.states_buffers[name + '_mask'] = np.ndarray(
                        shape=shape, dtype=util.np_dtype(dtype='bool')
                    )

            self.num_episodes = 0
            self.record_states = OrderedDict(((name, list()) for name in self.states_spec))
            self.record_actions = OrderedDict(((name, list()) for name in self.actions_spec))
            for name, spec in self.actions_spec.items():
                if spec['type'] == 'int':
                    self.record_states[name + '_mask'] = list()
            self.record_terminal = list()
            self.record_reward = list()

        # Parallel buffer indices
        self.buffer_indices = np.zeros(
            shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='int')
        )

        self.timesteps = 0
        self.episodes = 0
        self.updates = 0

        # Setup Model
        if not hasattr(self, 'model'):
            raise TensorforceError.missing(name='Agent', value='model')

        self.model.initialize()
        if self.model.saver_directory is not None:
            file = os.path.join(self.model.saver_directory, self.model.saver_filename + '.json')
            with open(file, 'w') as fp:
                json.dump(obj=self.spec, fp=fp)

        self.reset()
示例#12
0
    def __init__(
            # Required
            self,
            states,
            actions,
            max_episode_timesteps,
            batch_size,
            # Network
            network='auto',
            use_beta_distribution=False,
            # Memory
            memory='minimum',
            # Optimization
            update_frequency=1.0,
            learning_rate=1e-3,
            multi_step=10,
            subsampling_fraction=0.33,
            # Reward estimation
            likelihood_ratio_clipping=0.25,
            discount=0.99,
            return_processing=None,
            advantage_processing=None,
            predict_terminal_values=False,
            # Baseline
            baseline=None,
            baseline_optimizer=None,
            # Preprocessing
            state_preprocessing='linear_normalization',
            reward_preprocessing=None,
            # Exploration
            exploration=0.0,
            variable_noise=0.0,
            # Regularization
            l2_regularization=0.0,
            entropy_regularization=0.0,
            # Parallel interactions
            parallel_interactions=1,
            # Config, saver, summarizer, tracking, recorder
            config=None,
            saver=None,
            summarizer=None,
            tracking=None,
            recorder=None,
            # Deprecated
            **kwargs):
        if 'optimization_steps' in kwargs:
            raise TensorforceError.deprecated(name='PPO',
                                              argument='optimization_steps',
                                              replacement='multi_step')
        if 'estimate_terminal' in kwargs:
            raise TensorforceError.deprecated(
                name='PPO',
                argument='estimate_terminal',
                replacement='predict_terminal_values')
        if 'critic_network' in kwargs:
            raise TensorforceError.deprecated(name='PPO',
                                              argument='critic_network',
                                              replacement='baseline')
        if 'baseline_network' in kwargs:
            raise TensorforceError.deprecated(name='PPO',
                                              argument='baseline_network',
                                              replacement='baseline')
        if 'critic_optimizer' in kwargs:
            raise TensorforceError.deprecated(name='PPO',
                                              argument='critic_optimizer',
                                              replacement='baseline_optimizer')

        self.spec = OrderedDict(
            agent='ppo',
            states=states,
            actions=actions,
            max_episode_timesteps=max_episode_timesteps,
            batch_size=batch_size,
            network=network,
            use_beta_distribution=use_beta_distribution,
            memory=memory,
            update_frequency=update_frequency,
            learning_rate=learning_rate,
            multi_step=multi_step,
            subsampling_fraction=subsampling_fraction,
            likelihood_ratio_clipping=likelihood_ratio_clipping,
            discount=discount,
            return_processing=return_processing,
            advantage_processing=advantage_processing,
            predict_terminal_values=predict_terminal_values,
            baseline=baseline,
            baseline_optimizer=baseline_optimizer,
            state_preprocessing=state_preprocessing,
            reward_preprocessing=reward_preprocessing,
            exploration=exploration,
            variable_noise=variable_noise,
            l2_regularization=l2_regularization,
            entropy_regularization=entropy_regularization,
            parallel_interactions=parallel_interactions,
            config=config,
            saver=saver,
            summarizer=summarizer,
            tracking=tracking,
            recorder=recorder)

        policy = dict(type='parametrized_distributions',
                      network=network,
                      temperature=1.0,
                      use_beta_distribution=use_beta_distribution)

        if memory == 'minimum':
            memory = dict(type='recent')
        else:
            memory = dict(type='recent', capacity=memory)

        update = dict(unit='episodes',
                      batch_size=batch_size,
                      frequency=update_frequency)

        optimizer = dict(optimizer='adam',
                         learning_rate=learning_rate,
                         multi_step=multi_step,
                         subsampling_fraction=subsampling_fraction)
        objective = dict(type='policy_gradient',
                         importance_sampling=True,
                         clipping_value=likelihood_ratio_clipping)

        if baseline is None:
            assert not predict_terminal_values
            reward_estimation = dict(horizon='episode',
                                     discount=discount,
                                     predict_horizon_values=False,
                                     estimate_advantage=False)
            assert baseline_optimizer is None
            baseline_objective = None

        else:
            reward_estimation = dict(
                horizon='episode',
                discount=discount,
                predict_horizon_values='early',
                estimate_advantage=True,
                predict_action_values=False,
                predict_terminal_values=predict_terminal_values)
            baseline = dict(type='parametrized_state_value', network=baseline)
            assert baseline_optimizer is not None
            baseline_objective = dict(type='state_value')

        super().__init__(
            # Agent
            states=states,
            actions=actions,
            max_episode_timesteps=max_episode_timesteps,
            parallel_interactions=parallel_interactions,
            config=config,
            recorder=recorder,
            # TensorforceModel
            policy=policy,
            memory=memory,
            update=update,
            optimizer=optimizer,
            objective=objective,
            reward_estimation=reward_estimation,
            baseline=baseline,
            baseline_optimizer=baseline_optimizer,
            baseline_objective=baseline_objective,
            l2_regularization=l2_regularization,
            entropy_regularization=entropy_regularization,
            state_preprocessing=state_preprocessing,
            reward_preprocessing=reward_preprocessing,
            exploration=exploration,
            variable_noise=variable_noise,
            saver=saver,
            summarizer=summarizer,
            tracking=tracking,
            **kwargs)
示例#13
0
    def __init__(
        self,
        # Model
        name, device, parallel_interactions, buffer_observe, seed, execution, saver, summarizer,
        config, states, actions, preprocessing, exploration, variable_noise, l2_regularization,
        # TensorforceModel
        policy, memory, update, optimizer, objective, reward_estimation, baseline_policy,
        baseline_optimizer, baseline_objective, entropy_regularization
    ):
        # Policy internals specification
        policy_cls, first_arg, kwargs = Module.get_module_class_and_kwargs(
            name='policy', module=policy, modules=policy_modules, states_spec=states,
            actions_spec=actions
        )
        if first_arg is None:
            internals = policy_cls.internals_spec(name='policy', **kwargs)
        else:
            internals = policy_cls.internals_spec(first_arg, name='policy', **kwargs)
        if any(name.startswith('baseline-') for name in internals):
            raise TensorforceError.unexpected()

        # Baseline internals specification
        if baseline_policy is None:
            pass
        else:
            baseline_cls, first_arg, kwargs = Module.get_module_class_and_kwargs(
                name='baseline', module=baseline_policy, modules=policy_modules,
                states_spec=states, actions_spec=actions
            )
            if first_arg is None:
                baseline_internals = baseline_cls.internals_spec(name='baseline', **kwargs)
            else:
                baseline_internals = baseline_cls.internals_spec(
                    first_arg, name='baseline', **kwargs
                )
            for name, spec in baseline_internals.items():
                if name in internals:
                    raise TensorforceError(
                        "Name overlap between policy and baseline internals: {}.".format(name)
                    )
                internals[name] = spec

        super().__init__(
            # Model
            name=name, device=device, parallel_interactions=parallel_interactions,
            buffer_observe=buffer_observe, seed=seed, execution=execution, saver=saver,
            summarizer=summarizer, config=config, states=states, internals=internals,
            actions=actions, preprocessing=preprocessing, exploration=exploration,
            variable_noise=variable_noise, l2_regularization=l2_regularization
        )

        # Policy
        self.policy = self.add_module(
            name='policy', module=policy, modules=policy_modules, states_spec=self.states_spec,
            actions_spec=self.actions_spec
        )

        # Memory
        self.memory = self.add_module(
            name='memory', module=memory, modules=memory_modules, is_trainable=False,
            values_spec=self.values_spec
        )

        # Update mode
        if not all(key in ('batch_size', 'frequency', 'start', 'unit') for key in update):
            raise TensorforceError.value(name='update', value=list(update))
        # update: unit
        elif 'unit' not in update:
            raise TensorforceError.required(name='update', value='unit')
        elif update['unit'] not in ('timesteps', 'episodes'):
            raise TensorforceError.value(
                name='update', argument='unit', value=update['unit']
            )
        # update: batch_size
        elif 'batch_size' not in update:
            raise TensorforceError.required(name='update', value='batch_size')

        self.update_unit = update['unit']
        self.update_batch_size = self.add_module(
            name='update-batch-size', module=update['batch_size'], modules=parameter_modules,
            is_trainable=False, dtype='long'
        )
        if 'frequency' in update and update['frequency'] == 'never':
            self.update_frequency = 'never'
        else:
            self.update_frequency = self.add_module(
                name='update-frequency', module=update.get('frequency', update['batch_size']),
                modules=parameter_modules, is_trainable=False, dtype='long'
            )
            self.update_start = self.add_module(
                name='update-start', module=update.get('start', 0), modules=parameter_modules,
                is_trainable=False, dtype='long'
            )

        # Optimizer
        self.optimizer = self.add_module(
            name='optimizer', module=optimizer, modules=optimizer_modules, is_trainable=False
        )

        # Objective
        self.objective = self.add_module(
            name='objective', module=objective, modules=objective_modules, is_trainable=False
        )

        # Estimator
        if not all(key in (
            'capacity', 'discount', 'estimate_actions', 'estimate_advantage', 'estimate_horizon',
            'estimate_terminal', 'horizon'
        ) for key in reward_estimation):
            raise TensorforceError.value(name='reward_estimation', value=list(reward_estimation))
        if baseline_policy is None and baseline_optimizer is None and baseline_objective is None:
            estimate_horizon = False
        else:
            estimate_horizon = 'late'
        self.estimator = self.add_module(
            name='estimator', module=Estimator, is_trainable=False, is_saved=False,
            values_spec=self.values_spec, horizon=reward_estimation['horizon'],
            discount=reward_estimation.get('discount', 1.0),
            estimate_horizon=reward_estimation.get('estimate_horizon', estimate_horizon),
            estimate_actions=reward_estimation.get('estimate_actions', False),
            estimate_terminal=reward_estimation.get('estimate_terminal', False),
            estimate_advantage=reward_estimation.get('estimate_advantage', False),
            capacity=reward_estimation['capacity']
        )

        # Baseline
        if (baseline_policy is not None or baseline_objective is not None) and \
                (baseline_optimizer is None or isinstance(baseline_optimizer, float)):
            # since otherwise not part of training
            assert self.estimator.estimate_advantage or baseline_objective is not None
            is_trainable = True
        else:
            is_trainable = False
        if baseline_policy is None:
            self.baseline_policy = self.policy
        else:
            self.baseline_policy = self.add_module(
                name='baseline', module=baseline_policy, modules=policy_modules,
                is_trainable=is_trainable, is_subscope=True, states_spec=self.states_spec,
                actions_spec=self.actions_spec
            )

        # Baseline optimizer
        if baseline_optimizer is None:
            self.baseline_optimizer = None
            self.baseline_loss_weight = 1.0
        elif isinstance(baseline_optimizer, float):
            self.baseline_optimizer = None
            self.baseline_loss_weight = baseline_optimizer
        else:
            self.baseline_optimizer = self.add_module(
                name='baseline-optimizer', module=baseline_optimizer, modules=optimizer_modules,
                is_trainable=False, is_subscope=True
            )

        # Baseline objective
        if baseline_objective is None:
            self.baseline_objective = None
        else:
            self.baseline_objective = self.add_module(
                name='baseline-objective', module=baseline_objective, modules=objective_modules,
                is_trainable=False, is_subscope=True
            )

        # Entropy regularization
        entropy_regularization = 0.0 if entropy_regularization is None else entropy_regularization
        self.entropy_regularization = self.add_module(
            name='entropy-regularization', module=entropy_regularization,
            modules=parameter_modules, is_trainable=False, dtype='float'
        )

        # Internals initialization
        self.internals_init.update(self.policy.internals_init())
        self.internals_init.update(self.baseline_policy.internals_init())
        if any(internal_init is None for internal_init in self.internals_init.values()):
            raise TensorforceError.unexpected()

        # Register global tensors
        Module.register_tensor(name='update', spec=dict(type='long', shape=()), batched=False)
        Module.register_tensor(
            name='optimization', spec=dict(type='bool', shape=()), batched=False
        )
        Module.register_tensor(
            name='dependency_starts', spec=dict(type='long', shape=()), batched=True
        )
        Module.register_tensor(
            name='dependency_lengths', spec=dict(type='long', shape=()), batched=True
        )
示例#14
0
    def __init__(
            # Required
            self,
            states,
            actions,
            memory,
            # Environment
            max_episode_timesteps=None,
            # Network
            network='auto',
            # Optimization
            batch_size=32,
            update_frequency=None,
            start_updating=None,
            learning_rate=3e-4,
            # Reward estimation
            horizon=0,
            discount=0.99,
            estimate_terminal=False,
            # Critic
            critic_network='auto',
            critic_optimizer=1.0,
            # Preprocessing
            preprocessing=None,
            # Exploration
            exploration=0.0,
            variable_noise=0.0,
            # Regularization
            l2_regularization=0.0,
            entropy_regularization=0.0,
            # TensorFlow etc
            name='agent',
            device=None,
            parallel_interactions=1,
            seed=None,
            execution=None,
            saver=None,
            summarizer=None,
            recorder=None,
            config=None):
        self.spec = OrderedDict(agent='dpg',
                                states=states,
                                actions=actions,
                                max_episode_timesteps=max_episode_timesteps,
                                network=network,
                                memory=memory,
                                batch_size=batch_size,
                                update_frequency=update_frequency,
                                start_updating=start_updating,
                                learning_rate=learning_rate,
                                horizon=horizon,
                                discount=discount,
                                estimate_terminal=estimate_terminal,
                                critic_network=critic_network,
                                critic_optimizer=critic_optimizer,
                                preprocessing=preprocessing,
                                exploration=exploration,
                                variable_noise=variable_noise,
                                l2_regularization=l2_regularization,
                                entropy_regularization=entropy_regularization,
                                name=name,
                                device=device,
                                parallel_interactions=parallel_interactions,
                                seed=seed,
                                execution=execution,
                                saver=saver,
                                summarizer=summarizer,
                                recorder=recorder,
                                config=config)

        # TODO: action type and shape

        assert max_episode_timesteps is None or \
            memory >= batch_size + max_episode_timesteps + horizon
        policy = dict(network=network, temperature=0.0)
        memory = dict(type='replay', capacity=memory)
        update = dict(unit='timesteps', batch_size=batch_size)
        if update_frequency is not None:
            update['frequency'] = update_frequency
        if start_updating is not None:
            update['start'] = start_updating
        optimizer = dict(type='adam', learning_rate=learning_rate)
        objective = 'det_policy_gradient'
        reward_estimation = dict(horizon=horizon,
                                 discount=discount,
                                 estimate_horizon='late',
                                 estimate_terminal=estimate_terminal,
                                 estimate_actions=True)
        # Action value doesn't exist for Beta
        baseline_policy = dict(network=critic_network,
                               distributions=dict(float='gaussian'))
        baseline_objective = dict(type='value', value='action')

        super().__init__(
            # Agent
            states=states,
            actions=actions,
            max_episode_timesteps=max_episode_timesteps,
            parallel_interactions=parallel_interactions,
            buffer_observe=True,
            seed=seed,
            recorder=recorder,
            config=config,
            # Model
            name=name,
            device=device,
            execution=execution,
            saver=saver,
            summarizer=summarizer,
            preprocessing=preprocessing,
            exploration=exploration,
            variable_noise=variable_noise,
            l2_regularization=l2_regularization,
            # TensorforceModel
            policy=policy,
            memory=memory,
            update=update,
            optimizer=optimizer,
            objective=objective,
            reward_estimation=reward_estimation,
            baseline_policy=baseline_policy,
            baseline_optimizer=critic_optimizer,
            baseline_objective=baseline_objective,
            entropy_regularization=entropy_regularization)

        action_spec = next(iter(self.actions_spec.values()))
        if len(self.actions_spec) > 1 or action_spec['type'] != 'float' or \
                action_spec['shape'] != ():
            raise TensorforceError.unexpected()
示例#15
0
    def get_output_spec(self, input_spec):
        if len(self.tensors) == 1:
            return Module.get_tensor_spec(name=self.tensors[0])

        # Get tensor types and shapes
        dtypes = list()
        shapes = list()
        for tensor in self.tensors:
            # Tensor specification
            if tensor == '*':
                spec = input_spec
            else:
                spec = Module.get_tensor_spec(name=tensor)
            dtypes.append(spec['type'])
            shapes.append(spec['shape'])

        # Check tensor types
        if all(dtype == dtypes[0] for dtype in dtypes):
            dtype = dtypes[0]
        else:
            raise TensorforceError.value(name='retrieve',
                                         argument='tensor types',
                                         value=dtypes)

        if self.aggregation == 'concat':
            if any(len(shape) != len(shapes[0]) for shape in shapes):
                raise TensorforceError.value(name='retrieve',
                                             argument='tensor shapes',
                                             value=shapes)
            elif any(shape[n] != shapes[0][n] for shape in shapes
                     for n in range(len(shape)) if n != self.axis):
                raise TensorforceError.value(name='retrieve',
                                             argument='tensor shapes',
                                             value=shapes)
            shape = tuple(
                sum(shape[n]
                    for shape in shapes) if n == self.axis else shapes[0][n]
                for n in range(len(shapes[0])))

        elif self.aggregation == 'stack':
            if any(len(shape) != len(shapes[0]) for shape in shapes):
                raise TensorforceError.value(name='retrieve',
                                             argument='tensor shapes',
                                             value=shapes)
            elif any(shape[n] != shapes[0][n] for shape in shapes
                     for n in range(len(shape))):
                raise TensorforceError.value(name='retrieve',
                                             argument='tensor shapes',
                                             value=shapes)
            shape = tuple(
                len(shapes) if n == self.axis else shapes[0][n - int(
                    n > self.axis)] for n in range(len(shapes[0]) + 1))

        else:
            # Check and unify tensor shapes
            for shape in shapes:
                if len(shape) != len(shapes[0]):
                    raise TensorforceError.value(name='retrieve',
                                                 argument='tensor shapes',
                                                 value=shapes)
                if any(x != y and x != 1 and y != 1
                       for x, y in zip(shape, shapes[0])):
                    raise TensorforceError.value(name='retrieve',
                                                 argument='tensor shapes',
                                                 value=shapes)
            shape = tuple(
                max(shape[n] for shape in shapes)
                for n in range(len(shapes[0])))

        # Missing num_values, min/max_value!!!
        return dict(type=dtype, shape=shape)
示例#16
0
    def __init__(
        # Required
        self, states, actions, memory, batch_size,
        # Environment
        max_episode_timesteps=None,
        # Network
        network='auto', use_beta_distribution=True,
        # Optimization
        update_frequency=1.0, start_updating=None, learning_rate=1e-3,
        # Reward estimation
        horizon=1, discount=0.99, return_processing=None, predict_terminal_values=False,
        # Critic
        critic='auto', critic_optimizer=1.0,
        # Preprocessing
        state_preprocessing='linear_normalization', reward_preprocessing=None,
        # Exploration
        exploration=0.1, variable_noise=0.0,
        # Regularization
        l2_regularization=0.0, entropy_regularization=0.0,
        # Parallel interactions
        parallel_interactions=1,
        # Config, saver, summarizer, tracking, recorder
        config=None, saver=None, summarizer=None, tracking=None, recorder=None,
        # Deprecated
        **kwargs
    ):
        if 'estimate_terminal' in kwargs:
            raise TensorforceError.deprecated(
                name='DPG', argument='estimate_terminal', replacement='predict_terminal_values'
            )
        if 'critic_network' in kwargs:
            raise TensorforceError.deprecated(
                name='DPG', argument='critic_network', replacement='critic'
            )

        self.spec = OrderedDict(
            agent='dpg',
            states=states, actions=actions, memory=memory, batch_size=batch_size,
            max_episode_timesteps=max_episode_timesteps,
            network=network, use_beta_distribution=use_beta_distribution,
            update_frequency=update_frequency, start_updating=start_updating,
            learning_rate=learning_rate,
            horizon=horizon, discount=discount, return_processing=return_processing,
            predict_terminal_values=predict_terminal_values,
            critic=critic, critic_optimizer=critic_optimizer,
            state_preprocessing=state_preprocessing, reward_preprocessing=reward_preprocessing,
            exploration=exploration, variable_noise=variable_noise,
            l2_regularization=l2_regularization, entropy_regularization=entropy_regularization,
            parallel_interactions=parallel_interactions,
            config=config, saver=saver, summarizer=summarizer, tracking=tracking, recorder=recorder
        )

        policy = dict(
            type='parametrized_distributions', network=network, temperature=0.0,
            use_beta_distribution=use_beta_distribution
        )

        memory = dict(type='replay', capacity=memory)

        update = dict(
            unit='timesteps', batch_size=batch_size, frequency=update_frequency,
            start=start_updating
        )

        optimizer = dict(type='adam', learning_rate=learning_rate)
        objective = 'deterministic_policy_gradient'

        reward_estimation = dict(
            horizon=horizon, discount=discount, predict_horizon_values='late',
            estimate_advantage=False, predict_action_values=True,
            predict_terminal_values=predict_terminal_values
        )

        baseline = dict(type='parametrized_action_value', network=critic)
        baseline_optimizer = critic_optimizer
        baseline_objective = dict(type='value', value='action')

        super().__init__(
            # Agent
            states=states, actions=actions, max_episode_timesteps=max_episode_timesteps,
            parallel_interactions=parallel_interactions, config=config, recorder=recorder,
            # TensorforceModel
            policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective,
            reward_estimation=reward_estimation,
            baseline=baseline, baseline_optimizer=baseline_optimizer,
            baseline_objective=baseline_objective,
            l2_regularization=l2_regularization, entropy_regularization=entropy_regularization,
            state_preprocessing=state_preprocessing, reward_preprocessing=reward_preprocessing,
            exploration=exploration, variable_noise=variable_noise,
            saver=saver, summarizer=summarizer, tracking=tracking, **kwargs
        )
示例#17
0
    def __init__(self, *, name=None, action_spec=None, input_spec=None):
        assert action_spec.type == 'float' and action_spec.min_value is not None and \
            action_spec.max_value is not None

        parameters_spec = TensorsSpec(
            alpha=TensorSpec(type='float', shape=action_spec.shape),
            beta=TensorSpec(type='float', shape=action_spec.shape),
            alpha_beta=TensorSpec(type='float', shape=action_spec.shape),
            log_norm=TensorSpec(type='float', shape=action_spec.shape))
        conditions_spec = TensorsSpec()

        super().__init__(name=name,
                         action_spec=action_spec,
                         input_spec=input_spec,
                         parameters_spec=parameters_spec,
                         conditions_spec=conditions_spec)

        if len(self.input_spec.shape) == 1:
            # Single embedding
            action_size = util.product(xs=self.action_spec.shape, empty=0)
            self.alpha = self.submodule(name='alpha',
                                        module='linear',
                                        modules=layer_modules,
                                        size=action_size,
                                        initialization_scale=0.01,
                                        input_spec=self.input_spec)
            self.beta = self.submodule(name='beta',
                                       module='linear',
                                       modules=layer_modules,
                                       size=action_size,
                                       initialization_scale=0.01,
                                       input_spec=self.input_spec)

        else:
            # Embedding per action
            if len(self.input_spec.shape) < 1 or len(
                    self.input_spec.shape) > 3:
                raise TensorforceError.value(name=name,
                                             argument='input_spec.shape',
                                             value=self.input_spec.shape,
                                             hint='invalid rank')
            if self.input_spec.shape[:-1] == self.action_spec.shape[:-1]:
                size = self.action_spec.shape[-1]
            elif self.input_spec.shape[:-1] == self.action_spec.shape:
                size = 0
            else:
                raise TensorforceError.value(
                    name=name,
                    argument='input_spec.shape',
                    value=self.input_spec.shape,
                    hint='not flattened and incompatible with action shape')
            self.alpha = self.submodule(name='alpha',
                                        module='linear',
                                        modules=layer_modules,
                                        size=size,
                                        initialization_scale=0.01,
                                        input_spec=self.input_spec)
            self.beta = self.submodule(name='beta',
                                       module='linear',
                                       modules=layer_modules,
                                       size=size,
                                       initialization_scale=0.01,
                                       input_spec=self.input_spec)
示例#18
0
    def __init__(
        self, *, stddev_mode='predicted', bounded_transform='tanh', name=None, action_spec=None,
        input_spec=None
    ):
        assert action_spec.type == 'float'

        parameters_spec = TensorsSpec(
            mean=TensorSpec(type='float', shape=action_spec.shape),
            stddev=TensorSpec(type='float', shape=action_spec.shape),
            log_stddev=TensorSpec(type='float', shape=action_spec.shape)
        )
        conditions_spec = TensorsSpec()

        super().__init__(
            name=name, action_spec=action_spec, input_spec=input_spec,
            parameters_spec=parameters_spec, conditions_spec=conditions_spec
        )

        self.stddev_mode = stddev_mode

        if bounded_transform is None:
            bounded_transform = 'tanh'
        if bounded_transform not in ('clipping', 'tanh'):
            raise TensorforceError.value(
                name='Gaussian', argument='bounded_transform', value=bounded_transform,
                hint='not in {clipping,tanh}'
            )
        elif bounded_transform == 'tanh' and (
            (self.action_spec.min_value is not None) is not (self.action_spec.max_value is not None)
        ):
            raise TensorforceError.value(
                name='Gaussian', argument='bounded_transform', value=bounded_transform,
                condition='one-sided bounded action space'
            )
        elif self.action_spec.min_value is None and self.action_spec.max_value is None:
            bounded_transform = None
        self.bounded_transform = bounded_transform

        if self.input_spec.rank == 1:
            # Single embedding
            self.mean = self.submodule(
                name='mean', module='linear', modules=layer_modules, size=self.action_spec.size,
                initialization_scale=0.01, input_spec=self.input_spec
            )
            if self.stddev_mode == 'predicted':
                self.softplus_stddev = self.submodule(
                    name='softplus_stddev', module='linear', modules=layer_modules,
                    size=self.action_spec.size, initialization_scale=0.01,
                    input_spec=self.input_spec
                )

        else:
            # Embedding per action
            if self.input_spec.rank < 1 or self.input_spec.rank > 3:
                raise TensorforceError.value(
                    name=name, argument='input_spec.shape', value=self.embedding_shape,
                    hint='invalid rank'
                )
            elif self.input_spec.shape[:-1] == self.action_spec.shape[:-1]:
                size = self.action_spec.shape[-1]
            elif self.input_spec.shape[:-1] == self.action_spec.shape:
                size = 0
            else:
                raise TensorforceError.value(
                    name=name, argument='input_spec.shape', value=self.input_spec.shape,
                    hint='not flattened and incompatible with action shape'
                )
            self.mean = self.submodule(
                name='mean', module='linear', modules=layer_modules, size=size,
                initialization_scale=0.01, input_spec=self.input_spec
            )
            if self.stddev_mode == 'predicted':
                self.softplus_stddev = self.submodule(
                    name='softplus_stddev', module='linear', modules=layer_modules, size=size,
                    initialization_scale=0.01, input_spec=self.input_spec
                )
示例#19
0
    def create(environment=None,
               max_episode_timesteps=None,
               reward_shaping=None,
               remote=None,
               blocking=False,
               host=None,
               port=None,
               **kwargs):
        """
        Creates an environment from a specification. In case of "socket-server" remote mode, runs
        environment in server communication loop until closed.

        Args:
            environment (specification | Environment class/object): JSON file, specification key,
                configuration dictionary, library module, `Environment` class/object, or gym.Env
                (<span style="color:#C00000"><b>required</b></span>, invalid for "socket-client"
                remote mode).
            max_episode_timesteps (int > 0): Maximum number of timesteps per episode, overwrites
                the environment default if defined
                (<span style="color:#00C000"><b>default</b></span>: environment default, invalid
                for "socket-client" remote mode).
            reward_shaping (callable[(s,a,t,r,s') -> r|(r,t)] | str): Reward shaping function
                mapping state, action, terminal, reward and next state to shaped reward and
                terminal, or a string expression with arguments "states", "actions", "terminal",
                "reward" and "next_states", e.g. "-1.0 if terminal else max(reward, 0.0)"
                (<span style="color:#00C000"><b>default</b></span>: no reward shaping).
            remote ("multiprocessing" | "socket-client" | "socket-server"): Communication mode for
                remote environment execution of parallelized environment execution, "socket-client"
                mode requires a corresponding "socket-server" running, and "socket-server" mode
                runs environment in server communication loop until closed
                (<span style="color:#00C000"><b>default</b></span>: local execution).
            blocking (bool): Whether remote environment calls should be blocking
                (<span style="color:#00C000"><b>default</b></span>: not blocking, invalid unless
                "multiprocessing" or "socket-client" remote mode).
            host (str): Socket server hostname or IP address
                (<span style="color:#C00000"><b>required</b></span> only for "socket-client" remote
                mode).
            port (int): Socket server port
                (<span style="color:#C00000"><b>required</b></span> only for "socket-client/server"
                remote mode).
            kwargs: Additional arguments.
        """
        if remote not in ('multiprocessing', 'socket-client'):
            if blocking:
                raise TensorforceError.invalid(
                    name='Environment.create',
                    argument='blocking',
                    condition='no multiprocessing/socket-client instance')
        if remote not in ('socket-client', 'socket-server'):
            if host is not None:
                raise TensorforceError.invalid(name='Environment.create',
                                               argument='host',
                                               condition='no socket instance')
            elif port is not None:
                raise TensorforceError.invalid(name='Environment.create',
                                               argument='port',
                                               condition='no socket instance')

        if remote == 'multiprocessing':
            from tensorforce.environments import MultiprocessingEnvironment
            environment = MultiprocessingEnvironment(
                blocking=blocking,
                environment=environment,
                max_episode_timesteps=max_episode_timesteps,
                reward_shaping=reward_shaping,
                **kwargs)
            return environment

        elif remote == 'socket-client':
            if environment is not None:
                raise TensorforceError.invalid(
                    name='Environment.create',
                    argument='environment',
                    condition='socket-client instance')
            elif max_episode_timesteps is not None:
                raise TensorforceError.invalid(
                    name='Environment.create',
                    argument='max_episode_timesteps',
                    condition='socket-client instance')
            elif len(kwargs) > 0:
                raise TensorforceError.invalid(
                    name='Environment.create',
                    argument='kwargs',
                    condition='socket-client instance')
            from tensorforce.environments import SocketEnvironment
            environment = SocketEnvironment(host=host,
                                            port=port,
                                            blocking=blocking)
            return environment

        elif remote == 'socket-server':
            from tensorforce.environments import SocketEnvironment
            SocketEnvironment.remote(
                port=port,
                environment=environment,
                max_episode_timesteps=max_episode_timesteps,
                reward_shaping=reward_shaping,
                **kwargs)

        elif remote is not None:
            raise TensorforceError.value(name='Environment.create',
                                         argument='remote',
                                         value=remote)

        elif isinstance(environment, (EnvironmentWrapper, RemoteEnvironment)):
            if max_episode_timesteps is not None and \
                    max_episode_timesteps != environment.max_episode_timesteps():
                raise TensorforceError(
                    message=
                    'Environment argument max_episode_timesteps has been specified twice '
                    'with different values: {} != {}.'.format(
                        max_episode_timesteps,
                        environment.max_episode_timesteps()))
            if len(kwargs) > 0:
                raise TensorforceError.invalid(
                    name='Environment.create',
                    argument='kwargs',
                    condition='EnvironmentWrapper instance')
            return environment

        elif isinstance(environment, type) and \
                issubclass(environment, (EnvironmentWrapper, RemoteEnvironment)):
            raise TensorforceError.type(name='Environment.create',
                                        argument='environment',
                                        dtype=type(environment))

        elif isinstance(environment, Environment):
            return EnvironmentWrapper(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps,
                reward_shaping=reward_shaping)

        elif isinstance(environment, type) and issubclass(
                environment, Environment):
            environment = environment(**kwargs)
            assert isinstance(environment, Environment)
            return Environment.create(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps,
                reward_shaping=reward_shaping)

        elif isinstance(environment, dict):
            # Dictionary specification
            util.deep_disjoint_update(target=kwargs, source=environment)
            environment = kwargs.pop('environment',
                                     kwargs.pop('type', 'default'))
            assert environment is not None
            if max_episode_timesteps is None:
                max_episode_timesteps = kwargs.pop('max_episode_timesteps',
                                                   None)
            if reward_shaping is None:
                reward_shaping = kwargs.pop('reward_shaping', None)

            return Environment.create(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps,
                reward_shaping=reward_shaping,
                **kwargs)

        elif isinstance(environment, str):
            if os.path.isfile(environment):
                # JSON file specification
                with open(environment, 'r') as fp:
                    environment = json.load(fp=fp)

                util.deep_disjoint_update(target=kwargs, source=environment)
                environment = kwargs.pop('environment',
                                         kwargs.pop('type', 'default'))
                assert environment is not None
                if max_episode_timesteps is None:
                    max_episode_timesteps = kwargs.pop('max_episode_timesteps',
                                                       None)
                if reward_shaping is None:
                    reward_shaping = kwargs.pop('reward_shaping', None)

                return Environment.create(
                    environment=environment,
                    max_episode_timesteps=max_episode_timesteps,
                    reward_shaping=reward_shaping,
                    **kwargs)

            elif environment in tensorforce.environments.environments:
                # Keyword specification
                environment = tensorforce.environments.environments[
                    environment]
                return Environment.create(
                    environment=environment,
                    max_episode_timesteps=max_episode_timesteps,
                    reward_shaping=reward_shaping,
                    **kwargs)

            else:
                # Library specification
                import gym
                _environment = util.try_import_module(
                    module=environment, parent_class=(Environment, gym.Env))
                if _environment is not None:
                    return Environment.create(
                        environment=_environment,
                        max_episode_timesteps=max_episode_timesteps,
                        reward_shaping=reward_shaping,
                        **kwargs)

                # Default: OpenAI Gym
                try:
                    return Environment.create(
                        environment='gym',
                        level=environment,
                        max_episode_timesteps=max_episode_timesteps,
                        reward_shaping=reward_shaping,
                        **kwargs)
                except TensorforceError:
                    raise TensorforceError.value(name='Environment.create',
                                                 argument='environment',
                                                 value=environment)

        else:
            # Default: OpenAI Gym
            import gym
            if isinstance(environment, gym.Env) or \
                    (isinstance(environment, type) and issubclass(environment, gym.Env)):
                return Environment.create(
                    environment='gym',
                    level=environment,
                    max_episode_timesteps=max_episode_timesteps,
                    reward_shaping=reward_shaping,
                    **kwargs)

            else:
                raise TensorforceError.type(name='Environment.create',
                                            argument='environment',
                                            dtype=type(environment))
示例#20
0
    def __init__(self,
                 agent,
                 environment=None,
                 num_parallel=None,
                 environments=None,
                 max_episode_timesteps=None,
                 evaluation_environment=None,
                 save_best_agent=None):
        self.environments = list()
        if environment is None:
            assert num_parallel is None and environments is not None
            if not util.is_iterable(x=environments):
                raise TensorforceError.type(name='parallel-runner',
                                            argument='environments',
                                            value=environments)
            elif len(environments) == 0:
                raise TensorforceError.value(name='parallel-runner',
                                             argument='environments',
                                             value=environments)
            num_parallel = len(environments)
            environment = environments[0]
            self.is_environment_external = isinstance(environment, Environment)
            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps)
            states = environment.states()
            actions = environment.actions()
            self.environments.append(environment)
            for environment in environments[1:]:
                assert isinstance(environment,
                                  Environment) == self.is_environment_external
                environment = Environment.create(
                    environment=environment,
                    max_episode_timesteps=max_episode_timesteps)
                assert environment.states() == states
                assert environment.actions() == actions
                self.environments.append(environment)

        else:
            assert num_parallel is not None and environments is None
            assert not isinstance(environment, Environment)
            self.is_environment_external = False
            for _ in range(num_parallel):
                environment = Environment.create(
                    environment=environment,
                    max_episode_timesteps=max_episode_timesteps)
                self.environments(environment)

        if evaluation_environment is None:
            self.evaluation_environment = None
        else:
            self.is_eval_environment_external = isinstance(
                evaluation_environment, Environment)
            self.evaluation_environment = Environment.create(
                environment=evaluation_environment,
                max_episode_timesteps=max_episode_timesteps)
            assert self.evaluation_environment.states() == environment.states()
            assert self.evaluation_environment.actions(
            ) == environment.actions()

        self.is_agent_external = isinstance(agent, Agent)
        kwargs = dict(parallel_interactions=num_parallel)
        self.agent = Agent.create(agent=agent,
                                  environment=environment,
                                  **kwargs)
        self.save_best_agent = save_best_agent

        self.episode_rewards = list()
        self.episode_timesteps = list()
        self.episode_seconds = list()
        self.episode_agent_seconds = list()
        self.evaluation_rewards = list()
        self.evaluation_timesteps = list()
        self.evaluation_seconds = list()
        self.evaluation_agent_seconds = list()
示例#21
0
 def start_execute(self, actions):
     if self._expect_receive is not None:
         raise TensorforceError.unexpected()
     self._expect_receive = 'execute'
     assert self._actions is None
     self._actions = actions
示例#22
0
    def load(directory=None, filename=None, format=None, environment=None, **kwargs):
        """
        Restores an agent from a directory/file.

        Args:
            directory (str): Checkpoint directory
                (<span style="color:#C00000"><b>required</b></span>, unless saver is specified).
            filename (str): Checkpoint filename, with or without append and extension
                (<span style="color:#00C000"><b>default</b></span>: "agent").
            format ("checkpoint" | "saved-model" | "numpy" | "hdf5"): File format, "saved-model" loads
                an act-only agent based on a Protobuf model
                (<span style="color:#00C000"><b>default</b></span>: format matching directory and
                filename, required to be unambiguous).
            environment (Environment object): Environment which the agent is supposed to be trained
                on, environment-related arguments like state/action space specifications and
                maximum episode length will be extract if given
                (<span style="color:#C00000"><b>recommended</b></span>).
            kwargs: Additional agent arguments.
        """
        if directory is not None:
            if filename is None:
                filename = 'agent'
            agent = os.path.join(directory, os.path.splitext(filename)[0] + '.json')
            if not os.path.isfile(agent) and agent[agent.rfind('-') + 1: -5].isdigit():
                agent = agent[:agent.rindex('-')] + '.json'
            if os.path.isfile(agent):
                with open(agent, 'r') as fp:
                    agent = json.load(fp=fp)
                if 'agent' in kwargs:
                    if 'agent' in agent and agent['agent'] != kwargs['agent']:
                        raise TensorforceError.value(
                            name='Agent.load', argument='agent', value=kwargs['agent']
                        )
                    agent['agent'] = kwargs.pop('agent')
            else:
                agent = kwargs
                kwargs = dict()
        else:
            agent = kwargs
            kwargs = dict()

        # Overwrite values
        if environment is not None and environment.max_episode_timesteps() is not None:
            if 'max_episode_timesteps' in kwargs:
                assert kwargs['max_episode_timesteps'] >= environment.max_episode_timesteps()
                agent['max_episode_timesteps'] = kwargs['max_episode_timesteps']
            else:
                agent['max_episode_timesteps'] = environment.max_episode_timesteps()
        if 'parallel_interactions' in kwargs and kwargs['parallel_interactions'] > 1:
            agent['parallel_interactions'] = kwargs['parallel_interactions']

        agent.pop('internals', None)
        agent.pop('initial_internals', None)
        saver_restore = False
        if 'saver' in agent and isinstance(agent['saver'], dict):
            if not agent.get('load', True):
                raise TensorforceError.value(
                    name='Agent.load', argument='saver[load]', value=agent['saver']['load']
                )
            agent['saver'] = dict(agent['saver'])
            agent['saver']['load'] = True
            saver_restore = True
        elif 'saver' in kwargs and isinstance(kwargs['saver'], dict):
            if not kwargs.get('load', True):
                raise TensorforceError.value(
                    name='Agent.load', argument='saver[load]', value=kwargs['saver']['load']
                )
            kwargs['saver'] = dict(kwargs['saver'])
            kwargs['saver']['load'] = True
            saver_restore = True
        agent = Agent.create(agent=agent, environment=environment, **kwargs)
        if not saver_restore:
            agent.restore(directory=directory, filename=filename, format=format)

        return agent
示例#23
0
 def execute(self, actions):
     if self._timestep is None:
         raise TensorforceError(
             message=
             "An environment episode has to be initialized by calling reset() first."
         )
     assert self._max_episode_timesteps is None or self._timestep < self._max_episode_timesteps
     if self._num_parallel is None:
         states, terminal, reward = self._environment.execute(
             actions=actions)
     else:
         parallel, states, terminal, reward = self._environment.execute(
             actions=actions)
     if self._execute_output_check:
         self._check_states_output(states=states, function='execute')
         if self._num_parallel is None:
             if isinstance(reward, (np.generic, np.ndarray)):
                 reward = reward.item()
             if isinstance(terminal, (np.generic, np.ndarray)):
                 terminal = terminal.item()
             if not isinstance(terminal, bool) and \
                     (not isinstance(terminal, int) or terminal < 0 or terminal > 2):
                 raise TensorforceError(
                     'Environment.execute: invalid value {} for terminal.'.
                     format(terminal))
             if not isinstance(reward, (float, int)):
                 raise TensorforceError(
                     'Environment.execute: invalid type {} for reward.'.
                     format(type(reward)))
         else:
             TensorSpec(type='int', shape=(
             ), num_values=self._num_parallel).np_assert(
                 x=parallel,
                 batched=True,
                 message='Environment.execute: invalid {issue} for parallel.'
             )
             TensorSpec(type='bool', shape=()).np_assert(
                 x=terminal,
                 batched=True,
                 message='Environment.execute: invalid {issue} for terminal.'
             )
             TensorSpec(type='float', shape=()).np_assert(
                 x=reward,
                 batched=True,
                 message='Environment.execute: invalid {issue} for reward.')
         self._execute_output_check = False
     if self._reward_shaping is not None:
         if isinstance(self._reward_shaping, str):
             reward = eval(
                 self._reward_shaping, dict(),
                 dict(states=self._previous_states,
                      actions=actions,
                      terminal=terminal,
                      reward=reward,
                      next_states=states,
                      math=math,
                      np=np,
                      random=random))
         else:
             reward = self._reward_shaping(self._previous_states, actions,
                                           terminal, reward, states)
         if isinstance(reward, tuple):
             reward, terminal = reward
         if isinstance(reward, (np.generic, np.ndarray)):
             reward = reward.item()
         if isinstance(terminal, (np.generic, np.ndarray)):
             terminal = terminal.item()
         self._previous_states = states
     self._timestep += 1
     if self._num_parallel is None:
         terminal = int(terminal)
         if terminal == 0 and self._max_episode_timesteps is not None and \
                 self._timestep >= self._max_episode_timesteps:
             terminal = 2
         if terminal > 0:
             self._timestep = None
         return states, terminal, reward
     else:
         terminal = terminal.astype(util.np_dtype('int'))
         if (terminal == 0).any() and self._max_episode_timesteps is not None and \
                 self._timestep >= self._max_episode_timesteps:
             terminal = np.where(terminal == 0, 2, terminal)
             parallel = parallel[:0]
             states = None
         if (terminal > 0).all():
             self._timestep = None
         return parallel, states, terminal, reward
示例#24
0
    def create(agent='tensorforce', environment=None, **kwargs):
        """
        Creates an agent from a specification.

        Args:
            agent (specification | Agent class/object | lambda[states -> actions]): JSON file,
                specification key, configuration dictionary, library module, or `Agent`
                class/object. Alternatively, an act-function mapping states to actions which is
                supposed to be recorded.
                (<span style="color:#00C000"><b>default</b></span>: Tensorforce base agent).
            environment (Environment object): Environment which the agent is supposed to be trained
                on, environment-related arguments like state/action space specifications and
                maximum episode length will be extract if given
                (<span style="color:#C00000"><b>recommended</b></span>).
            kwargs: Additional agent arguments.
        """
        if isinstance(agent, Recorder):
            if environment is not None:
                # TODO:
                # assert agent.spec['states'] == environment.states()
                # assert agent.spec['actions'] == environment.actions()
                # assert environment.max_episode_timesteps() is None or \
                #     agent.spec['max_episode_timesteps'] >= environment.max_episode_timesteps()
                pass

            for key, value in kwargs.items():
                if key == 'parallel_interactions':
                    assert agent.spec[key] >= value
                else:
                    assert agent.spec[key] == value

            if agent.is_initialized:
                agent.reset()
            else:
                agent.initialize()

            return agent

        elif (isinstance(agent, type) and issubclass(agent, Agent)) or callable(agent):
            # Type specification, or Recorder
            if environment is not None:
                if 'states' in kwargs:
                    # TODO:
                    # assert kwargs['states'] == environment.states()
                    pass
                else:
                    kwargs['states'] = environment.states()
                if 'actions' in kwargs:
                    # assert kwargs['actions'] == environment.actions()
                    pass
                else:
                    kwargs['actions'] = environment.actions()
                if environment.max_episode_timesteps() is None:
                    pass
                elif 'max_episode_timesteps' in kwargs:
                    # assert kwargs['max_episode_timesteps'] >= environment.max_episode_timesteps()
                    pass
                else:
                    kwargs['max_episode_timesteps'] = environment.max_episode_timesteps()

            if isinstance(agent, type) and issubclass(agent, Agent):
                agent = agent(**kwargs)
                assert isinstance(agent, Agent)
            else:
                if 'recorder' not in kwargs:
                    raise TensorforceError.required(name='Recorder', argument='recorder')
                agent = Recorder(fn_act=agent, **kwargs)
            return Agent.create(agent=agent, environment=environment)

        elif isinstance(agent, dict):
            # Dictionary specification
            agent.update(kwargs)
            kwargs = dict(agent)
            agent = kwargs.pop('agent', kwargs.pop('type', 'default'))

            return Agent.create(agent=agent, environment=environment, **kwargs)

        elif isinstance(agent, str):
            if os.path.isfile(agent):
                # JSON file specification
                with open(agent, 'r') as fp:
                    agent = json.load(fp=fp)
                return Agent.create(agent=agent, environment=environment, **kwargs)

            elif '.' in agent:
                # Library specification
                library_name, module_name = agent.rsplit('.', 1)
                library = importlib.import_module(name=library_name)
                agent = getattr(library, module_name)
                return Agent.create(agent=agent, environment=environment, **kwargs)

            elif agent in tensorforce.agents.agents:
                # Keyword specification
                agent = tensorforce.agents.agents[agent]
                return Agent.create(agent=agent, environment=environment, **kwargs)

            else:
                raise TensorforceError.value(name='Agent.create', argument='agent', value=agent)

        else:
            raise TensorforceError.type(name='Agent.create', argument='agent', dtype=type(agent))
示例#25
0
    def specs_from_gym_space(space, ignore_value_bounds):
        import gym

        if isinstance(space, gym.spaces.Discrete):
            return dict(type='int', shape=(), num_values=space.n)

        elif isinstance(space, gym.spaces.MultiBinary):
            return dict(type='bool', shape=space.n)

        elif isinstance(space, gym.spaces.MultiDiscrete):
            num_discrete_space = len(space.nvec)
            if (space.nvec == space.nvec[0]).all():
                return dict(type='int',
                            shape=num_discrete_space,
                            num_values=space.nvec[0])
            else:
                specs = dict()
                for n in range(num_discrete_space):
                    specs['gymmdc{}'.format(n)] = dict(
                        type='int', shape=(), num_values=space.nvec[n])
                return specs

        elif isinstance(space, gym.spaces.Box):
            if ignore_value_bounds:
                return dict(type='float', shape=space.shape)
            elif (space.low
                  == space.low[0]).all() and (space.high
                                              == space.high[0]).all():
                return dict(type='float',
                            shape=space.shape,
                            min_value=space.low[0],
                            max_value=space.high[0])
            else:
                specs = dict()
                low = space.low.flatten()
                high = space.high.flatten()
                for n in range(low.shape[0]):
                    specs['gymbox{}'.format(n)] = dict(type='float',
                                                       shape=(),
                                                       min_value=low[n],
                                                       max_value=high[n])
                return specs

        elif isinstance(space, gym.spaces.Tuple):
            specs = dict()
            n = 0
            for n, space in enumerate(space.spaces):
                spec = OpenAIGym.specs_from_gym_space(
                    space=space, ignore_value_bounds=ignore_value_bounds)
                if 'type' in spec:
                    specs['gymtpl{}'.format(n)] = spec
                else:
                    for name, spec in spec.items():
                        specs['gymtpl{}-{}'.format(n, name)] = spec
            return specs

        elif isinstance(space, gym.spaces.Dict):
            specs = dict()
            for space_name, space in space.spaces.items():
                spec = OpenAIGym.specs_from_gym_space(
                    space=space, ignore_value_bounds=ignore_value_bounds)
                if 'type' in spec:
                    specs[space_name] = spec
                else:
                    for name, spec in spec.items():
                        specs['{}-{}'.format(space_name, name)] = spec
            return specs

        else:
            raise TensorforceError('Unknown Gym space.')
示例#26
0
    def restore(self, directory=None, filename=None, format=None):
        """
        Restores the agent from a checkpoint.

        Args:
            directory (str): Checkpoint directory
                (<span style="color:#C00000"><b>required</b></span>, unless "saved-model" format and
                saver specified).
            filename (str): Checkpoint filename, with or without append and extension
                (<span style="color:#C00000"><b>required</b></span>, unless "saved-model" format and
                saver specified).
            format ("checkpoint" | "numpy" | "hdf5"): File format
                (<span style="color:#00C000"><b>default</b></span>: format matching directory and
                filename, required to be unambiguous).
        """
        if not hasattr(self, 'model'):
            raise TensorforceError(message="Missing agent attribute model.")

        if not self.is_initialized:
            self.initialize()

        # format implicitly given if file exists
        if format is None and os.path.isfile(os.path.join(directory, filename)):
            if '.data-' in filename:
                filename = filename[:filename.index('.data-')]
                format = 'checkpoint'
            elif filename.endswith('.npz'):
                filename = filename[:-4]
                format = 'numpy'
            elif filename.endswith('.hdf5'):
                filename = filename[:-5]
                format = 'hdf5'
            elif filename.endswith('.h5'):
                filename = filename[:-3]
                format = 'hdf5'
            else:
                assert False
        elif format is None and os.path.isfile(os.path.join(directory, filename + '.index')):
            format = 'checkpoint'
        elif format is None and os.path.isfile(os.path.join(directory, filename + '.npz')):
            format = 'numpy'
        elif format is None and (
            os.path.isfile(os.path.join(directory, filename + '.hdf5')) or
            os.path.isfile(os.path.join(directory, filename + '.h5'))
        ):
            format = 'hdf5'

        else:
            # infer format from directory
            found = None
            latest = -1
            for name in os.listdir(directory):
                if format in (None, 'numpy') and name == filename + '.npz':
                    assert found is None
                    found = 'numpy'
                    latest = None
                elif format in (None, 'numpy') and name.startswith(filename) and \
                        name.endswith('.npz'):
                    assert found is None or found == 'numpy'
                    found = 'numpy'
                    n = int(name[len(filename) + 1: -4])
                    if n > latest:
                        latest = n
                elif format in (None, 'hdf5') and \
                        (name == filename + '.hdf5' or name == filename + '.h5'):
                    assert found is None
                    found = 'hdf5'
                    latest = None
                elif format in (None, 'hdf5') and name.startswith(filename) and \
                        (name.endswith('.hdf5') or name.endswith('.h5')):
                    assert found is None or found == 'hdf5'
                    found = 'hdf5'
                    n = int(name[len(filename) + 1: -5])
                    if n > latest:
                        latest = n

            if latest == -1:
                if format is None:
                    format = 'checkpoint'
                else:
                    assert format == 'checkpoint'
                if filename is None or \
                        not os.path.isfile(os.path.join(directory, filename + '.index')):
                    import tensorflow as tf
                    path = tf.train.latest_checkpoint(checkpoint_dir=directory)
                    if not path:
                        raise TensorforceError.exists_not(name='Checkpoint', value=directory)
                    filename = os.path.basename(path)

            else:
                if format is None:
                    format = found
                else:
                    assert format == found
                if latest is not None:
                    filename = filename + '-' + str(latest)

        self.timesteps, self.episodes, self.updates = self.model.restore(
            directory=directory, filename=filename, format=format
        )
示例#27
0
    def variable(self,
                 *,
                 name,
                 spec,
                 initializer,
                 is_trainable,
                 is_saved,
                 initialization_scale=None):
        assert self.is_initialized is False
        # name
        if not isinstance(name, str):
            raise TensorforceError.type(name='variable',
                                        argument='name',
                                        dtype=type(name))
        name = name.replace('/', '_')
        # spec
        if not isinstance(spec, TensorSpec):
            raise TensorforceError.type(name='variable',
                                        argument='spec',
                                        dtype=type(spec))
        if spec.is_underspecified():
            raise TensorforceError.value(name='variable',
                                         argument='spec',
                                         value=spec,
                                         hint='underspecified')
        # initializer
        initializer_names = ('constant', 'normal', 'normal-relu', 'ones',
                             'orthogonal', 'orthogonal-relu', 'zeros')
        if not isinstance(initializer, (spec.py_type(), np.ndarray, tf.Tensor)) and \
                initializer not in initializer_names:
            raise TensorforceError.value(name='variable',
                                         argument='initializer',
                                         value=initializer)
        elif isinstance(initializer,
                        np.ndarray) and initializer.dtype != spec.np_type():
            raise TensorforceError.type(name='variable',
                                        argument='initializer',
                                        dtype=initializer.dtype)
        elif isinstance(
                initializer,
                tf.Tensor) and tf_util.dtype(x=initializer) != spec.tf_type():
            raise TensorforceError.type(name='variable',
                                        argument='initializer',
                                        dtype=tf_util.dtype(x=initializer))
        # initialization_scale
        if initialization_scale is not None:
            if isinstance(initializer, (spec.py_type(), np.ndarray, tf.Tensor)) or \
                    initializer not in ('constant', 'orthogonal', 'orthogonal-relu'):
                raise TensorforceError.invalid(
                    name='variable',
                    argument='initialization_scale',
                    condition='initializer not orthogonal')
            elif not isinstance(initialization_scale, spec.py_type()):
                raise TensorforceError.type(name='variable',
                                            argument='initialization_scale',
                                            dtype=type(initialization_scale),
                                            hint='!= float')
        # is_trainable
        if not isinstance(is_trainable, bool):
            raise TensorforceError.type(name='variable',
                                        argument='is_trainable',
                                        dtype=type(is_trainable))
        elif is_trainable and spec.type != 'float':
            raise TensorforceError.value(name='variable',
                                         argument='is_trainable',
                                         value=is_trainable,
                                         condition='spec.type != float')
        # is_saved
        if not isinstance(is_saved, bool):
            raise TensorforceError.type(name='variable',
                                        argument='is_saved',
                                        dtype=type(is_saved))

        # Variable initializer
        if isinstance(initializer, spec.py_type()):
            initializer = tf_util.constant(value=initializer,
                                           dtype=spec.type,
                                           shape=spec.shape)
        elif isinstance(initializer, np.ndarray):
            if initializer.shape != spec.shape:
                raise TensorforceError.mismatch(name='Module.variable',
                                                value1='shape',
                                                value2='initializer')
            initializer = tf_util.constant(value=initializer, dtype=spec.type)
        elif isinstance(initializer, tf.Tensor):
            if tf_util.shape(x=initializer) != spec.shape:
                raise TensorforceError.mismatch(name='Module.variable',
                                                value1='shape',
                                                value2='initializer')
            initializer = initializer
        elif not isinstance(initializer, str):
            raise TensorforceError(
                "Invalid variable initializer: {}".format(initializer))
        elif initializer.startswith('normal'):
            if spec.type != 'float':
                raise TensorforceError(
                    message=
                    "Invalid variable initializer value for non-float variable: {}."
                    .format(initializer))
            if initializer.endswith('-relu'):
                stddev = min(0.1,
                             np.sqrt(2.0 / util.product(xs=spec.shape[:-1])))
            else:
                stddev = min(
                    0.1,
                    np.sqrt(
                        2.0 /
                        (util.product(xs=spec.shape[:-1]) + spec.shape[-1])))
            initializer = tf.random.normal(shape=spec.shape,
                                           stddev=stddev,
                                           dtype=spec.tf_type())
        elif initializer.startswith('orthogonal'):
            if spec.type != 'float':
                raise TensorforceError(
                    message=
                    "Invalid variable initializer value for non-float variable: {}."
                    .format(initializer))
            if spec.rank < 2:
                raise TensorforceError(
                    message=
                    "Invalid variable initializer value for 0/1-rank variable: {}."
                    .format(initializer))
            normal = np.random.normal(size=(util.product(xs=spec.shape[:-1]),
                                            spec.shape[-1]))
            u, _, v = np.linalg.svd(a=normal, full_matrices=False)
            orthogonal = u if u.shape[1] == spec.shape[-1] else v
            if initializer.endswith('-relu'):
                orthogonal = orthogonal * np.sqrt(2.0)
            if initialization_scale is not None and initialization_scale != 1.0:
                if initialization_scale <= 0.0:
                    raise TensorforceError.value(
                        name='variable',
                        argument='initialization_scale',
                        value=initialization_scale,
                        hint='<= 0.0')
                orthogonal = orthogonal * initialization_scale
            initializer = tf_util.constant(value=orthogonal.reshape(
                spec.shape),
                                           dtype=spec.type)
        elif initializer == 'zeros':
            initializer = tf_util.zeros(shape=spec.shape, dtype=spec.type)
        elif initializer == 'ones':
            initializer = tf_util.ones(shape=spec.shape, dtype=spec.type)
        elif initializer == 'constant':
            initializer = tf.fill(dims=spec.shape,
                                  value=tf_util.constant(
                                      value=initialization_scale,
                                      dtype=spec.type))

        # Variable
        variable = tf.Variable(initial_value=initializer,
                               trainable=is_trainable,
                               validate_shape=True,
                               name=name,
                               dtype=spec.tf_type(),
                               shape=spec.shape)
        variable.is_saved = is_saved

        return variable
示例#28
0
    def prepare(self,
                environment=None,
                min_timesteps=None,
                states=None,
                actions=None,
                exclude_bool_action=False,
                exclude_int_action=False,
                exclude_float_action=False,
                exclude_bounded_action=False,
                require_observe=False,
                require_all=False,
                **agent):
        """
        Generic unit-test preparation.
        """
        Layer.layers = None

        if environment is None:
            if states is None:
                states = deepcopy(self.__class__.states)

            if actions is None:
                actions = deepcopy(self.__class__.actions)
                if exclude_bool_action or self.__class__.exclude_bool_action:
                    actions.pop('bool_action')
                if exclude_int_action or self.__class__.exclude_int_action:
                    actions.pop('int_action')
                if exclude_float_action or self.__class__.exclude_float_action:
                    actions.pop('float_action')
                if exclude_bounded_action or self.__class__.exclude_bounded_action:
                    actions.pop('bounded_action')

            if min_timesteps is None:
                min_timesteps = self.__class__.min_timesteps

            environment = UnittestEnvironment(states=states,
                                              actions=actions,
                                              min_timesteps=min_timesteps)

        elif min_timesteps is not None:
            raise TensorforceError.unexpected()

        environment = Environment.create(environment=environment,
                                         max_episode_timesteps=5)

        for key, value in self.__class__.agent.items():
            if key not in agent:
                agent[key] = value

        if self.__class__.require_all or require_all:
            config = None
        elif self.__class__.require_observe or require_observe:
            config = dict(api_functions=['reset', 'act', 'observe'])
        else:
            config = dict(api_functions=['reset', 'act'])

        agent = Agent.create(agent=agent,
                             environment=environment,
                             config=config)

        return agent, environment
示例#29
0
    def run(
            self,
            # General
            num_episodes=None,
            num_timesteps=None,
            num_updates=None,
            # Parallel
            batch_agent_calls=False,
            sync_timesteps=False,
            sync_episodes=False,
            num_sleep_secs=0.001,
            # Callback
            callback=None,
            callback_episode_frequency=None,
            callback_timestep_frequency=None,
            # Tqdm
            use_tqdm=True,
            mean_horizon=1,
            # Evaluation
            evaluation=False,
            save_best_agent=None,
            evaluation_callback=None):
        """
        Run experiment.

        Args:
            num_episodes (int > 0): Number of episodes to run experiment, sum of episodes across all
                parallel/vectorized environment(s) / actors in a multi-actor environment
                (<span style="color:#00C000"><b>default</b></span>: no episode limit).
            num_timesteps (int > 0): Number of timesteps to run experiment, sum of timesteps across
                all parallel/vectorized environment(s) / actors in a multi-actor environment
                (<span style="color:#00C000"><b>default</b></span>: no timestep limit).
            num_updates (int > 0): Number of agent updates to run experiment
                (<span style="color:#00C000"><b>default</b></span>: no update limit).
            batch_agent_calls (bool): Whether to batch agent calls for parallel environment
                execution
                (<span style="color:#00C000"><b>default</b></span>: false, separate call per
                environment).
            sync_timesteps (bool): Whether to synchronize parallel environment execution on
                timestep-level, implied by batch_agent_calls
                (<span style="color:#00C000"><b>default</b></span>: false, unless
                batch_agent_calls is true).
            sync_episodes (bool): Whether to synchronize parallel environment execution on
                episode-level
                (<span style="color:#00C000"><b>default</b></span>: false).
            num_sleep_secs (float): Sleep duration if no environment is ready
                (<span style="color:#00C000"><b>default</b></span>: one milliseconds).
            callback (callable[(Runner, parallel) -> bool]): Callback function taking the runner
                instance plus parallel index and returning a boolean value indicating whether
                execution should continue
                (<span style="color:#00C000"><b>default</b></span>: callback always true).
            callback_episode_frequency (int): Episode interval between callbacks
                (<span style="color:#00C000"><b>default</b></span>: every episode).
            callback_timestep_frequency (int): Timestep interval between callbacks
                (<span style="color:#00C000"><b>default</b></span>: not specified).
            use_tqdm (bool): Whether to display a tqdm progress bar for the experiment run
                (<span style="color:#00C000"><b>default</b></span>: true), with the following
                additional information (averaged over number of episodes given via mean_horizon):
                <ul>
                <li>return &ndash; cumulative episode return</li>
                <li>ts/ep &ndash; timesteps per episode</li>
                <li>sec/ep &ndash; seconds per episode</li>
                <li>ms/ts &ndash; milliseconds per timestep</li>
                <li>agent &ndash; percentage of time spent on agent computation</li>
                <li>comm &ndash; if remote environment execution, percentage of time spent on
                communication</li>
                </ul>
            mean_horizon (int): Number of episodes progress bar values and evaluation score are
                averaged over (<span style="color:#00C000"><b>default</b></span>: not averaged).
            evaluation (bool): Whether to run in evaluation mode, only valid if single environment
                (<span style="color:#00C000"><b>default</b></span>: no evaluation).
            save_best_agent (string): Directory to save the best version of the agent according to
                the evaluation score
                (<span style="color:#00C000"><b>default</b></span>: best agent is not saved).
            evaluation_callback (int | callable[Runner -> float]): Callback function taking the
                runner instance and returning an evaluation score
                (<span style="color:#00C000"><b>default</b></span>: cumulative evaluation return
                averaged over mean_horizon episodes).
        """
        # General
        if num_episodes is None:
            self.num_episodes = float('inf')
        else:
            self.num_episodes = num_episodes
        if num_timesteps is None:
            self.num_timesteps = float('inf')
        else:
            self.num_timesteps = num_timesteps
        if num_updates is None:
            self.num_updates = float('inf')
        else:
            self.num_updates = num_updates

        # Parallel
        if len(self.environments) == 1:
            condition = 'single environment'
        elif self.num_vectorized is not None:
            condition = 'vectorized environment'
        else:
            condition = None
        if condition is None:
            pass
        elif batch_agent_calls:
            raise TensorforceError.invalid(name='Runner.run',
                                           argument='batch_agent_calls',
                                           condition=condition)
        elif sync_timesteps:
            raise TensorforceError.invalid(name='Runner.run',
                                           argument='sync_timesteps',
                                           condition=condition)
        elif sync_episodes:
            raise TensorforceError.invalid(name='Runner.run',
                                           argument='sync_episodes',
                                           condition=condition)
        self.batch_agent_calls = batch_agent_calls or (self.num_vectorized
                                                       is not None)
        self.sync_timesteps = sync_timesteps or self.batch_agent_calls
        self.sync_episodes = sync_episodes or (self.num_vectorized is not None)
        self.num_sleep_secs = num_sleep_secs
        if self.num_vectorized is None:
            self.num_environments = len(self.environments)
        else:
            self.num_environments = self.num_vectorized

        # Callback
        assert callback_episode_frequency is None or callback_timestep_frequency is None
        if callback_episode_frequency is None and callback_timestep_frequency is None:
            callback_episode_frequency = 1
        if callback_episode_frequency is None:
            self.callback_episode_frequency = float('inf')
        else:
            self.callback_episode_frequency = callback_episode_frequency
        if callback_timestep_frequency is None:
            self.callback_timestep_frequency = float('inf')
        else:
            self.callback_timestep_frequency = callback_timestep_frequency
        if callback is None:
            self.callback = (lambda r, p: True)
        elif util.is_iterable(x=callback):

            def sequential_callback(runner, parallel):
                result = True
                for fn in callback:
                    x = fn(runner, parallel)
                    if isinstance(result, bool):
                        result = result and x
                return result

            self.callback = sequential_callback
        else:

            def boolean_callback(runner, parallel):
                result = callback(runner, parallel)
                if isinstance(result, bool):
                    return result
                else:
                    return True

            self.callback = boolean_callback

        # Experiment statistics
        self.episode_returns = list()
        self.episode_timesteps = list()
        self.episode_seconds = list()
        self.episode_agent_seconds = list()
        if self.is_environment_remote:
            self.episode_env_seconds = list()
        if self.evaluation or evaluation:
            self.evaluation_returns = list()
            self.evaluation_timesteps = list()
            self.evaluation_seconds = list()
            self.evaluation_agent_seconds = list()
            if self.is_environment_remote:
                self.evaluation_env_seconds = list()
            if self.num_environments == 1:
                # for tqdm
                self.episode_returns = self.evaluation_returns
                self.episode_timesteps = self.evaluation_timesteps
                self.episode_seconds = self.evaluation_seconds
                self.episode_agent_seconds = self.evaluation_agent_seconds
                if self.is_environment_remote:
                    self.episode_env_seconds = self.evaluation_env_seconds
        else:
            # for tqdm
            self.evaluation_returns = self.episode_returns
            self.evaluation_timesteps = self.episode_timesteps
            self.evaluation_seconds = self.episode_seconds
            self.evaluation_agent_seconds = self.episode_agent_seconds
            if self.is_environment_remote:
                self.evaluation_env_seconds = self.episode_env_seconds

        # Timestep/episode/update counter
        self.timesteps = 0
        self.episodes = 0
        self.updates = 0

        # Tqdm
        if use_tqdm:
            if hasattr(self, 'tqdm'):
                self.tqdm.close()

            assert self.num_episodes != float(
                'inf') or self.num_timesteps != float('inf')
            inner_callback = self.callback

            if self.num_episodes != float('inf'):
                # Episode-based tqdm (default option if both num_episodes and num_timesteps set)
                assert self.num_episodes != float('inf')
                bar_format = (
                    '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, return={postfix[0]:.2f}, ts/ep='
                    '{postfix[1]}, sec/ep={postfix[2]:.2f}, ms/ts={postfix[3]:.1f}, agent='
                    '{postfix[4]:.1f}%]')
                postfix = [0.0, 0, 0.0, 0.0, 0.0]
                if self.is_environment_remote:
                    bar_format = bar_format[:-1] + ', comm={postfix[5]:.1f}%]'
                    postfix.append(0.0)

                self.tqdm = tqdm(desc='Episodes',
                                 total=self.num_episodes,
                                 bar_format=bar_format,
                                 initial=self.episodes,
                                 postfix=postfix)
                self.tqdm_last_update = self.episodes

                def tqdm_callback(runner, parallel):
                    if len(runner.evaluation_returns) > 0:
                        mean_return = float(
                            np.mean(runner.evaluation_returns[-mean_horizon:]))
                        runner.tqdm.postfix[0] = mean_return
                    if len(runner.episode_timesteps) > 0:
                        mean_ts_per_ep = int(
                            np.mean(runner.episode_timesteps[-mean_horizon:]))
                        mean_sec_per_ep = float(
                            np.mean(runner.episode_seconds[-mean_horizon:]))
                        mean_agent_sec = float(
                            np.mean(
                                runner.episode_agent_seconds[-mean_horizon:]))
                        try:
                            mean_ms_per_ts = mean_sec_per_ep * 1000.0 / mean_ts_per_ep
                        except ZeroDivisionError:
                            mean_ms_per_ts = 0.0
                        try:
                            mean_rel_agent = mean_agent_sec * 100.0 / mean_sec_per_ep
                        except ZeroDivisionError:
                            mean_rel_agent = 0.0
                        runner.tqdm.postfix[1] = mean_ts_per_ep
                        runner.tqdm.postfix[2] = mean_sec_per_ep
                        runner.tqdm.postfix[3] = mean_ms_per_ts
                        runner.tqdm.postfix[4] = mean_rel_agent
                    if runner.is_environment_remote and len(
                            runner.episode_env_seconds) > 0:
                        mean_env_sec = float(
                            np.mean(
                                runner.episode_env_seconds[-mean_horizon:]))
                        mean_rel_comm = (mean_agent_sec + mean_env_sec
                                         ) * 100.0 / mean_sec_per_ep
                        mean_rel_comm = 100.0 - mean_rel_comm
                        runner.tqdm.postfix[5] = mean_rel_comm
                    runner.tqdm.update(n=(runner.episodes -
                                          runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.episodes
                    return inner_callback(runner, parallel)

            else:
                # Timestep-based tqdm
                self.tqdm = tqdm(desc='Timesteps',
                                 total=self.num_timesteps,
                                 initial=self.timesteps,
                                 postfix=dict(mean_return='n/a'))
                self.tqdm_last_update = self.timesteps

                def tqdm_callback(runner, parallel):
                    # sum_timesteps_return = sum(runner.timestep_returns[num_mean_return:])
                    # num_timesteps = min(num_mean_return, runner.evaluation_timestep)
                    # mean_return = sum_timesteps_return / num_episodes
                    runner.tqdm.set_postfix(mean_return='n/a')
                    runner.tqdm.update(n=(runner.timesteps -
                                          runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.timesteps
                    return inner_callback(runner, parallel)

            self.callback = tqdm_callback

        # Evaluation
        if evaluation and self.num_environments > 1:
            raise TensorforceError.invalid(name='Runner.run',
                                           argument='evaluation',
                                           condition='parallel environments')
        self.evaluation_run = self.evaluation or evaluation
        self.save_best_agent = save_best_agent
        if evaluation_callback is None:
            self.evaluation_callback = (lambda r: None)
        else:
            self.evaluation_callback = evaluation_callback
        if self.save_best_agent is not None:
            inner_evaluation_callback = self.evaluation_callback

            def mean_return_callback(runner):
                result = inner_evaluation_callback(runner)
                if result is None:
                    return float(
                        np.mean(runner.evaluation_returns[-mean_horizon:]))
                else:
                    return result

            self.evaluation_callback = mean_return_callback
            self.best_evaluation_score = None

        # Episode statistics
        self.episode_return = [0.0 for _ in range(self.num_environments)]
        self.episode_timestep = [0 for _ in range(self.num_environments)]
        # if self.batch_agent_calls:
        #     self.episode_agent_second = 0.0
        #     self.episode_start = time.time()
        if self.evaluation_run:
            self.episode_agent_second = [
                0.0 for _ in range(self.num_environments - 1)
            ]
            self.episode_start = [
                time.time() for _ in range(self.num_environments - 1)
            ]
        else:
            self.episode_agent_second = [
                0.0 for _ in range(self.num_environments)
            ]
            self.episode_start = [
                time.time() for _ in range(self.num_environments)
            ]
        self.evaluation_agent_second = 0.0
        self.evaluation_start = time.time()

        # Values
        self.terminate = 0
        self.prev_terminals = [-1 for _ in range(self.num_environments)]
        self.states = [None for _ in range(self.num_environments)]
        self.terminals = [None for _ in range(self.num_environments)]
        self.rewards = [None for _ in range(self.num_environments)]
        if self.evaluation_run:
            self.evaluation_internals = self.agent.initial_internals()

        # Required if agent was previously stopped mid-episode
        self.agent.reset()

        # Reset environments
        if self.num_vectorized is None:
            for environment in self.environments:
                environment.start_reset()
        else:
            if self.environments[0].is_vectorizable():
                parallel, states = self.environments[0].reset(
                    num_parallel=self.num_vectorized)
            else:
                parallel, states = self.environments[0].reset()
            for i, n in enumerate(parallel):
                self.states[n] = states[i]
                self.prev_terminals[n] = -2

        # Runner loop
        while any(terminal <= 0 for terminal in self.prev_terminals):
            self.terminals = [None for _ in self.terminals]

            if self.batch_agent_calls:

                if self.num_vectorized is None:
                    # Retrieve observations (only if not already terminated)
                    while any(terminal is None for terminal in self.terminals):
                        for n in range(self.num_environments):
                            if self.terminals[n] is not None:
                                # Already received
                                continue
                            elif self.prev_terminals[n] <= 0:
                                # Receive if not terminal
                                observation = self.environments[
                                    n].receive_execute()
                                if observation is None:
                                    continue
                                self.states[n], self.terminals[
                                    n], self.rewards[n] = observation
                            else:
                                # Terminal
                                self.states[n] = None
                                self.terminals[n] = self.prev_terminals[n]
                                self.rewards[n] = None

                else:
                    # Vectorized environment execute
                    if all(terminal >= -1 for terminal in self.prev_terminals):
                        parallel, states, terminals, rewards = self.environments[
                            0].execute(actions=np.asarray(self.actions))
                        i = 0
                        for n, terminal in enumerate(self.prev_terminals):
                            if terminal <= 0:
                                self.terminals[n] = terminals[i]
                                self.rewards[n] = rewards[i]
                                if terminals[i] > 0:
                                    self.states[n] = None
                                i += 1
                            else:
                                self.states[n] = None
                                self.terminals[n] = self.prev_terminals[n]
                                self.rewards[n] = None
                        for i, n in enumerate(parallel):
                            assert self.terminals[n] <= 0 or self.terminals[
                                n] == 2
                            self.states[n] = states[i]
                    else:
                        for n, terminal in enumerate(self.prev_terminals):
                            if terminal < -1:
                                self.terminals[n] = -1
                            else:
                                self.terminals[n] = self.prev_terminals[n]

                self.handle_observe_joint()
                self.handle_act_joint()

            # Parallel environments loop
            no_environment_ready = True
            for n in range(self.num_environments):

                if self.prev_terminals[n] > 0:
                    # Continue if episode terminated (either sync_episodes or finished)
                    self.terminals[n] = self.prev_terminals[n]
                    continue

                elif self.batch_agent_calls:
                    # Handled before parallel environments loop
                    pass

                elif self.sync_timesteps:
                    # Wait until environment is ready
                    while True:
                        observation = self.environments[n].receive_execute()
                        if observation is not None:
                            break

                else:
                    # Check whether environment is ready, otherwise continue
                    observation = self.environments[n].receive_execute()
                    if observation is None:
                        self.terminals[n] = self.prev_terminals[n]
                        continue

                no_environment_ready = False
                if not self.batch_agent_calls:
                    self.states[n], self.terminals[n], self.rewards[
                        n] = observation

                # Check whether evaluation environment
                if self.evaluation_run and n == self.num_environments - 1:
                    if self.terminals[n] == -1:
                        # Initial act
                        self.handle_act_evaluation()
                    else:
                        # Observe
                        self.handle_observe_evaluation()
                        if self.terminals[n] == 0:
                            # Act
                            self.handle_act_evaluation()
                        else:
                            # Terminal
                            self.handle_terminal_evaluation()

                else:
                    if self.terminals[n] == -1:
                        # Initial act
                        self.handle_act(parallel=n)
                    else:
                        # Observe
                        self.handle_observe(parallel=n)
                        if self.terminals[n] == 0:
                            # Act
                            self.handle_act(parallel=n)
                        else:
                            # Terminal
                            self.handle_terminal(parallel=n)

            self.prev_terminals = list(self.terminals)

            # Sync_episodes: Reset if all episodes terminated
            if self.sync_episodes and all(terminal > 0
                                          for terminal in self.terminals):
                num_episodes_left = self.num_episodes - self.episodes
                if self.num_vectorized is None:
                    num_noneval_environments = self.num_environments - int(
                        self.evaluation_run)
                    for n in range(
                            min(num_noneval_environments, num_episodes_left)):
                        self.prev_terminals[n] = -1
                        self.environments[n].start_reset()
                    if self.evaluation_run and num_episodes_left > 0:
                        self.prev_terminals[-1] = -1
                        self.environments[-1].start_reset()
                elif num_episodes_left > 0:
                    if self.environments[0].is_vectorizable():
                        parallel, states = self.environments[0].reset(
                            num_parallel=min(num_episodes_left,
                                             self.num_vectorized))
                    else:
                        parallel, states = self.environments[0].reset()
                    for i, n in enumerate(parallel):
                        self.states[n] = states[i]
                        self.prev_terminals[n] = -2
                else:
                    self.prev_terminals = list()

            # Sleep if no environment was ready
            if no_environment_ready:
                time.sleep(self.num_sleep_secs)
示例#30
0
    def observe(self, reward=0.0, terminal=False, parallel=0):
        # Check whether inputs are batched
        if util.is_iterable(x=reward):
            reward = np.asarray(reward)
            num_parallel = reward.shape[0]
            if terminal is False:
                terminal = np.asarray([0 for _ in range(num_parallel)])
            else:
                terminal = np.asarray(terminal)
            if parallel == 0:
                assert num_parallel == self.parallel_interactions
                parallel = np.asarray(list(range(num_parallel)))
            else:
                parallel = np.asarray(parallel)

        elif util.is_iterable(x=terminal):
            terminal = np.asarray([int(t) for t in terminal])
            num_parallel = terminal.shape[0]
            if reward == 0.0:
                reward = np.asarray([0.0 for _ in range(num_parallel)])
            else:
                reward = np.asarray(reward)
            if parallel == 0:
                assert num_parallel == self.parallel_interactions
                parallel = np.asarray(list(range(num_parallel)))
            else:
                parallel = np.asarray(parallel)

        elif util.is_iterable(x=parallel):
            parallel = np.asarray(parallel)
            num_parallel = parallel.shape[0]
            if reward == 0.0:
                reward = np.asarray([0.0 for _ in range(num_parallel)])
            else:
                reward = np.asarray(reward)
            if terminal is False:
                terminal = np.asarray([0 for _ in range(num_parallel)])
            else:
                terminal = np.asarray(terminal)

        else:
            reward = np.asarray([float(reward)])
            terminal = np.asarray([int(terminal)])
            parallel = np.asarray([int(parallel)])
            num_parallel = 1

        # Check whether shapes/lengths are consistent
        if parallel.shape[0] == 0:
            raise TensorforceError.value(name='Agent.observe',
                                         argument='len(parallel)',
                                         value=parallel.shape[0],
                                         hint='= 0')
        if reward.shape != parallel.shape:
            raise TensorforceError.value(name='Agent.observe',
                                         argument='len(reward)',
                                         value=reward.shape,
                                         hint='!= parallel length')
        if terminal.shape != parallel.shape:
            raise TensorforceError.value(name='Agent.observe',
                                         argument='len(terminal)',
                                         value=terminal.shape,
                                         hint='!= parallel length')

        # Convert terminal to int if necessary
        if terminal.dtype is util.np_dtype(dtype='bool'):
            zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int'))
            ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int'))
            terminal = np.where(terminal, ones, zeros)

        # Check whether current timesteps are not completed
        if self.timestep_completed[parallel].any():
            raise TensorforceError(
                message="Calling agent.observe must be preceded by agent.act.")
        self.timestep_completed[parallel] = True

        # Check whether episode is too long
        self.timestep_counter[parallel] += 1
        if self.max_episode_timesteps is not None and np.logical_and(
                terminal == 0, self.timestep_counter[parallel] >
                self.max_episode_timesteps).any():
            raise TensorforceError(
                message="Episode longer than max_episode_timesteps.")
        self.timestep_counter[parallel] = np.where(
            terminal > 0, 0, self.timestep_counter[parallel])

        if self.recorder is None:
            pass

        elif self.num_episodes < self.recorder.get('start', 0):
            # Increment num_episodes
            for t in terminal.tolist():
                if t > 0:
                    self.num_episodes += 1

        else:
            # Store values per parallel interaction
            for p, t, r in zip(parallel.tolist(), terminal.tolist(),
                               reward.tolist()):

                # Buffer inputs
                self.buffers['terminal'][p].append(t)
                self.buffers['reward'][p].append(r)

                # Continue if not terminal
                if t == 0:
                    continue
                self.num_episodes += 1

                # Buffered terminal/reward inputs
                for name in self.states_spec:
                    self.recorded['states'][name].append(
                        np.stack(self.buffers['states'][name][p], axis=0))
                    self.buffers['states'][name][p].clear()
                for name, spec in self.actions_spec.items():
                    self.recorded['actions'][name].append(
                        np.stack(self.buffers['actions'][name][p], axis=0))
                    self.buffers['actions'][name][p].clear()
                self.recorded['terminal'].append(
                    np.array(self.buffers['terminal'][p],
                             dtype=self.terminal_spec.np_type()))
                self.buffers['terminal'][p].clear()
                self.recorded['reward'].append(
                    np.array(self.buffers['reward'][p],
                             dtype=self.reward_spec.np_type()))
                self.buffers['reward'][p].clear()

                # Check whether recording step
                if (self.num_episodes - self.recorder.get('start', 0)) \
                        % self.recorder.get('frequency', 1) != 0:
                    continue

                # Manage recorder directory
                directory = self.recorder['directory']
                if os.path.isdir(directory):
                    files = sorted(
                        f for f in os.listdir(directory)
                        if os.path.isfile(os.path.join(directory, f))
                        and os.path.splitext(f)[1] == '.npz')
                else:
                    os.makedirs(directory)
                    files = list()
                max_traces = self.recorder.get('max-traces')
                if max_traces is not None and len(files) > max_traces - 1:
                    for filename in files[:-max_traces + 1]:
                        filename = os.path.join(directory, filename)
                        os.remove(filename)

                # Write recording file
                filename = os.path.join(
                    directory,
                    'trace-{:09d}.npz'.format(self.num_episodes - 1))
                # time.strftime('%Y%m%d-%H%M%S')
                kwargs = self.recorded.fmap(function=np.concatenate,
                                            cls=ArrayDict).items()
                np.savez_compressed(file=filename, **dict(kwargs))

                # Clear recorded values
                for recorded in self.recorded.values():
                    recorded.clear()

        if self._is_agent:
            return reward, terminal, parallel
        else:
            return 0