Пример #1
0
    def __init__(
        # Required
        self, states, actions, update, objective, reward_estimation,
        # Environment
        max_episode_timesteps=None,
        # Agent
        policy='default', memory=None, optimizer='adam',
        # Baseline
        baseline_policy=None, baseline_optimizer=None, baseline_objective=None,
        # Preprocessing
        preprocessing=None,
        # Exploration
        exploration=0.0, variable_noise=0.0,
        # Regularization
        l2_regularization=0.0, entropy_regularization=0.0,
        # TensorFlow etc
        name='agent', device=None, parallel_interactions=1, buffer_observe=True, seed=None,
        execution=None, saver=None, summarizer=None, recorder=None, config=None
    ):
        if not hasattr(self, 'spec'):
            self.spec = OrderedDict(
                agent='tensorforce',
                states=states, actions=actions, max_episode_timesteps=max_episode_timesteps,
                policy=policy, memory=memory, update=update, optimizer=optimizer,
                objective=objective, reward_estimation=reward_estimation,
                baseline_policy=baseline_policy, baseline_optimizer=baseline_optimizer,
                baseline_objective=baseline_objective,
                preprocessing=preprocessing,
                exploration=exploration, variable_noise=variable_noise,
                l2_regularization=l2_regularization, entropy_regularization=entropy_regularization,
                name=name, device=device, parallel_interactions=parallel_interactions,
                buffer_observe=buffer_observe, seed=seed, execution=execution, saver=saver,
                summarizer=summarizer, recorder=recorder, config=config
            )

        if isinstance(update, int) or update['unit'] == 'timesteps':
            if parallel_interactions > 1:
                raise TensorforceError.value(
                    name='agent', argument='update', value=update,
                    condition='parallel_interactions > 1'
                )
            if buffer_observe is not True:
                raise TensorforceError.invalid(
                    name='agent', argument='buffer_observe', condition='update[unit] = timesteps'
                )
            buffer_observe = False

        if buffer_observe is True and parallel_interactions == 1 and summarizer is not None:
            buffer_observe = False

        super().__init__(
            states=states, actions=actions, max_episode_timesteps=max_episode_timesteps,
            parallel_interactions=parallel_interactions, buffer_observe=buffer_observe, seed=seed,
            recorder=recorder
        )

        if isinstance(update, int):
            update = dict(unit='timesteps', batch_size=update)

        reward_estimation = dict(reward_estimation)
        if reward_estimation['horizon'] == 'episode':
            if max_episode_timesteps is None:
                raise TensorforceError.value(
                    name='agent', argument='reward_estimation[horizon]', value='episode',
                    condition='max_episode_timesteps is None'
                )
            reward_estimation['horizon'] = max_episode_timesteps

        # TEMPORARY TODO: State value doesn't exist for Beta
        if isinstance(baseline_policy, dict) and \
                baseline_policy.get('type') in (None, 'default', 'parametrized_distributions') and \
                'distributions' not in baseline_policy:
            baseline_policy['distributions'] = dict(float='gaussian')
        if (reward_estimation.get('estimate_horizon') is not False or \
                reward_estimation.get('estimate_advantage') is not False or \
                baseline_policy is not None or baseline_optimizer is not None or \
                baseline_objective is not None) and isinstance(policy, dict) and \
                policy.get('type') in (None, 'default', 'parametrized_distributions') and \
                'distributions' not in policy:
            policy['distributions'] = dict(float='gaussian')

        self.model = TensorforceModel(
            # Model
            name=name, device=device, parallel_interactions=self.parallel_interactions,
            buffer_observe=self.buffer_observe, seed=seed, execution=execution, saver=saver,
            summarizer=summarizer, config=config, states=self.states_spec,
            actions=self.actions_spec, preprocessing=preprocessing, exploration=exploration,
            variable_noise=variable_noise, l2_regularization=l2_regularization,
            # TensorforceModel
            policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective,
            reward_estimation=reward_estimation, baseline_policy=baseline_policy,
            baseline_optimizer=baseline_optimizer, baseline_objective=baseline_objective,
            entropy_regularization=entropy_regularization,
            max_episode_timesteps=max_episode_timesteps
        )

        self.experience_size = self.model.estimator.capacity
Пример #2
0
    def __init__(
            # Required
            self,
            states,
            actions,
            update,
            optimizer,
            objective,
            reward_estimation,
            # Environment
            max_episode_timesteps=None,
            # Agent
            policy='auto',
            memory=None,
            # Baseline
            baseline=None,
            baseline_optimizer=None,
            baseline_objective=None,
            # Regularization
            l2_regularization=0.0,
            entropy_regularization=0.0,
            # Preprocessing
            state_preprocessing='linear_normalization',
            reward_preprocessing=None,
            # Exploration
            exploration=0.0,
            variable_noise=0.0,
            # Parallel interactions
            parallel_interactions=1,
            # Config, saver, summarizer, tracking, recorder
            config=None,
            saver=None,
            summarizer=None,
            tracking=None,
            recorder=None,
            # Deprecated
            baseline_policy=None,
            name=None,
            buffer_observe=None,
            device=None,
            seed=None):
        if 'estimate_actions' in reward_estimation:
            raise TensorforceError.deprecated(
                name='Agent',
                argument='reward_estimation[estimate_actions]',
                replacement='reward_estimation[estimate_action_values]')
        if 'estimate_terminal' in reward_estimation:
            raise TensorforceError.deprecated(
                name='Agent',
                argument='reward_estimation[estimate_terminal]',
                replacement='reward_estimation[estimate_terminals]')
        if summarizer is not None and 'labels' in summarizer:
            raise TensorforceError.deprecated(
                name='Agent',
                argument='summarizer[labels]',
                replacement='summarizer[summaries]')
        if baseline_policy is not None:
            raise TensorforceError.deprecated(name='Agent',
                                              argument='baseline_policy',
                                              replacement='baseline')
        if name is not None:
            raise TensorforceError.deprecated(name='Agent',
                                              argument='name',
                                              replacement='config[name]')
        if buffer_observe is not None:
            raise TensorforceError.deprecated(
                name='Agent',
                argument='buffer_observe',
                replacement='config[buffer_observe]')
        if device is not None:
            raise TensorforceError.deprecated(name='Agent',
                                              argument='device',
                                              replacement='config[device]')
        if seed is not None:
            raise TensorforceError.deprecated(name='Agent',
                                              argument='seed',
                                              replacement='config[seed]')

        if not hasattr(self, 'spec'):
            self.spec = OrderedDict(
                agent='tensorforce',
                # Environment
                states=states,
                actions=actions,
                max_episode_timesteps=max_episode_timesteps,
                # Agent
                policy=policy,
                memory=memory,
                update=update,
                optimizer=optimizer,
                objective=objective,
                reward_estimation=reward_estimation,
                # Baseline
                baseline=baseline,
                baseline_optimizer=baseline_optimizer,
                baseline_objective=baseline_objective,
                # Regularization
                l2_regularization=l2_regularization,
                entropy_regularization=entropy_regularization,
                # Preprocessing
                state_preprocessing=state_preprocessing,
                reward_preprocessing=reward_preprocessing,
                # Exploration
                exploration=exploration,
                variable_noise=variable_noise,
                # Parallel interactions
                parallel_interactions=parallel_interactions,
                # Config, saver, summarizer, recorder
                config=config,
                saver=saver,
                summarizer=summarizer,
                tracking=tracking,
                recorder=recorder)

        if memory is None:
            memory = dict(type='recent')

        if isinstance(update, int):
            update = dict(unit='timesteps', batch_size=update)

        if config is None:
            config = dict()
        else:
            config = dict(config)

        # TODO: should this change if summarizer is specified?
        if parallel_interactions > 1:
            if 'buffer_observe' not in config:
                if max_episode_timesteps is None:
                    raise TensorforceError.required(
                        name='Agent',
                        argument='max_episode_timesteps',
                        condition='parallel_interactions > 1')
                config['buffer_observe'] = 'episode'
            # elif config['buffer_observe'] < max_episode_timesteps:
            #     raise TensorforceError.value(
            #         name='Agent', argument='config[buffer_observe]',
            #         hint='< max_episode_timesteps', condition='parallel_interactions > 1'
            #     )

        elif update['unit'] == 'timesteps':
            update_frequency = update.get('frequency', update['batch_size'])
            if 'buffer_observe' not in config:
                if isinstance(update_frequency, int):
                    config['buffer_observe'] = update_frequency
                else:
                    config['buffer_observe'] = 1
            elif isinstance(
                    update_frequency,
                    int) and (config['buffer_observe'] == 'episode'
                              or config['buffer_observe'] > update_frequency):
                raise TensorforceError.value(
                    name='Agent',
                    argument='config[buffer_observe]',
                    value=config['buffer_observe'],
                    hint='> update[frequency]',
                    condition='update[unit] = "timesteps"')

        elif update['unit'] == 'episodes':
            if 'buffer_observe' not in config:
                config['buffer_observe'] = 'episode'

        # reward_estimation = dict(reward_estimation)
        # if reward_estimation['horizon'] == 'episode':
        #     if max_episode_timesteps is None:
        #         raise TensorforceError.required(
        #             name='Agent', argument='max_episode_timesteps',
        #             condition='reward_estimation[horizon] = "episode"'
        #         )
        #     reward_estimation['horizon'] = max_episode_timesteps

        super().__init__(states=states,
                         actions=actions,
                         max_episode_timesteps=max_episode_timesteps,
                         parallel_interactions=parallel_interactions,
                         config=config,
                         recorder=recorder)

        self.model = TensorforceModel(
            states=self.states_spec,
            actions=self.actions_spec,
            max_episode_timesteps=self.max_episode_timesteps,
            policy=policy,
            memory=memory,
            update=update,
            optimizer=optimizer,
            objective=objective,
            reward_estimation=reward_estimation,
            baseline=baseline,
            baseline_optimizer=baseline_optimizer,
            baseline_objective=baseline_objective,
            l2_regularization=l2_regularization,
            entropy_regularization=entropy_regularization,
            state_preprocessing=state_preprocessing,
            reward_preprocessing=reward_preprocessing,
            exploration=exploration,
            variable_noise=variable_noise,
            parallel_interactions=self.parallel_interactions,
            config=self.config,
            saver=saver,
            summarizer=summarizer,
            tracking=tracking)
Пример #3
0
    def __init__(
            self,
            # --- required ---
            # Environment
            states,
            actions,
            # Agent
            update,
            objective,
            reward_estimation,
            # --- default ---
            # Environment
            max_episode_timesteps=None,
            # Agent
            policy=None,
            network='auto',
            memory=None,
            optimizer='adam',
            # Baseline
            baseline_policy=None,
            baseline_network=None,
            baseline_optimizer=None,
            baseline_objective=None,
            # Preprocessing
            preprocessing=None,
            # Exploration
            exploration=0.0,
            variable_noise=0.0,
            # Regularization
            l2_regularization=0.0,
            entropy_regularization=0.0,
            # TensorFlow etc
            name='agent',
            device=None,
            parallel_interactions=1,
            buffer_observe=True,
            seed=None,
            execution=None,
            saver=None,
            summarizer=None,
            recorder=None,
            config=None):
        if not hasattr(self, 'spec'):
            self.spec = OrderedDict(
                agent='tensorforce',
                states=states,
                actions=actions,
                max_episode_timesteps=max_episode_timesteps,
                policy=policy,
                network=network,
                memory=memory,
                update=update,
                optimizer=optimizer,
                objective=objective,
                reward_estimation=reward_estimation,
                baseline_policy=baseline_policy,
                baseline_network=baseline_network,
                baseline_optimizer=baseline_optimizer,
                baseline_objective=baseline_objective,
                preprocessing=preprocessing,
                exploration=exploration,
                variable_noise=variable_noise,
                l2_regularization=l2_regularization,
                entropy_regularization=entropy_regularization,
                name=name,
                device=device,
                parallel_interactions=parallel_interactions,
                seed=seed,
                execution=execution,
                saver=saver,
                summarizer=summarizer,
                recorder=recorder,
                config=config)

        if buffer_observe is True and parallel_interactions == 1 and summarizer is not None:
            buffer_observe = False

        super().__init__(states=states,
                         actions=actions,
                         max_episode_timesteps=max_episode_timesteps,
                         parallel_interactions=parallel_interactions,
                         buffer_observe=buffer_observe,
                         seed=seed,
                         recorder=recorder)

        if isinstance(update, int):
            update = dict(unit='timesteps', batch_size=update)

        if reward_estimation['horizon'] == 'episode':
            if max_episode_timesteps is None:
                raise TensorforceError.unexpected()
            reward_estimation['horizon'] = max_episode_timesteps

        if memory is None:
            # predecessor/successor?
            if max_episode_timesteps is None or not isinstance(update['batch_size'], int) \
                    or not isinstance(reward_estimation['horizon'], int):
                raise TensorforceError.unexpected()
            if update['unit'] == 'timesteps':
                memory = update['batch_size'] + max_episode_timesteps + \
                    reward_estimation['horizon']
                # memory = ceil(update['batch_size'] / max_episode_timesteps) * max_episode_timesteps
                # memory += int(update['batch_size'] / max_episode_timesteps >= 1.0)
            elif update['unit'] == 'episodes':
                memory = update['batch_size'] * max_episode_timesteps + \
                    max(reward_estimation['horizon'], max_episode_timesteps)
            memory = max(memory, min(self.buffer_observe,
                                     max_episode_timesteps))

        self.model = TensorforceModel(
            # Model
            name=name,
            device=device,
            parallel_interactions=self.parallel_interactions,
            buffer_observe=self.buffer_observe,
            execution=execution,
            saver=saver,
            summarizer=summarizer,
            config=config,
            states=self.states_spec,
            actions=self.actions_spec,
            preprocessing=preprocessing,
            exploration=exploration,
            variable_noise=variable_noise,
            l2_regularization=l2_regularization,
            # TensorforceModel
            policy=policy,
            network=network,
            memory=memory,
            update=update,
            optimizer=optimizer,
            objective=objective,
            reward_estimation=reward_estimation,
            baseline_policy=baseline_policy,
            baseline_network=baseline_network,
            baseline_optimizer=baseline_optimizer,
            baseline_objective=baseline_objective,
            entropy_regularization=entropy_regularization)

        assert max_episode_timesteps is None or self.model.memory.capacity > max_episode_timesteps
Пример #4
0
    def __init__(
            self,
            # --- required ---
            # Environment
            states,
            actions,
            # Agent
            update,
            objective,
            reward_estimation,
            # --- default ---
            # Environment
            max_episode_timesteps=None,
            # Agent
            policy='default',
            memory=None,
            optimizer='adam',
            # Baseline
            baseline_policy=None,
            baseline_optimizer=None,
            baseline_objective=None,
            # Preprocessing
            preprocessing=None,
            # Exploration
            exploration=0.0,
            variable_noise=0.0,
            # Regularization
            l2_regularization=0.0,
            entropy_regularization=0.0,
            # TensorFlow etc
            name='agent',
            device=None,
            parallel_interactions=1,
            buffer_observe=True,
            seed=None,
            execution=None,
            saver=None,
            summarizer=None,
            recorder=None,
            config=None):
        if not hasattr(self, 'spec'):
            self.spec = OrderedDict(
                agent='tensorforce',
                states=states,
                actions=actions,
                max_episode_timesteps=max_episode_timesteps,
                policy=policy,
                memory=memory,
                update=update,
                optimizer=optimizer,
                objective=objective,
                reward_estimation=reward_estimation,
                baseline_policy=baseline_policy,
                baseline_optimizer=baseline_optimizer,
                baseline_objective=baseline_objective,
                preprocessing=preprocessing,
                exploration=exploration,
                variable_noise=variable_noise,
                l2_regularization=l2_regularization,
                entropy_regularization=entropy_regularization,
                name=name,
                device=device,
                parallel_interactions=parallel_interactions,
                buffer_observe=buffer_observe,
                seed=seed,
                execution=execution,
                saver=saver,
                summarizer=summarizer,
                recorder=recorder,
                config=config)

        if isinstance(update, int) or update['unit'] == 'timesteps':
            if parallel_interactions > 1:
                raise TensorforceError.value(
                    name='agent',
                    argument='update',
                    value=update,
                    condition='parallel_interactions > 1')
            if buffer_observe is not True:
                raise TensorforceError.invalid(
                    name='agent',
                    argument='buffer_observe',
                    condition='update[unit] = timesteps')
            buffer_observe = False

        if buffer_observe is True and parallel_interactions == 1 and summarizer is not None:
            buffer_observe = False

        super().__init__(states=states,
                         actions=actions,
                         max_episode_timesteps=max_episode_timesteps,
                         parallel_interactions=parallel_interactions,
                         buffer_observe=buffer_observe,
                         seed=seed,
                         recorder=recorder)

        if isinstance(update, int):
            update = dict(unit='timesteps', batch_size=update)

        reward_estimation = dict(reward_estimation)
        if reward_estimation['horizon'] == 'episode':
            if max_episode_timesteps is None:
                raise TensorforceError.value(
                    name='agent',
                    argument='reward_estimation[horizon]',
                    value='episode',
                    condition='max_episode_timesteps is None')
            reward_estimation['horizon'] = max_episode_timesteps
        if 'capacity' not in reward_estimation:
            # TODO: Doesn't take network horizon into account, needs to be set internally to max
            # if isinstance(reward_estimation['horizon'], int):
            #     reward_estimation['capacity'] = max(
            #         self.buffer_observe, reward_estimation['horizon'] + 2
            #     )
            if max_episode_timesteps is None:
                raise TensorforceError.required(
                    name='agent',
                    argument='reward_estimation[capacity]',
                    condition='max_episode_timesteps is None')
            if isinstance(reward_estimation['horizon'], int):
                reward_estimation['capacity'] = max(
                    max_episode_timesteps, reward_estimation['horizon'])
            else:
                reward_estimation['capacity'] = max_episode_timesteps
        self.experience_size = reward_estimation['capacity']

        if memory is None or (isinstance(memory, dict)
                              and 'capacity' not in memory):
            # predecessor/successor?
            if max_episode_timesteps is None:
                raise TensorforceError.required(
                    name='agent',
                    argument='memory',
                    condition='max_episode_timesteps is None')
            elif not isinstance(update['batch_size'], int):
                raise TensorforceError.required(
                    name='agent',
                    argument='memory',
                    condition='update[batch_size] not int')
            elif not isinstance(reward_estimation['horizon'], int):
                raise TensorforceError.required(
                    name='agent',
                    argument='memory',
                    condition='reward_estimation[horizon] not int')
            if update['unit'] == 'timesteps':
                capacity = update['batch_size'] + max_episode_timesteps + \
                    reward_estimation['horizon']
                # capacity = ceil(update['batch_size'] / max_episode_timesteps) * max_episode_timesteps
                # capacity += int(update['batch_size'] / max_episode_timesteps >= 1.0)
            elif update['unit'] == 'episodes':
                capacity = update['batch_size'] * max_episode_timesteps + \
                    max(reward_estimation['horizon'], max_episode_timesteps)
            capacity = max(capacity,
                           min(self.buffer_observe, max_episode_timesteps))
            if memory is None:
                memory = capacity
            else:
                memory['capacity'] = capacity

        self.model = TensorforceModel(
            # Model
            name=name,
            device=device,
            parallel_interactions=self.parallel_interactions,
            buffer_observe=self.buffer_observe,
            seed=seed,
            execution=execution,
            saver=saver,
            summarizer=summarizer,
            config=config,
            states=self.states_spec,
            actions=self.actions_spec,
            preprocessing=preprocessing,
            exploration=exploration,
            variable_noise=variable_noise,
            l2_regularization=l2_regularization,
            # TensorforceModel
            policy=policy,
            memory=memory,
            update=update,
            optimizer=optimizer,
            objective=objective,
            reward_estimation=reward_estimation,
            baseline_policy=baseline_policy,
            baseline_optimizer=baseline_optimizer,
            baseline_objective=baseline_objective,
            entropy_regularization=entropy_regularization)

        if max_episode_timesteps is not None and \
                self.model.memory.capacity <= max_episode_timesteps:
            raise TensorforceError.value(name='agent',
                                         argument='memory.capacity',
                                         value=self.model.memory.capacity,
                                         hint='<= max_episode_timesteps')