Exemplo n.º 1
0
    def reset(self, **kwargs):
        self.cat_policy = self.policy_net_fn(
            self.env, **self.policy_net_kwargs).to(self.device)
        self.policy_optimizer = optimizer_factory(self.cat_policy.parameters(),
                                                  **self.optimizer_kwargs)

        self.value_net = self.value_net_fn(
            self.env, **self.value_net_kwargs).to(self.device)

        self.value_optimizer = optimizer_factory(self.value_net.parameters(),
                                                 **self.optimizer_kwargs)

        self.cat_policy_old = self.policy_net_fn(
            self.env, **self.policy_net_kwargs).to(self.device)
        self.cat_policy_old.load_state_dict(self.cat_policy.state_dict())

        self.MseLoss = nn.MSELoss()

        self.memory = Memory()

        self.episode = 0

        # useful data
        self._rewards = np.zeros(self.n_episodes)
        self._cumul_rewards = np.zeros(self.n_episodes)

        # default writer
        log_every = 5 * logger.getEffectiveLevel()
        self.writer = PeriodicWriter(self.name, log_every=log_every)
Exemplo n.º 2
0
    def __init__(self,
                 env,
                 n_episodes=1000,
                 horizon=100,
                 gamma=0.99,
                 batch_size=16,
                 percentile=70,
                 learning_rate=0.01,
                 optimizer_type='ADAM',
                 policy_net_fn=None,
                 **kwargs):
        Agent.__init__(self, env, **kwargs)

        # check environment
        assert isinstance(self.env.observation_space, spaces.Box)
        assert isinstance(self.env.action_space, spaces.Discrete)

        # parameters
        self.gamma = gamma
        self.batch_size = batch_size
        self.n_episodes = n_episodes
        self.percentile = percentile
        self.learning_rate = learning_rate
        self.horizon = horizon

        # random number generator
        self.rng = seeding.get_rng()

        #
        self.policy_net_fn = policy_net_fn \
            or (lambda: default_policy_net_fn(self.env))

        self.optimizer_kwargs = {'optimizer_type': optimizer_type,
                                 'lr': learning_rate}

        # policy net
        self.policy_net = self.policy_net_fn().to(device)

        # loss function and optimizer
        self.loss_fn = nn.CrossEntropyLoss()
        self.optimizer = optimizer_factory(
                                    self.policy_net.parameters(),
                                    **self.optimizer_kwargs)

        # memory
        self.memory = CEMMemory(self.batch_size)

        # default writer
        self.writer = PeriodicWriter(self.name,
                                     log_every=5*logger.getEffectiveLevel())
Exemplo n.º 3
0
    def __init__(self,
                 env,
                 n_episodes=1000,
                 horizon=256,
                 gamma=0.99,
                 loss_function="l2",
                 batch_size=100,
                 device="cuda:best",
                 target_update=1,
                 learning_rate=0.001,
                 optimizer_type='ADAM',
                 qvalue_net_fn=None,
                 double=True,
                 exploration_kwargs=None,
                 memory_kwargs=None,
                 **kwargs):
        # Wrap arguments and initialize base class
        memory_kwargs = memory_kwargs or {}
        memory_kwargs['gamma'] = gamma
        base_args = (env, horizon, exploration_kwargs, memory_kwargs,
                     n_episodes, batch_size, target_update, double)
        AbstractDQNAgent.__init__(self, *base_args, **kwargs)

        # init
        self.optimizer_kwargs = {'optimizer_type': optimizer_type,
                                 'lr': learning_rate}
        self.device = device
        self.loss_function = loss_function
        self.gamma = gamma
        #
        qvalue_net_fn = qvalue_net_fn \
            or (lambda: default_qvalue_net_fn(self.env))
        self.value_net = qvalue_net_fn()
        self.target_net = qvalue_net_fn()
        #
        self.target_net.load_state_dict(self.value_net.state_dict())
        self.target_net.eval()
        logger.debug("Number of trainable parameters: {}"
                     .format(trainable_parameters(self.value_net)))
        self.device = choose_device(self.device)
        self.value_net.to(self.device)
        self.target_net.to(self.device)
        self.loss_function = loss_function_factory(self.loss_function)
        self.optimizer = optimizer_factory(self.value_net.parameters(),
                                           **self.optimizer_kwargs)
        self.steps = 0
Exemplo n.º 4
0
    def reset(self, **kwargs):
        self.policy_net = self.policy_net_fn(
            self.env,
            **self.policy_net_kwargs,
        ).to(self.device)

        self.policy_optimizer = optimizer_factory(self.policy_net.parameters(),
                                                  **self.optimizer_kwargs)

        self.memory = Memory()

        self.episode = 0

        # useful data
        self._rewards = np.zeros(self.n_episodes)
        self._cumul_rewards = np.zeros(self.n_episodes)

        # default writer
        log_every = 5 * logger.getEffectiveLevel()
        self.writer = PeriodicWriter(self.name, log_every=log_every)
Exemplo n.º 5
0
    def reset(self, **kwargs):
        # policy net
        self.policy_net = self.policy_net_fn(
            self.env, **self.policy_net_kwargs).to(self.device)

        # loss function and optimizer
        self.loss_fn = nn.CrossEntropyLoss()
        self.optimizer = optimizer_factory(self.policy_net.parameters(),
                                           **self.optimizer_kwargs)

        # memory
        self.memory = CEMMemory(self.batch_size)

        # default writer
        self.writer = PeriodicWriter(self.name,
                                     log_every=5 * logger.getEffectiveLevel())

        #
        self.episode = 0
        self._rewards = np.zeros(self.n_episodes)
        self._cumul_rewards = np.zeros(self.n_episodes)
Exemplo n.º 6
0
    def __init__(self,
                 env,
                 n_episodes=1000,
                 horizon=256,
                 gamma=0.99,
                 loss_function="l2",
                 batch_size=100,
                 device="cuda:best",
                 target_update=1,
                 learning_rate=0.001,
                 epsilon_init=1.0,
                 epsilon_final=0.1,
                 epsilon_decay=5000,
                 optimizer_type='ADAM',
                 qvalue_net_fn=None,
                 qvalue_net_kwargs=None,
                 double=True,
                 memory_capacity=10000,
                 use_bonus=False,
                 uncertainty_estimator_kwargs=None,
                 prioritized_replay=True,
                 update_frequency=1,
                 **kwargs):
        # Wrap arguments and initialize base class
        memory_kwargs = {
            'capacity': memory_capacity,
            'n_steps': 1,
            'gamma': gamma
        }
        exploration_kwargs = {
            'method': "EpsilonGreedy",
            'temperature': epsilon_init,
            'final_temperature': epsilon_final,
            'tau': epsilon_decay,
        }
        self.use_bonus = use_bonus
        if self.use_bonus:
            env = UncertaintyEstimatorWrapper(env,
                                              **uncertainty_estimator_kwargs)
        IncrementalAgent.__init__(self, env, **kwargs)
        self.horizon = horizon
        self.exploration_kwargs = exploration_kwargs or {}
        self.memory_kwargs = memory_kwargs or {}
        self.n_episodes = n_episodes
        self.batch_size = batch_size
        self.target_update = target_update
        self.double = double

        assert isinstance(env.action_space, spaces.Discrete), \
            "Only compatible with Discrete action spaces."

        self.prioritized_replay = prioritized_replay
        memory_class = PrioritizedReplayMemory if prioritized_replay else TransitionReplayMemory
        self.memory = memory_class(**self.memory_kwargs)
        self.exploration_policy = \
            exploration_factory(self.env.action_space,
                                **self.exploration_kwargs)
        self.training = True
        self.steps = 0
        self.episode = 0
        self.writer = None

        self.optimizer_kwargs = {
            'optimizer_type': optimizer_type,
            'lr': learning_rate
        }
        self.device = choose_device(device)
        self.loss_function = loss_function
        self.gamma = gamma

        qvalue_net_kwargs = qvalue_net_kwargs or {}
        qvalue_net_fn = load(qvalue_net_fn) if isinstance(qvalue_net_fn, str) else \
            qvalue_net_fn or default_qvalue_net_fn
        self.value_net = qvalue_net_fn(self.env, **qvalue_net_kwargs)
        self.target_net = qvalue_net_fn(self.env, **qvalue_net_kwargs)

        self.target_net.load_state_dict(self.value_net.state_dict())
        self.target_net.eval()
        logger.info("Number of trainable parameters: {}".format(
            trainable_parameters(self.value_net)))
        self.value_net.to(self.device)
        self.target_net.to(self.device)
        self.loss_function = loss_function_factory(self.loss_function)
        self.optimizer = optimizer_factory(self.value_net.parameters(),
                                           **self.optimizer_kwargs)
        self.update_frequency = update_frequency
        self.steps = 0