def reset(self, **kwargs): self.cat_policy = self.policy_net_fn( self.env, **self.policy_net_kwargs).to(self.device) self.policy_optimizer = optimizer_factory(self.cat_policy.parameters(), **self.optimizer_kwargs) self.value_net = self.value_net_fn( self.env, **self.value_net_kwargs).to(self.device) self.value_optimizer = optimizer_factory(self.value_net.parameters(), **self.optimizer_kwargs) self.cat_policy_old = self.policy_net_fn( self.env, **self.policy_net_kwargs).to(self.device) self.cat_policy_old.load_state_dict(self.cat_policy.state_dict()) self.MseLoss = nn.MSELoss() self.memory = Memory() self.episode = 0 # useful data self._rewards = np.zeros(self.n_episodes) self._cumul_rewards = np.zeros(self.n_episodes) # default writer log_every = 5 * logger.getEffectiveLevel() self.writer = PeriodicWriter(self.name, log_every=log_every)
def __init__(self, env, n_episodes=1000, horizon=100, gamma=0.99, batch_size=16, percentile=70, learning_rate=0.01, optimizer_type='ADAM', policy_net_fn=None, **kwargs): Agent.__init__(self, env, **kwargs) # check environment assert isinstance(self.env.observation_space, spaces.Box) assert isinstance(self.env.action_space, spaces.Discrete) # parameters self.gamma = gamma self.batch_size = batch_size self.n_episodes = n_episodes self.percentile = percentile self.learning_rate = learning_rate self.horizon = horizon # random number generator self.rng = seeding.get_rng() # self.policy_net_fn = policy_net_fn \ or (lambda: default_policy_net_fn(self.env)) self.optimizer_kwargs = {'optimizer_type': optimizer_type, 'lr': learning_rate} # policy net self.policy_net = self.policy_net_fn().to(device) # loss function and optimizer self.loss_fn = nn.CrossEntropyLoss() self.optimizer = optimizer_factory( self.policy_net.parameters(), **self.optimizer_kwargs) # memory self.memory = CEMMemory(self.batch_size) # default writer self.writer = PeriodicWriter(self.name, log_every=5*logger.getEffectiveLevel())
def __init__(self, env, n_episodes=1000, horizon=256, gamma=0.99, loss_function="l2", batch_size=100, device="cuda:best", target_update=1, learning_rate=0.001, optimizer_type='ADAM', qvalue_net_fn=None, double=True, exploration_kwargs=None, memory_kwargs=None, **kwargs): # Wrap arguments and initialize base class memory_kwargs = memory_kwargs or {} memory_kwargs['gamma'] = gamma base_args = (env, horizon, exploration_kwargs, memory_kwargs, n_episodes, batch_size, target_update, double) AbstractDQNAgent.__init__(self, *base_args, **kwargs) # init self.optimizer_kwargs = {'optimizer_type': optimizer_type, 'lr': learning_rate} self.device = device self.loss_function = loss_function self.gamma = gamma # qvalue_net_fn = qvalue_net_fn \ or (lambda: default_qvalue_net_fn(self.env)) self.value_net = qvalue_net_fn() self.target_net = qvalue_net_fn() # self.target_net.load_state_dict(self.value_net.state_dict()) self.target_net.eval() logger.debug("Number of trainable parameters: {}" .format(trainable_parameters(self.value_net))) self.device = choose_device(self.device) self.value_net.to(self.device) self.target_net.to(self.device) self.loss_function = loss_function_factory(self.loss_function) self.optimizer = optimizer_factory(self.value_net.parameters(), **self.optimizer_kwargs) self.steps = 0
def reset(self, **kwargs): self.policy_net = self.policy_net_fn( self.env, **self.policy_net_kwargs, ).to(self.device) self.policy_optimizer = optimizer_factory(self.policy_net.parameters(), **self.optimizer_kwargs) self.memory = Memory() self.episode = 0 # useful data self._rewards = np.zeros(self.n_episodes) self._cumul_rewards = np.zeros(self.n_episodes) # default writer log_every = 5 * logger.getEffectiveLevel() self.writer = PeriodicWriter(self.name, log_every=log_every)
def reset(self, **kwargs): # policy net self.policy_net = self.policy_net_fn( self.env, **self.policy_net_kwargs).to(self.device) # loss function and optimizer self.loss_fn = nn.CrossEntropyLoss() self.optimizer = optimizer_factory(self.policy_net.parameters(), **self.optimizer_kwargs) # memory self.memory = CEMMemory(self.batch_size) # default writer self.writer = PeriodicWriter(self.name, log_every=5 * logger.getEffectiveLevel()) # self.episode = 0 self._rewards = np.zeros(self.n_episodes) self._cumul_rewards = np.zeros(self.n_episodes)
def __init__(self, env, n_episodes=1000, horizon=256, gamma=0.99, loss_function="l2", batch_size=100, device="cuda:best", target_update=1, learning_rate=0.001, epsilon_init=1.0, epsilon_final=0.1, epsilon_decay=5000, optimizer_type='ADAM', qvalue_net_fn=None, qvalue_net_kwargs=None, double=True, memory_capacity=10000, use_bonus=False, uncertainty_estimator_kwargs=None, prioritized_replay=True, update_frequency=1, **kwargs): # Wrap arguments and initialize base class memory_kwargs = { 'capacity': memory_capacity, 'n_steps': 1, 'gamma': gamma } exploration_kwargs = { 'method': "EpsilonGreedy", 'temperature': epsilon_init, 'final_temperature': epsilon_final, 'tau': epsilon_decay, } self.use_bonus = use_bonus if self.use_bonus: env = UncertaintyEstimatorWrapper(env, **uncertainty_estimator_kwargs) IncrementalAgent.__init__(self, env, **kwargs) self.horizon = horizon self.exploration_kwargs = exploration_kwargs or {} self.memory_kwargs = memory_kwargs or {} self.n_episodes = n_episodes self.batch_size = batch_size self.target_update = target_update self.double = double assert isinstance(env.action_space, spaces.Discrete), \ "Only compatible with Discrete action spaces." self.prioritized_replay = prioritized_replay memory_class = PrioritizedReplayMemory if prioritized_replay else TransitionReplayMemory self.memory = memory_class(**self.memory_kwargs) self.exploration_policy = \ exploration_factory(self.env.action_space, **self.exploration_kwargs) self.training = True self.steps = 0 self.episode = 0 self.writer = None self.optimizer_kwargs = { 'optimizer_type': optimizer_type, 'lr': learning_rate } self.device = choose_device(device) self.loss_function = loss_function self.gamma = gamma qvalue_net_kwargs = qvalue_net_kwargs or {} qvalue_net_fn = load(qvalue_net_fn) if isinstance(qvalue_net_fn, str) else \ qvalue_net_fn or default_qvalue_net_fn self.value_net = qvalue_net_fn(self.env, **qvalue_net_kwargs) self.target_net = qvalue_net_fn(self.env, **qvalue_net_kwargs) self.target_net.load_state_dict(self.value_net.state_dict()) self.target_net.eval() logger.info("Number of trainable parameters: {}".format( trainable_parameters(self.value_net))) self.value_net.to(self.device) self.target_net.to(self.device) self.loss_function = loss_function_factory(self.loss_function) self.optimizer = optimizer_factory(self.value_net.parameters(), **self.optimizer_kwargs) self.update_frequency = update_frequency self.steps = 0