class AntAgent: def __init__(self, render=False, model=None): # create an environment self.environment = gym.make('MountainCarContinuous-v0') # reset environment when an agent is initialized self.current_observation = self.reset_environment() self.render = render self.model = model self.buffer = ReplayBuffer() def reset_environment(self): current_observation = self.environment.reset() return current_observation def get_action(self, current_observation): """Fetch an action according to model policy""" if self.model is None: action = self.environment.action_space.sample() else: action = self.model.predict(current_observation) return action def get_transitions(self, action): """Take one step in the environment and return the observations""" next_observation, reward, done, _ = self.environment.step(action) if self.render: self.environment.render() return next_observation, reward, done def run_episode(self, num_episodes=1): """run episodes `num_episodes` times using `model` policy""" for episode in range(num_episodes): self.current_observation = self.reset_environment() episode_id = self.buffer.create_episode() done = False transition = dict() while not done: transition['current_observation'] = self.current_observation transition['action'] = self.get_action(self.current_observation) transition['next_observation'], transition['reward'], done = self.get_transitions(transition['action']) self.buffer.add_sample(episode_id, transition) self.buffer.add_episode(episode_id) def learn(self, step=0, restore=False): """Train SAC model using transitions in replay buffer""" if self.model is None: raise Exception("This agent has no brain! Add a model which implements fit() function to train.") # Sample array of transitions from replay buffer. transition_matrices = self.buffer.fetch_sample() if step != 0: restore = True # Fit the SAC model. self.model.fit(transition_matrices, restore=restore, global_step=step)
# Store transition in replay buffer replay.store(current_state, action, reward, next_state, end) # Update current state current_state = next_state step += 1 global_step += 1 if (step % 1 == 0) and (global_step > args.start_steps): for epoch in range(args.epochs): # Randomly sample minibatch of transitions from replay buffer current_states, actions, rewards, next_states, ends = replay.fetch_sample(num_samples=args.batch_size) # Perform single step of gradient descent on Q and policy # network critic1_loss, critic2_loss, actor_loss, alpha_loss = sac.train(current_states, actions, rewards, next_states, ends) if args.verbose: print(episode, global_step, epoch, critic1_loss.numpy(), critic2_loss.numpy(), actor_loss.numpy(), episode_reward) with writer.as_default(): tf.summary.scalar("actor_loss", actor_loss, sac.epoch_step) tf.summary.scalar("critic1_loss", critic1_loss, sac.epoch_step) tf.summary.scalar("critic2_loss", critic2_loss, sac.epoch_step) tf.summary.scalar("alpha_loss", alpha_loss, sac.epoch_step)