class Agent(BaseAgent): def __init__(self, env): super(Agent, self).__init__(env) self.model = DQN(self.obs_dim, self.action_dim) self.replay_buffer = ReplayBuffer(minibatch_size=minibatch_size) self.set_gui_flag(False, False) def get_action(self, obs, train=True): eps_min = 0.1 eps_max = 1.0 eps_decay_steps = self.train_step epsilon = max(eps_min, eps_max - (eps_max - eps_min)*self.global_step/eps_decay_steps) if train and np.random.rand(1) < epsilon: action = self.env.action_space.sample() else: action = self.model.get_action(obs) return action def train_model(self, obs, action, reward, obs_next, done): self.replay_buffer.add_to_memory((obs, action, reward, obs_next, done)) if len(self.replay_buffer.replay_memory) < minibatch_size * pre_train_step: return None minibatch = self.replay_buffer.sample_from_memory() s, a, r, s_, done = map(np.array, zip(*minibatch)) self.model.train_network(s, a, r, s_, done) if self.global_step % target_update_period == 0: self.model.update_target() return
class Agent(AbstractAgent): def __init__(self, env): super(Agent, self).__init__(env) print("Q-network Agent is created") self.action_dim = env.action_space.n self.obs_dim = np.power(int(env.observation_space.high[0] + 1), 2) self.model = Q_Network(self.obs_dim, self.action_dim, train_step) self.replay_buffer = ReplayBuffer(minibatch_size=minibatch_size) def learn(self): print("Start train for {} steps".format(train_step)) global_step = 0 episode_num = 0 while global_step < train_step: episode_num += 1 step_in_ep = 0 obs_v = self.env.reset() obs = self.one_hot(obs_v) total_reward = 0 done = False while (not done and global_step < train_step): global_step += 1 step_in_ep += 1 action = self.get_action(obs, global_step) # For debugging if global_step % 1000 == 0: self.draw_current_optimal_actions(global_step) obs_v_next, reward, done, _ = self.env.step(action) obs_next = self.one_hot(obs_v_next) self.train_agent(obs, action, reward, obs_next, done, global_step) # GUI # self.env.render() obs = obs_next total_reward += reward def test(self, global_step=0): print("Start test for {} steps".format(test_step)) global_step = 0 episode_num = 0 self.draw_current_optimal_actions(0) while global_step < test_step: episode_num += 1 step_in_ep = 0 total_reward = 0 done = False obs_v = self.env.reset() # Reset environment obs = self.one_hot(obs_v) while (not done and global_step < test_step): global_step += 1 step_in_ep += 1 action = self.get_action(obs, global_step, False) obs_v_next, reward, done, _ = self.env.step(action) obs_next = self.one_hot(obs_v_next) # GUI time.sleep(0.05) self.env.render() obs = obs_next total_reward += reward print("[ test_ep: {}, total reward: {} ]".format( episode_num, total_reward)) def get_action(self, obs, global_step, train=True): eps_min = 0.1 eps_max = 1.0 eps_decay_steps = train_step epsilon = max( eps_min, eps_max - (eps_max - eps_min) * global_step / eps_decay_steps) if train and np.random.rand(1) < epsilon: action = self.env.action_space.sample() else: action = self.model.get_action(obs) return action def train_agent(self, obs, action, reward, obs_next, done, global_step): self.replay_buffer.add_to_memory((obs, action, reward, obs_next, done)) if len(self.replay_buffer.replay_memory ) < minibatch_size * pre_train_step: return None minibatch = self.replay_buffer.sample_from_memory() s, a, r, s_, done = map(np.array, zip(*minibatch)) self.model.train_network(s, a, r, s_, done) if False: # TODO fill here self.model.update_target() return def one_hot(self, obs): idx = int(obs[1] * (self.env.observation_space.high[0] + 1) + obs[0]) return np.eye(int(pow(self.env.observation_space.high[0] + 1, 2)))[idx] def draw_current_optimal_actions(self, step): idx = int(np.sqrt(self.obs_dim)) directions = ["U", "D", "R", "L"] print("optimal actions at step {}".format(step)) for i in range(idx): print("----" * idx + "-") row = "" for j in range(idx): row = row + "| {} ".format(directions[self.model.get_action( np.eye(self.obs_dim)[int(idx * i + j)])]) # one-hot row = row + "|" print(row) print("----" * idx + "-") return
class Agent(AbstractAgent): def __init__(self, env): super(Agent, self).__init__(env) print("DQN Agent") self.action_dim = env.action_space.n self.obs_dim = observation_dim(env.observation_space) self.model = DQN(self.obs_dim, self.action_dim) self.replay_buffer = ReplayBuffer(minibatch_size=minibatch_size) def learn(self): print("Start train for {} steps".format(train_step)) global_step = 0 episode_num = 0 while global_step < train_step: episode_num += 1 obs = self.env.reset() # Reset environment total_reward = 0 done = False while (not done and global_step < train_step): global_step += 1 action = self.get_action(obs, global_step) obs_next, reward, done, _ = self.env.step(action) self.train_agent(obs, action, reward, obs_next, done, global_step) # GUI self.env.render() obs = obs_next total_reward += reward if global_step % 10000 == 0: print(global_step) self.model.save_network() def test(self, global_step=0): print("Start test for {} steps".format(test_step)) global_step = 0 episode_num = 0 total_reward = 0 while global_step < test_step: episode_num += 1 obs = self.env.reset() # Reset environment done = False while (not done and global_step < test_step): global_step += 1 action = self.get_action(obs, global_step, False) obs_next, reward, done, _ = self.env.step(action) # GUI self.env.render() obs = obs_next total_reward += reward print("[ train_ep: {}, total reward: {} ]".format(episode_num, total_reward)) total_reward = 0 def get_action(self, obs, global_step, train=True): eps_min = 0.1 eps_max = 1.0 eps_decay_steps = train_step epsilon = max(eps_min, eps_max - (eps_max - eps_min)*global_step/eps_decay_steps) if train and np.random.rand(1) < epsilon: action = self.env.action_space.sample() else: action = self.model.get_action(obs) return action def train_agent(self, obs, action, reward, obs_next, done, global_step): state = self.model.preprocess_observation(obs) state_next = self.model.preprocess_observation(obs_next) self.replay_buffer.add_to_memory((state, action, reward, state_next, done)) if len(self.replay_buffer.replay_memory) < minibatch_size * pre_train_step: return None if global_step % training_interval == 0: minibatch = self.replay_buffer.sample_from_memory() s, a, r, s_, done = map(np.array, zip(*minibatch)) self.model.train_network(s, a, r, s_, done, global_step) if global_step % target_update_period == 0: self.model.update_target() return
class Agent(AbstractAgent): def __init__(self, env): super(Agent, self).__init__(env) print("DDPG Agent") self.action_dim = action_dim( env.action_space) ### KH: for continuous action task self.obs_dim = observation_dim(env.observation_space) self.action_max = env.action_space.high ### KH: DDPG action bound self.action_min = env.action_space.low ### KH: DDPG action bound self.model = self.set_model() self.replay_buffer = ReplayBuffer(minibatch_size=minibatch_size) def set_model(self): # model can be q-table or q-network model = DDPG(self.obs_dim, self.action_dim, self.action_max, self.action_min) return model def learn(self): print("Start Learn") global_step = 0 episode_num = 0 while global_step < train_step: episode_num += 1 step_in_ep = 0 obs = self.env.reset() # Reset environment total_reward = 0 done = False self.noise = np.zeros(self.action_dim) while (not done and step_in_ep < max_step_per_episode and global_step < train_step): ### KH: reset every 200 steps global_step += 1 step_in_ep += 1 action = self.get_action(obs, global_step) obs_next, reward, done, _ = self.env.step(action) self.train_agent(obs, action, reward, obs_next, done, global_step) # GUI self.env.render() obs = obs_next total_reward += reward print("[ train_ep: {}, total reward: {} ]".format( episode_num, total_reward)) ### KH: train result def test(self, global_step=0): print("Test step: {}".format(global_step)) global_step = 0 episode_num = 0 total_reward = 0 while global_step < test_step: episode_num += 1 step_in_ep = 0 obs = self.env.reset() # Reset environment total_reward = 0 ### KH: Added missing done = False while (not done and step_in_ep < max_step_per_episode and global_step < test_step): ### KH: reset every 200 steps global_step += 1 step_in_ep += 1 action = self.get_action(obs, global_step, False) obs_next, reward, done, _ = self.env.step(action) # GUI self.env.render() obs = obs_next total_reward += reward print("[ test_ep: {}, total reward: {} ]".format( episode_num, total_reward)) ### KH: test result def get_action(self, obs, global_step, train=True): # 최적의 액션 선택 + Exploration (Epsilon greedy) action = self.model.choose_action(obs) if train: scale = 1 - global_step / train_step self.noise = self.ou_noise(self.noise) action = action + self.noise * (self.action_max - self.action_min) / 2 * scale action = np.maximum(action, self.action_min) action = np.minimum(action, self.action_max) return action def train_agent(self, obs, action, reward, obs_next, done, step): self.replay_buffer.add_to_memory((obs, action, reward, obs_next, done)) if len(self.replay_buffer.replay_memory ) < minibatch_size * pre_train_step: return None minibatch = self.replay_buffer.sample_from_memory() s, a, r, ns, d = map(np.array, zip(*minibatch)) self.model.train_network(s, a, r, ns, d, step) return None def ou_noise(self, x): return x + theta * (mu - x) + sigma * np.random.randn(self.action_dim)