class Agent(): def __init__(self, state_size, action_size, config): self.action_size = action_size self.state_size = state_size self.Q = np.zeros([state_size, action_size]) self.Q_inverse = np.zeros([state_size, action_size]) self.debug_Q = np.zeros([state_size, action_size]) self.Q_shift = np.zeros([state_size, action_size]) self.r = np.zeros([state_size, action_size]) self.counter = np.zeros([state_size, action_size]) self.gamma = config["gamma"] self.epsilon = 1 self.lr = config["lr"] self.lr_iql_q = config["lr_iql_q"] self.lr_iql_r = config["lr_iql_r"] self.min_epsilon = config["min_epsilon"] self.max_epsilon =1 self.episode = 15000 self.decay = config["decay"] self.total_reward = 0 self.eval_frq = 50 self.render_env = False self.env = gym.make(config["env_name"]) self.memory = ReplayBuffer((1,),(1,),config["buffer_size"], config["device"]) self.gamma_iql = 0.99 self.gamma_iql = 0.99 self.lr_sh = config["lr_q_sh"] self.ratio = 1. / action_size self.eval_q_inverse = 50000 self.episodes_qinverse = int(5e6) self.update_freq = config['freq_q'] self.steps = 0 pathname = "lr_inv_q {} lr_inv_r {} freq {}".format(self.lr_iql_q, self.lr_iql_r, self.update_freq) tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) tensorboard_name = str(config["locexp"]) + '/runs/' + "inverse" self.writer_inverse = SummaryWriter(tensorboard_name) tensorboard_name = str(config["locexp"]) + '/runs/' + "expert" self.writer_expert = SummaryWriter(tensorboard_name) self.last_100_reward_errors = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.expert_buffer_size = config["expert_buffer_size"] def act(self, state, epsilon, eval_pi=False, use_debug=False): if np.random.random() > epsilon or eval_pi: action = np.argmax(self.Q[state]) if use_debug: action = np.argmax(self.debug_Q[state]) else: action = self.env.action_space.sample() return action def act_inverse_q(self, state): action = np.argmax(self.Q_inverse[state]) return action def optimize(self, state, action, reward, next_state, debug=False): if debug: max_next_state = np.max(self.debug_Q[next_state]) td_error = max_next_state - self.debug_Q[state, action] self.debug_Q[(state,action)] = self.debug_Q[(state,action)] + self.lr * (reward + self.gamma *td_error) return max_next_state = np.max(self.Q[next_state]) td_error = max_next_state - self.Q[state, action] self.Q[(state,action)] = self.Q[(state,action)] + self.lr * (reward + self.gamma *td_error) def learn(self): states, actions, rewards, next_states, done = self.memory.sample(self.batch_size) # update Q function for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, done): max_next_state = np.max(self.Q[next_state]) td_error = self.Q[state, action] - max_next_state self.Q[(state,action)] = self.Q[(state,action)] + self.lr * (reward + self.gamma* td_error) def compute_reward_loss(self, episode=10): """ use the env to create the real reward and compare it to the predicted reward of the model """ self.env.seed(np.random.randint(0,10)) reward_loss = 0 reward_list = [] for epi in range(episode): state = self.env.reset() done = False while not done: action = np.argmax(self.trained_Q[state]) next_state, reward, done, _ = self.env.step(action) predict_reward = self.r[state, action] reward_list.append((reward, predict_reward)) if done: break reward_loss =([abs(r[0] - r[1]) for r in reward_list] ) reward_loss_length = len(reward_loss) reward_loss = sum(reward_loss) / reward_loss_length self.last_100_reward_errors.append(reward_loss) average_loss = np.mean(self.last_100_reward_errors) print("average mean loss ", average_loss) self.writer.add_scalar('Reward_loss', reward_loss, self.steps) self.writer.add_scalar('Average_Reward_loss', average_loss, self.steps) #print(reward_loss) def invers_q(self, continue_train=False): self.memory.load_memory("memory") self.load_q_table() if not continue_train: print("clean policy") self.Q = np.zeros([self.state_size, self.action_size]) mkdir("", "inverse_policy") for epi in range(1, self.episodes_qinverse + 1): self.steps += 1 text = "Inverse Episode {} \r".format(epi) # print(text, end = '') if epi % self.eval_q_inverse == 0: self.start_reward() self.memory.save_memory("inverse_policy") self.save_q_table("inverse_Q") self.save_r_table() self.render_env = False self.eval_policy(use_inverse=True, episode=5) self.eval_policy(use_expert=True, episode=5) self.render_env =False state, action, r, next_state, _ = self.memory.sample(1) action = action[0][0] state = state[0][0] next_state = next_state[0][0] self.counter[state, action] += 1 total_num = np.sum(self.counter[state,:]) action_prob = self.counter[state] / total_num assert(np.isclose(np.sum(action_prob),1)) # update Q shift Q_shift_target = self.lr_sh * (self.gamma_iql * np.max(self.Q_inverse[next_state])) #print("q values", self.Q[state]) self.Q_shift[state, action] = ((1 - self.lr_sh) * self.Q_shift[state, action]) + Q_shift_target # compute n a if action_prob[action] == 0: action_prob[action] = np.finfo(float).eps n_a = np.log(action_prob[action]) - self.Q_shift[state, action] # update reward function self.update_r(state, action, n_a, action_prob) #self.debug_train() # update Q function self.update_q(state, action, next_state) # self.policy_diff(state, action) def update_q(self, state, action, next_state): q_old = (1 - self.lr_iql_q) * self.Q_inverse[state, action] q_new = self.lr_iql_q *(self.r[state, action] + (self.gamma_iql * np.max(self.Q_inverse[next_state]))) #print("q old ", q_old) #print("q_new", q_new) #print("q invers ", q_old + q_new) self.Q_inverse[state, action] = q_old + q_new def update_r(self, state, action, n_a, action_prob): r_old = (1 - self.lr_iql_r) * self.r[state, action] part1 = n_a #print("part1", n_a) part2 = self.ratio * self.sum_over_action(state, action, action_prob) r_new = self.lr_iql_r * (part1 + part2) #print("r old ", r_old) #print("r_new", r_new) self.r[state, action] = r_old + r_new def sum_over_action(self, state, a, action_prob): res = 0 for b in range(self.action_size): if b == a: continue res = res + (self.r[state, b] - self.compute_n_a(state, b, action_prob)) return res def compute_n_a(self, state, action, action_prob): if action_prob[action] == 0: action_prob[action] = np.finfo(float).eps return np.log(action_prob[action]) - self.Q_shift[state, action] def start_reward(self): self.env.seed = 1 state = self.env.reset() print(state) ns, r, d, _ = self.env.step(0) np.set_printoptions(precision=2) print(" expert q {}".format(self.trained_Q[state])) print("inverse q {}".format(self.Q_inverse[state])) return def eval_policy(self, random_agent=False, use_expert=False, use_debug=False, use_inverse=False,episode=10): if use_expert: self.load_q_table() total_steps = 0 total_reward = 0 total_penetlies = 0 for i_episode in range(1, episode + 1): score = 0 steps = 0 state = self.env.reset() done = False penelty = 0 while not done: steps += 1 if use_expert: action = np.argmax(self.trained_Q[state]) elif random_agent: action = self.env.action_space.sample() elif use_debug: action = np.argmax(self.debug_Q[state]) elif use_inverse: action = np.argmax(self.Q_inverse[state]) else: action = self.act(state, 0, True) next_state, reward, done, _ = self.env.step(action) state = next_state if self.render_env: self.env.render() time.sleep(0.1) score += reward if reward == -10: penelty += 1 if done: total_steps += steps total_reward += score total_penetlies += penelty break if self.render_env: self.env.close() aver_steps = total_steps / episode average_reward = total_reward / episode aver_penelties = total_penetlies / episode if use_expert: print("Expert avge steps {} average reward {:.2f} average penelty {} ".format(aver_steps, average_reward, aver_penelties)) elif random_agent: print("Random Eval avge steps {} average reward {:.2f} average penelty {} ".format(aver_steps, average_reward, aver_penelties)) elif use_inverse: print("Inverse q Eval avge steps {} average reward {:.2f} average penelty {} ".format(aver_steps, average_reward, aver_penelties)) else: print("Eval avge steps {} average reward {:.2f} average penelty {} ".format(aver_steps, average_reward, aver_penelties)) self.writer.add_scalar('Eval_Average_steps', aver_steps, self.steps) self.writer.add_scalar('Eval_Average_reward', average_reward, self.steps) self.writer.add_scalar('Eval_Average_penelties', aver_penelties, self.steps) def save_q_table(self, table="Q", filename="policy"): mkdir("", filename) if table == "Q": with open(filename + '/Q.npy', 'wb') as f: np.save(f, self.Q) if table =="inverse_Q": with open(filename + '/Inverse_Q.npy', 'wb') as f: np.save(f, self.Q_inverse) def load_q_table(self, table="Q", filename="policy"): if table == "Q": with open(filename + '/Q.npy', 'rb') as f: self.Q = np.load(f) if table == "inverse_Q": with open(filename + '/Inverse_Q.npy', 'rb') as f: self.Q_inverse = np.load(f) self.trained_Q = self.Q def save_r_table(self, filename="reward_function"): mkdir("", filename) with open(filename + '/r.npy', 'wb') as f: np.save(f, self.r) def load_r_table(self, filename="reward_function"): with open(filename + '/r.npy', 'rb') as f: self.r = np.load(f) def eval_inverse(self): self.load_q_table(table= "inverse_Q") for i_episode in range(1, 11): score = 0 steps = 0 penelties = 0 state = self.env.reset() done = False while not done: steps += 1 print(self.Q_inverse) action = np.argmax(self.Q_inverse[state]) next_state, reward, done, _ = self.env.step(action) score += reward if reward == -10: penelties += 1 state = next_state print("Inverse steps {} reward {:.2f} penelty {} ".format(steps, score, penelties)) def policy_diff(self, state, expert_action): self.trained_Q = self.Q def create_expert_policy(self): self.load_q_table() self.trained_Q = self.Q for i_episode in range(1, self.expert_buffer_size + 1): text = "create Buffer {} of {}\r".format(i_episode, self.expert_buffer_size) print(text, end=" ") state = self.env.reset() if state == 184: print("yes ") done = False score = 0 while True: action = self.act(state, 0, True) next_state, reward, done, _ = self.env.step(action) score += reward self.memory.add(state, action, reward, next_state, done, done) state = next_state if done: #print("reward ", score) break self.memory.save_memory("memory") def policy_diff(self, state, expert_action): action = np.argmax(self.Q_inverse[state]) if action == expert_action: print("Episode {} Reward {:.2f} Average Reward {:.2f} steps {} epsilon {:.2f}".format(i_episode, score, average_reward, steps, self.epsilon)) self.writer.add_scalar('Average_reward', average_reward, self.steps) self.writer.add_scalar('Train_reward', score, self.steps) self.trained_Q = self.Q self.memory.save_memory("memory") def debug_train(self): """ use the trained reward function to train the agent """ state = self.env.reset() done = False score = 0 self.steps += 1 epsiode_steps = 0 while True: action = self.act(state, 0, True) next_state, _, done, _ = self.env.step(action) reward = self.r[state, action] self.optimize(state, action, reward, next_state, debug=True) score += reward epsiode_steps += 1 if done: break state = next_state self.total_reward += score average_reward = self.total_reward / self.steps print("Episode {} Reward {:.2f} Average Reward {:.2f} epi steps {}".format(self.steps, score, average_reward, epsiode_steps)) def train(self): total_timestep = 0 for i_episode in range(1, self.episode + 1): score = 0 state = self.env.reset() done = False steps = 0 while not done: self.steps +=1 steps += 1 total_timestep += 1 action = self.act(state, self.epsilon) next_state, reward, done, _ = self.env.step(action) score += reward self.optimize(state, action, reward, next_state) self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon)*np.exp(-self.decay * i_episode) if done: break state = next_state if i_episode % self.eval_frq == 0: self.eval_policy() self.total_reward += score average_reward = self.total_reward / i_episode print("Episode {} Reward {:.2f} Average Reward {:.2f} steps {} epsilon {:.2f}".format(i_episode, score, average_reward, steps, self.epsilon)) self.writer.add_scalar('Average_reward', average_reward, self.steps) self.writer.add_scalar('Train_reward', score, self.steps) self.trained_Q = self.Q
env = gym.make('LunarLander-v2') env.seed(0) print('State shape: ', env.observation_space.shape) print('Number of actions: ', env.action_space.n) agent = DQNAgent(state_size=8, action_size=4, seed=0) agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) memory = ReplayBuffer((8, ), (1, ), 20000, 'cuda') n_episodes = 40 max_t = 500 eps = 0 for i_episode in range(1, n_episodes + 1): state = env.reset() score = 0 for t in range(max_t): action = agent.act(state, eps) next_state, reward, done, _ = env.step(action) score += reward memory.add(state, action, reward, next_state, done, done) state = next_state # env.render() if done: print("Episode {} Reward {}".format(i_episode, score)) break mkdir("", "expert_policy") print("save memory ...") memory.save_memory("expert_policy") print("... memory saved")