def eval(opt, global_model, num_states, num_actions): torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT env = create_train_env(opt.world, opt.stage, actions) local_model = PPO(num_states, num_actions) if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) # Uncomment following lines if you want to save model whenever level is completed if info['flag_get'] == True: print( "############### The model is finished .saving the model ###############" ) torch.save( local_model.state_dict(), "{}/ppo_full_finished_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage, opt.saved_episode)) exit() havedisplay = "DISPLAY" in os.environ if havedisplay: env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def eval(args, global_model, num_states, num_actions): # 固定初始化状态 if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) # 创建游戏动作 env = create_train_env(args.game) # 获取网络模型 local_model = PPO(num_states, num_actions) # 判断是否可以使用GPU if torch.cuda.is_available(): local_model.cuda() # 切换为评估状态 local_model.eval() # 将图像转换为Pytorch的数据类型 state = torch.from_numpy(env.reset()) # 一开始就更新模型参数 done = True curr_step = 0 max_reward = 0 while True: # 显示界面 if args.show_play: env.render() curr_step += 1 # 使用GPU计算 if torch.cuda.is_available(): state = state.cuda() # 每结束一次就更新模型参数 if done: local_model.load_state_dict(global_model.state_dict()) total_reward = 0 # 预测动作概率和评估值 logits, value = local_model(state) # 获取动作的序号 policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() # 执行游戏 state, reward, done, info = env.step(action) total_reward += reward # 重置游戏状态 if done: print("游戏得分:%f" % total_reward) curr_step = 0 state = env.reset() if max_reward < total_reward: torch.save( local_model.state_dict(), "{}/model_best_{}.pth".format(args.saved_path, args.game)) max_reward = total_reward # 转换每一步都游戏状态 state = torch.from_numpy(state)
def eval(opt, global_model, num_states, num_actions): torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT env = create_train_env(opt.world, opt.stage, actions) local_model = PPO(num_states, num_actions) if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) # Uncomment following lines if you want to save model whenever level is completed if info["flag_get"]: # if random.randint(0, 10)%2 == 0: # print("Finished") torch.save( local_model.state_dict(), "{}/ppo_super_mario_bros_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, curr_step)) # return # env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def test(opt, global_model, num_states, num_actions): torch.manual_seed(123) env = create_train_env(opt.level) local_model = PPO(num_states, num_actions) if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) if (done and info["lives"] != 0) or info["level"] == opt.level: torch.save( local_model.state_dict(), "{}/ppo_contra_success_{}".format(opt.saved_path, info["lives"])) env.render() actions.append(action) if curr_step > opt.num_max_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) mp = _mp.get_context("spawn") envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type, opt.num_processes) model = PPO(envs.num_states, envs.num_actions) if torch.cuda.is_available(): model.cuda() model.share_memory() process = mp.Process(target=eval, args=(opt, model, envs.num_states, envs.num_actions)) process.start() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] curr_states = torch.from_numpy(np.concatenate(curr_states, 0)) if torch.cuda.is_available(): curr_states = curr_states.cuda() curr_episode = 0 episode_plot = [] R_plot = [] ep_reward_plot = [] start_datetime = datetime.datetime.now().strftime("%m-%d_%H-%M") while True: if curr_episode % opt.save_interval == 0 and curr_episode > 0: # torch.save(model.state_dict(), # "{}/ppo_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage)) torch.save( model.state_dict(), "{}/ppo_super_mario_bros_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, curr_episode)) curr_episode += 1 episode_plot.append(int(curr_episode)) old_log_policies = [] actions = [] values = [] states = [] rewards = [] dones = [] for _ in range(opt.num_local_steps): states.append(curr_states) logits, value = model(curr_states) values.append(value.squeeze()) policy = F.softmax(logits, dim=1) old_m = Categorical(policy) action = old_m.sample() actions.append(action) old_log_policy = old_m.log_prob(action) old_log_policies.append(old_log_policy) # before step with env if torch.cuda.is_available(): [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu()) ] else: [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action) ] state, reward, done, info = zip( *[agent_conn.recv() for agent_conn in envs.agent_conns]) state = torch.from_numpy(np.concatenate(state, 0)) if torch.cuda.is_available(): state = state.cuda() reward = torch.cuda.FloatTensor(reward) done = torch.cuda.FloatTensor(done) else: reward = torch.FloatTensor(reward) done = torch.FloatTensor(done) rewards.append(reward) dones.append(done) curr_states = state _, next_value, = model(curr_states) next_value = next_value.squeeze() old_log_policies = torch.cat(old_log_policies).detach() actions = torch.cat(actions) values = torch.cat(values).detach() states = torch.cat(states) gae = 0 R = [] for value, reward, done in list(zip(values, rewards, dones))[::-1]: # calc advantage gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach() * ( 1 - done) - value.detach() next_value = value R.append(gae + value) R = R[::-1] R = torch.cat(R).detach() advantages = R - values print("mean big R:", torch.mean(R).item()) episode_reward_mean = torch.stack(rewards).mean( dim=1, keepdim=True).sum().item() print("mean reward", episode_reward_mean) R_plot.append(torch.mean(R).item()) ep_reward_plot.append(episode_reward_mean) plt.plot(episode_plot, R_plot, "r-") plt.xlabel('Episode') plt.ylabel('Mean R (PPO)') plt.savefig("ppo_R_episode_{}.pdf".format(start_datetime)) plt.close() plt.plot(episode_plot, ep_reward_plot, "r-") plt.xlabel('Episode') plt.ylabel('Mean Reward (PPO)') plt.savefig("ppo_reward_episode_{}.pdf".format(start_datetime)) plt.close() np.savetxt("ppo_R_episode_{}.csv".format(start_datetime), np.array(R_plot), delimiter=",") np.savetxt("ppo_reward_episode_{}.csv".format(start_datetime), np.array(ep_reward_plot), delimiter=",") for i in range(opt.num_epochs): indice = torch.randperm(opt.num_local_steps * opt.num_processes) for j in range(opt.batch_size): batch_indices = indice[int(j * ( opt.num_local_steps * opt.num_processes / opt.batch_size)):int((j + 1) * (opt.num_local_steps * opt.num_processes / opt.batch_size))] logits, value = model(states[batch_indices]) new_policy = F.softmax(logits, dim=1) new_m = Categorical(new_policy) new_log_policy = new_m.log_prob(actions[batch_indices]) ratio = torch.exp(new_log_policy - old_log_policies[batch_indices]) # ratio actor_loss = -torch.mean( torch.min( ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 + opt.epsilon) * advantages[batch_indices])) # cliping # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2 critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze()) entropy_loss = torch.mean(new_m.entropy()) total_loss = actor_loss + critic_loss - opt.beta * entropy_loss optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) # model clip optimizer.step() print("Episode: {}. Total loss: {}".format(curr_episode, total_loss))
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) opt.saved_path = os.getcwd() + '/baselines/PPO/' + opt.saved_path # if os.path.isdir(opt.log_path): # shutil.rmtree(opt.log_path) # os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) savefile = opt.saved_path + '/PPO_train.csv' print(savefile) title = ['Loops', 'Steps', 'Time', 'AvgLoss', 'MeanReward', "StdReward", "TotalReward", "Flags"] with open(savefile, 'w', newline='') as sfile: writer = csv.writer(sfile) writer.writerow(title) # Create environments envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type, opt.num_processes, opt.cortex_left, opt.cortex_right, opt.retina_resolution, opt.retina, opt.save_video) # Create model and optimizer model = PPO(envs.num_states, envs.num_actions) if torch.cuda.is_available(): model.cuda() model.share_memory() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) # Start test/evaluation model if TEST_ON_THE_GO: # evaluate(opt, model, envs.num_states, envs.num_actions) mp = _mp.get_context("spawn") process = mp.Process(target=evaluate, args=(opt, model, envs.num_states, envs.num_actions)) process.start() # Reset envs #[agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] curr_states = [] [curr_states.append(env.reset()) for env in envs.envs] # curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] curr_states = torch.from_numpy(np.concatenate(curr_states, 0)) if torch.cuda.is_available(): curr_states = curr_states.cuda() tot_loops = 0 tot_steps = 0 # Start main loop while True: # Save model each loop if opt.save_with_interval: if tot_loops % opt.save_interval == 0 and tot_loops > 0: # torch.save(model.state_dict(), "{}/ppo_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage)) torch.save(model.state_dict(), "{}/ppo_super_mario_bros_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage, tot_loops)) start_time = time.time() # Accumulate evidence tot_loops += 1 old_log_policies = [] actions = [] values = [] states = [] rewards = [] dones = [] flags = [] for _ in range(opt.num_local_steps): # From given states, predict an action states.append(curr_states) logits, value = model(curr_states) values.append(value.squeeze()) policy = F.softmax(logits, dim=1) old_m = Categorical(policy) action = old_m.sample() actions.append(action) old_log_policy = old_m.log_prob(action) old_log_policies.append(old_log_policy) # Evaluate predicted action result = [] # ac = action.cpu().item() if torch.cuda.is_available(): # [agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu())] [result.append(env.step(act.item())) for env, act in zip(envs.envs, action.cpu())] else: #[agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action)] [result.append(env.step(act.item())) for env, act in zip(envs.envs, action)] state, reward, done, info = zip(*result) state = torch.from_numpy(np.concatenate(state, 0)) if torch.cuda.is_available(): state = state.cuda() reward = torch.cuda.FloatTensor(reward) done = torch.cuda.FloatTensor(done) else: reward = torch.FloatTensor(reward) done = torch.FloatTensor(done) rewards.append(reward) dones.append(done) flags.append(check_flag(info) / opt.num_processes) curr_states = state # Training stage _, next_value, = model(curr_states) next_value = next_value.squeeze() old_log_policies = torch.cat(old_log_policies).detach() actions = torch.cat(actions) values = torch.cat(values).detach() states = torch.cat(states) gae = 0 R = [] for value, reward, done in list(zip(values, rewards, dones))[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach() * (1 - done) - value.detach() next_value = value R.append(gae + value) R = R[::-1] R = torch.cat(R).detach() advantages = R - values avg_loss = [] for _ in range(opt.num_epochs): indice = torch.randperm(opt.num_local_steps * opt.num_processes) for j in range(opt.batch_size): batch_indices = indice[int(j * (opt.num_local_steps * opt.num_processes / opt.batch_size)): int((j + 1) * ( opt.num_local_steps * opt.num_processes / opt.batch_size))] logits, value = model(states[batch_indices]) new_policy = F.softmax(logits, dim=1) new_m = Categorical(new_policy) new_log_policy = new_m.log_prob(actions[batch_indices]) ratio = torch.exp(new_log_policy - old_log_policies[batch_indices]) actor_loss = -torch.mean(torch.min(ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 + opt.epsilon) * advantages[batch_indices])) # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2 critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze()) entropy_loss = torch.mean(new_m.entropy()) total_loss = actor_loss + critic_loss - opt.beta * entropy_loss optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() avg_loss.append(total_loss.cpu().detach().numpy().tolist()) avg_loss = np.mean(avg_loss) all_rewards = torch.cat(rewards).cpu().numpy() tot_steps += opt.num_local_steps * opt.num_processes sum_reward = np.sum(all_rewards) mu_reward = np.mean(all_rewards) std_reward = np.std(all_rewards) any_flags = np.sum(flags) ep_time = time.time() - start_time # data = [tot_loops, tot_steps, ep_time, avg_loss, mu_reward, std_reward, sum_reward, any_flags] data = [tot_loops, tot_steps, "{:.6f}".format(ep_time), "{:.4f}".format(avg_loss), "{:.4f}".format(mu_reward), "{:.4f}".format(std_reward), "{:.2f}".format(sum_reward), any_flags] with open(savefile, 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerows([data]) print("Steps: {}. Total loss: {}".format(tot_steps, total_loss))
def train(args): # 固定初始化状态 if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) # 创建保存模型的文件夹 if not os.path.isdir(args.saved_path): os.makedirs(args.saved_path) # 创建多进程的游戏环境 envs = MultipleEnvironments(args.game, args.num_processes) # 创建模型 model = PPO(envs.num_states, envs.num_actions) # 加载预训练模型 if args.trained_model is not None: model.load_state_dict(torch.load(args.trained_model)) # 使用 GPU训练 if torch.cuda.is_available(): model.cuda() model.share_memory() # 为游戏评估单独开一个进程 mp = _mp.get_context("spawn") process = mp.Process(target=eval, args=(args, model, envs.num_states, envs.num_actions)) process.start() # 创建优化方法 optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # 刚开始给每个进程的游戏执行初始化 [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] # 获取游戏初始的界面 curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] curr_states = torch.from_numpy(np.concatenate(curr_states, 0)) if torch.cuda.is_available(): curr_states = curr_states.cuda() curr_episode = 0 while True: curr_episode += 1 old_log_policies = [] actions = [] values = [] states = [] rewards = [] dones = [] # 执行游戏获取数据 for _ in range(args.num_local_steps): states.append(curr_states) # 执行预测 logits, value = model(curr_states) # 计算每个动作的概率值 policy = F.softmax(logits, dim=1) # 根据每个标签的概率随机生成符合概率的标签 old_m = Categorical(policy) action = old_m.sample() # 记录预测数据 actions.append(action) values.append(value.squeeze()) # 计算损失使用 old_log_policy = old_m.log_prob(action) old_log_policies.append(old_log_policy) # 向各个进程游戏发送动作 if torch.cuda.is_available(): [agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu())] else: [agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action)] # 将多进程的游戏数据打包 state, reward, done, info = zip(*[agent_conn.recv() for agent_conn in envs.agent_conns]) # 进行数据转换 state = torch.from_numpy(np.concatenate(state, 0)) # 转换为pytorch数据 if torch.cuda.is_available(): state = state.cuda() reward = torch.cuda.FloatTensor(reward) done = torch.cuda.FloatTensor(done) else: reward = torch.FloatTensor(reward) done = torch.FloatTensor(done) # 记录预测数据 rewards.append(reward) dones.append(done) curr_states = state # 根据上面最后的图像预测 _, next_value, = model(curr_states) next_value = next_value.squeeze() old_log_policies = torch.cat(old_log_policies).detach() actions = torch.cat(actions) values = torch.cat(values).detach() states = torch.cat(states) gae = 0 R = [] for value, reward, done in list(zip(values, rewards, dones))[::-1]: gae = gae * args.gamma * args.tau gae = gae + reward + args.gamma * next_value.detach() * (1 - done) - value.detach() next_value = value R.append(gae + value) R = R[::-1] R = torch.cat(R).detach() advantages = R - values total_losses = [] for i in range(args.num_epochs): indice = torch.randperm(args.num_local_steps * args.num_processes) for j in range(args.batch_size): batch_indices = indice[ int(j * (args.num_local_steps * args.num_processes / args.batch_size)): int((j + 1) * ( args.num_local_steps * args.num_processes / args.batch_size))] # 根据拿到的图像执行预测 logits, value = model(states[batch_indices]) # 计算每个动作的概率值 new_policy = F.softmax(logits, dim=1) new_m = Categorical(new_policy) # 计算损失 new_log_policy = new_m.log_prob(actions[batch_indices]) ratio = torch.exp(new_log_policy - old_log_policies[batch_indices]) actor_loss = -torch.mean(torch.min(ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - args.epsilon, 1.0 + args.epsilon) * advantages[batch_indices])) critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze()) entropy_loss = torch.mean(new_m.entropy()) total_loss = actor_loss + critic_loss - args.beta * entropy_loss # 计算梯度 optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() total_losses.append(float(total_loss)) print("Episode: {}. Total loss: {:.4f}".format(curr_episode, np.mean(total_losses))) torch.save(model.state_dict(), "{}/model_{}.pth".format(args.saved_path, args.game))
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) mp = _mp.get_context("spawn") envs = MultipleEnvironments(opt.level, opt.num_processes) model = PPO(envs.num_states, envs.num_actions) if torch.cuda.is_available(): model.cuda() model.share_memory() process = mp.Process(target=test, args=(opt, model, envs.num_states, envs.num_actions)) process.start() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] curr_states = torch.from_numpy(np.concatenate(curr_states, 0)) if torch.cuda.is_available(): curr_states = curr_states.cuda() curr_episode = 0 while True: if curr_episode % opt.save_interval == 0 and curr_episode > 0: torch.save( model.state_dict(), "{}/ppo_contra_level{}".format(opt.saved_path, opt.level)) curr_episode += 1 old_log_policies = [] actions = [] values = [] states = [] rewards = [] dones = [] for _ in range(opt.num_local_steps): states.append(curr_states) logits, value = model(curr_states) values.append(value.squeeze()) policy = F.softmax(logits, dim=1) old_m = Categorical(policy) action = old_m.sample() actions.append(action) old_log_policy = old_m.log_prob(action) old_log_policies.append(old_log_policy) if torch.cuda.is_available(): [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu()) ] else: [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action) ] state, reward, done, info = zip( *[agent_conn.recv() for agent_conn in envs.agent_conns]) state = torch.from_numpy(np.concatenate(state, 0)) if torch.cuda.is_available(): state = state.cuda() reward = torch.cuda.FloatTensor(reward) done = torch.cuda.FloatTensor(done) else: reward = torch.FloatTensor(reward) done = torch.FloatTensor(done) rewards.append(reward) dones.append(done) curr_states = state _, next_value, = model(curr_states) next_value = next_value.squeeze() old_log_policies = torch.cat(old_log_policies).detach() actions = torch.cat(actions) values = torch.cat(values).detach() states = torch.cat(states) gae = 0 R = [] for value, reward, done in list(zip(values, rewards, dones))[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach() * ( 1 - done) - value.detach() next_value = value R.append(gae + value) R = R[::-1] R = torch.cat(R).detach() advantages = R - values for i in range(opt.num_epochs): indice = torch.randperm(opt.num_local_steps * opt.num_processes) for j in range(opt.batch_size): batch_indices = indice[int(j * ( opt.num_local_steps * opt.num_processes / opt.batch_size)):int((j + 1) * (opt.num_local_steps * opt.num_processes / opt.batch_size))] logits, value = model(states[batch_indices]) new_policy = F.softmax(logits, dim=1) new_m = Categorical(new_policy) new_log_policy = new_m.log_prob(actions[batch_indices]) ratio = torch.exp(new_log_policy - old_log_policies[batch_indices]) actor_loss = -torch.mean( torch.min( ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 + opt.epsilon) * advantages[batch_indices])) critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze()) entropy_loss = torch.mean(new_m.entropy()) total_loss = actor_loss + critic_loss - opt.beta * entropy_loss optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() print("Episode: {}. Total loss: {}".format(curr_episode, total_loss))
def evaluate(opt, global_model, num_states, num_actions): torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT savefile = opt.saved_path + '/PPO_test.csv' print(savefile) title = ['Steps', 'Time', 'TotalReward', "Flag"] with open(savefile, 'w', newline='') as sfile: writer = csv.writer(sfile) writer.writerow(title) print(opt.retina_resolution) env = create_train_env(actions, mp_wrapper=False, cortex_left=opt.cortex_left, cortex_right=opt.cortex_right, retina_resolution=opt.retina_resolution, use_retina=opt.retina) local_model = PPO(num_states, num_actions) if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 tot_step = 0 actions = deque(maxlen=opt.max_actions) tot_reward = 0 got_flag = 0 index = 0 while True: start_time = time.time() curr_step += 1 tot_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax( policy).item() # This selects the best action to take state, reward, done, info = env.step(action) # im1 = state[0, 0, :, :] # im2 = state[0, 1, :, :] # im3 = state[0, 2, :, :] # im4 = state[0, 3, :, :] # res1 = cv2.resize(im1, dsize=(370, 370), interpolation=cv2.INTER_CUBIC) # im2 = state[0, 1, :, :] # res2 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC) # im3 = state[0, 2, :, :] # res3 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC) # im4 = state[0, 3, :, :] # res4 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC) # fig=plt.figure(figsize=(8, 8)) # columns = 2 # rows = 2 # fig.add_subplot(rows, columns, 1) # plt.imshow(im1) # fig.add_subplot(rows, columns, 2) # plt.imshow(im2) # fig.add_subplot(rows, columns, 3) # plt.imshow(im3) # fig.add_subplot(rows, columns, 4) # plt.imshow(im4) # plt.show() index += 1 tot_reward += reward # Uncomment following lines if you want to save model whenever level is completed if flag_get(info): print("Evaluate: Level Completed!") got_flag = 1 done = True torch.save( local_model.state_dict(), "{}/ppo_super_mario_bros_{}".format(opt.saved_path, curr_step)) # env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: # print("Evaluate: Time's up!") done = True if done: # print("Evaluate: Done!") ep_time = time.time() - start_time data = [ tot_step, "{:.4f}".format(ep_time), "{:.2f}".format(tot_reward), got_flag ] with open(savefile, 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerows([data]) curr_step = 0 got_flag = 0 tot_reward = 0 actions.clear() # time.sleep(10) # Sleep for 10 secs state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def eval(opt, global_model, num_states, num_actions): torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT env = create_train_env(opt.world, opt.stage, actions) local_model = PPO(num_states, num_actions) Is_model_2_loaded = False if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) if done: if torch.cuda.is_available(): local_model.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode))) if torch.cuda.is_available() is False: local_model.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage)) while True: curr_step += 1 logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) if info['x_pos'] > 1000 and Is_model_2_loaded == False: try: local_model.load_state_dict(global_model.state_dict()) Is_model_2_loaded = True print('------ testing with model-----------') except: print('failed to load secondary training model') if info['x_pos'] < 1000 and Is_model_2_loaded == True: try: if torch.cuda.is_available(): local_model.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode))) if torch.cuda.is_available() is False: local_model.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage)) Is_model_2_loaded = False print('assistance model loaded') except: print('failed to load secondary training model') # Uncomment following lines if you want to save model whenever level is completed if info['flag_get'] == True: print( "############### The model is finished .saving the model ###############" ) torch.save( local_model.state_dict(), "{}/ppo_sendpt_finished_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode)) exit() havedisplay = "DISPLAY" in os.environ if havedisplay: env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) mp = _mp.get_context("spawn") envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type, opt.num_processes) model_mast = PPO(envs.num_states, envs.num_actions) model_1 = PPO(envs.num_states, envs.num_actions) model_2 = PPO(envs.num_states, envs.num_actions) model_1.eval() if torch.cuda.is_available(): try: model_1.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage))) model_1.cuda() print('model-1 is loaded cuda version') except: print('failed to load model-1') try: model_2.load_state_dict( torch.load("{}/ppo_secndpt_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode))) model_2.cuda() print('model-2 is loaded cuda version') except: print('failed to load model-2') else: try: model_1.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage)) print('model-1 is loaded non cuda version') except: print('Failed to load model-1') try: model_2.load_state_dict( torch.load("{}/ppo_scendpt_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode), map_location=lambda storage, loc: storage)) print('model-2 is loaded non cuda version') except: print('Failed to load non cuda model-2') model_mast.load_state_dict(model_2.state_dict()) if torch.cuda.is_available(): model_mast.cuda() model_mast.share_memory() process = mp.Process(target=eval, args=(opt, model_mast, envs.num_states, envs.num_actions)) process.start() optimizer = torch.optim.Adam(model_mast.parameters(), lr=opt.lr) [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] curr_states = torch.from_numpy(np.concatenate(curr_states, 0)) if torch.cuda.is_available(): curr_states = curr_states.cuda() curr_episode = opt.saved_episode while True: curr_episode += 1 old_log_policies = [] actions = [] values = [] states = [] rewards = [] dones = [] print( '############## restarting the training loop ###################' ) while True: while True: logits, value = model_1(curr_states) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = torch.tensor(action) action = action.view(-1) if torch.cuda.is_available(): [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu()) ] else: [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action) ] state, reward, done, info = zip( *[agent_conn.recv() for agent_conn in envs.agent_conns]) # print('position is',info[0]['x_pos']) if info[0]['x_pos'] > 1000: # print('starting sample collection') break else: state = torch.from_numpy(np.concatenate(state, 0)) curr_states = state state = torch.from_numpy(np.concatenate(state, 0)) curr_states = state for _ in range(opt.num_local_steps): states.append(curr_states) logits, value = model_mast(curr_states) values.append(value.squeeze()) policy = F.softmax(logits, dim=1) old_m = Categorical(policy) action = old_m.sample() actions.append(action) old_log_policy = old_m.log_prob(action) old_log_policies.append(old_log_policy) if torch.cuda.is_available(): [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu()) ] else: [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action) ] state, reward, done, info = zip( *[agent_conn.recv() for agent_conn in envs.agent_conns]) state = torch.from_numpy(np.concatenate(state, 0)) if torch.cuda.is_available(): state = state.cuda() reward = torch.cuda.FloatTensor(reward) done = torch.cuda.FloatTensor(done) else: reward = torch.FloatTensor(reward) done = torch.FloatTensor(done) rewards.append(reward) dones.append(done) curr_states = state if done: # print('samples collected ',len(states)) break if len(states) >= opt.num_local_steps: # print('entring training loop. states list size is ', len(states)) _, next_value, = model_mast(curr_states) next_value = next_value.squeeze() old_log_policies = torch.cat(old_log_policies).detach() actions = torch.cat(actions) values = torch.Tensor(values).detach() # values = torch.cat(values).detach() states = torch.cat(states) gae = 0 R = [] for value, reward, done in list(zip(values, rewards, dones))[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach() * ( 1 - done) - value.detach() next_value = value R.append(gae + value) R = R[::-1] R = torch.cat(R).detach() advantages = R - values for i in range(opt.num_epochs): indice = torch.randperm(opt.num_local_steps * opt.num_processes) for j in range(opt.batch_size): batch_indices = indice[int(j * ( opt.num_local_steps * opt.num_processes / opt.batch_size)):int((j + 1) * (opt.num_local_steps * opt.num_processes / opt.batch_size))] logits, value = model_mast(states[batch_indices]) new_policy = F.softmax(logits, dim=1) new_m = Categorical(new_policy) new_log_policy = new_m.log_prob(actions[batch_indices]) ratio = torch.exp(new_log_policy - old_log_policies[batch_indices]) actor_loss = -torch.mean( torch.min( ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 + opt.epsilon) * advantages[batch_indices])) # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2 critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze()) entropy_loss = torch.mean(new_m.entropy()) total_loss = actor_loss + critic_loss - opt.beta * entropy_loss optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(model_mast.parameters(), 0.5) optimizer.step() print("Episode: {}. Total loss: {}".format( curr_episode, total_loss)) try: if os.path.exists('{}/ppo_scendpt_{}_{}_{}'.format( opt.saved_path, opt.world, opt.stage, (curr_episode - 1))): # print('removing past saved data of episode',curr_episode) os.remove('{}/ppo_scendpt_{}_{}_{}'.format( opt.saved_path, opt.world, opt.stage, (curr_episode - 1))) except: print('failed to remove past saved model') torch.save( model_mast.state_dict(), "{}/ppo_scendpt_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage, curr_episode)) break else: print('reseting training ') opt.saved_episode = curr_episode