def test(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT env = create_train_env(opt.world, opt.stage, actions, "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage)) model = PPO(env.observation_space.shape[0], len(actions)) if torch.cuda.is_available(): model.load_state_dict(torch.load("{}/ppo_full_finished_{}_{}_2847".format(opt.saved_path, opt.world, opt.stage))) model.cuda() else: model.load_state_dict(torch.load("{}/ppo_full_finished_{}_{}_2847".format(opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset()) while True: if torch.cuda.is_available(): state = state.cuda() logits, value = model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) state = torch.from_numpy(state) env.render() print('x pos is ',info['x_pos']) if info["flag_get"]: print("World {} stage {} completed".format(opt.world, opt.stage)) break
def eval(opt, global_model, num_states, num_actions): torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT env = create_train_env(opt.world, opt.stage, actions) local_model = PPO(num_states, num_actions) if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) # Uncomment following lines if you want to save model whenever level is completed if info['flag_get'] == True: print( "############### The model is finished .saving the model ###############" ) torch.save( local_model.state_dict(), "{}/ppo_full_finished_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage, opt.saved_episode)) exit() havedisplay = "DISPLAY" in os.environ if havedisplay: env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def eval(args, global_model, num_states, num_actions): # 固定初始化状态 if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) # 创建游戏动作 env = create_train_env(args.game) # 获取网络模型 local_model = PPO(num_states, num_actions) # 判断是否可以使用GPU if torch.cuda.is_available(): local_model.cuda() # 切换为评估状态 local_model.eval() # 将图像转换为Pytorch的数据类型 state = torch.from_numpy(env.reset()) # 一开始就更新模型参数 done = True curr_step = 0 max_reward = 0 while True: # 显示界面 if args.show_play: env.render() curr_step += 1 # 使用GPU计算 if torch.cuda.is_available(): state = state.cuda() # 每结束一次就更新模型参数 if done: local_model.load_state_dict(global_model.state_dict()) total_reward = 0 # 预测动作概率和评估值 logits, value = local_model(state) # 获取动作的序号 policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() # 执行游戏 state, reward, done, info = env.step(action) total_reward += reward # 重置游戏状态 if done: print("游戏得分:%f" % total_reward) curr_step = 0 state = env.reset() if max_reward < total_reward: torch.save( local_model.state_dict(), "{}/model_best_{}.pth".format(args.saved_path, args.game)) max_reward = total_reward # 转换每一步都游戏状态 state = torch.from_numpy(state)
def eval(opt, global_model, num_states, num_actions): torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT env = create_train_env(opt.world, opt.stage, actions) local_model = PPO(num_states, num_actions) if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) # Uncomment following lines if you want to save model whenever level is completed if info["flag_get"]: # if random.randint(0, 10)%2 == 0: # print("Finished") torch.save( local_model.state_dict(), "{}/ppo_super_mario_bros_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, curr_step)) # return # env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def infer(args): # 固定初始化状态 if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) # 创建游戏环境 env = create_train_env(args.game) # 创建模型 model = PPO(env.observation_space.shape[0], env.action_space.n) # 加载模型参数文件 if torch.cuda.is_available(): model.load_state_dict( torch.load("{}/model_best_{}.pth".format(args.saved_path, args.game))) model.cuda() else: model.load_state_dict( torch.load("{}/model_best_{}.pth".format(args.saved_path, args.game), map_location=lambda storage, loc: storage)) # 切换评估模式 model.eval() # 获取刚开始的游戏图像 state = torch.from_numpy(env.reset()) total_reward = 0 while True: # 显示界面 env.render() # 使用GPU计算 if torch.cuda.is_available(): state = state.cuda() # 预测动作概率和评估值 logits, value = model(state) # 获取动作的序号 policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() # 执行游戏 state, reward, done, info = env.step(action) total_reward += reward # 转换每一步都游戏状态 state = torch.from_numpy(state) print(info) # 游戏通关 if done: print("游戏结束,得分:%f" % total_reward) break time.sleep(0.05) env.render(close=True) env.close()
def test(opt): opt.saved_path = os.getcwd() + '/PPO/' + opt.saved_path opt.output_path = os.getcwd() + '/PPO/' + opt.output_path if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT env = create_train_env_test(actions) rec = VideoRecorder(env, path="{}/mario_video_{}.mp4".format( opt.output_path, opt.step), enabled=True) model = PPO(env.observation_space.shape[0], len(actions)) if torch.cuda.is_available(): model.load_state_dict( torch.load("{}/ppo_super_mario_bros_{}".format( opt.saved_path, opt.step))) model.cuda() else: model.load_state_dict( torch.load("{}/ppo_super_mario_bros_{}".format( opt.saved_path, opt.step), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset()) while True: if torch.cuda.is_available(): state = state.cuda() logits, value = model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) state = torch.from_numpy(state) # print(info) # env.render() rec.capture_frame() if done: print("Died.") rec.close() break
def test(opt, global_model, num_states, num_actions): torch.manual_seed(123) env = create_train_env(opt.level) local_model = PPO(num_states, num_actions) if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) if (done and info["lives"] != 0) or info["level"] == opt.level: torch.save( local_model.state_dict(), "{}/ppo_contra_success_{}".format(opt.saved_path, info["lives"])) env.render() actions.append(action) if curr_step > opt.num_max_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def test(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) env = create_train_env(opt.zone, opt.act, output_path="{}/video_{}.mp4".format( opt.output_path, STATES["{}-{}".format(opt.zone, opt.act)])) model = PPO(env.observation_space.shape[0], len(ACTION_MAPPING)) if torch.cuda.is_available(): model.load_state_dict( torch.load("{}/PPO_SonicTheHedgehog_{}".format( opt.saved_path, STATES["{}-{}".format(opt.zone, opt.act)]))) model.cuda() else: model.load_state_dict( torch.load("{}/PPO_SonicTheHedgehog_{}".format( opt.saved_path, STATES["{}-{}".format(opt.zone, opt.act)]), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset()) while True: if torch.cuda.is_available(): state = state.cuda() logits, value = model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) state = torch.from_numpy(state) env.render() if done and info["act"] == opt.act: print("Map {} is completed".format(STATES["{}-{}".format( opt.zone, opt.act)])) break
def train(args): # 固定初始化状态 if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) # 创建保存模型的文件夹 if not os.path.isdir(args.saved_path): os.makedirs(args.saved_path) # 创建多进程的游戏环境 envs = MultipleEnvironments(args.game, args.num_processes) # 创建模型 model = PPO(envs.num_states, envs.num_actions) # 加载预训练模型 if args.trained_model is not None: model.load_state_dict(torch.load(args.trained_model)) # 使用 GPU训练 if torch.cuda.is_available(): model.cuda() model.share_memory() # 为游戏评估单独开一个进程 mp = _mp.get_context("spawn") process = mp.Process(target=eval, args=(args, model, envs.num_states, envs.num_actions)) process.start() # 创建优化方法 optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # 刚开始给每个进程的游戏执行初始化 [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] # 获取游戏初始的界面 curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] curr_states = torch.from_numpy(np.concatenate(curr_states, 0)) if torch.cuda.is_available(): curr_states = curr_states.cuda() curr_episode = 0 while True: curr_episode += 1 old_log_policies = [] actions = [] values = [] states = [] rewards = [] dones = [] # 执行游戏获取数据 for _ in range(args.num_local_steps): states.append(curr_states) # 执行预测 logits, value = model(curr_states) # 计算每个动作的概率值 policy = F.softmax(logits, dim=1) # 根据每个标签的概率随机生成符合概率的标签 old_m = Categorical(policy) action = old_m.sample() # 记录预测数据 actions.append(action) values.append(value.squeeze()) # 计算损失使用 old_log_policy = old_m.log_prob(action) old_log_policies.append(old_log_policy) # 向各个进程游戏发送动作 if torch.cuda.is_available(): [agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu())] else: [agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action)] # 将多进程的游戏数据打包 state, reward, done, info = zip(*[agent_conn.recv() for agent_conn in envs.agent_conns]) # 进行数据转换 state = torch.from_numpy(np.concatenate(state, 0)) # 转换为pytorch数据 if torch.cuda.is_available(): state = state.cuda() reward = torch.cuda.FloatTensor(reward) done = torch.cuda.FloatTensor(done) else: reward = torch.FloatTensor(reward) done = torch.FloatTensor(done) # 记录预测数据 rewards.append(reward) dones.append(done) curr_states = state # 根据上面最后的图像预测 _, next_value, = model(curr_states) next_value = next_value.squeeze() old_log_policies = torch.cat(old_log_policies).detach() actions = torch.cat(actions) values = torch.cat(values).detach() states = torch.cat(states) gae = 0 R = [] for value, reward, done in list(zip(values, rewards, dones))[::-1]: gae = gae * args.gamma * args.tau gae = gae + reward + args.gamma * next_value.detach() * (1 - done) - value.detach() next_value = value R.append(gae + value) R = R[::-1] R = torch.cat(R).detach() advantages = R - values total_losses = [] for i in range(args.num_epochs): indice = torch.randperm(args.num_local_steps * args.num_processes) for j in range(args.batch_size): batch_indices = indice[ int(j * (args.num_local_steps * args.num_processes / args.batch_size)): int((j + 1) * ( args.num_local_steps * args.num_processes / args.batch_size))] # 根据拿到的图像执行预测 logits, value = model(states[batch_indices]) # 计算每个动作的概率值 new_policy = F.softmax(logits, dim=1) new_m = Categorical(new_policy) # 计算损失 new_log_policy = new_m.log_prob(actions[batch_indices]) ratio = torch.exp(new_log_policy - old_log_policies[batch_indices]) actor_loss = -torch.mean(torch.min(ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - args.epsilon, 1.0 + args.epsilon) * advantages[batch_indices])) critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze()) entropy_loss = torch.mean(new_m.entropy()) total_loss = actor_loss + critic_loss - args.beta * entropy_loss # 计算梯度 optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() total_losses.append(float(total_loss)) print("Episode: {}. Total loss: {:.4f}".format(curr_episode, np.mean(total_losses))) torch.save(model.state_dict(), "{}/model_{}.pth".format(args.saved_path, args.game))
def evaluate(opt, global_model, num_states, num_actions): torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT savefile = opt.saved_path + '/PPO_test.csv' print(savefile) title = ['Steps', 'Time', 'TotalReward', "Flag"] with open(savefile, 'w', newline='') as sfile: writer = csv.writer(sfile) writer.writerow(title) print(opt.retina_resolution) env = create_train_env(actions, mp_wrapper=False, cortex_left=opt.cortex_left, cortex_right=opt.cortex_right, retina_resolution=opt.retina_resolution, use_retina=opt.retina) local_model = PPO(num_states, num_actions) if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 tot_step = 0 actions = deque(maxlen=opt.max_actions) tot_reward = 0 got_flag = 0 index = 0 while True: start_time = time.time() curr_step += 1 tot_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax( policy).item() # This selects the best action to take state, reward, done, info = env.step(action) # im1 = state[0, 0, :, :] # im2 = state[0, 1, :, :] # im3 = state[0, 2, :, :] # im4 = state[0, 3, :, :] # res1 = cv2.resize(im1, dsize=(370, 370), interpolation=cv2.INTER_CUBIC) # im2 = state[0, 1, :, :] # res2 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC) # im3 = state[0, 2, :, :] # res3 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC) # im4 = state[0, 3, :, :] # res4 = cv2.resize(im2, dsize=(370, 370), interpolation=cv2.INTER_CUBIC) # fig=plt.figure(figsize=(8, 8)) # columns = 2 # rows = 2 # fig.add_subplot(rows, columns, 1) # plt.imshow(im1) # fig.add_subplot(rows, columns, 2) # plt.imshow(im2) # fig.add_subplot(rows, columns, 3) # plt.imshow(im3) # fig.add_subplot(rows, columns, 4) # plt.imshow(im4) # plt.show() index += 1 tot_reward += reward # Uncomment following lines if you want to save model whenever level is completed if flag_get(info): print("Evaluate: Level Completed!") got_flag = 1 done = True torch.save( local_model.state_dict(), "{}/ppo_super_mario_bros_{}".format(opt.saved_path, curr_step)) # env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: # print("Evaluate: Time's up!") done = True if done: # print("Evaluate: Done!") ep_time = time.time() - start_time data = [ tot_step, "{:.4f}".format(ep_time), "{:.2f}".format(tot_reward), got_flag ] with open(savefile, 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerows([data]) curr_step = 0 got_flag = 0 tot_reward = 0 actions.clear() # time.sleep(10) # Sleep for 10 secs state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) mp = _mp.get_context("spawn") envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type, opt.num_processes) model = PPO(envs.num_states, envs.num_actions) if torch.cuda.is_available(): model.cuda() try: if torch.cuda.is_available(): model.load_state_dict( torch.load("{}/ppo_full_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode))) model.cuda() else: model.load_state_dict( torch.load("{}/ppo_full_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode), map_location=lambda storage, loc: storage)) print('model is loaded with saved episode', opt.saved_episode) except: print('No model is loaded') model.share_memory() process = mp.Process(target=eval, args=(opt, model, envs.num_states, envs.num_actions)) process.start() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] curr_states = torch.from_numpy(np.concatenate(curr_states, 0)) if torch.cuda.is_available(): curr_states = curr_states.cuda() curr_episode = opt.saved_episode while True: # if curr_episode % opt.save_interval == 0 and curr_episode > 0: # torch.save(model.state_dict(), # "{}/ppo_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage)) if os.path.exists('{}/ppo_full_{}_{}_{}'.format( opt.saved_path, opt.world, opt.stage, (curr_episode - 1))): print('removing past saved data of episode', curr_episode) os.remove('{}/ppo_full_{}_{}_{}'.format(opt.saved_path, opt.world, opt.stage, (curr_episode - 1))) else: print('failed to remove past saved model') input() torch.save( model.state_dict(), "{}/ppo_full_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage, curr_episode)) curr_episode += 1 old_log_policies = [] actions = [] values = [] states = [] rewards = [] dones = [] for _ in range(opt.num_local_steps): states.append(curr_states) logits, value = model(curr_states) values.append(value.squeeze()) policy = F.softmax(logits, dim=1) old_m = Categorical(policy) action = old_m.sample() actions.append(action) old_log_policy = old_m.log_prob(action) old_log_policies.append(old_log_policy) if torch.cuda.is_available(): [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu()) ] else: [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action) ] state, reward, done, info = zip( *[agent_conn.recv() for agent_conn in envs.agent_conns]) state = torch.from_numpy(np.concatenate(state, 0)) if torch.cuda.is_available(): state = state.cuda() reward = torch.cuda.FloatTensor(reward) done = torch.cuda.FloatTensor(done) else: reward = torch.FloatTensor(reward) done = torch.FloatTensor(done) rewards.append(reward) dones.append(done) curr_states = state _, next_value, = model(curr_states) next_value = next_value.squeeze() old_log_policies = torch.cat(old_log_policies).detach() actions = torch.cat(actions) values = torch.Tensor(values).detach() states = torch.cat(states) gae = 0 R = [] for value, reward, done in list(zip(values, rewards, dones))[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach() * ( 1 - done) - value.detach() next_value = value R.append(gae + value) R = R[::-1] R = torch.cat(R).detach() advantages = R - values for i in range(opt.num_epochs): indice = torch.randperm(opt.num_local_steps * opt.num_processes) for j in range(opt.batch_size): batch_indices = indice[int(j * ( opt.num_local_steps * opt.num_processes / opt.batch_size)):int((j + 1) * (opt.num_local_steps * opt.num_processes / opt.batch_size))] logits, value = model(states[batch_indices]) new_policy = F.softmax(logits, dim=1) new_m = Categorical(new_policy) new_log_policy = new_m.log_prob(actions[batch_indices]) ratio = torch.exp(new_log_policy - old_log_policies[batch_indices]) actor_loss = -torch.mean( torch.min( ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 + opt.epsilon) * advantages[batch_indices])) # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2 critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze()) entropy_loss = torch.mean(new_m.entropy()) total_loss = actor_loss + critic_loss - opt.beta * entropy_loss optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() print("Episode: {}. Total loss: {}".format(curr_episode, total_loss)) opt.saved_episode = curr_episode
def eval(opt, global_model, num_states, num_actions): torch.manual_seed(123) if opt.action_type == "right": actions = RIGHT_ONLY elif opt.action_type == "simple": actions = SIMPLE_MOVEMENT else: actions = COMPLEX_MOVEMENT env = create_train_env(opt.world, opt.stage, actions) local_model = PPO(num_states, num_actions) Is_model_2_loaded = False if torch.cuda.is_available(): local_model.cuda() local_model.eval() state = torch.from_numpy(env.reset()) if torch.cuda.is_available(): state = state.cuda() done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) if done: if torch.cuda.is_available(): local_model.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode))) if torch.cuda.is_available() is False: local_model.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage)) while True: curr_step += 1 logits, value = local_model(state) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, info = env.step(action) if info['x_pos'] > 1000 and Is_model_2_loaded == False: try: local_model.load_state_dict(global_model.state_dict()) Is_model_2_loaded = True print('------ testing with model-----------') except: print('failed to load secondary training model') if info['x_pos'] < 1000 and Is_model_2_loaded == True: try: if torch.cuda.is_available(): local_model.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode))) if torch.cuda.is_available() is False: local_model.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage)) Is_model_2_loaded = False print('assistance model loaded') except: print('failed to load secondary training model') # Uncomment following lines if you want to save model whenever level is completed if info['flag_get'] == True: print( "############### The model is finished .saving the model ###############" ) torch.save( local_model.state_dict(), "{}/ppo_sendpt_finished_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode)) exit() havedisplay = "DISPLAY" in os.environ if havedisplay: env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) if torch.cuda.is_available(): state = state.cuda()
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) mp = _mp.get_context("spawn") envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type, opt.num_processes) model_mast = PPO(envs.num_states, envs.num_actions) model_1 = PPO(envs.num_states, envs.num_actions) model_2 = PPO(envs.num_states, envs.num_actions) model_1.eval() if torch.cuda.is_available(): try: model_1.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage))) model_1.cuda() print('model-1 is loaded cuda version') except: print('failed to load model-1') try: model_2.load_state_dict( torch.load("{}/ppo_secndpt_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode))) model_2.cuda() print('model-2 is loaded cuda version') except: print('failed to load model-2') else: try: model_1.load_state_dict( torch.load("{}/ppo_assistance_{}_{}".format( opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage)) print('model-1 is loaded non cuda version') except: print('Failed to load model-1') try: model_2.load_state_dict( torch.load("{}/ppo_scendpt_{}_{}_{}".format( opt.saved_path, opt.world, opt.stage, opt.saved_episode), map_location=lambda storage, loc: storage)) print('model-2 is loaded non cuda version') except: print('Failed to load non cuda model-2') model_mast.load_state_dict(model_2.state_dict()) if torch.cuda.is_available(): model_mast.cuda() model_mast.share_memory() process = mp.Process(target=eval, args=(opt, model_mast, envs.num_states, envs.num_actions)) process.start() optimizer = torch.optim.Adam(model_mast.parameters(), lr=opt.lr) [agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] curr_states = torch.from_numpy(np.concatenate(curr_states, 0)) if torch.cuda.is_available(): curr_states = curr_states.cuda() curr_episode = opt.saved_episode while True: curr_episode += 1 old_log_policies = [] actions = [] values = [] states = [] rewards = [] dones = [] print( '############## restarting the training loop ###################' ) while True: while True: logits, value = model_1(curr_states) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = torch.tensor(action) action = action.view(-1) if torch.cuda.is_available(): [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu()) ] else: [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action) ] state, reward, done, info = zip( *[agent_conn.recv() for agent_conn in envs.agent_conns]) # print('position is',info[0]['x_pos']) if info[0]['x_pos'] > 1000: # print('starting sample collection') break else: state = torch.from_numpy(np.concatenate(state, 0)) curr_states = state state = torch.from_numpy(np.concatenate(state, 0)) curr_states = state for _ in range(opt.num_local_steps): states.append(curr_states) logits, value = model_mast(curr_states) values.append(value.squeeze()) policy = F.softmax(logits, dim=1) old_m = Categorical(policy) action = old_m.sample() actions.append(action) old_log_policy = old_m.log_prob(action) old_log_policies.append(old_log_policy) if torch.cuda.is_available(): [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu()) ] else: [ agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action) ] state, reward, done, info = zip( *[agent_conn.recv() for agent_conn in envs.agent_conns]) state = torch.from_numpy(np.concatenate(state, 0)) if torch.cuda.is_available(): state = state.cuda() reward = torch.cuda.FloatTensor(reward) done = torch.cuda.FloatTensor(done) else: reward = torch.FloatTensor(reward) done = torch.FloatTensor(done) rewards.append(reward) dones.append(done) curr_states = state if done: # print('samples collected ',len(states)) break if len(states) >= opt.num_local_steps: # print('entring training loop. states list size is ', len(states)) _, next_value, = model_mast(curr_states) next_value = next_value.squeeze() old_log_policies = torch.cat(old_log_policies).detach() actions = torch.cat(actions) values = torch.Tensor(values).detach() # values = torch.cat(values).detach() states = torch.cat(states) gae = 0 R = [] for value, reward, done in list(zip(values, rewards, dones))[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach() * ( 1 - done) - value.detach() next_value = value R.append(gae + value) R = R[::-1] R = torch.cat(R).detach() advantages = R - values for i in range(opt.num_epochs): indice = torch.randperm(opt.num_local_steps * opt.num_processes) for j in range(opt.batch_size): batch_indices = indice[int(j * ( opt.num_local_steps * opt.num_processes / opt.batch_size)):int((j + 1) * (opt.num_local_steps * opt.num_processes / opt.batch_size))] logits, value = model_mast(states[batch_indices]) new_policy = F.softmax(logits, dim=1) new_m = Categorical(new_policy) new_log_policy = new_m.log_prob(actions[batch_indices]) ratio = torch.exp(new_log_policy - old_log_policies[batch_indices]) actor_loss = -torch.mean( torch.min( ratio * advantages[batch_indices], torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 + opt.epsilon) * advantages[batch_indices])) # critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2 critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze()) entropy_loss = torch.mean(new_m.entropy()) total_loss = actor_loss + critic_loss - opt.beta * entropy_loss optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(model_mast.parameters(), 0.5) optimizer.step() print("Episode: {}. Total loss: {}".format( curr_episode, total_loss)) try: if os.path.exists('{}/ppo_scendpt_{}_{}_{}'.format( opt.saved_path, opt.world, opt.stage, (curr_episode - 1))): # print('removing past saved data of episode',curr_episode) os.remove('{}/ppo_scendpt_{}_{}_{}'.format( opt.saved_path, opt.world, opt.stage, (curr_episode - 1))) except: print('failed to remove past saved model') torch.save( model_mast.state_dict(), "{}/ppo_scendpt_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage, curr_episode)) break else: print('reseting training ') opt.saved_episode = curr_episode