def game(mode): torch.set_num_threads(1) env = EnvFindGoals() max_epi_iter = 1500 max_MC_iter = 200 agent = COMA(N_action=5) train_curve = [] warm_up_run = 200 obs1_norm = RunningMeanStd() obs2_norm = RunningMeanStd() reward_rms1 = RunningMeanStd() reward_rms2 = RunningMeanStd() discounted_reward1 = RewardForwardFilter(gamma=0.999) discounted_reward2 = RewardForwardFilter(gamma=0.999) env.reset() o1_list = [] o2_list = [] #Normalize Obs for each agent for i in range(warm_up_run): obs1 = env.get_agt1_obs() obs2 = env.get_agt2_obs() o1_list.append(obs1) o2_list.append(obs2) obs1_norm.update(o1_list) obs2_norm.update(o2_list) for epi_iter in range(max_epi_iter): env.reset() o1_list = [] a1_list = [] pi_a1_list = [] o2_list = [] a2_list = [] pi_a2_list = [] r_list = [] r1_int_list = [] r2_int_list = [] total_int_rwd1_list = [] total_int_rwd2_list = [] adv1_int = [] adv2_int = [] dones_list = [] v1_list, v2_list = [], [] acc_r = 0 for MC_iter in range(max_MC_iter): #env.render() obs1 = env.get_agt1_obs() obs2 = env.get_agt2_obs() o1_list.append(obs1) o2_list.append(obs2) #print("obs1 =",obs1) #print("intrinsic reward =",agent.compute_intrinsic_reward1(obs1)) action1, pi_a1, action2, pi_a2, v1, v2 = agent.get_action( ((obs1 - obs1_norm.mean) / obs1_norm.var).clip(-5, 5), ((obs2 - obs2_norm.mean) / obs2_norm.var).clip(-5, 5)) int_rwd1 = agent.compute_intrinsic_reward1( ((obs1 - obs1_norm.mean) / obs1_norm.var).clip(-5, 5)) int_rwd2 = agent.compute_intrinsic_reward2( ((obs2 - obs2_norm.mean) / obs2_norm.var).clip(-5, 5)) #print("int_rwd1 =",int_rwd1) #print("int_rwd2 =",int_rwd2) v1_list.append(v1.data.numpy()[0]) v2_list.append(v2.data.numpy()[0]) #print(v1.data.numpy()[0]) r1_int_list.append(int_rwd1) r2_int_list.append(int_rwd2) a1_list.append(action1) pi_a1_list.append(pi_a1) a2_list.append(action2) pi_a2_list.append(pi_a2) [reward_1, reward_2], done = env.step([action1, action2]) #print(reward_1, reward_2) if done == False: dones_list.append(0) else: dones_list.append(1) acc_r = acc_r + reward_1 r_list.append(reward_1) ###Test using reward_2 instead #acc_r = acc_r + reward_2 #r_list.append(reward_2) if done: break obs1_norm.update(o1_list) obs2_norm.update(o2_list) """ for i in reversed(r1_int_list): r1_int_temp = discounted_reward1.update(i) for j in reversed(r2_int_list): r2_int_temp = discounted_reward2.update(j) """ #print("mean =",reward_rms1.mean) mean1, std1, count1 = np.mean(r1_int_list), np.std(r1_int_list), len( r1_int_list) mean2, std2, count2 = np.mean(r2_int_list), np.std(r2_int_list), len( r2_int_list) #mean1, std1, count1 = np.mean(r1_int_temp), np.std(r1_int_temp), len(r1_int_temp) #mean2, std2, count2 = np.mean(r2_int_temp), np.std(r2_int_temp), len(r2_int_temp) reward_rms1.update_from_moments(mean1, std1**2, count1) reward_rms2.update_from_moments(mean2, std2**2, count2) #print("var =",reward_rms1.var) #adv1_int = (r1_int_list-reward_rms1.mean)/np.sqrt(reward_rms1.var) #adv2_int = (r2_int_list-reward_rms2.mean)/np.sqrt(reward_rms2.var) #May be not correct way of approx advantages total_int_rwd1_list = r1_int_list / np.sqrt(reward_rms1.var) total_int_rwd2_list = r2_int_list / np.sqrt(reward_rms2.var) dones_list = np.stack(np.expand_dims(dones_list, axis=1)).transpose() total_int_rwd1_list = np.stack(total_int_rwd1_list).transpose() total_int_rwd2_list = np.stack(total_int_rwd2_list).transpose() v1_list = np.stack(v1_list).transpose() v2_list = np.stack(v2_list).transpose() #print(len(total_int_rwd1_list),len(dones_list),len(v1_list)) #target_int1, adv1_int = make_train_data_me(total_int_rwd1_list,dones_list,v1_list,0.999,MC_iter) #target_int2, adv2_int = make_train_data_me(total_int_rwd2_list,dones_list,v2_list,0.999,MC_iter) #print(target_int1,adv1_int) if mode == 'intrinsic': adv1_int = total_int_rwd1_list - reward_rms1.mean / np.sqrt( reward_rms1.var) adv2_int = total_int_rwd2_list - reward_rms2.mean / np.sqrt( reward_rms2.var) else: adv1_int = 0 adv2_int = 0 #print("adv1_int=",adv1_int) if epi_iter % 1 == 0: train_curve.append(acc_r / MC_iter) print('Episode', epi_iter, 'reward', acc_r / MC_iter) agent.train(o1_list, a1_list, pi_a1_list, o2_list, a2_list, pi_a2_list, r_list, adv1_int, adv2_int) #plt.plot(train_curve, linewidth=1, label='COMA') #plt.show() env.reset() return train_curve
from env_FindGoals import EnvFindGoals import random if __name__ == '__main__': env = EnvFindGoals() max_iter = 10000 for i in range(max_iter): print("iter= ", i) action_list = [random.randint(0, 4), random.randint(0, 4)] reward_list, done = env.step(action_list) print(env.agt1_pos, env.agt2_pos) env.render()
a1_loss = -a1_loss / T a1_optimizer.zero_grad() a1_loss.backward() a1_optimizer.step() a2_loss = torch.FloatTensor([0.0]) for t in range(T): a2_loss = a2_loss + A2_list[t].item() * torch.log(pi_a2_list[t][0, a2_list[t]]) a2_loss = -a2_loss / T a2_optimizer.zero_grad() a2_loss.backward() a2_optimizer.step() if __name__ == '__main__': torch.set_num_threads(1) env = EnvFindGoals() max_epi_iter = 1000 max_MC_iter = 200 agent = COMA(N_action=5) train_curve = [] for epi_iter in range(max_epi_iter): env.reset() o1_list = [] a1_list = [] pi_a1_list = [] o2_list = [] a2_list = [] pi_a2_list = [] r_list = [] acc_r = 0 for MC_iter in range(max_MC_iter):
def game(mode): torch.set_num_threads(1) env = EnvFindGoals() max_epi_iter = 1101 max_MC_iter = 200 N_action = 5 agent = COMA(N_action=5) train_curve = [] warm_up_run = 100 obs1_norm = RunningMeanStd() obs2_norm = RunningMeanStd() reward_rms1 = RunningMeanStd() reward_rms2 = RunningMeanStd() discounted_reward1 = RewardForwardFilter(gamma=0.999) discounted_reward2 = RewardForwardFilter(gamma=0.999) env.reset() o1_list = [] o2_list = [] intrisic_surprise_loss = nn.MSELoss() loss_intrisic_rms = RunningMeanStd() #Normalize Obs for each agent for i in range(warm_up_run): ############ agent1 move and agent2 standstill obs1 = env.get_agt1_obs() obs2 = env.get_agt2_obs() env.reset() for _ in range(1000): action1 = np.random.randint(0, N_action) action2 = 4 #np.random.randint(0,N_action) [reward_1, reward_2], done = env.step([action1, action2]) obs_all = env.get_full_obs() next_obs = agent.get_next_state(obs_all, action1, "1") #print("next_obs=",next_obs.shape) #print("_",img_to_tensor(obs_all).unsqueeze(0).detach().shape) loss1 = agent.pretrain_loss1( img_to_tensor(obs_all).unsqueeze(0).detach(), next_obs) agent.pretrain_train(loss1, "1") #print("loss1",loss1) env.reset() for _ in range(1000): action1 = 4 #np.random.randint(0,N_action) action2 = np.random.randint(0, N_action) [reward_1, reward_2], done = env.step([action1, action2]) obs_all = env.get_full_obs() next_obs = agent.get_next_state(obs_all, action2, "2") #print("next_obs=",next_obs.shape) #print("_",img_to_tensor(obs_all).unsqueeze(0).detach().shape) loss2 = agent.pretrain_loss2( img_to_tensor(obs_all).unsqueeze(0).detach(), next_obs) agent.pretrain_train(loss2, "2") #print("loss2",loss2) print("Pre-training is ", i * 100 / (warm_up_run), "percent complete") o1_list.append(obs1) o2_list.append(obs2) obs1_norm.update(o1_list) obs2_norm.update(o2_list) #print("obs1=",obs1) #print("obs2=",obs2) for epi_iter in range(max_epi_iter): env.reset() o1_list = [] a1_list = [] pi_a1_list = [] o2_list = [] a2_list = [] pi_a2_list = [] r_list = [] r1_int_list = [] r2_int_list = [] total_int_rwd1_list = [] total_int_rwd2_list = [] adv1_int = [] adv2_int = [] dones_list = [] v1_list, v2_list = [], [] acc_r = 0 obs_all_s = 0 pred_s_list = [] pred_s_dat_list = [] pre_train_state_list = [] loss_intrisic_surprise_list = [] for MC_iter in range(max_MC_iter): #env.render() obs1 = env.get_agt1_obs() obs2 = env.get_agt2_obs() o1_list.append(obs1) o2_list.append(obs2) obs_all_s = env.get_full_obs() #print("obs_all_s =",obs_all_s.shape) #print("intrinsic reward =",agent.compute_intrinsic_reward1(obs1)) action1, pi_a1, action2, pi_a2, v1, v2 = agent.get_action( ((obs1 - obs1_norm.mean) / obs1_norm.var).clip(-5, 5), ((obs2 - obs2_norm.mean) / obs2_norm.var).clip(-5, 5)) pred_next_state = agent.get_next_all_state(obs_all_s, action1, action2) pre_train_next_state = agent.get_next_state( obs_all_s, action1, "1") #agent.pretrain_agent1.forward1(obs_all_s,action1) #print("pre_train_next_state",pre_train_next_state.shape) pre_train_next_state2 = agent.pretrain_agent2.forward2( pre_train_next_state, action2) pre_train_state_list.append(pre_train_next_state2) #print("pred_next_state =",pred_next_state.shape) pred_s_list.append(pred_next_state) int_rwd1 = agent.compute_intrinsic_reward1( ((obs1 - obs1_norm.mean) / obs1_norm.var).clip(-5, 5)) int_rwd2 = agent.compute_intrinsic_reward2( ((obs2 - obs2_norm.mean) / obs2_norm.var).clip(-5, 5)) intrisic_surprise = intrisic_surprise_loss( pred_next_state, pre_train_next_state2).data.numpy() loss_intrisic_surprise_list.append([intrisic_surprise]) #print("intrisic_surprise=",intrisic_surprise) #print("int_rwd1 =",int_rwd1) #print("int_rwd2 =",int_rwd2) v1_list.append(v1.data.numpy()[0]) v2_list.append(v2.data.numpy()[0]) #print(v1.data.numpy()[0]) r1_int_list.append(int_rwd1) r2_int_list.append(int_rwd2) a1_list.append(action1) pi_a1_list.append(pi_a1) a2_list.append(action2) pi_a2_list.append(pi_a2) [reward_1, reward_2], done = env.step([action1, action2]) obs_all_s = env.get_full_obs() obs_all_s = img_to_tensor(obs_all).unsqueeze(0).detach() #print("obs_all_s =",obs_all_s.shape) pred_s_dat_list.append(obs_all_s) #print(reward_1, reward_2) if done == False: dones_list.append(0) else: dones_list.append(1) sum_r = reward_1 + reward_2 if reward_2 > 0 or reward_1 > 0: print("reward_1=", reward_1) print("reward_2=", reward_2) print("MC_iter", MC_iter) r_list.append(sum_r) ###Test using reward_2 instead #acc_r = acc_r + reward_2 #r_list.append(reward_2) acc_r = acc_r + sum_r if done: break print("acc_r=", acc_r) obs1_norm.update(o1_list) obs2_norm.update(o2_list) loss_intrisic_rms.update(loss_intrisic_surprise_list) """ for i in reversed(r1_int_list): r1_int_temp = discounted_reward1.update(i) for j in reversed(r2_int_list): r2_int_temp = discounted_reward2.update(j) """ #print("mean =",reward_rms1.mean) mean1, std1, count1 = np.mean(r1_int_list), np.std(r1_int_list), len( r1_int_list) mean2, std2, count2 = np.mean(r2_int_list), np.std(r2_int_list), len( r2_int_list) #mean1, std1, count1 = np.mean(r1_int_temp), np.std(r1_int_temp), len(r1_int_temp) #mean2, std2, count2 = np.mean(r2_int_temp), np.std(r2_int_temp), len(r2_int_temp) reward_rms1.update_from_moments(mean1, std1**2, count1) reward_rms2.update_from_moments(mean2, std2**2, count2) #print("var =",reward_rms1.var) #adv1_int = (r1_int_list-reward_rms1.mean)/np.sqrt(reward_rms1.var) #adv2_int = (r2_int_list-reward_rms2.mean)/np.sqrt(reward_rms2.var) #May be not correct way of approx advantages total_int_rwd1_list = r1_int_list / np.sqrt(reward_rms1.var) total_int_rwd2_list = r2_int_list / np.sqrt(reward_rms2.var) dones_list = np.stack(np.expand_dims(dones_list, axis=1)).transpose() total_int_rwd1_list = np.stack(total_int_rwd1_list).transpose() total_int_rwd2_list = np.stack(total_int_rwd2_list).transpose() v1_list = np.stack(v1_list).transpose() v2_list = np.stack(v2_list).transpose() #print(len(total_int_rwd1_list),len(dones_list),len(v1_list)) #target_int1, adv1_int = make_train_data_me(total_int_rwd1_list,dones_list,v1_list,0.999,MC_iter) #target_int2, adv2_int = make_train_data_me(total_int_rwd2_list,dones_list,v2_list,0.999,MC_iter) #print(target_int1,adv1_int) if mode == 'intrinsic': adv1_int = np.zeros( (len(r1_int_list), 1) ) #(r1_int_list)/np.sqrt(reward_rms1.var)#total_int_rwd1_list - reward_rms1.mean/np.sqrt(reward_rms1.var) adv2_int = np.zeros( (len(r1_int_list), 1) ) #(r2_int_list)/np.sqrt(reward_rms2.var)#total_int_rwd2_list - reward_rms2.mean/np.sqrt(reward_rms2.var) adv_sur_int = loss_intrisic_surprise_list / np.sqrt( loss_intrisic_rms.var) else: adv1_int = np.zeros((len(r1_int_list), 1)) adv2_int = np.zeros((len(r2_int_list), 1)) adv_sur_int = np.zeros((len(r2_int_list), 1)) #print("adv1_int=",adv1_int) if epi_iter % 1 == 0: train_curve.append(acc_r) print("####################################") print('Episode', epi_iter, 'reward', acc_r) print("####################################") agent.train(o1_list, a1_list, pi_a1_list, o2_list, a2_list, pi_a2_list, r_list, adv1_int, adv2_int, pred_s_list, pred_s_dat_list, adv_sur_int) #plt.plot(train_curve, linewidth=1, label='COMA') #plt.show() env.reset() return train_curve
q = self.soft_q_net.get_Q(state).squeeze(1) est_q = q.clone() next_q = self.soft_q_net.get_Q(next_state).squeeze(1) next_v = self.soft_q_net.get_V(next_q) for i in range(len(action)): est_q[i][action[i]] = reward[i] + self.gamma * next_v[i] q_loss = F.mse_loss(q, est_q.detach()) self.soft_q_optimizer.zero_grad() q_loss.backward() self.soft_q_optimizer.step() if __name__ == '__main__': use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") env = EnvFindGoals() action_dim = 5 state_dim = 2 agent = SoftQ(state_dim=2, action_dim=5) max_MC_iter = 200 max_epi_iter = 500 batch_size = 64 replay_buffer = ReplayBuffer(10000) train_curve = [] for epi in range(max_epi_iter): env.reset() acc_reward = 0 for MC_iter in range(max_MC_iter): # print("MC= ", MC_iter) env.render() state = np.array(env.agt1_pos).reshape((1, 2))
if __name__ == '__main__': # running GDICE alpha = 0.05 N_agent = 2 N_s = 20 N_b = 5 # no bigger than N_s N_action = 5 N_node_list = [10, 10] max_opt_iter = 100 max_MC_iter = 300 # get obs_dict obs_list1 = [] obs_list2 = [] env = EnvFindGoals() print('start') for j in range(100): print(j) env.reset() for i in range(100): reward_list, done = env.step([random.randint(0, 4), random.randint(0, 4)]) obs1 = env.get_obs()[0] obs2 = env.get_obs()[1] if is_obs_in_list(obs1, obs_list1) == False: obs_list1.append(obs1) if is_obs_in_list(obs2, obs_list2) == False: obs_list2.append(obs2) print(len(obs_list1)) print(len(obs_list2)) obs_dict = [obs_list1, obs_list2]