示例#1
0
def game(mode):
    torch.set_num_threads(1)
    env = EnvFindGoals()
    max_epi_iter = 1500
    max_MC_iter = 200
    agent = COMA(N_action=5)
    train_curve = []
    warm_up_run = 200
    obs1_norm = RunningMeanStd()
    obs2_norm = RunningMeanStd()
    reward_rms1 = RunningMeanStd()
    reward_rms2 = RunningMeanStd()
    discounted_reward1 = RewardForwardFilter(gamma=0.999)
    discounted_reward2 = RewardForwardFilter(gamma=0.999)
    env.reset()
    o1_list = []
    o2_list = []
    #Normalize Obs for each agent
    for i in range(warm_up_run):
        obs1 = env.get_agt1_obs()
        obs2 = env.get_agt2_obs()
        o1_list.append(obs1)
        o2_list.append(obs2)
    obs1_norm.update(o1_list)
    obs2_norm.update(o2_list)

    for epi_iter in range(max_epi_iter):
        env.reset()
        o1_list = []
        a1_list = []
        pi_a1_list = []
        o2_list = []
        a2_list = []
        pi_a2_list = []
        r_list = []
        r1_int_list = []
        r2_int_list = []
        total_int_rwd1_list = []
        total_int_rwd2_list = []
        adv1_int = []
        adv2_int = []
        dones_list = []
        v1_list, v2_list = [], []
        acc_r = 0

        for MC_iter in range(max_MC_iter):
            #env.render()
            obs1 = env.get_agt1_obs()
            obs2 = env.get_agt2_obs()
            o1_list.append(obs1)
            o2_list.append(obs2)
            #print("obs1 =",obs1)
            #print("intrinsic reward =",agent.compute_intrinsic_reward1(obs1))
            action1, pi_a1, action2, pi_a2, v1, v2 = agent.get_action(
                ((obs1 - obs1_norm.mean) / obs1_norm.var).clip(-5, 5),
                ((obs2 - obs2_norm.mean) / obs2_norm.var).clip(-5, 5))
            int_rwd1 = agent.compute_intrinsic_reward1(
                ((obs1 - obs1_norm.mean) / obs1_norm.var).clip(-5, 5))
            int_rwd2 = agent.compute_intrinsic_reward2(
                ((obs2 - obs2_norm.mean) / obs2_norm.var).clip(-5, 5))
            #print("int_rwd1 =",int_rwd1)
            #print("int_rwd2 =",int_rwd2)
            v1_list.append(v1.data.numpy()[0])
            v2_list.append(v2.data.numpy()[0])
            #print(v1.data.numpy()[0])
            r1_int_list.append(int_rwd1)
            r2_int_list.append(int_rwd2)
            a1_list.append(action1)
            pi_a1_list.append(pi_a1)
            a2_list.append(action2)
            pi_a2_list.append(pi_a2)
            [reward_1, reward_2], done = env.step([action1, action2])
            #print(reward_1, reward_2)
            if done == False:
                dones_list.append(0)
            else:
                dones_list.append(1)
            acc_r = acc_r + reward_1
            r_list.append(reward_1)
            ###Test using reward_2 instead
            #acc_r = acc_r + reward_2
            #r_list.append(reward_2)

            if done:
                break
        obs1_norm.update(o1_list)
        obs2_norm.update(o2_list)
        """
        for i in reversed(r1_int_list):
            r1_int_temp = discounted_reward1.update(i)
        for j in reversed(r2_int_list):
            r2_int_temp = discounted_reward2.update(j)
        """
        #print("mean =",reward_rms1.mean)
        mean1, std1, count1 = np.mean(r1_int_list), np.std(r1_int_list), len(
            r1_int_list)
        mean2, std2, count2 = np.mean(r2_int_list), np.std(r2_int_list), len(
            r2_int_list)
        #mean1, std1, count1 = np.mean(r1_int_temp), np.std(r1_int_temp), len(r1_int_temp)
        #mean2, std2, count2 = np.mean(r2_int_temp), np.std(r2_int_temp), len(r2_int_temp)
        reward_rms1.update_from_moments(mean1, std1**2, count1)
        reward_rms2.update_from_moments(mean2, std2**2, count2)
        #print("var =",reward_rms1.var)
        #adv1_int = (r1_int_list-reward_rms1.mean)/np.sqrt(reward_rms1.var)
        #adv2_int = (r2_int_list-reward_rms2.mean)/np.sqrt(reward_rms2.var)
        #May be not correct way of approx advantages

        total_int_rwd1_list = r1_int_list / np.sqrt(reward_rms1.var)
        total_int_rwd2_list = r2_int_list / np.sqrt(reward_rms2.var)

        dones_list = np.stack(np.expand_dims(dones_list, axis=1)).transpose()
        total_int_rwd1_list = np.stack(total_int_rwd1_list).transpose()
        total_int_rwd2_list = np.stack(total_int_rwd2_list).transpose()

        v1_list = np.stack(v1_list).transpose()
        v2_list = np.stack(v2_list).transpose()
        #print(len(total_int_rwd1_list),len(dones_list),len(v1_list))
        #target_int1, adv1_int = make_train_data_me(total_int_rwd1_list,dones_list,v1_list,0.999,MC_iter)
        #target_int2, adv2_int = make_train_data_me(total_int_rwd2_list,dones_list,v2_list,0.999,MC_iter)
        #print(target_int1,adv1_int)
        if mode == 'intrinsic':
            adv1_int = total_int_rwd1_list - reward_rms1.mean / np.sqrt(
                reward_rms1.var)
            adv2_int = total_int_rwd2_list - reward_rms2.mean / np.sqrt(
                reward_rms2.var)
        else:
            adv1_int = 0
            adv2_int = 0

        #print("adv1_int=",adv1_int)

        if epi_iter % 1 == 0:
            train_curve.append(acc_r / MC_iter)
        print('Episode', epi_iter, 'reward', acc_r / MC_iter)
        agent.train(o1_list, a1_list, pi_a1_list, o2_list, a2_list, pi_a2_list,
                    r_list, adv1_int, adv2_int)
    #plt.plot(train_curve, linewidth=1, label='COMA')
    #plt.show()
    env.reset()
    return train_curve
示例#2
0
from env_FindGoals import EnvFindGoals
import random

if __name__ == '__main__':
    env = EnvFindGoals()
    max_iter = 10000
    for i in range(max_iter):
        print("iter= ", i)
        action_list = [random.randint(0, 4), random.randint(0, 4)]
        reward_list, done = env.step(action_list)
        print(env.agt1_pos, env.agt2_pos)
        env.render()
示例#3
0
        a1_loss = -a1_loss / T
        a1_optimizer.zero_grad()
        a1_loss.backward()
        a1_optimizer.step()

        a2_loss = torch.FloatTensor([0.0])
        for t in range(T):
            a2_loss = a2_loss + A2_list[t].item() * torch.log(pi_a2_list[t][0, a2_list[t]])
        a2_loss = -a2_loss / T
        a2_optimizer.zero_grad()
        a2_loss.backward()
        a2_optimizer.step()

if __name__ == '__main__':
    torch.set_num_threads(1)
    env = EnvFindGoals()
    max_epi_iter = 1000
    max_MC_iter = 200
    agent = COMA(N_action=5)
    train_curve = []
    for epi_iter in range(max_epi_iter):
        env.reset()
        o1_list = []
        a1_list = []
        pi_a1_list = []
        o2_list = []
        a2_list = []
        pi_a2_list = []
        r_list = []
        acc_r = 0
        for MC_iter in range(max_MC_iter):
示例#4
0
def game(mode):
    torch.set_num_threads(1)
    env = EnvFindGoals()
    max_epi_iter = 1101
    max_MC_iter = 200
    N_action = 5
    agent = COMA(N_action=5)
    train_curve = []
    warm_up_run = 100
    obs1_norm = RunningMeanStd()
    obs2_norm = RunningMeanStd()
    reward_rms1 = RunningMeanStd()
    reward_rms2 = RunningMeanStd()
    discounted_reward1 = RewardForwardFilter(gamma=0.999)
    discounted_reward2 = RewardForwardFilter(gamma=0.999)
    env.reset()
    o1_list = []
    o2_list = []
    intrisic_surprise_loss = nn.MSELoss()
    loss_intrisic_rms = RunningMeanStd()

    #Normalize Obs for each agent
    for i in range(warm_up_run):
        ############ agent1 move and agent2 standstill
        obs1 = env.get_agt1_obs()
        obs2 = env.get_agt2_obs()
        env.reset()
        for _ in range(1000):
            action1 = np.random.randint(0, N_action)
            action2 = 4  #np.random.randint(0,N_action)
            [reward_1, reward_2], done = env.step([action1, action2])
            obs_all = env.get_full_obs()
            next_obs = agent.get_next_state(obs_all, action1, "1")
            #print("next_obs=",next_obs.shape)
            #print("_",img_to_tensor(obs_all).unsqueeze(0).detach().shape)
            loss1 = agent.pretrain_loss1(
                img_to_tensor(obs_all).unsqueeze(0).detach(), next_obs)
            agent.pretrain_train(loss1, "1")
            #print("loss1",loss1)
        env.reset()
        for _ in range(1000):
            action1 = 4  #np.random.randint(0,N_action)
            action2 = np.random.randint(0, N_action)
            [reward_1, reward_2], done = env.step([action1, action2])
            obs_all = env.get_full_obs()
            next_obs = agent.get_next_state(obs_all, action2, "2")
            #print("next_obs=",next_obs.shape)
            #print("_",img_to_tensor(obs_all).unsqueeze(0).detach().shape)
            loss2 = agent.pretrain_loss2(
                img_to_tensor(obs_all).unsqueeze(0).detach(), next_obs)
            agent.pretrain_train(loss2, "2")
            #print("loss2",loss2)
        print("Pre-training is ", i * 100 / (warm_up_run), "percent complete")
        o1_list.append(obs1)
        o2_list.append(obs2)

    obs1_norm.update(o1_list)
    obs2_norm.update(o2_list)
    #print("obs1=",obs1)
    #print("obs2=",obs2)

    for epi_iter in range(max_epi_iter):
        env.reset()
        o1_list = []
        a1_list = []
        pi_a1_list = []
        o2_list = []
        a2_list = []
        pi_a2_list = []
        r_list = []
        r1_int_list = []
        r2_int_list = []
        total_int_rwd1_list = []
        total_int_rwd2_list = []
        adv1_int = []
        adv2_int = []
        dones_list = []
        v1_list, v2_list = [], []
        acc_r = 0
        obs_all_s = 0
        pred_s_list = []
        pred_s_dat_list = []
        pre_train_state_list = []
        loss_intrisic_surprise_list = []

        for MC_iter in range(max_MC_iter):
            #env.render()
            obs1 = env.get_agt1_obs()
            obs2 = env.get_agt2_obs()
            o1_list.append(obs1)
            o2_list.append(obs2)
            obs_all_s = env.get_full_obs()
            #print("obs_all_s =",obs_all_s.shape)
            #print("intrinsic reward =",agent.compute_intrinsic_reward1(obs1))
            action1, pi_a1, action2, pi_a2, v1, v2 = agent.get_action(
                ((obs1 - obs1_norm.mean) / obs1_norm.var).clip(-5, 5),
                ((obs2 - obs2_norm.mean) / obs2_norm.var).clip(-5, 5))
            pred_next_state = agent.get_next_all_state(obs_all_s, action1,
                                                       action2)

            pre_train_next_state = agent.get_next_state(
                obs_all_s, action1,
                "1")  #agent.pretrain_agent1.forward1(obs_all_s,action1)
            #print("pre_train_next_state",pre_train_next_state.shape)
            pre_train_next_state2 = agent.pretrain_agent2.forward2(
                pre_train_next_state, action2)
            pre_train_state_list.append(pre_train_next_state2)
            #print("pred_next_state =",pred_next_state.shape)
            pred_s_list.append(pred_next_state)
            int_rwd1 = agent.compute_intrinsic_reward1(
                ((obs1 - obs1_norm.mean) / obs1_norm.var).clip(-5, 5))
            int_rwd2 = agent.compute_intrinsic_reward2(
                ((obs2 - obs2_norm.mean) / obs2_norm.var).clip(-5, 5))
            intrisic_surprise = intrisic_surprise_loss(
                pred_next_state, pre_train_next_state2).data.numpy()
            loss_intrisic_surprise_list.append([intrisic_surprise])
            #print("intrisic_surprise=",intrisic_surprise)
            #print("int_rwd1 =",int_rwd1)
            #print("int_rwd2 =",int_rwd2)
            v1_list.append(v1.data.numpy()[0])
            v2_list.append(v2.data.numpy()[0])
            #print(v1.data.numpy()[0])
            r1_int_list.append(int_rwd1)
            r2_int_list.append(int_rwd2)
            a1_list.append(action1)
            pi_a1_list.append(pi_a1)
            a2_list.append(action2)
            pi_a2_list.append(pi_a2)
            [reward_1, reward_2], done = env.step([action1, action2])
            obs_all_s = env.get_full_obs()
            obs_all_s = img_to_tensor(obs_all).unsqueeze(0).detach()
            #print("obs_all_s =",obs_all_s.shape)
            pred_s_dat_list.append(obs_all_s)
            #print(reward_1, reward_2)

            if done == False:
                dones_list.append(0)
            else:
                dones_list.append(1)
            sum_r = reward_1 + reward_2
            if reward_2 > 0 or reward_1 > 0:

                print("reward_1=", reward_1)
                print("reward_2=", reward_2)
                print("MC_iter", MC_iter)
            r_list.append(sum_r)
            ###Test using reward_2 instead
            #acc_r = acc_r + reward_2
            #r_list.append(reward_2)
            acc_r = acc_r + sum_r
            if done:
                break

        print("acc_r=", acc_r)
        obs1_norm.update(o1_list)
        obs2_norm.update(o2_list)
        loss_intrisic_rms.update(loss_intrisic_surprise_list)
        """
        for i in reversed(r1_int_list):
            r1_int_temp = discounted_reward1.update(i)
        for j in reversed(r2_int_list):
            r2_int_temp = discounted_reward2.update(j)
        """
        #print("mean =",reward_rms1.mean)
        mean1, std1, count1 = np.mean(r1_int_list), np.std(r1_int_list), len(
            r1_int_list)
        mean2, std2, count2 = np.mean(r2_int_list), np.std(r2_int_list), len(
            r2_int_list)
        #mean1, std1, count1 = np.mean(r1_int_temp), np.std(r1_int_temp), len(r1_int_temp)
        #mean2, std2, count2 = np.mean(r2_int_temp), np.std(r2_int_temp), len(r2_int_temp)
        reward_rms1.update_from_moments(mean1, std1**2, count1)
        reward_rms2.update_from_moments(mean2, std2**2, count2)
        #print("var =",reward_rms1.var)
        #adv1_int = (r1_int_list-reward_rms1.mean)/np.sqrt(reward_rms1.var)
        #adv2_int = (r2_int_list-reward_rms2.mean)/np.sqrt(reward_rms2.var)
        #May be not correct way of approx advantages

        total_int_rwd1_list = r1_int_list / np.sqrt(reward_rms1.var)
        total_int_rwd2_list = r2_int_list / np.sqrt(reward_rms2.var)

        dones_list = np.stack(np.expand_dims(dones_list, axis=1)).transpose()
        total_int_rwd1_list = np.stack(total_int_rwd1_list).transpose()
        total_int_rwd2_list = np.stack(total_int_rwd2_list).transpose()

        v1_list = np.stack(v1_list).transpose()
        v2_list = np.stack(v2_list).transpose()
        #print(len(total_int_rwd1_list),len(dones_list),len(v1_list))
        #target_int1, adv1_int = make_train_data_me(total_int_rwd1_list,dones_list,v1_list,0.999,MC_iter)
        #target_int2, adv2_int = make_train_data_me(total_int_rwd2_list,dones_list,v2_list,0.999,MC_iter)
        #print(target_int1,adv1_int)
        if mode == 'intrinsic':

            adv1_int = np.zeros(
                (len(r1_int_list), 1)
            )  #(r1_int_list)/np.sqrt(reward_rms1.var)#total_int_rwd1_list - reward_rms1.mean/np.sqrt(reward_rms1.var)
            adv2_int = np.zeros(
                (len(r1_int_list), 1)
            )  #(r2_int_list)/np.sqrt(reward_rms2.var)#total_int_rwd2_list - reward_rms2.mean/np.sqrt(reward_rms2.var)
            adv_sur_int = loss_intrisic_surprise_list / np.sqrt(
                loss_intrisic_rms.var)
        else:

            adv1_int = np.zeros((len(r1_int_list), 1))
            adv2_int = np.zeros((len(r2_int_list), 1))
            adv_sur_int = np.zeros((len(r2_int_list), 1))

        #print("adv1_int=",adv1_int)

        if epi_iter % 1 == 0:
            train_curve.append(acc_r)
        print("####################################")
        print('Episode', epi_iter, 'reward', acc_r)
        print("####################################")
        agent.train(o1_list, a1_list, pi_a1_list, o2_list, a2_list, pi_a2_list,
                    r_list, adv1_int, adv2_int, pred_s_list, pred_s_dat_list,
                    adv_sur_int)
    #plt.plot(train_curve, linewidth=1, label='COMA')
    #plt.show()
    env.reset()
    return train_curve
        q = self.soft_q_net.get_Q(state).squeeze(1)
        est_q = q.clone()
        next_q = self.soft_q_net.get_Q(next_state).squeeze(1)
        next_v = self.soft_q_net.get_V(next_q)
        for i in range(len(action)):
            est_q[i][action[i]] = reward[i] + self.gamma * next_v[i]
        q_loss = F.mse_loss(q, est_q.detach())
        self.soft_q_optimizer.zero_grad()
        q_loss.backward()
        self.soft_q_optimizer.step()


if __name__ == '__main__':
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    env = EnvFindGoals()
    action_dim = 5
    state_dim = 2
    agent = SoftQ(state_dim=2, action_dim=5)
    max_MC_iter = 200
    max_epi_iter = 500
    batch_size = 64
    replay_buffer = ReplayBuffer(10000)
    train_curve = []
    for epi in range(max_epi_iter):
        env.reset()
        acc_reward = 0
        for MC_iter in range(max_MC_iter):
            # print("MC= ", MC_iter)
            env.render()
            state = np.array(env.agt1_pos).reshape((1, 2))
if __name__ == '__main__':
    # running GDICE
    alpha = 0.05
    N_agent = 2
    N_s = 20
    N_b = 5     # no bigger than N_s
    N_action = 5
    N_node_list = [10, 10]
    max_opt_iter = 100
    max_MC_iter = 300

    # get obs_dict
    obs_list1 = []
    obs_list2 = []
    env = EnvFindGoals()
    print('start')
    for j in range(100):
        print(j)
        env.reset()
        for i in range(100):
            reward_list, done = env.step([random.randint(0, 4), random.randint(0, 4)])
            obs1 = env.get_obs()[0]
            obs2 = env.get_obs()[1]
            if is_obs_in_list(obs1, obs_list1) == False:
                obs_list1.append(obs1)
            if is_obs_in_list(obs2, obs_list2) == False:
                obs_list2.append(obs2)
        print(len(obs_list1))
        print(len(obs_list2))
    obs_dict = [obs_list1, obs_list2]