Exemplo n.º 1
0
				r_p = p_z[signal[i]]
				ep_rewards[i].append(r_p)

		if steps%T==0:
			for i in range(n_agent):
				meta_rewards[i].append(utili[i]/(0.1+abs(rat[i])))
				ep_actions[i] = np.array(ep_actions[i])
				ep_rewards[i] = np.array(ep_rewards[i], dtype=np.float_)
				ep_states[i] = np.array(ep_states[i])
				if LAMBDA < -0.1:
					targets = discount_rewards(ep_rewards[i], GAMMA)
					V[i][signal[i]].update(ep_states[i], targets)
					vs = V[i][signal[i]].get(ep_states[i])
				else:
					vs = V[i][signal[i]].get(ep_states[i])
					targets = eligibility_traces(ep_rewards[i], vs, V[i][signal[i]].get(copy.deepcopy([obs[i]])), GAMMA, LAMBDA)
					V[i][signal[i]].update(ep_states[i], targets)
				ep_advantages = targets - vs
				ep_advantages = (ep_advantages - np.mean(ep_advantages))/(np.std(ep_advantages)+0.0000000001)
				Pi[i][signal[i]].update(ep_states[i], ep_actions[i], ep_advantages)
				
			ep_actions  = [[] for _ in range(n_agent)]
			ep_rewards  = [[] for _ in range(n_agent)]
			ep_states  = [[] for _ in range(n_agent)]
		
		if render:
			env.render()

	for i in range(n_agent):
		if len(meta_rewards[i])==0:
			continue
Exemplo n.º 2
0
                meta_state_s = np.concatenate(
                    (meta_state, to_categorical([i] * T,
                                                n_agent), others_action),
                    axis=1)
                amstate.append(meta_state_s)
                qsall = meta_Q.get(meta_state_s)
                allqsa.append(qsall)
                qsa = (qsall * ep_actions[i]).sum(axis=-1)

                nothers_action = next_action[rmmyindex, :].reshape(-1)
                next_meta_state_s = np.concatenate(
                    (mobs, to_categorical(i, n_agent), nothers_action))
                next_qsall = meta_Q.get([next_meta_state_s])
                next_qsa = (next_qsall * next_action[i]).sum(axis=-1)

                ltarget = eligibility_traces(ep_rewards[i], qsa, next_qsa,
                                             GAMMA, LAMBDA)
                targets.append(ltarget)

            targets = np.array(targets).transpose()
            amstate = np.array(amstate)
            s, a, t = ([], [], [])
            for i in range(n_agent):
                s.append(amstate[i])
                a.append(ep_actions[i])
                t.append(targets[:, i])
            s = np.array(s).reshape((T * n_agent, -1))
            a = np.array(a).reshape((T * n_agent, -1))
            t = np.array(t).reshape((T * n_agent, -1))
            meta_Q.update(s, a, t[:, 0])

            #compute counterfactual
Exemplo n.º 3
0
        for i in range(n_agent):
            ep_rewards[i].append(rewards[i])

        if steps % T == 0:
            for i in range(n_agent):
                ep_actions[i] = np.array(ep_actions[i])
                ep_rewards[i] = np.array(ep_rewards[i], dtype=np.float_)
                ep_states[i] = np.array(ep_states[i])
                if LAMBDA < -0.1:
                    targets = discount_rewards(ep_rewards[i], GAMMA)
                    V[i].update(ep_states[i], targets)
                    vs = V[i].get(ep_states[i])
                else:
                    vs = V[i].get(ep_states[i])
                    targets = eligibility_traces(
                        ep_rewards[i], vs, V[i].get(copy.deepcopy([obs[i]])),
                        GAMMA, LAMBDA)
                    V[i].update(ep_states[i], targets)
                ep_advantages = targets - vs
                ep_advantages = (ep_advantages - np.mean(ep_advantages)) / (
                    np.std(ep_advantages) + 0.0000000001)
                Pi[i].update(ep_states[i], ep_actions[i], ep_advantages)

            ep_actions = [[] for _ in range(n_agent)]
            ep_rewards = [[] for _ in range(n_agent)]
            ep_states = [[] for _ in range(n_agent)]

        if render:
            env.render()

    print(i_episode)
Exemplo n.º 4
0
                        vs = V[i].get(ep_states[i])
                    else:
                        gV[i].update(ep_states[i], targets)
                        vs = gV[i].get(ep_states[i])
                else:
                    next_s = copy.deepcopy(obs[i])
                    if not greedy[i]:
                        vs = V[i].get(ep_states[i])
                        more_obs = gPi[i].get_dist(np.array([obs[i]]))[0]
                        next_s.extend(more_obs)
                        more_obs = get_more_obs_com(True, neighbors,
                                                    average_jpi, i,
                                                    more_obs_size)
                        next_s.extend(more_obs)
                        targets = eligibility_traces(ep_rewards[i], vs,
                                                     V[i].get([next_s]), GAMMA,
                                                     LAMBDA)
                        V[i].update(ep_states[i], targets)
                    else:
                        vs = gV[i].get(ep_states[i])
                        targets = eligibility_traces(ep_rewards[i], vs,
                                                     gV[i].get([next_s]),
                                                     GAMMA, LAMBDA)
                        gV[i].update(ep_states[i], targets)

                ep_advantages = targets - vs
                ep_advantages = (ep_advantages - np.mean(ep_advantages)) / (
                    np.std(ep_advantages) + 0.0000000001)
                all_ep_advantages.append(ep_advantages)
            all_ep_advantages = np.array(all_ep_advantages)
Exemplo n.º 5
0
                meta_state_s = np.concatenate(
                    (meta_state, to_categorical([i] * T,
                                                n_agent), others_action),
                    axis=1)
                amstate.append(meta_state_s)
                qsall = meta_Q.get(meta_state_s)
                allqsa.append(qsall)
                qsa = (qsall * ep_actions[i]).sum(axis=-1)

                nothers_action = next_action[rmmyindex, :].reshape(-1)
                next_meta_state_s = np.concatenate(
                    (mobs, to_categorical(i, n_agent), nothers_action))
                next_qsall = meta_Q.get([next_meta_state_s])
                next_qsa = (next_qsall * next_action[i]).sum(axis=-1)

                ltarget = eligibility_traces(meta_rewards, qsa, next_qsa,
                                             GAMMA, LAMBDA)
                targets.append(ltarget)

            targets = np.array(targets).transpose()
            amstate = np.array(amstate)
            s, a, t = ([], [], [])
            for i in range(n_agent):
                s.append(amstate[i])
                a.append(ep_actions[i])
                t.append(targets[:, i])
            s = np.array(s).reshape((T * n_agent, -1))
            a = np.array(a).reshape((T * n_agent, -1))
            t = np.array(t).reshape((T * n_agent, -1))
            meta_Q.update(s, a, t[:, 0])

            #compute counterfactual