Exemplo n.º 1
0
    ep_len += 1
    buff.add(obs, p, reward, next_obs, terminated)
    obs = next_obs

    if (terminated) | (ep_len == max_ep_len):
        obs = env.reset()
        terminated = False
        ep_len = 0

    if setps % 10000 == 0:
        print(test_agent())

    if (setps < 1000) | (setps % 50 != 0):
        continue

    for e in range(50):
        batch = buff.getBatch(batch_size)
        for j in range(batch_size):
            X[j] = batch[j][0]
            next_X[j] = batch[j][3]
            for i in range(n_ant):
                A[i][j] = batch[j][1][i]

        Q_target = agents.Q_tot_tar.predict(next_X, batch_size=batch_size)
        for j in range(batch_size):
            Q_target[j] = batch[j][2] + Q_target[j] * gamma * (1 - batch[j][4])

        agents.train_critic(X, A, Q_target)
        agents.train_actors(X)
        agents.update()
Exemplo n.º 2
0
	sum_reward += reward
	setps += 1
	buff.add(obs, p, reward, next_obs, terminated)
	obs = next_obs

	if terminated:
		obs = env.reset()
		terminated = False
		reward_list.append(sum_reward)
		sum_reward = 0
		if buff.pointer > buffer_size:

			print(np.mean(reward_list))
			reward_list = []

			for k in range(num_ite):
				states, actions, returns, next_states, dones, gammas = buff.getBatch(mini_batch)
				Q_target = agents.compute_target([next_states])[0]
				Q_target = returns + Q_target*gammas*(1 - dones)
				agents.train_critic(states, actions, Q_target)
				agents.update()

			states, actions, returns, next_states, dones, gammas = buff.getBatch(2000)
			advantages = agents.compute_advantage([states]+[actions[i] for i in range(n_ant)])
			if advantage_norm:
				for i in range(n_ant):
					advantages[i] = (advantages[i] - advantages[i].mean())/(advantages[i].std()+1e-8)
			agents.train_actors(states, actions, advantages)

			buff.reset()