for i in range(num_runs): vel = np.zeros(num_steps) # logging.info("Iter #" + str(i)) print('episode is:', i) ret = 0 ret_list = [] state = env.reset() aset = [] for j in range(num_steps): # manager actions # one controllers a = agent.choose_action([state]) aset.append(a) next_state, reward, done, _ = env.step(a) # if j%300==0: # print(state) # print(a) ret += reward ret_list.append(reward) if agent.num_experience > 2000: ploss, qloss, reg_loss, Q = agent.learn(batch_size=32) if done: agent.remember(state, a, reward, next_state, 1.) break agent.remember(state, a, reward, next_state, 0.) state = next_state[:]
action_dict = {} a = sess.run(agent.action, feed_dict={ agent.adj: [adj], agent.state_holder: state_, agent.vecholder: np.asarray([vec]) }) k = 0 for key, value in state.items(): action_dict[key] = a[k] k += 1 aset.append(a) next_state, reward, done, _ = env.step(action_dict) next_state_ = np.array(list(next_state.values())).reshape( 1, -1).tolist() rewards = list(reward.values()) ## calculate individual reward # for k in range(len(rewards)): # print('agent',k,'reward is:',rewards[k]) ret += np.average( rewards) ##here we consider the rewards of each agent ret_list.append(rewards) agent.remember(state_, a, rewards, next_state_, 0., adj) if agent.num_experience > 200: ##could change to 200 ploss, qloss = agent.learn(batch_size=32)