def play(episodes, is_render, is_testing, checkpoint_interval, weights_filename_prefix, csv_filename_prefix, batch_size): # init statistics. NOTE: simple tag specific! statistics_header = ["episode"] statistics_header.append("steps") statistics_header.extend(["reward_{}".format(i) for i in range(env.n)]) statistics_header.extend(["loss_{}".format(i) for i in range(env.n)]) statistics_header.extend(["eps_greedy_{}".format(i) for i in range(env.n)]) statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)]) print("Collecting statistics {}:".format(" ".join(statistics_header))) statistics = general_utilities.Time_Series_Statistics_Store( statistics_header) for episode in range(args.episodes): states = env.reset() episode_losses = np.zeros(env.n) episode_rewards = np.zeros(env.n) collision_count = np.zeros(env.n) steps = 0 while True: steps += 1 # render if args.render: env.render() time.sleep(0.1) # act actions = [] actions_onehot = [] for i in range(env.n): action = dqns[i].choose_action(states[i]) speed = 0.9 if env.agents[i].adversary else 1 onehot_action = np.zeros(n_actions[i]) onehot_action[action] = speed actions_onehot.append(onehot_action) actions.append(action) # step states_next, rewards, done, info = env.step(actions_onehot) # learn if not args.testing: size = memories[0].pointer batch = random.sample( range(size), size) if size < batch_size else random.sample( range(size), batch_size) for i in range(env.n): if done[i]: rewards[i] -= 50 memories[i].remember(states[i], actions[i], rewards[i], states_next[i], done[i]) if memories[i].pointer > batch_size * 10: history = dqns[i].learn(*memories[i].sample(batch)) episode_losses[i] += history.history["loss"][0] else: episode_losses[i] = -1 states = states_next episode_rewards += rewards collision_count += np.array( simple_tag_utilities.count_agent_collisions(env)) # reset states if done if any(done): episode_rewards = episode_rewards / steps episode_losses = episode_losses / steps statistic = [episode] statistic.append(steps) statistic.extend([episode_rewards[i] for i in range(env.n)]) statistic.extend([episode_losses[i] for i in range(env.n)]) statistic.extend([dqns[i].eps_greedy for i in range(env.n)]) statistic.extend(collision_count.tolist()) statistics.add_statistics(statistic) if episode % 25 == 0: print(statistics.summarize_last()) break if episode % checkpoint_interval == 0: statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode)) general_utilities.save_dqn_weights( dqns, "{}_{}_".format(weights_filename_prefix, episode)) if episode >= checkpoint_interval: os.remove("{}_{}.csv".format(csv_filename_prefix, episode - checkpoint_interval)) return statistics
def play(episodes, is_render, is_testing, checkpoint_interval, weights_filename_prefix, csv_filename_prefix, batch_size): # init statistics. NOTE: simple tag specific! statistics_header = ["episode"] statistics_header.append("steps") statistics_header.append("done") statistics_header.append("reward") statistics_header.extend( ["loss_{}".format(i) for i in range(env.num_agents)]) statistics_header.extend( ["eps_greedy_{}".format(i) for i in range(env.num_agents)]) statistics_header.extend( ["Agent Energy Left_{}".format(i) for i in range(env.num_agents)]) statistics_header.extend( ["Task Energy Left_{}".format(i) for i in range(env.num_agents)]) print("Collecting statistics {}:".format(" ".join(statistics_header))) statistics = general_utilities.Time_Series_Statistics_Store( statistics_header) for episode in range(args.episodes): states = env.reset() # episode_losses = np.zeros(env.n) # episode_rewards = np.zeros(env.n) # collision_count = np.zeros(env.n) episode_losses = np.zeros(env.num_agents) episode_rewards = 0 steps = 0 all_states = [states] while steps <= 600: steps += 1 # render # if args.render: # env._render() # act actions = [] # n represents agents' number for i in range(env.num_agents): action = dqns[i].choose_action(states) actions.append(action) # step states_next, rewards, done, info = env.step(actions) all_states.append(states_next) # learn if not args.testing: size = memories[0].pointer batch = random.sample( range(size), size) if size < batch_size else random.sample( range(size), batch_size) for i in range(env.num_agents): memories[i].remember(states, actions[i], rewards, states_next, done) if memories[i].pointer > batch_size * 10: history = dqns[i].learn(*memories[i].sample(batch)) episode_losses[i] += history.history["loss"][0] else: for i in range(env.num_agents): episode_losses[i] = -1 states = states_next episode_rewards += rewards # reset states if done if done or steps >= 600: episode_losses = episode_losses / steps statistic = [episode] statistic.append(steps) statistic.append(done) statistic.append(episode_rewards) statistic.extend( [episode_losses[i] for i in range(env.num_agents)]) statistic.extend( [dqns[i].eps_greedy for i in range(env.num_agents)]) statistic.extend([env.B_k[i] for i in range(env.num_agents)]) statistic.extend([env.T_i[i] for i in range(env.num_agents)]) statistics.add_statistics(statistic) if episode % 1 == 0: print(statistics.summarize_last()) if done: with open('/save/states/episode{}_states.txt'.format( episode), mode='w') as myfile: for each in all_states: myfile.write(each) myfile.write('\n') myfile.close() break if episode % checkpoint_interval == 0: statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode)) general_utilities.save_dqn_weights( dqns, "{}_{}_".format(weights_filename_prefix, episode)) if episode >= checkpoint_interval: os.remove("{}_{}.csv".format(csv_filename_prefix, episode - checkpoint_interval)) return statistics
tf.set_random_seed(args.random_seed) # init DQNs n_actions = [env.action_space[i].n for i in range(env.n)] state_sizes = [env.observation_space[i].shape[0] for i in range(env.n)] memories = [Memory(args.memory_size) for i in range(env.n)] dqns = [ DQN(n_actions[i], state_sizes[i], eps_greedy=epsilon_greedy[i]) for i in range(env.n) ] general_utilities.load_dqn_weights_if_exist( dqns, args.experiment_prefix + args.weights_filename_prefix) start_time = time.time() # play statistics = play(args.episodes, args.render, args.testing, args.checkpoint_frequency, args.experiment_prefix + args.weights_filename_prefix, args.experiment_prefix + args.csv_filename_prefix, args.batch_size) # bookkeeping print("Finished {} episodes in {} seconds".format(args.episodes, time.time() - start_time)) general_utilities.save_dqn_weights( dqns, args.experiment_prefix + args.weights_filename_prefix) statistics.dump(args.experiment_prefix + args.csv_filename_prefix + ".csv")
def play(episodes, is_render, is_testing, checkpoint_interval, weights_filename_prefix, csv_filename_prefix, batch_size): # init statistics. NOTE: simple tag specific! statistics_header = ["episode"] statistics_header.append("steps") statistics_header.extend(["reward_{}".format(i) for i in range(env.n)]) statistics_header.extend(["loss_{}".format(i) for i in range(env.n)]) statistics_header.extend(["eps_greedy_{}".format(i) for i in range(env.n)]) statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)]) print("Collecting statistics {}:".format(" ".join(statistics_header))) statistics = general_utilities.Time_Series_Statistics_Store( statistics_header) for episode in range(args.episodes): states = env.reset() episode_losses = np.zeros(env.n) episode_rewards = np.zeros(env.n) collision_count = np.zeros(env.n) steps = 0 while True: steps += 1 # render if args.render: env.render() # act actions = [] for i in range(env.n): if i < h: action = dqns[i].choose_action(states[i]) speed = 0.9 if env.agents[i].adversary else 1 onehot_action = np.zeros(n_actions[i]) onehot_action[action] = speed actions.append(onehot_action) else: action = np.clip( actors[i].choose_action(states[i]) + actors_noise[i](), -2, 2) actions.append(action) # step states_next, rewards, done, info = env.step(actions) # learn if not args.testing: size = memories[0].pointer batch = random.sample( range(size), size) if size < batch_size else random.sample( range(size), batch_size) for i in range(env.n): if done[i]: rewards[i] -= 50 if i < h: memories[i].remember(states[i], np.argmax(actions[i]), rewards[i], states_next[i], done[i]) else: memories[i].remember(states, actions, rewards[i], states_next, done[i]) if i < h: if memories[i].pointer > batch_size * 10: s, a, r, sn, done = memories[i].sample(batch) history = dqns[i].learn(s, a, r, sn, done) episode_losses[i] += history.history["loss"][0] else: episode_losses[i] = -1 else: if memories[i].pointer > batch_size * 10: s, a, r, sn, _ = memories[i].sample(batch, env.n) r = np.reshape(r, (batch_size, 1)) loss = critics[i].learn(s, a, r, sn) actors[i].learn(actors, s) episode_losses[i] += loss else: episode_losses[i] = -1 states = states_next episode_rewards += rewards collision_count += np.array( simple_tag_utilities.count_agent_collisions(env)) # reset states if done if any(done): episode_rewards = episode_rewards / steps episode_losses = episode_losses / steps statistic = [episode] statistic.append(steps) statistic.extend([episode_rewards[i] for i in range(env.n)]) statistic.extend([episode_losses[i] for i in range(env.n)]) statistic.extend([ *[dqns[i].eps_greedy for i in range(h)], *[-1 for i in range(h, env.n - h + 1)] ]) statistic.extend(collision_count.tolist()) statistics.add_statistics(statistic) if episode % 25 == 0: print(statistics.summarize_last()) break if episode % checkpoint_interval == 0: if i < h: statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode)) general_utilities.save_dqn_weights( dqns, "{}_{}_".format(weights_filename_prefix, episode)) if episode >= checkpoint_interval: os.remove("{}_{}.csv".format(csv_filename_prefix, episode - checkpoint_interval)) else: statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode)) if not os.path.exists(weights_filename_prefix): os.makedirs(weights_filename_prefix) save_path = saver.save(session, os.path.join(weights_filename_prefix, "models"), global_step=episode) print("saving model to {}".format(save_path)) if episode >= checkpoint_interval: os.remove("{}_{}.csv".format(csv_filename_prefix, episode - checkpoint_interval)) return statistics