def play(episodes, is_render, is_testing, checkpoint_interval, weights_filename_prefix, csv_filename_prefix, batch_size): # init statistics. NOTE: simple tag specific! statistics_header = ["episode"] statistics_header.append("steps") statistics_header.extend(["reward_{}".format(i) for i in range(env.n)]) statistics_header.extend(["loss_{}".format(i) for i in range(env.n)]) statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_theta_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_mu_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_sigma_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_dt_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_x0_{}".format(i) for i in range(env.n)]) print("Collecting statistics {}:".format(" ".join(statistics_header))) statistics = general_utilities.Time_Series_Statistics_Store( statistics_header) for episode in range(args.episodes): states = env.reset() episode_losses = np.zeros(env.n) episode_rewards = np.zeros(env.n) collision_count = np.zeros(env.n) steps = 0 jsonFile = [] coords = [] fullyBreak = False while steps < 300: steps += 1 # render if args.render: env.render() time.sleep(0.05) if args.dump_file: # NIJE STO JE NASE... frame = env.dump_file() coords.append(frame) print("Radim...\n") # act actions = [] for i in range(env.n): action = np.clip( actors[i].choose_action(states[i]) + actors_noise[i](), -2, 2) actions.append(action) # step states_next, rewards, done, info = env.step(actions) # learn if not args.testing: size = memories[0].pointer batch = random.sample( range(size), size) if size < batch_size else random.sample( range(size), batch_size) for i in range(env.n): if done[i]: rewards[i] -= 500 memories[i].remember(states, actions, rewards[i], states_next, done[i]) if memories[i].pointer > batch_size * 10: s, a, r, sn, _ = memories[i].sample(batch, env.n) r = np.reshape(r, (batch_size, 1)) loss = critics[i].learn(s, a, r, sn) actors[i].learn(actors, s) episode_losses[i] += loss else: episode_losses[i] = -1 states = states_next episode_rewards += rewards collision_count += np.array( simple_tag_utilities.count_agent_collisions(env)) # reset states if done if any(done): episode_rewards = episode_rewards / steps episode_losses = episode_losses / steps statistic = [episode] statistic.append(steps) statistic.extend([episode_rewards[i] for i in range(env.n)]) statistic.extend([episode_losses[i] for i in range(env.n)]) statistic.extend(collision_count.tolist()) statistic.extend([actors_noise[i].theta for i in range(env.n)]) statistic.extend([actors_noise[i].mu for i in range(env.n)]) statistic.extend([actors_noise[i].sigma for i in range(env.n)]) statistic.extend([actors_noise[i].dt for i in range(env.n)]) statistic.extend([actors_noise[i].x0 for i in range(env.n)]) statistics.add_statistics(statistic) # NIJE STO JE NASE... if args.dump_file: fileNum = 1 print("Pravim...\n") with open("results/coords.txt", "w+") as f: f.write(str(len(coords[0]) / 2)) f.write("\n") for fr in coords: print("Pisem...\n") f.write(" ".join(str(i) for i in fr)) f.write("\n") fullyBreak = True break #fileNum += 1 coords = [] if episode % 25 == 0: print(statistics.summarize_last()) break if episode % checkpoint_interval == 0: statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode)) if not os.path.exists(weights_filename_prefix): os.makedirs(weights_filename_prefix) save_path = saver.save(session, os.path.join(weights_filename_prefix, "models"), global_step=episode) print("saving model to {}".format(save_path)) if episode >= checkpoint_interval: os.remove("{}_{}.csv".format(csv_filename_prefix, episode - checkpoint_interval)) if fullyBreak: break return statistics
def play(episodes, is_render, is_testing, checkpoint_interval, weights_filename_prefix, csv_filename_prefix, batch_size): # init statistics. NOTE: simple tag specific! statistics_header = ["episode"] statistics_header.append("steps") statistics_header.extend(["reward_{}".format(i) for i in range(env.n)]) statistics_header.extend(["loss_{}".format(i) for i in range(env.n)]) statistics_header.extend(["eps_greedy_{}".format(i) for i in range(env.n)]) statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)]) print("Collecting statistics {}:".format(" ".join(statistics_header))) statistics = general_utilities.Time_Series_Statistics_Store( statistics_header) for episode in range(args.episodes): states = env.reset() episode_losses = np.zeros(env.n) episode_rewards = np.zeros(env.n) collision_count = np.zeros(env.n) steps = 0 while True: steps += 1 # render if args.render: env.render() time.sleep(0.1) # act actions = [] actions_onehot = [] for i in range(env.n): action = dqns[i].choose_action(states[i]) speed = 0.9 if env.agents[i].adversary else 1 onehot_action = np.zeros(n_actions[i]) onehot_action[action] = speed actions_onehot.append(onehot_action) actions.append(action) # step states_next, rewards, done, info = env.step(actions_onehot) # learn if not args.testing: size = memories[0].pointer batch = random.sample( range(size), size) if size < batch_size else random.sample( range(size), batch_size) for i in range(env.n): if done[i]: rewards[i] -= 50 memories[i].remember(states[i], actions[i], rewards[i], states_next[i], done[i]) if memories[i].pointer > batch_size * 10: history = dqns[i].learn(*memories[i].sample(batch)) episode_losses[i] += history.history["loss"][0] else: episode_losses[i] = -1 states = states_next episode_rewards += rewards collision_count += np.array( simple_tag_utilities.count_agent_collisions(env)) # reset states if done if any(done): episode_rewards = episode_rewards / steps episode_losses = episode_losses / steps statistic = [episode] statistic.append(steps) statistic.extend([episode_rewards[i] for i in range(env.n)]) statistic.extend([episode_losses[i] for i in range(env.n)]) statistic.extend([dqns[i].eps_greedy for i in range(env.n)]) statistic.extend(collision_count.tolist()) statistics.add_statistics(statistic) if episode % 25 == 0: print(statistics.summarize_last()) break if episode % checkpoint_interval == 0: statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode)) general_utilities.save_dqn_weights( dqns, "{}_{}_".format(weights_filename_prefix, episode)) if episode >= checkpoint_interval: os.remove("{}_{}.csv".format(csv_filename_prefix, episode - checkpoint_interval)) return statistics
def play(episodes, is_render, is_testing, checkpoint_interval, weights_filename_prefix, csv_filename_prefix, batch_size): # init statistics. NOTE: simple tag specific! statistics_header = ["episode"] statistics_header.append("steps") statistics_header.extend(["reward_{}".format(i) for i in range(env.n)]) statistics_header.extend(["loss_{}".format(i) for i in range(env.n)]) statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_theta_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_mu_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_sigma_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_dt_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_x0_{}".format(i) for i in range(env.n)]) print("Collecting statistics {}:".format(" ".join(statistics_header))) statistics = general_utilities.Time_Series_Statistics_Store( statistics_header) for episode in range(args.episodes): states = env.reset() episode_losses = np.zeros(env.n) episode_rewards = np.zeros(env.n) collision_count = np.zeros(env.n) steps = 0 while True: steps += 1 # render if args.render: env.render() # act actions = [] for i in range(env.n): action = np.clip( actors[i].choose_action(states[i]) + actors_noise[i](), -2, 2) actions.append(action) # step states_next, rewards, done, info = env.step(actions) # learn if not args.testing: size = memories[0].pointer batch = random.sample( range(size), size) if size < batch_size else random.sample( range(size), batch_size) for i in range(env.n): if done[i]: rewards[i] -= 50 memories[i].remember(states[i], actions[i], rewards[i], states_next[i], done[i]) if memories[i].pointer > batch_size * 10: s, a, r, sn, _ = memories[i].sample(batch) r = np.reshape(r, (batch_size, 1)) loss = critics[i].learn(s, a, r, sn) actors[i].learn(s) episode_losses[i] += loss else: episode_losses[i] = -1 states = states_next episode_rewards += rewards collision_count += np.array( simple_tag_utilities.count_agent_collisions(env)) # reset states if done if any(done): episode_rewards = episode_rewards / steps episode_losses = episode_losses / steps statistic = [episode] statistic.append(steps) statistic.extend([episode_rewards[i] for i in range(env.n)]) statistic.extend([episode_losses[i] for i in range(env.n)]) statistic.extend(collision_count.tolist()) statistic.extend([actors_noise[i].theta for i in range(env.n)]) statistic.extend([actors_noise[i].mu for i in range(env.n)]) statistic.extend([actors_noise[i].sigma for i in range(env.n)]) statistic.extend([actors_noise[i].dt for i in range(env.n)]) statistic.extend([actors_noise[i].x0 for i in range(env.n)]) statistics.add_statistics(statistic) if episode % 25 == 0: print(statistics.summarize_last()) break if episode % checkpoint_interval == 0: statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode)) if not os.path.exists(weights_filename_prefix): os.makedirs(weights_filename_prefix) save_path = saver.save(session, os.path.join(weights_filename_prefix, "models"), global_step=episode) print("saving model to {}".format(save_path)) if episode >= checkpoint_interval: os.remove("{}_{}.csv".format(csv_filename_prefix, episode - checkpoint_interval)) return statistics
def play(checkpoint_interval, weights_filename_prefix, csv_filename_prefix, batch_size, stats_df): """Doc-string here""" for episode in range(args.episodes): states = env.reset() episode_losses = np.zeros(env.n) episode_rewards = np.zeros(env.n) collision_count = np.zeros(env.n) steps = 0 while True: steps += 1 # render if args.render: env.render() # act actions = [] for i in range(env.n): action = np.clip( actors[i].choose_action(states[i]) + actors_noise[i](), -2, 2) actions.append(action) # step states_next, rewards, done, info = env.step(actions) # learn if not args.testing: size = memories[0].pointer batch = random.sample(range(size), size) if size < batch_size else random.sample( range(size), batch_size) for i in range(env.n): if done[i]: rewards[i] -= 500 memories[i].remember(states, actions, rewards[i], states_next, done[i]) if memories[i].pointer > batch_size * 10: s, a, r, sn, _ = memories[i].sample(batch, env.n) r = np.reshape(r, (batch_size, 1)) loss = critics[i].learn(s, a, r, sn) actors[i].learn(actors, s) episode_losses[i] += loss else: episode_losses[i] = -1 states = states_next episode_rewards += rewards collision_count += np.array( simple_tag_utilities.count_agent_collisions(env)) # reset states if done if any(done): episode_rewards = episode_rewards / steps episode_losses = episode_losses / steps write_stats_row(env, stats_df, episode, steps, episode_rewards, episode_losses, collision_count) if episode % 25 == 0: print(stats_df.iloc[episode]) break if episode % checkpoint_interval == 0: stats_file = f"{csv_filename_prefix}_{episode}.h5" store = pd.HDFStore(stats_file) store['stats_df'] = stats_df print(f"stats_df saved to {stats_file}") if not os.path.exists(weights_filename_prefix): os.makedirs(weights_filename_prefix) save_path = saver.save(session, os.path.join( weights_filename_prefix, "models"), global_step=episode) stats_file = f"{csv_filename_prefix}_{args.episodes}.h5" store = pd.HDFStore(stats_file) store['stats_df'] = stats_df