def main(env_id, policy_file, record, stochastic, extra_kwargs): import gym from gym import wrappers import tensorflow as tf from es_distributed.policies import ESAtariPolicy from es_distributed.atari_wrappers import ScaledFloatFrame, wrap_deepmind from es_distributed.es import get_ref_batch import numpy as np is_atari_policy = "NoFrameskip" in env_id env = gym.make(env_id) if is_atari_policy: env = wrap_deepmind(env) if record: import uuid env = wrappers.Monitor(env, '/tmp/' + str(uuid.uuid4()), force=True) if extra_kwargs: import json extra_kwargs = json.loads(extra_kwargs) with tf.Session(): pi = ESAtariPolicy.Load(policy_file, extra_kwargs=extra_kwargs) pi.set_ref_batch(get_ref_batch(env, batch_size=128)) while True: if is_atari_policy: rews, t, novelty_vector = pi.rollout(env, render=True, random_stream=np.random if stochastic else None) print('return={:.4f} len={}'.format(rews, t)) if record: env.close() return
def main(env_ids, policy_directory, record, stochastic, extra_kwargs): import gym from gym import wrappers import tensorflow as tf from es_distributed.policies import MujocoPolicy, ESAtariPolicy, GAAtariPolicy from es_distributed.atari_wrappers import ScaledFloatFrame, wrap_deepmind from es_distributed.es import get_ref_batch import es_distributed.ns as ns import numpy as np import os env_ids = env_ids.split(' ') is_atari_policy = "NoFrameskip" in env_ids[0] files = 0 for policy_name in os.listdir(policy_directory): files += 1 policy_file = "%s/%s" % (policy_directory, policy_name) pid = os.fork() if (pid == 0): env = [] for i in range(0, len(env_ids)): env.append(gym.make(env_ids[i])) if env_ids[i].endswith('NoFrameskip-v4'): env[i] = wrap_deepmind(env[i]) if extra_kwargs: import json extra_kwargs = json.loads(extra_kwargs) with tf.Session(): if is_atari_policy: pi = GAAtariPolicy.Load(policy_file, extra_kwargs=extra_kwargs) if pi.needs_ref_batch: pi.set_ref_batch(get_ref_batch(env[0], batch_size=128)) else: pi = MujocoPolicy.Load(policy_file, extra_kwargs=extra_kwargs) while True: if is_atari_policy: rews, t, novelty_vector = pi.rollout(env, render=True, random_stream=np.random if stochastic else None) for i in range(0, files): os.wait()
def evaluate_policy_on_levels(policy_file, ids, n_rep, record=False): import tensorflow as tf from es_distributed.policies import MujocoPolicy, ESAtariPolicy, GAAtariPolicy, GAGenesisPolicy from es_distributed.atari_wrappers import ScaledFloatFrame, wrap_deepmind from es_distributed.es import get_ref_batch import numpy as np is_atari_policy = True all_scores = [] all_lengths = [] all_percs = [] all_rewards = [] tf.reset_default_graph() with tf.Session(): # create a baseline policy if policy_file == 'baseline': pi = BaselinePolicy() else: # load the policy just once pi = GAGenesisPolicy.Load(policy_file) # load the policy just once # play each env for id in ids: env = make_env(id, record=record) temp_all_scores = [] temp_all_lengths = [] temp_all_percs = [] temp_all_rewards = [] for i in range(0, n_rep): if pi.needs_ref_batch: pi.set_ref_batch(get_ref_batch(env, batch_size=128)) # play on this env rews, t, res_dict = pi.rollout(env, render=False) temp_all_lengths.append(t) # store the list of rewards perc = res_dict['max_perc'] if 'max_perc' in res_dict else 0 temp_all_percs.append(perc) score = res_dict['max_score'] if 'max_score' in res_dict else 0 temp_all_scores.append(score) temp_all_rewards.append(rews.sum()) del env print(temp_all_percs) all_scores.append(np.mean(temp_all_scores)) all_rewards.append(np.mean(temp_all_rewards)) all_lengths.append(np.mean(temp_all_lengths)) all_percs.append(np.mean(temp_all_percs)) # scores, how far Sonic got to the goal, level length, and in-game score if len(all_scores) == 1: return all_scores[0], all_percs[0], all_lengths[0], all_rewards[0] else: return all_scores, all_percs, all_lengths, all_rewards