def main(env_id, policy_file, record, stochastic, extra_kwargs): import lab from lab import wrappers import tensorflow as tf from es_distributed.policies import MujocoPolicy import numpy as np env = lab.make(env_id) if record: import uuid env = wrappers.Monitor(env, '/tmp/' + str(uuid.uuid4()), force=True) if extra_kwargs: import json extra_kwargs = json.loads(extra_kwargs) with tf.Session(): pi = MujocoPolicy.Load(policy_file, extra_kwargs=extra_kwargs) while True: rews, t = pi.rollout(env, render=True, random_stream=np.random if stochastic else None) print('return={:.4f} len={}'.format(rews.sum(), t)) if record: env.close() return
def setup_policy(cls, policy_file, thisData, noise_stdev, path): from es_distributed.policies import MujocoPolicy pi = MujocoPolicy.Load(policy_file, extra_kwargs=None) if not thisData.parentOrNot: noiseIdx, noiseSign = thisData.child_op_data[1:3].astype(int) theta = pi.get_trainable_flat() + noiseSign * noise_stdev * gs.noise.get(noiseIdx, pi.num_params) pi.set_trainable_flat(theta) return pi
def main(): sess = tf.InteractiveSession() env = gym.make("RoboschoolAnt-v1") ac_space = env.action_space ob_space = env.observation_space policy = MujocoPolicy(ob_space, ac_space, "uniform:10", 0.01, 'tanh', [256, 256], 'ff') ob_stat = RunningStat( env.observation_space.shape, eps=1e-2 # eps to prevent dividing by zero at the beginning when computing mean/stdev ) policy.set_ob_stat(ob_stat.mean, ob_stat.std) ob = np.ones(ob_space.shape)#env.reset() sess.run(tf.initialize_variables(tf.all_variables())) a = policy.act(ob[None]) print(a)
def main(): """ Tests if the outputs of the Keras model is equal (or similar) to the original implementation which used plain TensorFlow operations. The code in this main function starts 1000 iterations of the version with the TensorFlow Operations. The code in the function jupyter_cell can be started in the training Jupyter Notebook. :return: """ env = gym.make("RoboschoolAnt-v1") ac_space = env.action_space ob_space = env.observation_space ob_stat = RunningStat( env.observation_space.shape, eps= 1e-2 # eps to prevent dividing by zero at the beginning when computing mean/stdev ) rews, lens = [], [] for _ in range(1000): sess = tf.InteractiveSession() policy = MujocoPolicy(ob_space, ac_space, "continuous:", 0.01, 'tanh', [256, 256], 'ff') policy.set_ob_stat(ob_stat.mean, ob_stat.std) sess.run(tf.initialize_variables(tf.all_variables())) r, t = policy.rollout(env) rews.append(r.sum()) lens.append(t) tf.reset_default_graph() sess.close() print("RewMean", np.mean(rews)) print("RewStd", np.std(rews)) print("LenMean", np.mean(lens))
def main(env_id, policy_file, record, stochastic, extra_kwargs): import gym from gym import wrappers import tensorflow as tf from es_distributed.policies import MujocoPolicy, ESAtariPolicy, GAAtariPolicy, GAGenesisPolicy from es_distributed.atari_wrappers import ScaledFloatFrame, wrap_deepmind from es_distributed.es import get_ref_batch import numpy as np max_episode_steps = 4500 # register retro games with max steps per episode to be played out register_all(max_episode_steps=max_episode_steps) is_atari_policy = True env = gym.make(env_id) env = sonic_util.sonicize_env(env) #if is_atari_policy: # env = wrap_deepmind(env) if record: import uuid env = wrappers.Monitor(env, '/tmp/' + str(uuid.uuid4()), force=True) if extra_kwargs: import json extra_kwargs = json.loads(extra_kwargs) with tf.Session(): if is_atari_policy: pi = GAGenesisPolicy.Load(policy_file, extra_kwargs=extra_kwargs) # pi.set_ref_batch(get_ref_batch(env, batch_size=128)) else: pi = MujocoPolicy.Load(policy_file, extra_kwargs=extra_kwargs) while True: if is_atari_policy: rews, t, score = pi.rollout( env, render=True, random_stream=np.random if stochastic else None) print('return={:.4f} len={}'.format(rews.sum(), t)) print(score) if record: env.close() return
def main(env_ids, policy_directory, record, stochastic, extra_kwargs): import gym from gym import wrappers import tensorflow as tf from es_distributed.policies import MujocoPolicy, ESAtariPolicy, GAAtariPolicy from es_distributed.atari_wrappers import ScaledFloatFrame, wrap_deepmind from es_distributed.es import get_ref_batch import es_distributed.ns as ns import numpy as np import os env_ids = env_ids.split(' ') is_atari_policy = "NoFrameskip" in env_ids[0] files = 0 for policy_name in os.listdir(policy_directory): files += 1 policy_file = "%s/%s" % (policy_directory, policy_name) pid = os.fork() if (pid == 0): env = [] for i in range(0, len(env_ids)): env.append(gym.make(env_ids[i])) if env_ids[i].endswith('NoFrameskip-v4'): env[i] = wrap_deepmind(env[i]) if extra_kwargs: import json extra_kwargs = json.loads(extra_kwargs) with tf.Session(): if is_atari_policy: pi = GAAtariPolicy.Load(policy_file, extra_kwargs=extra_kwargs) if pi.needs_ref_batch: pi.set_ref_batch(get_ref_batch(env[0], batch_size=128)) else: pi = MujocoPolicy.Load(policy_file, extra_kwargs=extra_kwargs) while True: if is_atari_policy: rews, t, novelty_vector = pi.rollout(env, render=True, random_stream=np.random if stochastic else None) for i in range(0, files): os.wait()
def main(env_id, policy_file, record, stochastic, extra_kwargs): import gym from gym import wrappers import tensorflow as tf from es_distributed.policies import MujocoPolicy, ESAtariPolicy, GAAtariPolicy from es_distributed.atari_wrappers import ScaledFloatFrame, wrap_deepmind from es_distributed.es import get_ref_batch import numpy as np is_atari_policy = "NoFrameskip" in env_id env = gym.make(env_id) if is_atari_policy: env = wrap_deepmind(env) if record: import uuid env = wrappers.Monitor(env, '/tmp/' + str(uuid.uuid4()), force=True) if extra_kwargs: import json extra_kwargs = json.loads(extra_kwargs) with tf.Session(): if is_atari_policy: pi = GAAtariPolicy.Load(policy_file, extra_kwargs=extra_kwargs) # Commented out the policy below, as it is not the policy that we use. # pi = ESAtariPolicy.Load(policy_file, extra_kwargs=extra_kwargs) # pi.set_ref_batch(get_ref_batch(env, batch_size=128)) else: pi = MujocoPolicy.Load(policy_file, extra_kwargs=extra_kwargs) while True: if is_atari_policy: rews, t, novelty_vector = pi.rollout( env, render=True, random_stream=np.random if stochastic else None) print('return={:.4f} len={}'.format(rews.sum(), t)) if record: env.close() return