def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, ) M = variant['layer_size'] qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=(M, M), ) df = DFunction( env_spec=env.spec, hidden_layer_sizes=[M, M]) # discriminator, input is the actions. vf = VFunction(env_spec=env.spec, hidden_layer_sizes=[M, M]) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=16, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=False, df=df, vf=vf, df_lr=1e-3, dist=variant['dist'], ) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'pusher': # TODO: assumes `pusher.xml` is located in `rllab/models/` when # running on EC2. env = normalize(PusherEnv(goal=variant.get('goal'))) else: raise ValueError pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size']) sampler = SimpleSampler( max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict( epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, sampler=sampler) task_id = abs(pickle.dumps(variant).__hash__()) M = variant['layer_size'] qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf_{i}'.format(i=task_id)) policy = StochasticNNPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), name='policy_{i}'.format(i=task_id)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], value_n_particles=variant['value_n_particles'], td_target_update_interval=variant['td_target_update_interval'], qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=variant['save_full_state']) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'pusher': # TODO: assumes `pusher.xml` is located in `rllab/models/` when # running on EC2. env = normalize(PusherEnv(goal=variant.get('goal'))) else: raise ValueError pool = SimpleReplayBuffer( env_spec=spec(env), max_replay_buffer_size=variant['max_pool_size']) sampler = SimpleSampler( max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict( epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, sampler=sampler) task_id = abs(pickle.dumps(variant).__hash__()) M = variant['layer_size'] qf = NNQFunction( env_spec=spec(env), hidden_layer_sizes=(M, M), name='qf_{i}'.format(i=task_id)) policy = StochasticNNPolicy( env_spec=spec(env), hidden_layer_sizes=(M, M), name='policy_{i}'.format(i=task_id)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], value_n_particles=variant['value_n_particles'], td_target_update_interval=variant['td_target_update_interval'], qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=variant['save_full_state']) algorithm.train()
def test(): env = normalize(MultiGoalEnv()) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, ) base_kwargs = dict( min_pool_size=100, epoch_length=100, n_epochs=1000, max_path_length=30, batch_size=64, n_train_repeat=1, eval_render=True, eval_n_episodes=10, ) M = 128 policy = StochasticNNPolicy(env.spec, hidden_layer_sizes=(M, M), squash=True) qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[M, M]) plotter = QFPolicyPlotter(qf=qf, policy=policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, plotter=plotter, policy_lr=3e-4, qf_lr=3e-4, value_n_particles=16, td_target_update_interval=1000, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=32, kernel_update_ratio=0.5, discount=0.99, reward_scale=0.1, save_full_state=False, ) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': env = normalize(SwimmerEnv()) elif variant['env_name'] == 'ant-rllab': env = normalize(AntEnv()) elif variant['env_name'] == 'sawyer-rllab': env = normalize(SawyerTestEnv()) elif variant['env_name'] == 'arm3Ddisc-rllab': env = normalize(Arm3dDiscEnv()) else: env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size']) sampler = SimpleSampler( max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict( epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, sampler=sampler) M = variant['layer_size'] qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], value_n_particles=variant['value_n_particles'], td_target_update_interval=variant['td_target_update_interval'], qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=False) algorithm.train()
def test(): env = normalize(MultiGoalEnv()) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, ) base_kwargs = dict( min_pool_size=100, epoch_length=100, n_epochs=1000, max_path_length=30, batch_size=64, n_train_repeat=1, eval_render=True, eval_n_episodes=10, ) M = 128 policy = StochasticNNPolicy( env.spec, hidden_layer_sizes=(M, M), squash=True) qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[M, M]) plotter = QFPolicyPlotter( qf=qf, policy=policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, plotter=plotter, policy_lr=3e-4, qf_lr=3e-4, value_n_particles=16, td_target_update_interval=1000, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=32, kernel_update_ratio=0.5, discount=0.99, reward_scale=0.1, save_full_state=False, ) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, ) M = variant['layer_size'] qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=(M, M), ) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=32, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=False, ) algorithm.train()
def run_experiment(variant): env = normalize(SwimmerEnv()) pool = SimpleReplayBuffer( env_spec=spec(env), max_replay_buffer_size=1e6) sampler = SimpleSampler( max_path_length=1000, min_pool_size=1000, batch_size=128) base_kwargs = dict( epoch_length=1000, n_epochs=500, n_train_repeat=1, eval_render=False, eval_n_episodes=1, sampler=sampler) with tf.Session().as_default(): data = joblib.load(variant['file']) if 'algo' in data.keys(): saved_qf = data['algo'].qf saved_policy = data['algo'].policy else: saved_qf = data['qf'] saved_policy = data['policy'] algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=saved_qf, policy=saved_policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=16, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=3E-4, policy_lr=3E-4, discount=0.99, reward_scale=30, use_saved_qf=True, use_saved_policy=True, save_full_state=False) algorithm.train()
def run_experiment(variant): env = normalize(SawyerTestEnv()) pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=1e6) sampler = SimpleSampler(max_path_length=1000, min_pool_size=1000, batch_size=128) base_kwargs = dict(epoch_length=1000, n_epochs=500, n_train_repeat=1, eval_render=False, eval_n_episodes=1, sampler=sampler) with tf.Session().as_default(): data = joblib.load(variant['file']) #data = joblib.load('/home/gerrit/projectThesis/data/sawyer-experiment/itr_1600.pkl') if 'algo' in data.keys(): saved_qf = data['algo'].qf saved_policy = data['algo'].policy else: saved_qf = data['qf'] saved_policy = data['policy'] algorithm = SQL(base_kwargs=base_kwargs, env=env, pool=pool, qf=saved_qf, policy=saved_policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=16, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=3E-4, policy_lr=3E-4, discount=0.99, reward_scale=30, use_saved_qf=True, use_saved_policy=True, save_full_state=False) algorithm.train()
def run_experiment(variant): env = normalize(PusherEnv(goal=variant.get('goal'))) buffer1, qf1 = load_buffer_and_qf(variant['snapshot1']) buffer2, qf2 = load_buffer_and_qf(variant['snapshot2']) sampler = DummySampler( batch_size=variant['batch_size'], max_path_length=variant['max_path_length']) buffer = UnionBuffer(buffers=(buffer1, buffer2)) qf = SumQFunction(spec(env), q_functions=(qf1, qf2)) M = variant['layer_size'] policy = StochasticNNPolicy( env_spec=spec(env), hidden_layer_sizes=(M, M), name='policy{i}'.format(i=0)) base_kwargs = dict( epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=1, eval_render=False, eval_n_episodes=1, sampler=sampler) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=buffer, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], policy_lr=variant['policy_lr'], save_full_state=False, train_policy=True, train_qf=False, use_saved_qf=True) algorithm.train()
def run_experiment(variant): env = normalize(PusherEnv(goal=variant.get('goal'))) buffer1, qf1 = load_buffer_and_qf(variant['snapshot1']) buffer2, qf2 = load_buffer_and_qf(variant['snapshot2']) sampler = DummySampler( batch_size=variant['batch_size'], max_path_length=variant['max_path_length']) buffer = UnionBuffer(buffers=(buffer1, buffer2)) qf = SumQFunction(env.spec, q_functions=(qf1, qf2)) M = variant['layer_size'] policy = StochasticNNPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), name='policy{i}'.format(i=0)) base_kwargs = dict( epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=1, eval_render=False, eval_n_episodes=1, sampler=sampler) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=buffer, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], policy_lr=variant['policy_lr'], save_full_state=False, train_policy=True, train_qf=False, use_saved_qf=True) algorithm.train()
def train(self, env_fn, num_timesteps, noise_type, layer_norm, folder, load_policy, video_width, video_height, plot_rewards, save_every=50, seed=1234, episode_length=1000, pi_hid_size=150, pi_num_hid_layers=3, render_frames=_render_frames, **kwargs): num_cpu = self.workers if sys.platform == 'darwin': num_cpu //= 2 config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=num_cpu, inter_op_parallelism_threads=num_cpu) if self.gpu_usage is None or self.gpu_usage <= 0.: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" else: config.gpu_options.allow_growth = True # pylint: disable=E1101 config.gpu_options.per_process_gpu_memory_fraction = self.gpu_usage / self.workers tf.Session(config=config).__enter__() worker_seed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(worker_seed) tf.set_random_seed(worker_seed) np.random.seed(worker_seed) save_every = max(1, save_every) env = env_fn() env.seed(worker_seed) rank = MPI.COMM_WORLD.Get_rank() logger.info('rank {}: seed={}, logdir={}'.format(rank, worker_seed, logger.get_dir())) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=pi_hid_size, num_hid_layers=pi_num_hid_layers) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.INFO) that = self iter_name = 'iters_so_far' if self.method == 'sql': iter_name = 'epoch' # TODO replace with utils.create_callback(...) def callback(locals, globals): if that.method != "ddpg": if load_policy is not None and locals[iter_name] == 0: # noinspection PyBroadException try: utils.load_state(load_policy) if MPI.COMM_WORLD.Get_rank() == 0: logger.info("Loaded policy network weights from %s." % load_policy) # save TensorFlow summary (contains at least the graph definition) except: logger.error("Failed to load policy network weights from %s." % load_policy) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] == 0: _ = tf.summary.FileWriter(folder, tf.get_default_graph()) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] % save_every == 0: print('Saving video and checkpoint for policy at iteration %i...' % locals[iter_name]) ob = env.reset() images = [] rewards = [] max_reward = 1. # if any reward > 1, we have to rescale lower_part = video_height // 5 for i in range(episode_length): if that.method == "ddpg": ac, _ = locals['agent'].pi(ob, apply_noise=False, compute_Q=False) elif that.method == "sql": ac, _ = locals['policy'].get_action(ob) elif isinstance(locals['pi'], GaussianMlpPolicy): ac, _, _ = locals['pi'].act(np.concatenate((ob, ob))) else: ac, _ = locals['pi'].act(False, ob) ob, rew, new, _ = env.step(ac) images.append(render_frames(env)) if plot_rewards: rewards.append(rew) max_reward = max(rew, max_reward) if new: break orange = np.array([255, 163, 0]) red = np.array([255, 0, 0]) video = [] width_factor = 1. / episode_length * video_width for i, imgs in enumerate(images): for img in imgs: img[-lower_part, :10] = orange img[-lower_part, -10:] = orange if episode_length < video_width: p_rew_x = 0 for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, p_rew_x:rew_x] = red img[-1:, p_rew_x:rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, p_rew_x:rew_x] = orange img[-rew_y - 1:, p_rew_x:rew_x] = orange p_rew_x = rew_x else: for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, rew_x] = red img[-1:, rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, rew_x] = orange img[-rew_y - 1:, rew_x] = orange video.append(np.hstack(imgs)) imageio.mimsave( os.path.join(folder, "videos", "%s_%s_iteration_%i.mp4" % (that.environment, that.method, locals[iter_name])), video, fps=60) env.reset() if that.method != "ddpg": utils.save_state(os.path.join(that.folder, "checkpoints", "%s_%i" % (that.environment, locals[iter_name]))) if self.method == "ppo": pposgd_simple.learn( env, policy_fn, max_timesteps=int(num_timesteps), timesteps_per_actorbatch=1024, # 256 clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, # 1e-3 optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', # 'linear' callback=callback) elif self.method == "trpo": trpo_mpi.learn( env, policy_fn, max_timesteps=int(num_timesteps), timesteps_per_batch=1024, max_kl=0.1, # 0.01 cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, callback=callback) elif self.method == "acktr": from algos.acktr import acktr with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) acktr.learn( env, pi=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=1024, desired_kl=0.01, # 0.002 num_timesteps=num_timesteps, animate=False, callback=callback) elif self.method == "ddpg": from algos.ddpg import ddpg # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') from baselines.ddpg.noise import AdaptiveParamNoiseSpec param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') from baselines.ddpg.noise import NormalActionNoise action_noise = NormalActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: from baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory( limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) ddpg.train( env=env, eval_env=None, param_noise=param_noise, render=False, render_eval=False, action_noise=action_noise, actor=actor, critic=critic, memory=memory, callback=callback, **kwargs) elif self.method == "sql": from softqlearning.algorithms import SQL from softqlearning.misc.kernel import adaptive_isotropic_gaussian_kernel from softqlearning.misc.utils import timestamp from softqlearning.replay_buffers import SimpleReplayBuffer from softqlearning.value_functions import NNQFunction from softqlearning.policies import StochasticNNPolicy from rllab.envs.gym_env import GymEnv env = GymEnv(env) variant = { 'seed': [1, 2, 3], 'policy_lr': 3E-4, 'qf_lr': 3E-4, 'discount': 0.99, 'layer_size': 128, 'batch_size': 128, 'max_pool_size': 1E6, 'n_train_repeat': 1, 'epoch_length': 1000, 'snapshot_mode': 'last', 'snapshot_gap': 100, } pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=episode_length, epoch_length=episode_length, n_epochs=num_timesteps, max_path_length=episode_length, batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, iter_callback=callback ) qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=tuple([pi_hid_size] * pi_num_hid_layers), ) pi_layers = tuple([pi_hid_size] * pi_num_hid_layers) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=pi_layers) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=32, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=1, save_full_state=False, ) algorithm.train() else: print('ERROR: Invalid "method" argument provided.', file=sys.stderr) env.close()