def train(env_id, num_timesteps, seed): env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) set_global_seeds(seed) env.seed(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
def train(env_id, num_timesteps, seed): env = Lynx() #env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(1))) set_global_seeds(seed) #env.seed(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()) as sess: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): if MLP: policy = MlpPolicy(sess, ob_space=env.observation_space, ac_space=env.action_space) else: policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=50, desired_kl=0.002, num_timesteps=num_timesteps, animate=False)
def train(env_id, num_timesteps, seed, alg, lr, momentum): env = make_mujoco_env(env_id, seed) if alg == 'sgd': from baselines.acktr.acktr_cont import learn elif alg == 'mid': from baselines.acktr.acktr_cont_midpoint import learn elif alg == 'geo': from baselines.acktr.acktr_cont_geo import learn else: raise ValueError nprocs = 4 with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs)): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) policy = GaussianMlpPolicy(ob_dim, ac_dim, 'pi') learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False, lr=lr, momentum=momentum) env.close()
def run_train_task(vv): # Create envs. env = vv['env'](log_scale_limit=0.0, max_path_length=vv['path_length']) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=vv['discount'], lam=0.97, timesteps_per_batch=vv['batch_size'], desired_kl=0.002, num_timesteps=vv['num_timesteps'], max_path_length=vv['path_length'], animate=False) env.close()
def train(args, num_timesteps, seed): import tensorflow as tf from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser from baselines.acktr.acktr_cont import learn from baselines.acktr.policies import GaussianMlpPolicy from baselines.acktr.value_functions import NeuralNetValueFunction env = common.make_env(args) env.reward_scale = 0.01 with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
def train(env_id, num_timesteps, seed): env = gym.make(env_id) if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json")) set_global_seeds(seed) env.seed(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()) as session: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=args.timesteps_per_batch, desired_kl=0.002, num_timesteps=num_timesteps, save_path=args.save_path, save_after=args.save_after, load_path=args.load_path, save_rollouts=args.save_rollouts, animate=args.animate) env.close()
def train(env, num_timesteps, timesteps_per_batch, seed, num_cpu, resume, hid_size, num_hid_layers, logdir, agentName, desired_kl, gamma, lam, portnum, num_parallel ): if num_parallel > 1: env = CustomParallelEnv(num_parallel) else: env = gym.make(env) env.seed(seed) # Todo: add seed to the random env too if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json")) set_global_seeds(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()) as session: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim, hid_size=128, num_hid_layers=2) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim, hid_size=128, num_hid_layers=2) learn(env, policy=policy, vf=vf, gamma=gamma, lam=0.97, timesteps_per_batch=timesteps_per_batch, desired_kl=desired_kl, resume=resume, logdir=logdir, agentName=agentName, num_timesteps=num_timesteps, animate=True) env.close()
def train(env_id, num_timesteps, seed): env = make_gym_control_env(env_id, seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
def train(num_timesteps, seed, env_name, fname): env = make_env(env_name) if env == None: logger.log("Empty environment") return env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) set_global_seeds(seed) env.seed(seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) try: learn( env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=1, #4500 desired_kl=0.002, num_timesteps=num_timesteps, animate=visualization, fname=fname) except KeyboardInterrupt: if fname != None: os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("Model saved to file {}".format(fname)) pass env.close()
def train(env_id, num_timesteps, seed, render): env = LearningEnvironment(num_particles=PARTICLES, disable_render=not render) env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json")) set_global_seeds(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()) as session: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=8000, desired_kl=0.0002, num_timesteps=num_timesteps, animate=False) env.close()
def train(env_id, num_timesteps, seed): """ train an ACKTR model on atari :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ env = make_mujoco_env(env_id, seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): value_fn = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, value_fn=value_fn, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
def train(env_id, num_timesteps, seed, save, gamma, lam, desired_kl): env = make_mujoco_env(env_id, seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) ret = learn(env, policy=policy, vf=vf, gamma=gamma, lam=lam, desired_kl=desired_kl, timesteps_per_batch=2500, num_timesteps=num_timesteps, animate=False) env.close() np.savetxt(save, np.array([ret]))
def train(env_id, num_timesteps, seed): env=gym.make(env_id) rank = MPI.COMM_WORLD.Get_rank() env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) set_global_seeds(seed) env.seed(seed) gym.logger.setLevel(logging.WARN) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
from baselines.acktr.value_functions import NeuralNetValueFunction from baselines.common import set_global_seeds env = gym.make('GazeboModularScara3DOF-v0') initial_observation = env.reset() print("Initial observation: ", initial_observation) env.render() seed=0 set_global_seeds(seed) env.seed(seed) with tf.Session(config=tf.ConfigProto()) as session: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.02, num_timesteps=1e6, animate=False, save_model_with_prefix='', restore_model_from_file='')
def train(self, env_fn, num_timesteps, noise_type, layer_norm, folder, load_policy, video_width, video_height, plot_rewards, save_every=50, seed=1234, episode_length=1000, pi_hid_size=150, pi_num_hid_layers=3, render_frames=_render_frames, **kwargs): num_cpu = self.workers if sys.platform == 'darwin': num_cpu //= 2 config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=num_cpu, inter_op_parallelism_threads=num_cpu) if self.gpu_usage is None or self.gpu_usage <= 0.: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" else: config.gpu_options.allow_growth = True # pylint: disable=E1101 config.gpu_options.per_process_gpu_memory_fraction = self.gpu_usage / self.workers tf.Session(config=config).__enter__() worker_seed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(worker_seed) tf.set_random_seed(worker_seed) np.random.seed(worker_seed) save_every = max(1, save_every) env = env_fn() env.seed(worker_seed) rank = MPI.COMM_WORLD.Get_rank() logger.info('rank {}: seed={}, logdir={}'.format(rank, worker_seed, logger.get_dir())) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=pi_hid_size, num_hid_layers=pi_num_hid_layers) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.INFO) that = self iter_name = 'iters_so_far' if self.method == 'sql': iter_name = 'epoch' # TODO replace with utils.create_callback(...) def callback(locals, globals): if that.method != "ddpg": if load_policy is not None and locals[iter_name] == 0: # noinspection PyBroadException try: utils.load_state(load_policy) if MPI.COMM_WORLD.Get_rank() == 0: logger.info("Loaded policy network weights from %s." % load_policy) # save TensorFlow summary (contains at least the graph definition) except: logger.error("Failed to load policy network weights from %s." % load_policy) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] == 0: _ = tf.summary.FileWriter(folder, tf.get_default_graph()) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] % save_every == 0: print('Saving video and checkpoint for policy at iteration %i...' % locals[iter_name]) ob = env.reset() images = [] rewards = [] max_reward = 1. # if any reward > 1, we have to rescale lower_part = video_height // 5 for i in range(episode_length): if that.method == "ddpg": ac, _ = locals['agent'].pi(ob, apply_noise=False, compute_Q=False) elif that.method == "sql": ac, _ = locals['policy'].get_action(ob) elif isinstance(locals['pi'], GaussianMlpPolicy): ac, _, _ = locals['pi'].act(np.concatenate((ob, ob))) else: ac, _ = locals['pi'].act(False, ob) ob, rew, new, _ = env.step(ac) images.append(render_frames(env)) if plot_rewards: rewards.append(rew) max_reward = max(rew, max_reward) if new: break orange = np.array([255, 163, 0]) red = np.array([255, 0, 0]) video = [] width_factor = 1. / episode_length * video_width for i, imgs in enumerate(images): for img in imgs: img[-lower_part, :10] = orange img[-lower_part, -10:] = orange if episode_length < video_width: p_rew_x = 0 for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, p_rew_x:rew_x] = red img[-1:, p_rew_x:rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, p_rew_x:rew_x] = orange img[-rew_y - 1:, p_rew_x:rew_x] = orange p_rew_x = rew_x else: for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, rew_x] = red img[-1:, rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, rew_x] = orange img[-rew_y - 1:, rew_x] = orange video.append(np.hstack(imgs)) imageio.mimsave( os.path.join(folder, "videos", "%s_%s_iteration_%i.mp4" % (that.environment, that.method, locals[iter_name])), video, fps=60) env.reset() if that.method != "ddpg": utils.save_state(os.path.join(that.folder, "checkpoints", "%s_%i" % (that.environment, locals[iter_name]))) if self.method == "ppo": pposgd_simple.learn( env, policy_fn, max_timesteps=int(num_timesteps), timesteps_per_actorbatch=1024, # 256 clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, # 1e-3 optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', # 'linear' callback=callback) elif self.method == "trpo": trpo_mpi.learn( env, policy_fn, max_timesteps=int(num_timesteps), timesteps_per_batch=1024, max_kl=0.1, # 0.01 cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, callback=callback) elif self.method == "acktr": from algos.acktr import acktr with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) acktr.learn( env, pi=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=1024, desired_kl=0.01, # 0.002 num_timesteps=num_timesteps, animate=False, callback=callback) elif self.method == "ddpg": from algos.ddpg import ddpg # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') from baselines.ddpg.noise import AdaptiveParamNoiseSpec param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') from baselines.ddpg.noise import NormalActionNoise action_noise = NormalActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: from baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory( limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) ddpg.train( env=env, eval_env=None, param_noise=param_noise, render=False, render_eval=False, action_noise=action_noise, actor=actor, critic=critic, memory=memory, callback=callback, **kwargs) elif self.method == "sql": from softqlearning.algorithms import SQL from softqlearning.misc.kernel import adaptive_isotropic_gaussian_kernel from softqlearning.misc.utils import timestamp from softqlearning.replay_buffers import SimpleReplayBuffer from softqlearning.value_functions import NNQFunction from softqlearning.policies import StochasticNNPolicy from rllab.envs.gym_env import GymEnv env = GymEnv(env) variant = { 'seed': [1, 2, 3], 'policy_lr': 3E-4, 'qf_lr': 3E-4, 'discount': 0.99, 'layer_size': 128, 'batch_size': 128, 'max_pool_size': 1E6, 'n_train_repeat': 1, 'epoch_length': 1000, 'snapshot_mode': 'last', 'snapshot_gap': 100, } pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=episode_length, epoch_length=episode_length, n_epochs=num_timesteps, max_path_length=episode_length, batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, iter_callback=callback ) qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=tuple([pi_hid_size] * pi_num_hid_layers), ) pi_layers = tuple([pi_hid_size] * pi_num_hid_layers) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=pi_layers) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=32, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=1, save_full_state=False, ) algorithm.train() else: print('ERROR: Invalid "method" argument provided.', file=sys.stderr) env.close()