def train(env_id, num_episodes, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_halide_env(env_id, seed) pposgd_simple.learn( env, policy_fn, max_episodes=num_episodes, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.03, optim_epochs=4, optim_stepsize=2.5e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def train(env_id, num_timesteps, timesteps_per_actor_batch, seed, entropy_coeff, filepath): from baselines.ppo1 import mlp_policy, pposgd_simple sess = U.make_session(num_cpu=1) sess.__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=timesteps_per_actor_batch, clip_param=0.2, entcoeff=entropy_coeff, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') env.close() # Save policy etc. saver = tf.train.Saver() saver.save(sess, filepath + "_final")
def train_ppo1(env_id, num_timesteps, sfs, seed): from baselines.ppo1 import pposgd_simple sess = U.make_session(num_cpu=4) sess.__enter__() set_global_seeds(seed) env = gym.make(env_id) model_dir = get_model_dir(env_id, 'ppo') # monitor tensorboard log_dir = osp.join(logger.get_dir(), 'log_ppo') logger.log("log_dir: %s" % log_dir) env = bench.Monitor(env, log_dir) env = ModelSaverWrapper(env, model_dir, sfs) env.seed(seed) # env.render() gym.logger.setLevel(logging.WARN) pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, # TODO 0.2 entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', # TODO linear ) env.close()
def train(env_id, num_frames, seed, max_ts, logdir): """Train agent.""" from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() logger.configure(osp.join(logdir, "%i.log.json" % rank)) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): """Given an obs, returns an act.""" return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = atari_env.wrap_train(env) num_timesteps = max_ts or int(num_frames / 4 * 1.1) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import pposgd_simple, cnn_policy sess = U.make_session(num_cpu=1) sess.__enter__() logger.configure() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=1, lam=0.95, schedule='linear') env.close() saver = tf.train.Saver() saver.save(sess, '/tmp/model')
def train(env_id, num_timesteps, seed, save_interval, output_prefix): from baselines.ppo1 import mlp_policy, pposgd_simple sess = U.make_session(num_cpu=1) sess.__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=128, num_hid_layers=2) def callback_fn(local_vars, global_vars): iters = local_vars["iters_so_far"] saver = tf.train.Saver() if iters % save_interval == 0: saver.save(sess, output_prefix + str(iters)) env = make_dart_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback_fn ) env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple sess = U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make("FenceEscape-v0") def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def train(env, num_timesteps, seed): #from baselines.ppo1 import pposgd_simple, cnn_policy sess = U.make_session(num_cpu=1) sess.__enter__() set_global_seeds(seed) def policy_fn(name, ob_space, ac_space): #return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env.seed(seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=20, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=9, gamma=0.99, lam=0.95, schedule='linear', sess=sess, old_model=False)
def train(env_id, num_timesteps, seed, save_model, load_model, model_dir, timesteps_per_actorbatch, clip_param, ent_coeff, epochs, learning_rate, batch_size, gamma, lambd, exploration_rate, filename): from baselines.ppo1 import kick_policy, pposgd_simple, reward_scaler rank = MPI.COMM_WORLD.Get_rank() U.make_session(num_cpu=1).__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = SoccerEnv(rank) def policy_fn(name, ob_space, ac_space): return kick_policy.KickPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, exploration_rate = exploration_rate) env = bench.Monitor(env, logger.get_dir()) env.seed(workerseed) gym.logger.setLevel(logging.WARN) rw_scaler = reward_scaler.RewardScaler("rw_scaler") pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=timesteps_per_actorbatch, clip_param=clip_param, entcoeff=ent_coeff, optim_epochs=epochs, optim_stepsize=learning_rate, optim_batchsize=batch_size, gamma=gamma, lam=lambd, schedule='linear', save_model=save_model, load_model=load_model, model_dir=model_dir, rw_scaler=rw_scaler, filename=filename ) env.close()
def train(env_id, num_timesteps, seed,beta,theta,decay): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None set_global_seeds(workerseed) print(env_id) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear',beta=beta,theta=theta,decay=decay, ) env.close()
def main(): #Create Asynchronous Simulation of InvertedDoublePendulum-v2 mujoco environment. env = DoubleInvertedPendulumEnv( agent_dt=0.005, sensor_dt=[0.01, 0.0033333], ) # Start environment processes env.start() # Create baselines ppo policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Plotting process pp = Process(target=plot_returns, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines PPO learn kindred_callback = create_callback(shared_returns) # Train baselines PPO learn( env, policy_fn, max_timesteps=1e6, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=0.0001, optim_batchsize=64, gamma=0.995, lam=0.995, schedule="linear", callback=kindred_callback, ) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() # Shutdown the environment env.close()
def train(env_id, num_timesteps, history_len, seed, render): U.make_session(num_cpu=1).__enter__() # We need to make sure the seed is different in each COMM world rank = MPI.COMM_WORLD.Get_rank() workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) train_env = LearningEnvironment(num_particles=PARTICLES, disable_render=not render) train_env = StackedEnvWrapper(train_env, state_history_len=history_len) eval_env = LearningEnvironment(num_particles=PARTICLES, disable_render=not render) eval_env = StackedEnvWrapper(eval_env, state_history_len=history_len) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), "monitor.json")) pposgd_simple.learn(train_env, eval_env, policy_fn, directory=DIRECTORY.format(history_len), max_timesteps=num_timesteps, timesteps_per_batch=1024*VAR_REDUCTION, clip_param=0.2, entcoeff=0.0001, optim_epochs=10, optim_stepsize=2e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', render=render ) train_env.close() eval_env.close()
def train(env_id, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) env.seed(workerseed) task_name = "ppo." + args.env.split("-")[0] + "." + ("%.2f"%args.entcoeff) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) pposgd_simple.learn(env, policy_fn, max_timesteps=args.num_timesteps, timesteps_per_batch=256, clip_param=0.2, entcoeff=args.entcoeff, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, sample_stochastic=args.sample_stochastic, task_name=task_name, save_per_iter=args.save_per_iter, ckpt_dir=args.checkpoint_dir, load_model_path=args.load_model_path, task=args.task) env.close()
def train(env_id, num_timesteps, seed): """ Train PPO1 model for the Mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, sess=sess, placeholders=placeholders) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) #env = make_mujoco_env(env_id, seed) env = gym.make("CartPole-v0") pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def train(env_id, num_timesteps, seed, tb_dir=None): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = ContinuousGridworld('gridworld', visualize=False) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=256, clip_param=0.2, entcoeff=0.0, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', tb_dir=tb_dir) env.close()
def train(exp_name, env_id, max_iters, save_step, seed): from baselines.ppo1 import mlp_policy, pposgd_simple sess = U.make_session(num_cpu=1) sess.__enter__() # logger.session().__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) scriptpath = os.path.dirname(os.path.abspath( __file__ )) directory = os.path.join(scriptpath, exp_name) if not os.path.exists(directory): os.makedirs(directory) filepath = os.path.join(directory, "") pposgd_simple.learn(env, policy_fn, max_iters=max_iters, filepath=filepath, save_step=save_step, timesteps_per_actorbatch=4000, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, ) env.close()
def train(num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=128, num_hid_layers=2) env = TwoDofArmEnv(ActiveMuscles='antagonistic', actionParameterization=True, sim_length=0.005, traj_track=True, exo=True, exo_gain=70., delay=0.020) pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=1048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def train(env_id, num_frames, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = gym.make(env_id) env.seed(seed) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps = int(num_frames / 4 * 1.1) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def train(env, num_timesteps, seed, ckpt_dir=None, render=False, ckpt_freq=0, restore_dir=None, optim_stepsize=3e-4, schedule="linear", gamma=0.99, optim_epochs=10, optim_batchsize=64, horizon=2048): from baselines.common.fc_learning_utils import FlightLog from mpi4py import MPI from baselines import logger from baselines.ppo1.mlp_policy import MlpPolicy from baselines.common import set_global_seeds from baselines.ppo1 import pposgd_simple import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 1000000 * rank def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) if render: env.render() env.seed(workerseed) set_global_seeds(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=horizon, clip_param=0.2, entcoeff=0.0, optim_epochs=optim_epochs, optim_stepsize=optim_stepsize, optim_batchsize=optim_batchsize, gamma=0.99, lam=0.95, schedule=schedule, flight_log=None, ckpt_dir=ckpt_dir, restore_dir=restore_dir, save_timestep_period=ckpt_freq) env.close()
def train(env_id, num_timesteps, seed, save_model, load_model, model_dir): from baselines.ppo1 import mlp_policy, pposgd_simple rank = MPI.COMM_WORLD.Get_rank() U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = SoccerEnv(rank) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', save_model=save_model, load_model=load_model, model_dir=model_dir) env.close()
def train(env_id, backend, num_timesteps, seed, stdev=0., collision_detector='bullet'): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env_dist = dr.dist.Normal(env_id, backend, stdev=stdev) env_dist.seed(seed) set_global_seeds(seed) pposgd_simple.learn( env_dist, collision_detector, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', )
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() mujoco_py.ignore_mujoco_warnings().__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3) pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def train(env_id, num_frames, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps = int(num_frames / 4 * 1.1) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(5000), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback) env.close()
def train(env_id, num_timesteps, seed): """ Train PPO1 model for Robotics environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() with mujoco_py.ignore_mujoco_warnings(): workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3, sess=sess, placeholders=placeholders) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear') env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) from gym.envs.registration import register # Potential Pendulum Env if (env_id == 'Pendulumnf-v0'): register( id='Pendulumnf-v0', entry_point='nfunk.envs_nf.pendulum_nf:PendulumEnv', max_episode_steps=400, #kwargs = vars(args), ) env = gym.make('Pendulumnf-v0') # Potential Scalar Env elif (env_id == 'Scalarnf-v0'): register( id='Scalarnf-v0', entry_point='nfunk.envs_nf.gym_scalar_nf:GymScalarEnv', max_episode_steps=400, #kwargs = vars(args), ) env = gym.make('Scalarnf-v0') # Potential CartPole Environment (own one -> continouus) elif (env_id == 'CartPole-v9'): register( id='CartPole-v9', entry_point='nfunk.envs_nf.cartpole:CartPoleEnv', max_episode_steps=200, #kwargs = vars(args), ) env = gym.make('CartPole-v9') else: env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', seed=seed) env.close()
def train(env_id, num_timesteps, seed, num_options,app, saves ,wsaves, epoch,dc): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) from gym.envs.registration import register # Potential Pendulum Env if (env_id=='Pendulumnf-v0'): register( id='Pendulumnf-v0', entry_point='nfunk.envs_nf.pendulum_nf:PendulumEnv', max_episode_steps=400, #kwargs = vars(args), ) env = gym.make('Pendulumnf-v0') # Potential Scalar Env elif (env_id=='Scalarnf-v0'): register( id='Scalarnf-v0', entry_point='nfunk.envs_nf.gym_scalar_nf:GymScalarEnv', max_episode_steps=400, #kwargs = vars(args), ) env = gym.make('Scalarnf-v0') # Potential CartPole Environment (own one -> continouus) elif (env_id=='CartPole-v9'): register( id='CartPole-v9', entry_point='nfunk.envs_nf.cartpole:CartPoleEnv', max_episode_steps=200, #kwargs = vars(args), ) env = gym.make('CartPole-v9') else: env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, num_options=num_options, dc=dc) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) if num_options ==1: optimsize=64 elif num_options ==2: optimsize=32 else: print("Only two options or primitive actions is currently supported.") sys.exit() pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-5, optim_batchsize=optimsize, gamma=0.99, lam=0.95, schedule='constant', num_options=num_options, app=app, saves=saves, wsaves=wsaves, epoch=epoch, seed=seed,dc=dc ) env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) #env = make_atari(env_id) env = gym_super_mario_bros.make('SuperMarioBros-v1') # env = gym_super_mario_bros.make('SuperMarioBrosNoFrameskip-v3') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) env = ProcessFrame84(env) env = FrameMemoryWrapper(env) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) #env = wrap_deepmind(env) env.seed(workerseed) def render_callback(lcl, _glb): # print(lcl['episode_rewards']) total_steps = lcl['env'].total_steps #if total_steps % 1000 == 0: # print("Saving model to mario_model.pkl") # act.save("../models/mario_model_{}.pkl".format(modelname)) env.render() # pass pposgd_simple.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, # 3e-4 optim_batchsize=64, #256 gamma=0.99, lam=0.95, schedule='linear', callback = render_callback ) env.close()
def train(num_timesteps, seed, save_model_with_prefix, restore_model_from_file, save_after, load_after_iters, viz=False, stochastic=True): from baselines.ppo1 import pposgd_simple import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) g = tf.get_default_graph() with g.as_default(): tf.set_random_seed(workerseed) env = ProstheticsEnv_R2_multiclip(visualize=viz) env_string = str(env).split('<')[1] def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=312, num_hid_layers=2) env.seed(workerseed) pposgd_simple.learn(env, workerseed, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=1536, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=512, gamma=0.999, lam=0.9, schedule='linear', save_model_with_prefix=save_model_with_prefix, save_prefix=env_string, restore_model_from_file=restore_model_from_file, load_after_iters=load_after_iters, save_after=save_after, stochastic=stochastic) env.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id') parser.add_argument('-l', '--load') parser.add_argument('-e', '--episodes-per-batch', type=int, default=2500) args = parser.parse_args() rank = MPI.COMM_WORLD.Get_rank() size = MPI.COMM_WORLD.Get_size() U.single_threaded_session().__enter__() log_dir = 'logs/{}_{}'.format( args.gym_id, datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) logger.configure(dir=log_dir, format_strs=None if rank == 0 else []) env = bench.Monitor(gym.make(args.gym_id), os.path.join(logger.get_dir(), str(rank))) episodes_per_actorbatch = args.episodes_per_batch // size timesteps_per_actorbatch = episodes_per_actorbatch * 200 callbacks = [] if rank == 0: video_episodes_monitor_callback = VideoEpisodesMonitorCallback() callbacks.append(video_episodes_monitor_callback) monitor_path = os.path.join(log_dir, 'monitor') env = VideoMonitor( env, monitor_path, video_callable=video_episodes_monitor_callback.should_monitor) callbacks += [ ReloadCallback(model_path=args.load), # HardEnvCallback(env=env, switch_iterations=10000, linear_schedule=True), # VersusCallback(env=env, start_iterations=20, threshold_iterations=20, default_ai_weight=2, # latest_models_proportion=0.5, load_first_model=False), ] if rank == 0: callbacks += [ SaveCallback(log_dir=log_dir), ] pposgd_simple.learn( env, env.unwrapped.policy_class, max_iters=1000000, timesteps_per_actorbatch=timesteps_per_actorbatch, clip_param=0.2, entcoeff=0.0, optim_epochs=6, optim_stepsize=1e-3, optim_batchsize=4096, gamma=0.995, lam=0.95, schedule='constant', callback=lambda lv, gv: [cb(lv, gv) for cb in callbacks], ) env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def train(num_timesteps, seed, model_path=None): env_id = 'Humanoid-v2' from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain env = RewScale(env, 0.1) pi = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() if model_path: U.save_state(model_path) return pi
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
import gym from baselines.ppo1 import mlp_policy, pposgd_simple import tensorflow as tf env = gym.make("MountainCarContinuous-v0") g = tf.Graph() with g.as_default(): # tf.reset_default_graph() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) pposgd_simple.learn(env, policy_fn, max_timesteps=10000, timesteps_per_actorbatch=2048, # timesteps per actor per update # timesteps_per_actorbatch=128, # timesteps per actor per update clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', save_model_with_prefix=str(env.__class__.__name__), # typically, the env. outdir="/tmp/experiments/continuous/PPO/" # path for the log files (tensorboard) and models ) # act.save("models/mountaincar_continuous_model_PPO_"+str(m)+".pkl")