def train(args, env): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) network = args.network logger.configure() if os.path.exists(args.load_path): model = ppo2.learn(network=network, env=env, load_path=args.load_path, total_timesteps=args.total_timesteps, nsteps=args.nsteps, save_interval=args.save_interval, lr=args.lr, num_layers=args.num_layers) else: print('Warning: PATH ', args.load_path, ' does not exist.') model = ppo2.learn(network=network, env=env, total_timesteps=args.total_timesteps, nsteps=args.nsteps, save_interval=args.save_interval, lr=args.lr, num_layers=args.num_layers) model.save(args.save_path)
def train(env_id, num_timesteps, seed, policy): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) """ env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) policy = { 'cnn': CnnPolicy, 'lstm': LstmPolicy, 'lnlstm': LnLstmPolicy, 'mlp': MlpPolicy }[policy] ppo2.learn(policy=policy, env=env, n_steps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, total_timesteps=int(num_timesteps * 1.1))
def train(): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=args.num_cpus, inter_op_parallelism_threads=args.num_cpus) tf.Session(config=config).__enter__() env = RemoteVecEnv([create_env] * args.num_cpus) env = VecNormalize(env, ret=True, gamma=args.gamma) ppo2.learn(policy=policies.MlpPolicy, env=env, total_timesteps=args.num_timesteps, nminibatches=args.num_minibatches, nsteps=args.num_steps, noptepochs=args.num_epochs, lr=args.learning_rate, gamma=args.gamma, lam=args.lam, ent_coef=args.ent_coef, vf_coef=args.vf_coef, cliprange=args.clip_range, log_interval=args.log_interval, save_interval=args.save_interval, load_path=args.checkpoint_path, num_casks=args.num_casks)
def train(num_timesteps, seed): num_cpus = 1 num_casks = 1 num_cpus += num_casks config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_cpus, inter_op_parallelism_threads=num_cpus) tf.Session(config=config).__enter__() gamma = 0.995 env = RemoteVecEnv([make_env] * num_cpus) env = VecNormalize(env, ret=True, gamma=gamma) set_global_seeds(seed) policy = policies.MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=num_cpus-num_casks, lam=0.95, gamma=gamma, noptepochs=4, log_interval=1, vf_coef=0.5, ent_coef=0.0, lr=3e-4, cliprange=0.2, save_interval=2, load_path="./logs/course_6/00244", total_timesteps=num_timesteps, num_casks=num_casks)
def main(): """Run PPO until the environment throws an exception.""" config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config): #env = make_env #env = lambda: make_training_env('SonicTheHedgehog-Genesis', 'GreenHillZone.Act1', stack=True, scale_rew=True) env = MultigameEnvWrapper #load_path = '/root/compo/trained_on_images_nature_cnn.joblib' load_path = './saved_weights.joblib' logger.configure(dir='./logs', format_strs=['stdout', 'tensorboard']) # Take more timesteps than we need to be sure that # we stop due to an exception. ppo2.learn(policy=CustomCnnPolicy, env=DummyVecEnv([env]), nsteps=4096, nminibatches=8, lam=0.95, gamma=0.99, noptepochs=3, log_interval=1, ent_coef=0.01, lr=lambda _: 2e-4, cliprange=lambda _: 0.1, total_timesteps=int(1e8), load_path=load_path, save_interval=20)
def train(env_id, num_timesteps, seed, render): from baselines.common import set_global_seeds from baselines.ppo2 import ppo2 from baselines.ppo2.policies import DynamicLstmPolicy from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True #pylint: disable=E1101 tf.Session(config=config).__enter__() def make_env(rank): def env_fn(): env = LearningEnvironment(num_particles=PARTICLES, disable_render=not render) env = bench.Monitor(env, logger.get_dir()) return env return env_fn env = SubprocVecEnv([make_env(i) for i in range(ENVIRONMENTS)]) set_global_seeds(seed) policy = DynamicLstmPolicy ppo2.learn(policy=policy, env=env, nsteps=1000, nminibatches=3, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, save_interval=10, ent_coef=0.002, lr=1e-4, cliprange=0.2, total_timesteps=num_timesteps)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--game', default='Airstriker-Genesis') parser.add_argument('--state', default=retro.State.DEFAULT) parser.add_argument('--scenario', default=None) args = parser.parse_args() def make_env(): env = make_retro(game=args.game, state=args.state, scenario=args.scenario) env = wrap_deepmind_retro(env) return env venv = SubprocVecEnv([make_env] * 8) ppo2.learn( network='cnn', env=venv, total_timesteps=int(100e6), nsteps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f: f * 2.5e-4, cliprange=0.1, )
def run(): """Runs a PPO agent on a given environment.""" def _load_env(): """Loads environment.""" raw_env = rwrl.load( domain_name=FLAGS.domain_name, task_name=FLAGS.task_name, safety_spec=dict(enable=True), delay_spec=dict(enable=True, actions=20), log_output=os.path.join(FLAGS.save_path, 'log.npz'), environment_kwargs=dict( log_safety_vars=True, log_every=20, flat_observation=True)) env = GymEnv(raw_env) env = bench.Monitor(env, FLAGS.save_path) return env env = dummy_vec_env.DummyVecEnv([_load_env]) ppo2.learn( env=env, network=FLAGS.network, lr=FLAGS.learning_rate, total_timesteps=FLAGS.total_timesteps, # make sure to run enough steps nsteps=FLAGS.nsteps, gamma=FLAGS.agent_discount, )
def train(env_id, num_timesteps, seed, policy): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True #pylint: disable=E1101 gym.logger.setLevel(logging.WARN) tf.Session(config=config).__enter__() nenvs = 8 def make_env(rank): def env_fn(): print(rank) if nenvs == 1: env = MarioEnv(num_steering_dir=11, jump=True) else: env = MarioEnv(num_steering_dir=11, num_env=rank, jump=True) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) return env return env_fn env = SubprocVecEnv([make_env(i) for i in range(nenvs)]) set_global_seeds(seed) env = VecFrameStack(env, 4) policy = {'cont': ContCnnPolicy, 'cnn' : OurCNN2, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy}[policy] ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f : f * 1e-3, cliprange=lambda f : f * 0.1, total_timesteps=int(num_timesteps * 1.1), save_interval=10)
def train(): """Trains a PPO2 policy.""" ncpu = multiprocessing.cpu_count() config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True tf.Session(config=config).__enter__() vec_env = SubprocVecEnv([(lambda _i=i: create_single_football_env(_i)) for i in range(FLAGS.num_envs)], context=None) ppo2.learn(network=FLAGS.policy, total_timesteps=FLAGS.num_timesteps, env=vec_env, seed=FLAGS.seed, nsteps=FLAGS.nsteps, nminibatches=FLAGS.nminibatches, noptepochs=FLAGS.noptepochs, gamma=FLAGS.gamma, ent_coef=FLAGS.ent_coef, lr=FLAGS.lr, log_interval=1, save_interval=FLAGS.save_interval, cliprange=FLAGS.cliprange)
def train(angle, num_timesteps, seed): from baselines.common import set_global_seeds from baselines.ppo2 import ppo2 from baselines.ppo2.policies import MlpPolicy import gym import tensorflow as tf from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.common.vec_env.dummy_vec_env import DummyVecEnv with tf.Session() as sess: def make_env(): return ant_env(angle) # env = gym.make('Ant-v1') # return env env = DummyVecEnv([make_env]) env = VecNormalize(env) # env = ant_env(angle) set_global_seeds(seed) policy = MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=10, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=num_timesteps)
def main(unused_argv): rs = FLAGS.random_seed if FLAGS.random_seed is None: rs = int((time.time() % 1) * 1000000) logger.configure(dir=FLAGS.train_log_dir, format_strs=['log']) players = [] players.append(sc2_env.Agent(races[FLAGS.agent_race])) players.append(sc2_env.Agent(races[FLAGS.oppo_race])) screen_res = (int(FLAGS.screen_ratio * FLAGS.screen_resolution) // 4 * 4, FLAGS.screen_resolution) if FLAGS.agent_interface_format == 'feature': agent_interface_format = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions( screen=screen_res, minimap=FLAGS.minimap_resolution)) elif FLAGS.agent_interface_format == 'rgb': agent_interface_format = sc2_env.AgentInterfaceFormat( rgb_dimensions=sc2_env.Dimensions( screen=screen_res, minimap=FLAGS.minimap_resolution)) else: raise NotImplementedError ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True # pylint: disable=E1101 tf.Session(config=config).__enter__() #flags.DEFINE_float('param_tstep', 100000, 'the parameter totoal step') param_lam = FLAGS.param_lam param_gamma = FLAGS.param_gamma param_concurrent = FLAGS.param_concurrent param_lr = FLAGS.param_lr param_cr = FLAGS.param_cr param_tstep = FLAGS.param_tstep print('params, lam={} gamma={} concurrent={} lr={} tstep={}'.format( param_lam, param_gamma, param_concurrent, param_lr, param_tstep)) env = make_sc2_dis_env(num_env=param_concurrent, seed=rs, players=players, agent_interface_format=agent_interface_format) ppo2.learn(policy=CnnPolicy, env=env, nsteps=128, nminibatches=1, lam=param_lam, gamma=param_gamma, noptepochs=4, log_interval=1, ent_coef=0.01, lr=lambda f: f * param_lr, cliprange=lambda f: f * param_cr, total_timesteps=param_tstep, save_interval=10)
def train(_): """Trains a PPO2 policy.""" vec_env = SubprocVecEnv([(lambda _i=i: create_single_football_env(_i)) for i in range(FLAGS.num_envs)], context=None) # Import tensorflow after we create environments. TF is not fork sake, and # we could be using TF as part of environment if one of the players is # controled by an already trained model. import tensorflow.compat.v1 as tf ncpu = multiprocessing.cpu_count() config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True tf.Session(config=config).__enter__() ppo2.learn(network=FLAGS.policy, total_timesteps=FLAGS.num_timesteps, env=vec_env, seed=FLAGS.seed, nsteps=FLAGS.nsteps, nminibatches=FLAGS.nminibatches, noptepochs=FLAGS.noptepochs, max_grad_norm=FLAGS.max_grad_norm, gamma=FLAGS.gamma, ent_coef=FLAGS.ent_coef, lr=FLAGS.lr, log_interval=1, save_interval=FLAGS.save_interval, cliprange=FLAGS.cliprange, load_path=FLAGS.load_path)
def train(env_id, num_timesteps, seed, policy): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True #pylint: disable=E1101 tf.Session(config=config).__enter__() env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) policy = { 'cnn': CnnPolicy, 'lstm': LstmPolicy, 'lnlstm': LnLstmPolicy, 'mlp': MlpPolicy }[policy] ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, total_timesteps=int(num_timesteps * 1.1))
def run(bsuite_id: Text) -> Text: """Runs a PPO agent on a given bsuite environment, logging to CSV.""" def _load_env(): raw_env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) if FLAGS.verbose: raw_env = terminal_logging.wrap_environment(raw_env, log_every=True) return gym_wrapper.GymFromDMEnv(raw_env) env = dummy_vec_env.DummyVecEnv([_load_env]) ppo2.learn( env=env, network=FLAGS.network, lr=FLAGS.learning_rate, total_timesteps=FLAGS.total_timesteps, # make sure to run enough steps nsteps=FLAGS.nsteps, gamma=FLAGS.agent_discount, ) return bsuite_id
def train(env_id, num_timesteps, seed): from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2 import ppo2 from baselines.ppo2.policies import MlpPolicy import gym import tensorflow as tf from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def make_env(): env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=num_timesteps)
def main(policy, env, params): """Run PPO until the environment throws an exception.""" config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config): # Take more timesteps than we need to be sure that # we stop due to an exception. ppo2.learn(policy=policy, env=env, nsteps=params['n_steps'], nminibatches=(params['n_steps'] * env.num_envs) // params["batch_size"], lam=params["lam"], gamma=params['gamma'], noptepochs=params["n_opt_epochs"], log_interval=params["log_interval"], ent_coef=params["ent_coef"], vf_coef=params['vf_coef'], lr=lambda _: params["lr"], cliprange=lambda _: params['cliprange'], max_grad_norm=params['max_grad_norm'], total_timesteps=params["max_steps"], save_interval=params["save_interval"], weights_path=params["weights_path"], adam_stats=params["adam_stats"], nmixup=params["nmixup"], weights_choose_eps=params["weights_choose_eps"], cnn=params['cnn'])
def train(env_id, num_timesteps, seed, d_targ, load, point): from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2 import ppo2 from baselines.ppo2.policies import LstmMlpPolicy, MlpPolicy import gym # import roboschool import multiprocessing import tensorflow as tf from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from baselines.common.vec_env.dummy_vec_env import DummyVecEnv def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return env return _thunk set_global_seeds(seed) ncpu = multiprocessing.cpu_count() config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() nenvs = 32 env = SubprocVecEnv([make_env(i) for i in range(nenvs)]) env = VecNormalize(env) policy = MlpPolicy def adaptive_lr(lr, kl, d_targ): if kl < (d_targ / 1.5): lr *= 2. elif kl > (d_targ * 1.5): lr *= .5 return lr ppo2.learn(policy=policy, env=env, nsteps=512, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=15, log_interval=1, ent_coef=0.00, lr=adaptive_lr, cliprange=0.2, total_timesteps=num_timesteps, load=load, point=point, init_targ=d_targ)
def main(environment, nr_episodes): env = make_unity_env(environment, 1, True) ppo2.learn( network="mlp", env=env, total_timesteps=nr_episodes, lr=1e-3, )
def train(env_id, num_timesteps, seed, policy): from baselines.common import set_global_seeds from baselines.common.atari_wrappers import make_atari, wrap_deepmind from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from baselines.common.vec_env.vec_frame_stack import VecFrameStack from baselines.ppo2 import ppo2 from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy, MlpPolicy import gym import logging import multiprocessing import os.path as osp import tensorflow as tf ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True # pylint: disable=E1101 gym.logger.setLevel(logging.WARN) tf.Session(config=config).__enter__() def make_env(rank): def env_fn(): env = make_atari(env_id) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) return env # wrap_deepmind(env) return env_fn nenvs = 8 env = SubprocVecEnv([make_env(i) for i in range(nenvs)]) set_global_seeds(seed) env = VecFrameStack(env, 4) policy = { 'cnn': CnnPolicy, 'lstm': LstmPolicy, 'lnlstm': LnLstmPolicy, 'mlp': MlpPolicy, 'capsules': CapsulesPolicy }[policy] ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, total_timesteps=int(num_timesteps * 1.1))
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()
def main(): arenas_configurations = ArenaConfig( "configurations/arena_configurations/train_ml_agents_arenas.yml") env = make_aai_env("env/AnimalAI", 2, arenas_configurations) ppo2.learn( network="cnn", env=env, total_timesteps=100000, lr=1e-3, )
def main(): """Run PPO until the environment throws an exception.""" config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--num_env', default=1, type=int) parser.add_argument('--seed', default=None, type=int) parser.add_argument('--game', default='ContraIII-Snes') parser.add_argument( '--state', default='level1.1player.easy.100lives') #state=retro.State.DEFAULT parser.add_argument('--scenario', default='scenario') parser.add_argument('--discrete_actions', default=0, type=int) parser.add_argument('--bk2dir', default='videos') parser.add_argument('--monitordir', default='logs') parser.add_argument('--sonic_discretizer', default=1, type=int) parser.add_argument('--clip_rewards', default=0, type=int) parser.add_argument('--stack', default=4, type=int) parser.add_argument('--time_limit', default=8000, type=int) parser.add_argument('--scale_reward', default=0.01, type=float) parser.add_argument('--warp_frame', default=1, type=int) parser.add_argument('--stochastic_frame_skip', default=4, type=int) parser.add_argument('--skip_prob', default=0.0, type=float) parser.add_argument('--network', default='cnn') parser.add_argument('--scenario_number', default=1, type=int) parser.add_argument('--load_path', default=None) args = parser.parse_args() time_int = int(time.time()) env_vec = make_vec_env(args, time_int) logger.configure(dir='./log/{}'.format(time_int), format_strs=['stdout', 'log', 'csv', 'tensorboard']) with tf.Session(config=config): ppo2.learn( network= 'impala_cnn', #args.network, #network='contra_net', #network='cnn', env=env_vec, nsteps=1024, #1024, nminibatches=128, #16,256,512,64,128 lam=0.95, gamma=0.997, #0.99 noptepochs=3, #3, log_interval=100, ent_coef=0.003, #0.003,#0.003, 0.001, 0.005 #many actions #0.01 lr=lambda _: 5e-5, #2e-4,1e-4,5e-5 cliprange=0.1, save_interval=100, seed=args.seed, vf_coef=0.5, max_grad_norm=0.5, save_path='ppo_save/{}'.format(time_int), #load_path=args.load_path, total_timesteps=int(2e10))
def train(env_id, num_timesteps, seed, nsteps, batch_size, epoch, method, net_size, i_trial, load_path, use_entr, ncpu): # rank = MPI.COMM_WORLD.Get_rank() # if rank != 0: # logger.set_level(logger.DISABLED) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True # workerseed = seed + 10000 * rank tf.reset_default_graph() set_global_seeds(seed) def make_env(rank): def _thunk(): env = gym.make(env_id) if logger.get_dir(): env = bench.Monitor( env, os.path.join(logger.get_dir(), 'train-{}.monitor.json'.format(rank))) return env return _thunk # def make_env(): # env = gym.make(env_id) # env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) # return env env = SubprocVecEnv([make_env(i) for i in range(ncpu)]) # env = DummyVecEnv([make_env]) env = VecNormalize(env) with tf.Session(config=config) as sess: policy = MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=nsteps, nminibatches=batch_size, lam=0.95, gamma=0.99, noptepochs=epoch, log_interval=1, ent_coef=0.01, lr=3e-4, cliprange=0.2, total_timesteps=num_timesteps, useentr=use_entr, net_size=net_size, i_trial=i_trial, load_path=load_path, method=method)
def main(): print('Executable environment', OBS_TOWER_ENVPATH) env = make_unity_env(OBS_TOWER_ENVPATH, 1) ppo2.learn( network='mlp', env=env, total_timesteps=1e5, lr=1e-3, ) ppo2.save('obs_tower_chall_model.pkl')
def main(): if 1: env = gym.make("Pendulum-v0") env.num_envs = 1 act = ppo2.learn(env=env, network='mlp', total_timesteps=0, load_path="pendulum_model_ppo2.pkl") else: env_id = "pendulum-legacy-v0" env_type = "gym_poine" num_env = 1 seed = 1234 reward_scale = 1. flatten_dict_observations = False env = make_vec_env(env_id, env_type, num_env, seed, reward_scale, flatten_dict_observations) act = ppo2.learn( env=env, network='mlp', total_timesteps=0, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, #lr=lambda f : f * 2.5e-4, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.9, # default 0.99 lam=0.95, log_interval=10, nminibatches=32, # default 4 noptepochs=10, cliprange=0.2, save_interval=0, load_path="pendulum_model_ppo2.pkl", model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, ) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def train(env_id, num_timesteps, seed, pol, cur, vis, model): from baselines.common import set_global_seeds from baselines.ppo2 import ppo2 from baselines.ppo2.policies import HierPolicy, HierPolicy2, MlpPolicy, RandomWalkPolicy import gym import gym_program import tensorflow as tf from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() hier = True if pol == 'hier1' or pol == 'hier2' else False def make_env(): set_global_seeds(seed) env = gym.make(env_id) env.set_curiosity(cur, model) env.set_hier(hier) env.set_visualize(vis) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) if pol == 'hier1': policy = HierPolicy elif pol == 'hier2': policy = HierPolicy2 elif policy == 'mlp': policy = MlpPolicy elif pol == 'random_walk': pol = RandomWalkPolicy pol(env) return ppo2.learn(policy=policy, env=env, pol=pol, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=1e-4, cliprange=0.2, total_timesteps=num_timesteps)
def run_baselines(env, seed, log_dir): """Create baselines model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ ncpu = max(multiprocessing.cpu_count() // 2, 1) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.compat.v1.Session(config=config).__enter__() # Set up logger for baselines configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) baselines_logger.info('rank {}: seed={}, logdir={}'.format( 0, seed, baselines_logger.get_dir())) env = DummyVecEnv([ lambda: bench.Monitor( env, baselines_logger.get_dir(), allow_early_resets=True) ]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy nbatch = env.num_envs * hyper_parameters['batch_size'] training_batch_number = nbatch // hyper_parameters['training_batch_size'] # import pdb; pdb.set_trace() # use AdamOptimizer as optimizer and choose value function same with policy ppo2.learn(policy=policy, env=env, nsteps=hyper_parameters['batch_size'], lam=hyper_parameters['gae_lambda'], gamma=hyper_parameters['discount'], ent_coef=hyper_parameters['policy_ent_coeff'], nminibatches=training_batch_number, noptepochs=hyper_parameters['training_epochs'], max_grad_norm=None, lr=hyper_parameters['learning_rate'], cliprange=hyper_parameters['lr_clip_range'], total_timesteps=hyper_parameters['batch_size'] * hyper_parameters['n_epochs']) # yapf: disable # noqa: E501 return osp.join(log_dir, 'progress.csv')
def train(env_id, num_timesteps, seed): from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2 import ppo2 from baselines.ppo2.policies import MlpPolicy import gym import tensorflow as tf from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def make_env(): if env_id == 'toy': #env = continuous_gridworld.ContinuousGridworld('', max_steps=1000, # obstacle_mode=continuous_gridworld.NO_OBJECTS) from toy_environment import room_obstacle_list env = gridworld.Gridworld( obstacle_list_generator=room_obstacle_list.obstacle_list) elif env_id == 'navigate': env = NavigateEnv(use_camera=False, continuous_actions=True, neg_reward=True, max_steps=500) elif env_id == 'arm2pos': #env = Arm2PosEnv(continuous=False, max_steps=500) pass else: env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=num_timesteps)
def run_baselines(env, seed, log_dir): ''' Create baselines model and training. Replace the ppo and its training with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return ''' ncpu = max(multiprocessing.cpu_count() // 2, 1) config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() # Set up logger for baselines configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) baselines_logger.info('rank {}: seed={}, logdir={}'.format( 0, seed, baselines_logger.get_dir())) def make_env(): monitor = bench.Monitor( env, baselines_logger.get_dir(), allow_early_resets=True) return monitor env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy ppo2.learn( policy=policy, env=env, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=1e-3, vf_coef=0.5, max_grad_norm=None, cliprange=0.2, total_timesteps=int(1e6)) return osp.join(log_dir, 'progress.csv')
def train(args): logger.configure(args.main_path) if args.diff_frames: assert "stack_frames" in args seed = int.from_bytes(os.urandom(4), byteorder='big') set_global_seeds(seed) env = ImVecNormalize( make_multiple_mujoco_env(args.env_id, args.number_of_agents, seed)) ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True tf.Session(config=config).__enter__() with tf.device("/device:GPU:0"): if args.policy == "cnn": policy = MImVecPolicy elif args.policy == "lstm_cnn": policy = MImVecLstmPolicy elif args.policy == "lnlstm_cnn": policy = MImVecLnLstmPolicy else: raise ValueError ppo2.learn(policy=policy, env=env, nsteps=args.nsteps, nminibatches=args.nminibatches, lam=args.lam, gamma=args.gamma, noptepochs=args.noptepochs, log_interval=1, ent_coef=0.0, lr=args.learning_rate, cliprange=args.cliprange, total_timesteps=int(args.num_timesteps * 1.01), add_flownet=args.add_flownet, flownet_path=args.flownet_path, flow_key=args.flow_key, train_from_scratch=args.train_from_scratch, large_cnn=args.large_cnn, add_predicted_flow_to_vec=args.add_predicted_flow_to_vec, diff_frames=args.diff_frames)
def train(env_id, num_timesteps, seed, policy): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True #pylint: disable=E1101 tf.Session(config=config).__enter__() env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy, 'mlp': MlpPolicy}[policy] ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f : f * 2.5e-4, cliprange=lambda f : f * 0.1, total_timesteps=int(num_timesteps * 1.1))
import pytest import tensorflow as tf import random import numpy as np from gym.spaces import np_random from baselines.a2c import a2c from baselines.ppo2 import ppo2 from baselines.common.identity_env import IdentityEnv from baselines.common.vec_env.dummy_vec_env import DummyVecEnv from baselines.ppo2.policies import MlpPolicy learn_func_list = [ lambda e: a2c.learn(policy=MlpPolicy, env=e, seed=0, total_timesteps=50000), lambda e: ppo2.learn(policy=MlpPolicy, env=e, total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.01) ] @pytest.mark.slow @pytest.mark.parametrize("learn_func", learn_func_list) def test_identity(learn_func): ''' Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) ''' np.random.seed(0) np_random.seed(0) random.seed(0) env = DummyVecEnv([lambda: IdentityEnv(10)])