def make_vec_envs(id, num_processes, gamma, return_evn_vector=False, **kwargs): start_port = kwargs['port'] ports = range(start_port, start_port + num_processes) env_vector = [] for i in range(num_processes): kwargs['port'] = ports[i] env_vector.append(_make_env_fn(id, i, **kwargs)) if len(env_vector) > 1: envs = SubprocVecEnv(env_vector) else: envs = DummyVecEnv(env_vector) if len(envs.observation_space.shape) == 1: use_tf = True if gamma is None: envs = VecNormalize(envs, ret=False, use_tf=use_tf) else: envs = VecNormalize(envs, gamma=gamma, use_tf=use_tf) import tensorflow as tf from baselines.common.tf_util import get_session config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) if return_evn_vector: return envs, env_vector return envs
def make_vec_envs_pytorch(id, num_processes, gamma, device, return_evn_vector=False, **kwargs): from a2c_ppo_acktr.envs import VecPyTorch start_port = kwargs['port'] ports = range(start_port, start_port + num_processes) env_vector = [] for i in range(num_processes): kwargs['port'] = ports[i] env_vector.append(_make_env_fn(id, i, **kwargs)) if len(env_vector) > 1: envs = SubprocVecEnv(env_vector) else: envs = DummyVecEnv(env_vector) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) if return_evn_vector: return envs, env_vector return envs
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = args.seed env_type, env_id = get_env_type(args.env) if env_type == 'atari': if alg == 'acer': env = make_vec_env(env_id, env_type, nenv, seed) elif alg == 'deepq': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir()) env = atari_wrappers.wrap_deepmind(env, frame_stack=True) elif alg == 'trpo_mpi': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env = atari_wrappers.wrap_deepmind(env) # TODO check if the second seeding is necessary, and eventually remove env.seed(seed) else: frame_stack_size = 4 env = VecFrameStack(make_vec_env(env_id, env_type, nenv, seed), frame_stack_size) elif env_type == 'retro': import retro gamestate = args.gamestate or retro.State.DEFAULT env = retro_wrappers.make_retro(game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) env.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = retro_wrappers.wrap_deepmind_retro(env) elif env_type == 'unity': get_session(tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale) env = VecNormalize(env) else: get_session(tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale) if env_type == 'mujoco' or env_type == 'unity': env = VecNormalize(env) return env
def wrap_env_pytorch(env, gamma, device): from a2c_ppo_acktr.envs import VecPyTorch envs = DummyVecEnv([env]) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) return envs
def Test(): env=DummyVecEnv([EnvFunc(i) for i in range(1)]) env = VecNormalize(env,ob=True, ret=True) act=ppo.learn( network="mlp", env=env, # lr=3e-4, nsteps=256, nminibatches=8, # lam=0.94, total_timesteps=0, log_interval=100, epsilon_start=0.9, epsilon_final=0.002, epsilon_decay=140, # save_interval=500, load_path="/home/duoyi/MyGit/simple_baselines/300", num_layers=3, num_hidden=256, value_network="copy" ) obs=env.reset() iFrame=0 while True: action,_,_,_=act.step(obs) obs,reward,done,info=env.step(action) iFrame+=1 if done[0]: print("total Frame",iFrame) iFrame=0 env.render()
def run_train_task(vv): ncpu = multiprocessing.cpu_count() config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def make_env(): env = vv['env'](log_scale_limit=0.0, max_path_length=vv['path_length']) env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) return env n_envs = vv['batch_size'] // vv['path_length'] env = DummyVecEnv([make_env for i in range(n_envs)]) env = VecNormalize(env) set_global_seeds(vv['seed']) policy = MlpPolicy model = ppo2.learn(policy=policy, env=env, nsteps=vv['path_length'], nminibatches=25, lam=0.95, gamma=vv['discount'], noptepochs=10, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=vv['total_timesteps'])
def __call__(self, env_maker, seed=None, monitor_file=None): """ :param env_maker: instance of roam_learning.robot_env.EnvMaker :param seed: int that is used to generate seeds for vectorized envs :param monitor_file: path to a .csv file to log episode rewards, lengths etc,. of the vectorized envs :return: instance of either DummyVecEnv, SubprocVecEnv or ShmemVecEnv """ # Create a list of env makers if seed is not None: assert isinstance(seed, int) env_makers = [] for i in range(self.nenvs): env_makers += [deepcopy(env_maker)] if seed is not None: seed = hash_seed(seed) env_makers[i].set_seed(seed + i) # Create the vectorized envs envs = self.vec_env_wrapper(env_makers) # Monitor the envs before normalization if monitor_file is not None: envs = VecMonitor(envs, filename=monitor_file) if self.normalize_obs or self.normalize_ret: envs = VecNormalize(envs, ob=self.normalize_obs, ret=self.normalize_ret, use_tf=True) return envs
def __init__(self): # Create an instance of the network itself, as well as the memory. # Here is also a good place to set environmental parameters, # as well as training parameters - number of episodes / iterations, etc. args.log_dir = args.log_dir + args.env_name + '_' + args.algo try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) self.environment_name = args.env_name self.agent = VecEnvAgent(envs, args)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) agent = VecEnvAgent(envs, args) agent.train_maml(num_updates)
def train(num_timesteps, seed): ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def make_env(): env = PointEnv() env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) return env env = DummyVecEnv([make_env]) env = VecNormalize(env, ret=False, cliprew=200) set_global_seeds(seed) policy = MlpPolicy model = ppo2.learn(policy=policy, env=env, nsteps=100, nminibatches=25, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=num_timesteps) return model, env
def train(): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=args.num_cpus, inter_op_parallelism_threads=args.num_cpus) tf.Session(config=config).__enter__() env = RemoteVecEnv([create_env] * args.num_cpus) env = VecNormalize(env, ret=True, gamma=args.gamma) ppo2.learn(policy=policies.MlpPolicy, env=env, total_timesteps=args.num_timesteps, nminibatches=args.num_minibatches, nsteps=args.num_steps, noptepochs=args.num_epochs, lr=args.learning_rate, gamma=args.gamma, lam=args.lam, ent_coef=args.ent_coef, vf_coef=args.vf_coef, cliprange=args.clip_range, log_interval=args.log_interval, save_interval=args.save_interval, load_path=args.checkpoint_path, num_casks=args.num_casks)
def train(angle, num_timesteps, seed): from baselines.common import set_global_seeds from baselines.ppo2 import ppo2 from baselines.ppo2.policies import MlpPolicy import gym import tensorflow as tf from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.common.vec_env.dummy_vec_env import DummyVecEnv with tf.Session() as sess: def make_env(): return ant_env(angle) # env = gym.make('Ant-v1') # return env env = DummyVecEnv([make_env]) env = VecNormalize(env) # env = ant_env(angle) set_global_seeds(seed) policy = MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=10, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=num_timesteps)
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg seed = args.seed env_type, env_id = get_env_type(args.env) if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) env = VecFrameStack(env, frame_stack_size) else: config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale) if env_type == 'mujoco': env = VecNormalize(env) return env
def train(env_id, num_timesteps, seed): """ Train PPO2 model for Mujoco environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. """ def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy model = ppo2.learn(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, total_timesteps=num_timesteps) return model, env
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg seed = args.seed env_type, env_id = get_env_type(args.env) if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) env = VecFrameStack(env, frame_stack_size) else: config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.20)) config.gpu_options.allow_growth = True get_session(config=config) flatten_dict_observations = alg not in {'her'} env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations) normalize_value = args.normalize_value if (env_type == 'mujoco' or env_type=='roboschool') and normalize_value: env = VecNormalize(env) return env
def train(env_id, num_timesteps, seed): from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2 import ppo2 from baselines.ppo2.policies import MlpPolicy import gym import tensorflow as tf from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def make_env(): env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=num_timesteps)
def train(env_id, num_timesteps, seed, d_targ, load, point): from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2 import ppo2 from baselines.ppo2.policies import LstmMlpPolicy, MlpPolicy import gym # import roboschool import multiprocessing import tensorflow as tf from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from baselines.common.vec_env.dummy_vec_env import DummyVecEnv def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return env return _thunk set_global_seeds(seed) ncpu = multiprocessing.cpu_count() config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() nenvs = 32 env = SubprocVecEnv([make_env(i) for i in range(nenvs)]) env = VecNormalize(env) policy = MlpPolicy def adaptive_lr(lr, kl, d_targ): if kl < (d_targ / 1.5): lr *= 2. elif kl > (d_targ * 1.5): lr *= .5 return lr ppo2.learn(policy=policy, env=env, nsteps=512, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=15, log_interval=1, ent_coef=0.00, lr=adaptive_lr, cliprange=0.2, total_timesteps=num_timesteps, load=load, point=point, init_targ=d_targ)
def build_env(args, silent_monitor, prio_args=None): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg seed = args.seed env_type, env_id = get_env_type(args.env) if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale, prio_args=prio_args, silent_monitor=silent_monitor) if prio_args is None: env = VecFrameStack(env, frame_stack_size) else: env = PrioVecFrameStack(env, frame_stack_size) # TODO prio vec frame stack else: config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) num_env = args.n_active_envs if prio_args is None else args.num_env flatten_dict_observations = alg not in {'her'} env = make_vec_env(env_id, env_type, num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations, prio_args=prio_args, silent_monitor=silent_monitor) if env_type == 'mujoco': if prio_args is None: env = VecNormalize(env) else: env = PrioVecNormalize(env) return env
def Eval(): def EnvFunc(iSeed): def InnerFunc(): oEnv=Env() return oEnv return InnerFunc def linear_schedule(initial_value): def func(process): return process * initial_value return func learning_rate = linear_schedule(5e-4) clip_range = linear_schedule(0.2) n_timesteps = int(0) hyperparmas = {'nsteps': 256, 'noptepochs': 8, 'nminibatches': 4, 'lr': learning_rate, 'cliprange': clip_range, 'vf_coef': 0.5, 'ent_coef': 0.01} num_env = 1 env = SubprocVecEnv([EnvFunc(i) for i in range(num_env)]) env = VecNormalize(env,ob=True,ret=False) env=VecMonitor(env) act = ppo2.learn( network="mlp", env=env, total_timesteps=n_timesteps, save_interval=100, load_path="baselineLog/ppobaseliens-2019-06-05-17-38-15-168854/checkpoints/00300", **hyperparmas, value_network="copy" ) obs = env.reset() print("obs", obs.shape) bDone = False iFrame = 0 iReward = 0 reward_list=deque(maxlen=100) while not bDone: action = act.step(obs)[0] obs, reward, done, _ = env.step(action) iReward += reward[0] # time.sleep(0.01) # print("reward",reward) iFrame += 1 # env.render() if done[0]: obs = env.reset() reward_list.append(iReward) print("done.................", iFrame, iReward,sum(reward_list)/len(reward_list)) iFrame = 0 iReward = 0
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg seed = args.seed env_type, env_id = get_env_type(args.env) print(env_id) #extract the agc_env_name noskip_idx = env_id.find("NoFrameskip") env_name = env_id[:noskip_idx].lower() print("Env Name for Masking:", env_name) if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) env = VecFrameStack(env, frame_stack_size) else: config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale) if args.custom_reward != '': from baselines.common.vec_env import VecEnv, VecEnvWrapper import baselines.common.custom_reward_wrapper as W assert isinstance(env,VecEnv) or isinstance(env,VecEnvWrapper) custom_reward_kwargs = eval(args.custom_reward_kwargs) if args.custom_reward == 'pytorch': if args.custom_reward_path == '': assert False, 'no path for reward model' else: env = W.VecPyTorchAtariReward(env, args.custom_reward_path, env_name) else: assert False, 'no such wrapper exist' if env_type == 'mujoco': env = VecNormalize(env) # if env_type == 'atari': # input("Normalizing for ATari game: okay? [Enter]") # #normalize rewards but not observations for atari # env = VecNormalizeRewards(env) return env
def make(seed): def make_env(): env = gym.make(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) return env
def build_env(args, seed): nenv = 1 alg = args.alg # seed = args.seed seed = int(np.random.rand(1) * 101000) print(seed) env_type, env_id = get_env_type(args.env) set_global_seeds(seed) if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) env = VecFrameStack(env, frame_stack_size) else: # config = tf.ConfigProto(allow_soft_placement=True, # intra_op_parallelism_threads=1, # inter_op_parallelism_threads=1) # config.gpu_options.allow_growth = True # get_session(config=config) sess = tf.InteractiveSession() # env = VecNormalize(make_vec_env(env_id, env_type, 1, seed, reward_scale=args.reward_scale)) env = make_vec_env(env_id, env_type, args.numenv, seed, reward_scale=args.reward_scale) evalenv = make_vec_env(env_id, env_type, args.numenv, seed, reward_scale=args.reward_scale) if env_type == 'mujoco': env = VecNormalize(env) evalenv = VecNormalizeEval(evalenv) evalenv.ob_rms = env.ob_rms evalenv.ret_rms = env.ret_rms return env, sess, evalenv
def train(env_id, num_timesteps, seed, nsteps, batch_size, epoch, method, net_size, i_trial, load_path, use_entr, ncpu): # rank = MPI.COMM_WORLD.Get_rank() # if rank != 0: # logger.set_level(logger.DISABLED) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True # workerseed = seed + 10000 * rank tf.reset_default_graph() set_global_seeds(seed) def make_env(rank): def _thunk(): env = gym.make(env_id) if logger.get_dir(): env = bench.Monitor( env, os.path.join(logger.get_dir(), 'train-{}.monitor.json'.format(rank))) return env return _thunk # def make_env(): # env = gym.make(env_id) # env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) # return env env = SubprocVecEnv([make_env(i) for i in range(ncpu)]) # env = DummyVecEnv([make_env]) env = VecNormalize(env) with tf.Session(config=config) as sess: policy = MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=nsteps, nminibatches=batch_size, lam=0.95, gamma=0.99, noptepochs=epoch, log_interval=1, ent_coef=0.01, lr=3e-4, cliprange=0.2, total_timesteps=num_timesteps, useentr=use_entr, net_size=net_size, i_trial=i_trial, load_path=load_path, method=method)
def train(env_id, num_timesteps, seed, pol, cur, vis, model): from baselines.common import set_global_seeds from baselines.ppo2 import ppo2 from baselines.ppo2.policies import HierPolicy, HierPolicy2, MlpPolicy, RandomWalkPolicy import gym import gym_program import tensorflow as tf from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() hier = True if pol == 'hier1' or pol == 'hier2' else False def make_env(): set_global_seeds(seed) env = gym.make(env_id) env.set_curiosity(cur, model) env.set_hier(hier) env.set_visualize(vis) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) if pol == 'hier1': policy = HierPolicy elif pol == 'hier2': policy = HierPolicy2 elif policy == 'mlp': policy = MlpPolicy elif pol == 'random_walk': pol = RandomWalkPolicy pol(env) return ppo2.learn(policy=policy, env=env, pol=pol, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=1e-4, cliprange=0.2, total_timesteps=num_timesteps)
def get_env(env_name, no_normalize=False, out_dir="results", vector=8, reward_wrapper=lambda env: env): trained_agent = utils.get_trained_agent(env_name) ### ENV SETUP ### # TODO: upgrade Gym so this monkey-patch isn't needed gym.spaces.Dict = type(None) def make_env(id): # TODO: seed (not currently supported) # TODO: VecNormalize? (typically good for MuJoCo) # TODO: baselines logger? # TODO: we're loading identical policy weights into different # variables, this is to work-around design choice of Agent's # having state stored inside of them. sess = utils.make_session() with sess.as_default(): multi_env, policy_type = utils.get_env_and_policy_type(env_name) multi_env = ShapeWeightHack(multi_env) single_env = MultiToSingle( DelayedLoadEnv(multi_env, trained_agent, policy_type, "zoo_{}_policy_{}".format(env_name, id), 0, sess)) if env_name == 'kick-and-defend': #attacked_agent = utils.load_agent(trained_agent, policy_type, # "zoo_{}_policy_{}".format(env_name, id), multi_env, 0) #single_env = MultiToSingle(CurryEnv(multi_env, attacked_agent)) single_env = HackyFixForGoalie(single_env) single_env = reward_wrapper(single_env) single_env = Gymify(single_env) single_env.spec = gym.envs.registration.EnvSpec('Dummy-v0') # TODO: upgrade Gym so don't have to do thi0s single_env.observation_space.dtype = np.dtype(np.float32) single_env = Monitor(single_env, osp.join(out_dir, 'mon', 'log{}'.format(id))) return single_env # TODO: close session? venv = SubprocVecEnv( [functools.partial(make_env, i) for i in range(vector)]) if not no_normalize: venv = VecNormalize(venv) return venv
def create_environment(self): envs = [ make_env(i, args, True, self.gan_file) for i in range(self.num_processes) ] envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, gamma=args.gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) return envs, obs_shape
def run_baselines(env, seed, log_dir): """Create baselines model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ ncpu = max(multiprocessing.cpu_count() // 2, 1) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.compat.v1.Session(config=config).__enter__() # Set up logger for baselines configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) baselines_logger.info('rank {}: seed={}, logdir={}'.format( 0, seed, baselines_logger.get_dir())) env = DummyVecEnv([ lambda: bench.Monitor( env, baselines_logger.get_dir(), allow_early_resets=True) ]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy nbatch = env.num_envs * hyper_parameters['batch_size'] training_batch_number = nbatch // hyper_parameters['training_batch_size'] # import pdb; pdb.set_trace() # use AdamOptimizer as optimizer and choose value function same with policy ppo2.learn(policy=policy, env=env, nsteps=hyper_parameters['batch_size'], lam=hyper_parameters['gae_lambda'], gamma=hyper_parameters['discount'], ent_coef=hyper_parameters['policy_ent_coeff'], nminibatches=training_batch_number, noptepochs=hyper_parameters['training_epochs'], max_grad_norm=None, lr=hyper_parameters['learning_rate'], cliprange=hyper_parameters['lr_clip_range'], total_timesteps=hyper_parameters['batch_size'] * hyper_parameters['n_epochs']) # yapf: disable # noqa: E501 return osp.join(log_dir, 'progress.csv')
def ppo(): def make_env(): env = SawyerEnvWrapper(DownEnv(for_her=False)) return env tf.Session().__enter__() env = VecNormalize(DummyVecEnv([make_env])) policy = MlpPolicy model = ppo2.learn(policy=policy, env=env, nsteps=4000, nminibatches=1, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=1e8) return model
def train(env_id, num_timesteps, seed): from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2 import ppo2 from baselines.ppo2.policies import MlpPolicy import gym import tensorflow as tf from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def make_env(): if env_id == 'toy': #env = continuous_gridworld.ContinuousGridworld('', max_steps=1000, # obstacle_mode=continuous_gridworld.NO_OBJECTS) from toy_environment import room_obstacle_list env = gridworld.Gridworld( obstacle_list_generator=room_obstacle_list.obstacle_list) elif env_id == 'navigate': env = NavigateEnv(use_camera=False, continuous_actions=True, neg_reward=True, max_steps=500) elif env_id == 'arm2pos': #env = Arm2PosEnv(continuous=False, max_steps=500) pass else: env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=num_timesteps)
def train(env_id, num_timesteps, seed, lrschedule, num_env): def make_env(): env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy_fn = MlpPolicy learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()