def train(env_id, num_timesteps, seed, policy, lrschedule, num_env): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()
def main(): parser, args_default = arg_parser_common() args = parser.parse_args() import json from dotmap import DotMap from copy import copy, deepcopy keys_exclude = [ 'coef_predict_task', 'is_multiprocess', 'n_envs', 'eval_interval', 'n_steps', 'n_minibatches', 'play', 'n_eval_epsiodes', 'force_write', 'kl2clip_sharelogstd', 'policy_variance_state_dependent', 'kl2clip_clip_clipratio', 'kl2clip_decay', 'lr', 'num_timesteps', 'gradient_rectify', 'rectify_scale', 'kl2clip_clipcontroltype', 'reward_scale', 'coef_predict_task', 'explore_additive_rate', 'explore_additive_threshold', 'explore_timesteps', 'debug_halfcheetah', 'name_project', 'n_opt_epochs', 'coef_entropy', 'log_interval', 'save_interval', 'save_debug', 'isatari', 'env_full', 'envtype' ] keys_exclude.extend([ 'logstd', 'lam', 'hidden_sizes', 'num_layers', 'num_sharing_layers', 'ac_fn', 'lam_decay', 'policy_type' ]) # TODO: These args should not be used as name of dir only if they are specified. # TODO: Split args into..... group_keys and run_keys. # -------------------- prepare args args.env_full = args.env args.env = args.env_full.split('-v')[0] if not args.isatari: args.envtype = MUJOCO if '-v' not in args.env_full: args.env_full = f'{args.env}-v2' else: keys_exclude.append('logstd') args.envtype = ATARI # if 'NoFrameskip' not in args.env: # args.env = f'' if '-v' not in args.env_full: args.env_full = f'{args.env}-v4' tools.warn_(f'Run with setting for {args.envtype} task!!!!!') assert bool(args.alg) != bool( args.cliptype), 'Either alg or cliptype should be specified' if args.alg: # For release args.cliptype = alg2cliptype[args.alg] keys_exclude.append('cliptype') if len(args.keys_group) == 0: args.keys_group = ['alg'] if args.name_group is None: args.name_group = '' else: # For debug keys_exclude.append('alg') if len(args.keys_group) == 0: args.keys_group = ['cliptype', 'clipargs'] if args.name_group is None: args.name_group = 'tmp' # ------ Set the values of args def update_dict(dictmain, dictnew): for key_arg in dictnew: if key_arg.startswith('__'): # This means that the value are customized for the specific values key_interest = key_arg[2:] #e.g., __cliptype value_interest = dictmain[ key_interest] #Search value from dictmain. e.g., kl_klrollback_constant_withratio if value_interest in dictnew[key_arg].keys(): dictmain = update_dict(dictmain, dictnew[key_arg][value_interest]) else: if isinstance(dictnew[key_arg], dict) and key_arg in dictmain.keys(): dictmain[key_arg].update(dictnew[key_arg]) else: dictmain[key_arg] = copy(dictnew[key_arg]) return dictmain def reform_specific_dict(d): dictmain = dict((k, v) for k, v in d.items() if not k.startswith('__')) dictspecific = dict((k, v) for k, v in d.items() if k.startswith('__')) return update_dict(dictmain, dictspecific) # If the value of the following args are None, then it is setted by the following values keys_del = [] args = vars(args) keys = list(args.keys()) for key in keys: if args[key] is None: del args[key] #Delete the value of args keys_del.append(key) if len(keys_del) > 0: print( 'The following args are not provided value by the args. They will used built-in values.\n', ', '.join(keys_del)) # args__ = update_dict( copy(args_default), args ) # We need to update the basic args, e.g., env, cliptype # args__ = reform_specific_dict( args__) # The following operations may seems strange. Maybe I will give a more clear one in the furture. args__ = update_dict( deepcopy(args), args_default) # generate the default value from args_default args = update_dict(args__, args) # The priority of the customed value is highest for key in keys_del: # make sure that keys_del are within args.keys() assert key in args.keys(), key # print( json.dumps(args, indent=True) ) # exit() # TODO prepare_dir: change .finish_indicator to finishi_indictator, which is more clear. # --- prepare dir import baselines root_dir = tools_logger.get_logger_dir('baselines', 'results', baselines) args = tools_logger.prepare_dirs(args, key_first='env', keys_exclude=keys_exclude, dirs_type=['log'], root_dir=root_dir) # --- prepare args for use args.cliptype = ClipType[args.cliptype] args.zip_dirs = ['model', 'monitor'] for d in args.zip_dirs: args[f'{d}_dir'] = osp.join(args.log_dir, d) os.mkdir(args[f'{d}_dir']) from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2_AdaClip import ppo2 # from baselines.ppo2_AdaClip import ppo2_kl2clip_conservative as ppo2 import baselines.ppo2_AdaClip.policies as plcs import gym import tensorflow as tf from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True tf.Session(config=config).__enter__() set_global_seeds(args.seed) policy = getattr(plcs, args.policy_type) # ------ prepare env # args.eval_model = args.n_eval_epsiodes > 0 if args.envtype == MUJOCO: def make_mujoco_env(rank=0): def _thunk(): env = gym.make(args.env_full) env.seed(args.seed + rank) env = bench.Monitor(env, os.path.join(args.log_dir, 'monitor', str(rank)), allow_early_resets=True) return env return _thunk if args.n_envs == 1: env = DummyVecEnv([make_mujoco_env()]) else: from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv env = SubprocVecEnv( [make_mujoco_env(i) for i in range(args.n_envs)]) env = VecNormalize(env, reward_scale=args.reward_scale) env_test = None if args.n_eval_epsiodes > 0: if args.n_eval_epsiodes == 1: env_test = DummyVecEnv([make_mujoco_env()]) else: from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv env_test = SubprocVecEnv( [make_mujoco_env(i) for i in range(args.n_eval_epsiodes)]) env_test = VecNormalize( env_test, ret=False, update=False) # It doesn't need to normalize return else: from baselines.common.vec_env.vec_frame_stack import VecFrameStack from baselines.common.cmd_util import make_atari_env env = VecFrameStack( make_atari_env(args.env_full, num_env=args.n_envs, seed=args.seed), 4) env_test = None # TODO : debug VecFrame if args.n_eval_epsiodes > 0: env_test = VecFrameStack( make_atari_env(args.env_full, num_env=args.n_eval_epsiodes, seed=args.seed), 4) # env_test.reset() # env_test.render() # ----------- learn if args.envtype == MUJOCO: lr = args.lr # cliprange = args.clipargs.cliprange elif args.envtype == ATARI: lr = lambda f: f * args.lr # cliprange = lambda f: f*args.clipargs.cliprange if args.clipargs.cliprange is not None else None # print('action_space',env.action_space) ppo2.learn(policy=policy, env=env, env_eval=env_test, n_steps=args.n_steps, nminibatches=args.n_minibatches, lam=args.lam, gamma=0.99, n_opt_epochs=args.n_opt_epochs, log_interval=args.log_interval, ent_coef=args.coef_entropy, lr=lr, total_timesteps=args.num_timesteps, cliptype=args.cliptype, save_interval=args.save_interval, args=args) tools_logger.finish_dir(args.log_dir)
def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs): ''' Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf) Train an agent with given network architecture on a given environment using ACER. Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) (default: 20) nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension (last image dimension) (default: 4) total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M) q_coef: float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods) ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01) max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10), lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) rprop_alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting factor (default: 0.99) log_interval: int, number of updates between logging events (default: 100) buffer_size: int, size of the replay buffer (default: 50k) replay_ratio: int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4) replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k) c: float, importance weight clipping factor (default: 10) trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True) delta: float, max KL divergence between the old policy and updated policy (default: 1) alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99) load_path: str, path to load the model from (default: None) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' print("Running Acer Simple") print(locals()) set_global_seeds(seed) if not isinstance(env, VecFrameStack): env = VecFrameStack(env, 1) policy = build_policy(env, network, estimate_q=True, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nstack = env.nstack model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, c=c, trust_region=trust_region, alpha=alpha, delta=delta) runner = Runner(env=env, model=model, nsteps=nsteps) if replay_ratio > 0: buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size) else: buffer = None nbatch = nenvs * nsteps acer = Acer(runner, model, buffer, log_interval) acer.tstart = time.time() for acer.steps in range( 0, total_timesteps, nbatch ): #nbatch samples, 1 on_policy call and multiple off-policy calls acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): n = np.random.poisson(replay_ratio) for _ in range(n): acer.call(on_policy=False) # no simulation steps in this return model
def train(env_id, num_timesteps, seed, num_cpu): env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close()
# with tf.variable_scope("global"): # network = LSTMPolicy([42, 42, 1], 6) # saver = tf.train.Saver() # sess = tf.Session() # saver.restore(sess, "/tmp/pong/train/model.ckpt-1894351") tf.reset_default_graph() sess = tf.Session() class Actor(): def __init__(self, ob_space, ac_space, n_batch, n_steps): self.network = CnnPolicy(sess, ob_space, ac_space, n_batch, n_steps) saver = tf.train.Saver() saver.restore(sess, "./checkpoints/model.ckpt") def act(self, state): stuff = self.network.step(state) action, value_, _ = stuff[0], stuff[1], stuff[2:] return action, value_ env = VecFrameStack(make_atari_env('PongNoFrameskip-v4', 1, 123), 4) ob_space = env.observation_space ac_space = env.action_space actor = Actor(ob_space, ac_space, 32, 1) with sess: print(actor.act(np.ones((32, 84, 84, 4))))
from baselines.common.cmd_util import make_atari_env from baselines.common.vec_env.vec_frame_stack import VecFrameStack from arguments import get_args from a2c_agent import a2c_agent from baselines import logger if __name__ == '__main__': args = get_args() logger.configure(dir=args.log_dir) # create environments envs = VecFrameStack(make_atari_env(args.env_name, args.num_processes, args.seed), 4) trainer = a2c_agent(envs, args) trainer.learn() envs.close()
from arguments import get_args from ppo_agent import ppo_agent from baselines.common.cmd_util import make_atari_env from baselines.common.vec_env.vec_frame_stack import VecFrameStack from models import CNN_Net from baselines import logger import os if __name__ == '__main__': args = get_args() if not os.path.exists('logs/'): os.mkdir('logs/') log_path = 'logs/' + args.env_name + '/' if not os.path.exists(log_path): os.mkdir(log_path) # write log information logger.configure(dir=log_path) envs = VecFrameStack( make_atari_env(args.env_name, args.num_workers, args.seed), 4) network = CNN_Net(envs.action_space.n) ppo_trainer = ppo_agent(envs, args, network, 'atari') ppo_trainer.learn()
def main(): parser, clipargs_default_all, args_default_all = arg_parser_common() args = parser.parse_args() import json from dotmap import DotMap keys_exclude = [ 'coef_predict_task', 'is_multiprocess', 'n_envs', 'eval_interval', 'n_steps', 'n_minibatches', 'play', 'n_eval_epsiodes', 'force_write', 'kl2clip_sharelogstd', 'policy_variance_state_dependent', 'kl2clip_clip_clipratio', 'kl2clip_decay', 'lr', 'num_timesteps', 'gradient_rectify', 'rectify_scale', 'kl2clip_clipcontroltype', 'reward_scale', 'coef_predict_task', 'explore_additive_rate', 'explore_additive_threshold', 'explore_timesteps', 'debug_halfcheetah', 'name_project', 'env_pure', 'n_opt_epochs', 'coef_entropy', 'log_interval', 'save_interval', 'save_debug', 'is_atari' ] # 'is_atari' # -------------------- prepare args args.env_pure = args.env.split('-v')[0] # env_mujocos = 'InvertedPendulum,InvertedDoublePendulum,HalfCheetah,Hopper,Walker2d,Ant,Reacher,Swimmer,Humanoid' # env_mujocos = tools.str2list(env_mujocos) if not args.is_atari: env_type = MUJOCO if '-v' not in args.env: args.env = f'{args.env}-v2' else: env_type = ATARI if '-v' not in args.env: args.env = f'{args.env}-v4' tools.warn_(f'Run with setting for {env_type} task!!!!!') # --- set value of clipargs clipargs_default = clipargs_default_all[env_type] clipargs = clipargs_default[args.cliptype].copy() clipargs.update(args.clipargs) args.clipargs = clipargs # --- prepare other args # If the value of the following args are None, then it is setted by the following values args_default = args_default_all[env_type] args = DotMap(vars(args)) print( "The followng arg value is None, thus they are setted by built-in value:" ) for argname in args_default.keys(): if args[argname] is None: if args.env_pure in args_default[argname].keys(): args[argname] = args_default[argname][args.env_pure] else: args[argname] = args_default[argname]['_default'] print(f"{argname}={args[argname]}") # print( json.dumps( args.toDict(), indent='\t') ) # exit() # TODO prepare_dir: change .finish_indicator to finishi_indictator, which is more clear. # --- prepare dir import baselines root_dir = tools_logger.get_logger_dir('baselines', baselines, 'results') args = tools_logger.prepare_dirs(args, key_first='env', keys_exclude=keys_exclude, dirs_type=['log'], root_dir=root_dir) # --- prepare args for use args.cliptype = ClipType[args.cliptype] args.zip_dirs = ['model', 'monitor'] for d in args.zip_dirs: args[f'{d}_dir'] = osp.join(args.log_dir, d) os.mkdir(args[f'{d}_dir']) from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2_AdaClip import ppo2 # from baselines.ppo2_AdaClip import ppo2_kl2clip_conservative as ppo2 import baselines.ppo2_AdaClip.policies as plcs import gym import tensorflow as tf from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True tf.Session(config=config).__enter__() set_global_seeds(args.seed) policy = getattr(plcs, args.policy_type) # ------ prepare env # args.eval_model = args.n_eval_epsiodes > 0 if env_type == MUJOCO: def make_mujoco_env(rank=0): def _thunk(): env = gym.make(args.env) env.seed(args.seed + rank) env = bench.Monitor(env, os.path.join(args.log_dir, 'monitor', str(rank)), allow_early_resets=True) return env return _thunk if args.n_envs == 1: env = DummyVecEnv([make_mujoco_env()]) else: from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv env = SubprocVecEnv( [make_mujoco_env(i) for i in range(args.n_envs)]) env = VecNormalize(env, reward_scale=args.reward_scale) env_test = None if args.n_eval_epsiodes > 0: if args.n_eval_epsiodes == 1: env_test = DummyVecEnv([make_mujoco_env()]) else: from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv env_test = SubprocVecEnv( [make_mujoco_env(i) for i in range(args.n_eval_epsiodes)]) env_test = VecNormalize( env_test, ret=False, update=False) # It doesn't need to normalize return else: from baselines.common.vec_env.vec_frame_stack import VecFrameStack from baselines.common.cmd_util import make_atari_env env = VecFrameStack( make_atari_env(args.env, num_env=args.n_envs, seed=args.seed), 4) env_test = None # TODO : debug VecFrame if args.n_eval_epsiodes > 0: env_test = VecFrameStack( make_atari_env(args.env, num_env=args.n_eval_epsiodes, seed=args.seed), 4) # env_test.reset() # env_test.render() # ----------- learn if env_type == MUJOCO: lr = args.lr # cliprange = args.clipargs.cliprange elif env_type == ATARI: lr = lambda f: f * args.lr # cliprange = lambda f: f*args.clipargs.cliprange if args.clipargs.cliprange is not None else None args.env_type = env_type ppo2.learn(policy=policy, env=env, env_eval=env_test, n_steps=args.n_steps, nminibatches=args.n_minibatches, lam=args.lam, gamma=0.99, n_opt_epochs=args.n_opt_epochs, log_interval=args.log_interval, ent_coef=args.coef_entropy, lr=lr, total_timesteps=args.num_timesteps, cliptype=args.cliptype, save_interval=args.save_interval, args=args) tools_logger.finish_dir(args.log_dir)
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = args.seed env_type, env_id = get_env_type(args.env) if env_type == 'atari': if alg == 'acer': env = make_vec_env(env_id, env_type, nenv, seed) elif alg == 'deepq': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir()) env = atari_wrappers.wrap_deepmind(env, frame_stack=True, scale=True) elif alg == 'trpo_mpi': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env = atari_wrappers.wrap_deepmind(env) # TODO check if the second seeding is necessary, and eventually remove env.seed(seed) else: frame_stack_size = 4 env = VecFrameStack(make_vec_env(env_id, env_type, nenv, seed), frame_stack_size) elif env_type == 'retro': import retro gamestate = args.gamestate or 'Level1-1' env = retro_wrappers.make_retro( game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) env.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = retro_wrappers.wrap_deepmind_retro(env) elif env_type == 'AirHockey': from gym_airhockey.configuration import configure_env from baselines.common.vec_env.dummy_vec_env import DummyVecEnv version_list = [x for x in args.versions if x is not None] version = version_list[ MPI.COMM_WORLD.Get_rank() % len(version_list)] # Each rank gets its own version # setup the environment env = gym.make(env_id) env.seed(args.seed) configure_env(env, version=version) # wrap the environment env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) env = DummyVecEnv([lambda: env]) env.render() else: get_session( tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale) if env_type == 'mujoco': env = VecNormalize(env) return env
def train(env, policy, policy_init, seed, njobs=1, **alg_args): if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\S+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(seed=0): def _thunk(): env_rllab = Rllab2GymWrapper(env_rllab_class()) env_rllab.seed(seed) return env_rllab return _thunk parallel_env = SubprocVecEnv( [make_env(seed + i * 100) for i in range(njobs)]) # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari, custom env creation def make_env(seed=0): def _thunk(): _env = make_atari(env) _env.seed(seed) return wrap_deepmind(_env) return _thunk parallel_env = VecFrameStack( SubprocVecEnv([make_env(seed + i * 100) for i in range(njobs)]), 4) else: # Not atari, standard env creation def make_env(seed=0): def _thunk(): _env = gym.make(env) _env.seed(seed) return _env return _thunk parallel_env = SubprocVecEnv( [make_env(seed + i * 100) for i in range(njobs)]) if policy == 'linear': hid_size = num_hid_layers = 0 use_bias = False elif policy == 'simple-nn': hid_size = [16] num_hid_layers = 1 use_bias = True elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 use_bias = True if policy_init == 'xavier': policy_initializer = tf.contrib.layers.xavier_initializer() elif policy_init == 'zeros': policy_initializer = U.normc_initializer(0.0) elif policy_init == 'small-weights': policy_initializer = U.normc_initializer(0.1) else: raise Exception('Unrecognized policy initializer.') if policy == 'linear' or policy == 'nn' or policy == 'simple-nn': def make_policy(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=use_bias, use_critic=False, hidden_W_init=policy_initializer, output_W_init=policy_initializer) elif policy == 'cnn': def make_policy(name, ob_space, ac_space): return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=policy_initializer, output_W_init=policy_initializer) else: raise Exception('Unrecognized policy type.') try: affinity = len(os.sched_getaffinity(0)) except: affinity = njobs sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) pois2.learn(parallel_env, make_policy, **alg_args)
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg seed = args.seed env_type, env_id = get_env_type(args.env) print("In the build_env function with alg :: ", alg) if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: frame_stack_size = 4 print( "make_vec_env arguments env_id {} , env_type {} , nenv {} ,seed {} , gamestate {} reward_scale {}" .format(env_id, env_type, nenv, seed, args.gamestate, args.reward_scale)) #> # print("Called environment for mean and std") # env = make_vec_env(env_id, env_type, 1, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) # # env = VecFrameStack(env, frame_stack_size) ## No need for frame stacking while calculation of mean and std # ob_mean, ob_std = random_agent_ob_mean_std(env) # print(" environment complete with mean {} and std {}".format(ob_mean , ob_std)) # del env #> env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) # print("Received env from make_vec_env type env {} and env ".format( # type(env) , env)) print("ob_space {} and ac_space {} ".format( env.observation_space, env.action_space)) env = VecFrameStack(env, frame_stack_size) print("After Frame stacking env would become ") else: config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale) if env_type == 'mujoco': env = VecNormalize(env) return env #, ob_mean, ob_std
def main(visualize=False): session = tf_util.make_session() env_model = EnvNetwork(action_space_size=6, nbatch=num_env * singlestep, K=K, nsteps=singlestep, reuse=False, session=session) session.run(tf.global_variables_initializer()) env_model.restore() env = VecFrameStack(make_doom_env(num_env, seed, 'mixed'), 4) navi_model = Model(policy=CnnPolicy, ob_space=env.observation_space, ac_space=Discrete(3), nenvs=num_env, nsteps=nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=total_timesteps, lrschedule='linear', model_name='navi') navi_model.load("O:\\Doom\\baselinemodel\\navigate_flat2.dat") fire_model = Model(policy=CnnPolicy, ob_space=env.observation_space, ac_space=Discrete(3), nenvs=num_env, nsteps=nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=total_timesteps, lrschedule='linear', model_name='fire') fire_model.load("O:\\Doom\\baselinemodel\\fire_flat2.dat") policy_model = MixedModel(navi_model, fire_model, check_enemy_leave, check_enemy_enter, [0, 1, 4], [0, 1, 5]) runner = Runner(env, policy_model, nsteps=nsteps, gamma=0.99) nh, nw, nc = env.observation_space.shape while True: total_loss = 0 for _ in tqdm(range(save_freq)): obs1, _, _, mask1, actions1, _ = runner.run() obs1 = np.reshape(obs1, [num_env, nsteps, nh, nw, nc]) obs1 = obs1[:, :, :, :, -1:] actions1 = np.reshape(actions1, [num_env, nsteps]) mask1 = np.reshape(mask1, [num_env, nsteps]) hidden_states = env_model.initial_state for s in range(0, nsteps - K - singlestep, singlestep): input_frames = obs1[:, s:s + singlestep, :, :, :] // norm_factor input_frames = np.reshape(input_frames, [num_env * singlestep, nh, nw]) input_frames = np.eye(9)[input_frames] actions, masks, expected_observations = [], [], [] for t in range(K): expected_observation = obs1[:, s + t + 1:s + singlestep + t + 1, :, :, :] expected_observation = np.reshape( expected_observation, [num_env * singlestep, nh, nw, 1]) expected_observations.append(expected_observation) action = actions1[:, s + t:s + singlestep + t] action = np.reshape(action, [num_env * singlestep]) actions.append(action) mask = mask1[:, s + t:s + singlestep + t] mask = np.reshape(mask, [num_env * singlestep]) masks.append(mask) if s > 0: loss, prediction, hidden_states = env_model.train_and_predict( input_frames, actions, masks, expected_observations, hidden_states) total_loss += loss else: # warm up prediction, hidden_states = env_model.predict( input_frames, actions, masks, hidden_states) if visualize and s == 3 * singlestep: for batch_idx in range(num_env * singlestep): expected_t = expected_observations[0] if np.sum(expected_t[batch_idx, :, :, :] > 0.0): input_frame = input_frames[batch_idx, :, :, :] cv2.imshow('input', input_frame) for i in range(K): time_t_expectation = expected_observations[i] exp_obs = time_t_expectation[ batch_idx, :, :, :] cv2.imshow('expected for t+{}'.format(i + 1), exp_obs) for i in range(K): time_t_prediction = prediction[i] cv2.imshow( 'prediction for t+{}'.format(i + 1), time_t_prediction[batch_idx, :, :, 7]) cv2.waitKey(0) print("avg_loss = {}".format(total_loss / K / save_freq / valid_batch_size)) env_model.save()
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = args.seed env_type, env_id = get_env_type(args.env) if env_type == 'mujoco': # todo: copy paste from akhil: create session instead of getting session get_session( tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) # always using dummy environment should allow running saved models without any further changes! # env = DummyVecEnv([lambda: make_mujoco_env(env_id, seed, args.reward_scale)]) if args.num_env: env = SubprocVecEnv([ lambda: make_mujoco_env(env_id, seed + i if seed is not None else None, args.reward_scale) for i in range(args.num_env) ]) else: env = DummyVecEnv( [lambda: make_mujoco_env(env_id, seed, args.reward_scale)]) # uncommented on Akhil's advice, as it is no longer necessary because I'm normalizing the data in my environment! env = VecNormalize(env) elif env_type == 'atari': if alg == 'acer': env = make_atari_env(env_id, nenv, seed) elif alg == 'deepq': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir()) env = atari_wrappers.wrap_deepmind(env, frame_stack=True, scale=True) elif alg == 'trpo_mpi': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env = atari_wrappers.wrap_deepmind(env) # TODO check if the second seeding is necessary, and eventually remove env.seed(seed) else: frame_stack_size = 4 env = VecFrameStack(make_atari_env(env_id, nenv, seed), frame_stack_size) elif env_type == 'retro': import retro gamestate = args.gamestate or 'Level1-1' env = retro_wrappers.make_retro( game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) env.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = retro_wrappers.wrap_deepmind_retro(env) elif env_type == 'classic_control': def make_env(): e = gym.make(env_id) e = bench.Monitor(e, logger.get_dir(), allow_early_resets=True) e.seed(seed) return e env = DummyVecEnv([make_env]) else: raise ValueError('Unknown env_type {}'.format(env_type)) return env
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg seed = args.seed env_type, env_id = get_env_type(args.env) print(env_id) #extract the agc_env_name noskip_idx = env_id.find("NoFrameskip") env_name = env_id[:noskip_idx].lower() print("Env Name for Masking:", env_name) if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) env = VecFrameStack(env, frame_stack_size) else: print("preconfig") config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) print("post config") config.gpu_options.allow_growth = True get_session(config=config) print("got session") env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale) print("made env") if args.custom_reward != '': from baselines.common.vec_env import VecEnv, VecEnvWrapper import baselines.common.custom_reward_wrapper as W assert isinstance(env, VecEnv) or isinstance(env, VecEnvWrapper) custom_reward_kwargs = eval(args.custom_reward_kwargs) if args.custom_reward == 'live_long': env = W.VecLiveLongReward(env, **custom_reward_kwargs) elif args.custom_reward == 'random_tf': env = W.VecTFRandomReward(env, **custom_reward_kwargs) elif args.custom_reward == 'preference': env = W.VecTFPreferenceReward(env, **custom_reward_kwargs) elif args.custom_reward == 'rl_irl': if args.custom_reward_path == '': assert False, 'no path for reward model' else: if args.custom_reward_lambda == '': assert False, 'no combination parameter lambda' else: env = W.VecRLplusIRLAtariReward(env, args.custom_reward_path, args.custom_reward_lambda) elif args.custom_reward == 'pytorch': if args.custom_reward_path == '': assert False, 'no path for reward model' else: if env_type == "atari": env = W.VecPyTorchAtariReward(env, args.custom_reward_path, env_name) elif env_type == "mujoco": env = W.VecPyTorchMujocoReward(env, args.custom_reward_path, env_name) elif args.custom_reward == "mcmc_mean": if args.custom_reward_path == '' or args.mcmc_chain_path == '': assert False, 'no path for reward model and/or chain_path' else: env = W.VecMCMCMeanAtariReward(env, args.custom_reward_path, args.mcmc_chain_path, args.embedding_dim, env_name) elif args.custom_reward == "mcmc_map": if args.custom_reward_path == '': assert False, 'no path for reward model and/or chain_path' else: env = W.VecMCMCMAPAtariReward(env, args.custom_reward_path, args.embedding_dim, env_name) else: assert False, 'no such wrapper exist' if env_type == 'mujoco': print("normalized environment") env = VecNormalize(env) # if env_type == 'atari': # input("Normalizing for ATari game: okay? [Enter]") # #normalize rewards but not observations for atari # env = VecNormalizeRewards(env) return env
extra_checkpoint_info = "bc_degredation" #for finding checkpoint again hist_length = 4 #env id, env type, num envs, and seed env = make_vec_env(env_id, 'atari', 1, seed, wrapper_kwargs={ 'clip_rewards':False, 'episode_life':False, }) stochastic = True env = VecFrameStack(env, 4) demonstrator = PPO2Agent(env, env_type, stochastic) ##generate demonstrations for use in BC demonstrations, learning_returns = generate_demos(env, env_name, demonstrator, args.checkpoint_path, args.num_demos) #Run BC on demos dataset_size = sum([len(d) for d in demonstrations]) print("Data set size = ", dataset_size) episode_index_counter = 0 num_data = 0 action_set = set() action_cnt_dict = {} data = []
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = args.seed env_type, env_id = get_env_type(args.env) if env_type == 'atari': if alg == 'acer': env = make_vec_env(env_id, env_type, nenv, seed) elif alg == 'deepq': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir()) env = atari_wrappers.wrap_deepmind(env, frame_stack=True) elif alg == 'trpo_mpi': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env = atari_wrappers.wrap_deepmind(env) # TODO check if the second seeding is necessary, and eventually remove env.seed(seed) else: frame_stack_size = 4 env = VecFrameStack(make_vec_env(env_id, env_type, nenv, seed), frame_stack_size) elif env_type == 'retro': import retro gamestate = args.gamestate or retro.State.DEFAULT env = retro_wrappers.make_retro( game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) env.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = retro_wrappers.wrap_deepmind_retro(env) elif env_type == 'unity': get_session(tf.ConfigProto(allow_soft_placement=True)) # get_session(tf.ConfigProto(allow_soft_placement=True, # intra_op_parallelism_threads=1, # inter_op_parallelism_threads=1)) env = make_multi_unity_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale) env = VecNormalize(env) else: get_session( tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale) if env_type == 'mujoco' or env_type == 'unity': env = VecNormalize(env) return env
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = args.seed env_type, env_id = get_env_type(args.env) if env_type == 'mujoco': get_session( tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) if args.num_env: env = make_vec_env(env_id, env_type, nenv, seed, reward_scale=args.reward_scale) else: env = make_vec_env(env_id, env_type, 1, seed, reward_scale=args.reward_scale) env = VecNormalize(env) elif env_type == 'atari': if alg == 'acer': env = make_vec_env(env_id, env_type, nenv, seed) elif alg == 'deepq': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir()) env = atari_wrappers.wrap_deepmind(env, frame_stack=True, scale=True) elif alg == 'trpo_mpi': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env = atari_wrappers.wrap_deepmind(env) # TODO check if the second seeding is necessary, and eventually remove env.seed(seed) else: frame_stack_size = 4 env = VecFrameStack(make_vec_env(env_id, env_type, nenv, seed), frame_stack_size) elif env_type == 'retro': import retro gamestate = args.gamestate or 'Level1-1' env = retro_wrappers.make_retro( game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) env.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = retro_wrappers.wrap_deepmind_retro(env) elif env_type == 'classic_control': def make_env(): e = gym.make(env_id) e = bench.Monitor(e, logger.get_dir(), allow_early_resets=True) e.seed(seed) return e env = DummyVecEnv([make_env]) else: raise ValueError('Unknown env_type {}'.format(env_type)) return env
def build_env(args, selector=None): global store ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = args.seed env_type, env_id = get_env_type(args.env) print(env_type, env_id, nenv, args.num_env) if env_type == 'mujoco': get_session( tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) if args.num_env: env = SubprocVecEnv([ lambda: make_mujoco_env(env_id, seed + i if seed is not None else None, args.reward_scale) for i in range(args.num_env) ]) else: env = DummyVecEnv( [lambda: make_mujoco_env(env_id, seed, args.reward_scale)]) env = VecNormalize(env) elif env_type == 'atari': if alg == 'acer': env = make_atari_env( env_id, nenv, seed) #, wrapper_kwargs={'clip_rewards': False}) elif alg == 'deepq': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir()) env = atari_wrappers.wrap_deepmind(env, frame_stack=True, scale=True) elif alg == 'trpo_mpi': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env = atari_wrappers.wrap_deepmind(env) # TODO check if the second seeding is necessary, and eventually remove env.seed(seed) elif "Zelda" in env_id: sys.path.append( "/home/jupyter/Notebooks/Chang/HardRLWithYoutube/nnrunner/a2c_gvgai" ) import nnrunner.a2c_gvgai.env as gvgai_env frame_stack_size = 4 print("run zelda") env = VecFrameStack( gvgai_env.make_gvgai_env(env_id, nenv, seed, level_selector=selector, experiment="PE", dataset="zelda"), frame_stack_size) # env.reset() # store = env else: frame_stack_size = 4 env = VecFrameStack(make_atari_env(env_id, nenv, seed), frame_stack_size) elif env_type == 'retro': import retro gamestate = args.gamestate or 'Level1-1' env = retro_wrappers.make_retro( game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) env.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = retro_wrappers.wrap_deepmind_retro(env) elif env_type == 'classic_control': def make_env(): e = gym.make(env_id) e = bench.Monitor(e, logger.get_dir(), allow_early_resets=True) e.seed(seed) return e env = DummyVecEnv([make_env]) else: raise ValueError('Unknown env_type {}'.format(env_type)) # env.reset() print("build env") # store.reset() # store.reset() return env