def train(env_id, seed, policy, load_path, num_episodes, frame_skip, no_render): env = make_neyboy_env(env_id, 1, seed, allow_early_resets=True, frame_skip=frame_skip, save_video=True) env = VecFrameStack(env, 4) policy = build_policy(env, policy) ob_space = env.observation_space ac_space = env.action_space ent_coef = .01 vf_coef = 0.5 max_grad_norm = 0.5 model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=env.num_envs, nbatch_train=0, nsteps=0, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) model.load(load_path) for _ in range(num_episodes): if not no_render: env.render() observation, done = env.reset(), False if not no_render: env.render() episode_rew = 0 score = 0 while not done: if not no_render: env.render() action, _, _, _ = model.step(observation) observation, reward, done, info = env.step(action) episode_rew += reward score = info[0] print('Episode reward={}, info={}'.format(episode_rew, score))
from baselines.common.vec_env.vec_frame_stack import VecFrameStack from models import CNN_Net import torch import os # get the tensors def get_tensors(obs): return torch.tensor(np.transpose(obs, (0, 3, 1, 2)), dtype=torch.float32) if __name__ == '__main__': args = get_args() # create the environment env = VecFrameStack(make_atari_env(args.env_name, 1, args.seed), 4) # start to create the model model_path = args.save_dir + args.env_name + '/model.pt' network = CNN_Net(env.action_space.n) network.load_state_dict( torch.load(model_path, map_location=lambda storage, loc: storage)) # start to do the test obs = env.reset() for _ in range(10000): env.render() obs_tensor = get_tensors(obs) with torch.no_grad(): _, pi = network(obs_tensor) actions = torch.argmax(pi, dim=1).item() obs, reward, done, _ = env.step([actions]) env.close()
def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs): ''' Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf) Train an agent with given network architecture on a given environment using ACER. Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) (default: 20) nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension (last image dimension) (default: 4) total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M) q_coef: float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods) ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01) max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10), lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) rprop_alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting factor (default: 0.99) log_interval: int, number of updates between logging events (default: 100) buffer_size: int, size of the replay buffer (default: 50k) replay_ratio: int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4) replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k) c: float, importance weight clipping factor (default: 10) trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True) delta: float, max KL divergence between the old policy and updated policy (default: 1) alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99) load_path: str, path to load the model from (default: None) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' print("Running Acer Simple") print(locals()) set_global_seeds(seed) if not isinstance(env, VecFrameStack): env = VecFrameStack(env, 1) policy = build_policy(env, network, estimate_q=True, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nstack = env.nstack model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, c=c, trust_region=trust_region, alpha=alpha, delta=delta) if load_path is not None: model.load(load_path) runner = Runner(env=env, model=model, nsteps=nsteps) if replay_ratio > 0: buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size) else: buffer = None nbatch = nenvs * nsteps acer = Acer(runner, model, buffer, log_interval) acer.tstart = time.time() for acer.steps in range( 0, total_timesteps, nbatch ): # nbatch samples, 1 on_policy call and multiple off-policy calls env.render() acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): n = np.random.poisson(replay_ratio) for _ in range(n): acer.call(on_policy=False) # no simulation steps in this return model
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = args.seed env_type, env_id = get_env_type(args.env) if env_type == 'atari': if alg == 'acer': env = make_vec_env(env_id, env_type, nenv, seed) elif alg == 'deepq': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir()) env = atari_wrappers.wrap_deepmind(env, frame_stack=True, scale=True) elif alg == 'trpo_mpi': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env = atari_wrappers.wrap_deepmind(env) # TODO check if the second seeding is necessary, and eventually remove env.seed(seed) else: frame_stack_size = 4 env = VecFrameStack(make_vec_env(env_id, env_type, nenv, seed), frame_stack_size) elif env_type == 'retro': import retro gamestate = args.gamestate or 'Level1-1' env = retro_wrappers.make_retro( game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) env.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = retro_wrappers.wrap_deepmind_retro(env) elif env_type == 'AirHockey': from gym_airhockey.configuration import configure_env from baselines.common.vec_env.dummy_vec_env import DummyVecEnv version_list = [x for x in args.versions if x is not None] version = version_list[ MPI.COMM_WORLD.Get_rank() % len(version_list)] # Each rank gets its own version # setup the environment env = gym.make(env_id) env.seed(args.seed) configure_env(env, version=version) # wrap the environment env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) env = DummyVecEnv([lambda: env]) env.render() else: get_session( tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale) if env_type == 'mujoco': env = VecNormalize(env) return env