def denormalize(x, stats): if stats is None: return x state_mean = stats.mean() state_std = stats.std() state_mean = np.array(state_mean, dtype=np.float32) if not isinstance(state_mean, np.ndarray) else state_mean.astype(np.float32) state_std = np.array(state_std, dtype=np.float32) if not isinstance(state_std, np.ndarray) else state_std.astype(np.float32) return x * torch_utils.toTensor(state_std) + torch_utils.toTensor(state_mean)
def run(self): # Here, we init the lists that will contain the mb of experiences mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[] mb_states = self.states epinfos = [] # For n in range number of steps for _ in range(self.nsteps): # Given observations, get action value and neglopacs # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init outputs = self.model.step(torch_utils.toTensor(self.obs).float(), S=self.states, M=self.dones) actions, values, self.states, neglogpacs = torch_utils.toNumpy( outputs) mb_obs.append(self.obs.copy()) mb_actions.append(actions) mb_values.append(values) mb_neglogpacs.append(neglogpacs) mb_dones.append(self.dones) # Take actions in env and look the results # Infos contains a ton of useful informations self.obs[:], rewards, self.dones, infos = self.env.step(actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) mb_rewards.append(rewards) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_values = np.asarray(mb_values, dtype=np.float32) mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) last_values = self.model.value(torch_utils.toTensor(self.obs).float(), S=self.states, M=self.dones) last_values = torch_utils.toNumpy(last_values) # discount/bootstrap off value fn mb_returns = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards) lastgaelam = 0 for t in reversed(range(self.nsteps)): if t == self.nsteps - 1: nextnonterminal = 1.0 - self.dones nextvalues = last_values else: nextnonterminal = 1.0 - mb_dones[t + 1] nextvalues = mb_values[t + 1] delta = mb_rewards[ t] + self.gamma * nextvalues * nextnonterminal - mb_values[t] mb_advs[ t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam mb_returns = mb_advs + mb_values return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)), mb_states, epinfos)
def main(args): # configure logger, disable logging in child MPI processes (with rank > 0) arg_parser = common_arg_parser() args, unknown_args = arg_parser.parse_known_args(args) extra_args = parse_cmdline_kwargs(unknown_args) torch_utils.device = args.device if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 configure_logger(args.log_path, viz_server=args.viz_server, viz_port=args.viz_port) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(args.log_path, format_strs=[], viz_server=args.viz_server, viz_port=args.viz_port) model, env = train(args, extra_args) if args.save_path is not None and rank == 0: os.makedirs(osp.expanduser(args.save_path), exist_ok=True) save_path = osp.join(osp.expanduser(args.save_path), 'model.pth') model.save(save_path) if args.play: logger.log("Running trained model") from ptbaselines.algos.common.torch_utils import toNumpy, toTensor obs = env.reset() state = None dones = np.zeros((1, )) episode_rew = np.zeros(env.num_envs) if isinstance( env, VecEnv) else np.zeros(1) while True: if state is not None: actions, _, state, _ = toNumpy( model.step(toTensor(obs).float(), S=state, M=dones)) else: actions, _, _, _ = toNumpy(model.step(toTensor(obs).float())) obs, rew, done, _ = env.step(actions) # print('rewards: {}'.format(rew)) episode_rew += rew env.render() done_any = done.any() if isinstance(done, np.ndarray) else done if done_any: for i in np.nonzero(done)[0]: print('episode_rew={}'.format(episode_rew[i])) episode_rew[i] = 0 env.close() return model
def adapt_param_noise(self): try: from mpi4py import MPI except ImportError: MPI = None if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) obs0 = torch_utils.toTensor(batch['obs0']).float() normalize_obs0 = self.normalize_obs(obs0) self.perturb_params(self.actor, self.adaptive_actor, self.param_noise.current_stddev) with torch.no_grad(): actions = self.actor(normalize_obs0) adaptive_actions = self.adaptive_actor(normalize_obs0) distance = torch.sqrt(torch.pow(actions - adaptive_actions, 2.0).mean()) if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance self.param_noise.adapt(mean_distance.data.cpu().item()) return mean_distance
def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) obs0 = torch_utils.toTensor(batch['obs0']) obs1 = torch_utils.toTensor(batch['obs1']) rewards = torch_utils.toTensor(batch['rewards']) terminals1= torch_utils.toTensor(batch['terminals1'].astype('float32')) actions = torch_utils.toTensor(batch['actions']) normalize_obs0 = self.normalize_obs(obs0) normalize_obs1 = self.normalize_obs(obs1) # compute target Q_obs1 = denormalize(self.target_critic(normalize_obs1, self.target_actor(normalize_obs1)), self.ret_rms) target_Q = rewards + (1. - terminals1) * self.gamma * Q_obs1 critic_target = torch.clamp(normalize(target_Q, self.ret_rms), self.return_range[0], self.return_range[1]).detach() if self.normalize_returns and self.enable_popart: old_mean = self.ret_rms.mean() old_std = self.ret_rms.std() self.ret_rms.update(torch_utils.toNumpy(target_Q.view(-1))) self.popart(old_mean, old_std) # compute critic loss Q_obs0 = self.critic(normalize_obs0, actions) critic_loss = F.mse_loss(Q_obs0, critic_target) # update critic self.critic_optimizer.zero_grad() critic_loss.backward() mpi_util.average_gradients(self.critic_optimizer.param_groups) self.critic_optimizer.step() # compute actor loss actor_actions = self.actor(normalize_obs0) critic_with_actor = denormalize(torch.clamp(self.critic(normalize_obs0, actor_actions), self.return_range[0], self.return_range[1]), self.ret_rms) actor_loss = -critic_with_actor.mean() # update actor self.actor_optimizer.zero_grad() actor_loss.backward() mpi_util.average_gradients(self.actor_optimizer.param_groups) self.actor_optimizer.step() return critic_loss.data.cpu().item(), actor_loss.data.cpu().item()
def step(self, obs, apply_noise=True, compute_Q=True): if isinstance(obs, np.ndarray): obs = torch_utils.toTensor(obs).float() norm_obs = self.normalize_obs(obs) if self.param_noise is not None and apply_noise: action = self.pertubed_actor(norm_obs) else: action = self.actor(norm_obs) if compute_Q: normalize_value = self.critic(norm_obs, action) q = denormalize(torch.clamp(normalize_value, self.return_range[0], self.return_range[1]), self.ret_rms) else: q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += torch_utils.toTensor(noise[np.newaxis]).float() action = torch.clamp(action, self.action_range[0], self.action_range[1]) return torch_utils.toNumpy(action), q.data.cpu().item(), None, None
def main(): env = gym.make("CartPole-v0") act = deepq.learn(env, network='mlp', total_timesteps=0, load_path="cartpole_model.pth") while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() action = act.actions(torch_utils.toTensor(obs[None]).float(), stochastic=False)[0].item() obs, rew, done, _ = env.step(action) episode_rew += rew print("Episode reward", episode_rew)
def main(): env = gym.make("MountainCar-v0") act = deepq.learn(env, network=models.mlp(env.observation_space.shape, num_layers=1, num_hidden=64), total_timesteps=0, load_path='mountaincar_model.pth') while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() action = act.actions(torch_utils.toTensor(obs[None]).float(), stochastic=False)[0].item() obs, rew, done, _ = env.step(action) episode_rew += rew print("Episode reward", episode_rew)
def main(): env = make_atari("PongNoFrameskip-v4") env = deepq.wrap_atari_dqn(env) act = deepq.learn(env, "conv_only", convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True, total_timesteps=0, load_path='pong_model.pth') while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() action = act.actions(torch_utils.toTensor(obs[None]).float(), stochastic=False)[0].item() obs, rew, done, _ = env.step(action) episode_rew += rew print("Episode reward", episode_rew)
def learn(network, env, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, **network_kwargs): ''' Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Parameters: ----------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int, total number of timesteps to train on (default: 80M) vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5) ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) # Calculate the batch_size nbatch = nenvs * nsteps # Start total timer tstart = time.time() for update in range(1, total_timesteps // nbatch + 1): # Get mini batch of experiences obs, states, rewards, masks, actions, values, epinfos = runner.run() obs, rewards, masks, actions, values = torch_utils.toTensor( (obs, rewards, masks, actions, values)) epinfobuf.extend(epinfos) model_outputs = model.train(obs, states, rewards, masks, actions, values) policy_loss, value_loss, policy_entropy = torch_utils.toNumpy( model_outputs) nseconds = time.time() - tstart # Calculate the fps (frame per second) fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(*torch_utils.toNumpy((values, rewards))) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() # plot using visdom timesteps = update * nbatch logger.vizkv('eprewmean', timesteps, safemean([epinfo['r'] for epinfo in epinfobuf])) logger.vizkv('eplenmean', timesteps, safemean([epinfo['l'] for epinfo in epinfobuf])) logger.vizkv('policy_loss', timesteps, policy_loss) logger.vizkv('value_loss', timesteps, value_loss) logger.vizkv('policy_entropy', timesteps, policy_entropy) return model
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batch sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model set_global_seeds(seed) if checkpoint_path is not None: save_path = osp.join(checkpoint_path, 'model.pth') else: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) save_path = osp.join(checkdir, 'model.pth') q_net = QNet(env, network, **network_kwargs) model = Model(qnet=q_net, lr=lr, grad_norm_clipping=10, gamma=gamma, param_noise=param_noise) model_saved = False if load_path is not None: logger.log('Loaded model from {}'.format(load_path)) model.load(load_path) # model_save = True # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. action = model.actions(torch_utils.toTensor( np.array(obs, dtype=np.float32)[None]), eps=update_eps) else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) model.update_noise_scale( torch_utils.toTensor(np.array(obs, dtype=np.float32)[None]), update_param_noise_threshold) action = model.actions_with_param_noise(torch_utils.toTensor( np.array(obs, dtype=np.float32)[None]), eps=update_eps, reset=reset) action = torch_utils.toNumpy(action)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors, debug = model.train( *torch_utils.toTensor((obses_t, actions, rewards, obses_tp1, dones, weights))) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. model.update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() # plot using visdom logger.vizkv('eprewmean', t, mean_100ep_reward) if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) model.save(save_path) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) model.load(save_path) return model
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from ptbaselines.algos.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) slices = torch_utils.toTensor(slices) mblossvals.append( torch_utils.toNumpy( model.train(lrnow, cliprangenow, *slices))) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) slices.append(states[mbenvinds]) slices = torch_utils.toTensor(slices) mblossvals.append( torch_utils.toNumpy( model.train(lrnow, cliprangenow, *slices))) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("batchsize", nbatch) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() # plot using visdom timesteps = update * nbatch logger.vizkv('eprewmean', timesteps, safemean([epinfo['r'] for epinfo in epinfobuf])) logger.vizkv('eplenmean', timesteps, safemean([epinfo['l'] for epinfo in epinfobuf])) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.vizkv('loss/' + lossname, timesteps, lossval) if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) + '.pth' print('Saving to', savepath) model.save(savepath) return model
def run(self): # We initialize the lists that will contain the mb of experiences mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] mb_states = self.states epinfos = [] for n in range(self.nsteps): # Given observations, take action and value (V(s)) # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init outputs = self.model.step(torch_utils.toTensor(self.obs).float(), S=self.states, M=self.dones) actions, values, states, _ = torch_utils.toNumpy(outputs) # Append the experiences mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) # Take actions in env and look the results obs, rewards, dones, infos = self.env.step(actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) self.states = states self.dones = dones self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) # Batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] if self.gamma > 0.0: # Discount/bootstrap off value fn last_values = self.model.value(torch_utils.toTensor( self.obs).float(), S=self.states, M=self.dones) last_values = torch_utils.toNumpy(last_values).tolist() for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_actions = mb_actions.reshape( mb_actions.shape[0] * mb_actions.shape[1], *mb_actions.shape[2:]) mb_rewards = mb_rewards.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos