def train(env_id, num_timesteps, seed): from deephyper.search.nas.baselines.ppo1 import mlp_policy, pposgd_simple import deephyper.search.nas.baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() mujoco_py.ignore_mujoco_warnings().__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3) pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ wrapper_kwargs = wrapper_kwargs or {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None logger_dir = logger.get_dir() def make_thunk(rank): return lambda: make_env( env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations=flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, logger_dir=logger_dir ) set_global_seeds(seed) if num_env > 1: return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_thunk(start_index)])
def train(env, seed, policy_fn, reward_giver, dataset, algo, g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter, checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None): pretrained_weight = None if pretrained and (BC_max_iter > 0): # Pretrain with behavior cloning from deephyper.search.nas.baselines.gail import behavior_clone pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=BC_max_iter) if algo == 'trpo': from deephyper.search.nas.baselines.gail import trpo_mpi # Set up for MPI seed rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank, pretrained=pretrained, pretrained_weight=pretrained_weight, g_step=g_step, d_step=d_step, entcoeff=policy_entcoeff, max_timesteps=num_timesteps, ckpt_dir=checkpoint_dir, log_dir=log_dir, save_per_iter=save_per_iter, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.995, lam=0.97, vf_iters=5, vf_stepsize=1e-3, task_name=task_name) else: raise NotImplementedError
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name, verbose=True) avg_len, avg_ret = runner(env, policy_fn, savedir_fname, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, reuse=True)
def make_robotics_env(env_id, seed, rank=0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ set_global_seeds(seed) env = gym.make(env_id) env = FlattenDictWrapper(env, ['observation', 'desired_goal']) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success',)) env.seed(seed) return env
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) print('Evaluating {}'.format(args.env)) bc_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, args.stochastic_policy, False, 'BC') print('Evaluation for {}'.format(args.env)) print(bc_log) gail_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, args.stochastic_policy, True, 'gail') print('Evaluation for {}'.format(args.env)) print(gail_log) plot(args.env, bc_log, gail_log, args.stochastic_policy)
def make_mujoco_env(env_id, seed, reward_scale=1.0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ rank = MPI.COMM_WORLD.Get_rank() myseed = seed + 1000 * rank if seed is not None else None set_global_seeds(myseed) env = gym.make(env_id) logger_path = None if logger.get_dir() is None else os.path.join( logger.get_dir(), str(rank)) env = Monitor(env, logger_path, allow_early_resets=True) env.seed(seed) if reward_scale != 1.0: from deephyper.search.nas.baselines.common.retro_wrappers import RewardScaler env = RewardScaler(env, reward_scale) return env
def main(policy_file, seed, n_test_rollouts, render): set_global_seeds(seed) # Load policy. with open(policy_file, 'rb') as f: policy = pickle.load(f) env_name = policy.info['env_name'] # Prepare params. params = config.DEFAULT_PARAMS if env_name in config.DEFAULT_ENV_PARAMS: # merge env-specific parameters in params.update(config.DEFAULT_ENV_PARAMS[env_name]) params['env_name'] = env_name params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'compute_Q': True, 'rollout_batch_size': 1, 'render': bool(render), } for name in ['T', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) # Run evaluation. evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs for key, val in evaluator.logs('test'): logger.record_tabular(key, np.mean(val)) logger.dump_tabular()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def train(env_id, num_timesteps, seed): from deephyper.search.nas.baselines.ppo1 import pposgd_simple, cnn_policy import deephyper.search.nas.baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank( ) if seed is not None else None set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): # pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') env.close()
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=128, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=1, noptepochs=4, cliprange=0.2, save_interval=10, load_path=None, model_fn=None, **network_kwargs): """ Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network search_space. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network search_space, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor for rewards lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network search_space has arguments num_hidden and num_layers. """ set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) #total_timesteps = int(total_timesteps) policy = build_ppo_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from deephyper.search.nas.baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if load_path is not None: model.load(load_path) allvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=model.name) display_var_info(allvars) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, ob_space=ob_space, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, ob_space=ob_space, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch # for update in range(1, nupdates + 1): update = 1 while True: if not math.isnan(nupdates) and update >= nupdates: break assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch minibatch = runner.run() if eval_env is not None: eval_minibatch = eval_runner.run() _eval_obs = eval_minibatch['observations'] # noqa: F841 _eval_returns = eval_minibatch['returns'] # noqa: F841 _eval_masks = eval_minibatch['masks'] # noqa: F841 _eval_actions = eval_minibatch['actions'] # noqa: F841 _eval_values = eval_minibatch['values'] # noqa: F841 _eval_neglogpacs = eval_minibatch['neglogpacs'] # noqa: F841 _eval_states = eval_minibatch['state'] # noqa: F841 eval_epinfos = eval_minibatch['epinfos'] epinfobuf.extend(minibatch.pop('epinfos')) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = {key: minibatch[key][mbinds] for key in minibatch} mblossvals.append(model.train(lrnow, cliprangenow, **slices)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(minibatch['values'], minibatch['returns']) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('rewards_per_step', safemean(minibatch['rewards'])) logger.logkv('advantages_per_step', safemean(minibatch['advs'])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and ( MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) del minibatch update += 1 return model
def learn(network, env, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, **network_kwargs): ''' Main entrypoint for A2C algorithm. Train a policy with given network search_space on a given environment using a2c algorithm. Parameters: ----------- network: policy network search_space. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network search_space, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int, total number of timesteps to train on (default: 80M) vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5) ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network search_space has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) # Calculate the batch_size nbatch = nenvs * nsteps # Start total timer tstart = time.time() for update in range(1, total_timesteps // nbatch + 1): # Get mini batch of experiences obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) nseconds = time.time() - tstart # Calculate the fps (frame per second) fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
def learn( *, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() return pi
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs): set_global_seeds(seed) if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, is_async=is_async) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) nbatch = nenvs * nsteps tstart = time.time() coord = tf.train.Coordinator() if is_async: enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) else: enqueue_threads = [] for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) return act
def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs): ''' Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf) Train an agent with given network search_space on a given environment using ACER. Parameters: ---------- network: policy network search_space. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network search_space, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) (default: 20) nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension (last image dimension) (default: 4) total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M) q_coef: float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods) ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01) max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10), lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) rprop_alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting factor (default: 0.99) log_interval: int, number of updates between logging events (default: 100) buffer_size: int, size of the replay buffer (default: 50k) replay_ratio: int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4) replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k) c: float, importance weight clipping factor (default: 10) trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True) delta: float, max KL divergence between the old policy and updated policy (default: 1) alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99) load_path: str, path to load the model from (default: None) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network search_space has arguments num_hidden and num_layers. ''' print("Running Acer Simple") print(locals()) set_global_seeds(seed) if not isinstance(env, VecFrameStack): env = VecFrameStack(env, 1) policy = build_policy(env, network, estimate_q=True, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nstack = env.nstack model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, c=c, trust_region=trust_region, alpha=alpha, delta=delta) runner = Runner(env=env, model=model, nsteps=nsteps) if replay_ratio > 0: buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size) else: buffer = None nbatch = nenvs * nsteps acer = Acer(runner, model, buffer, log_interval) acer.tstart = time.time() for acer.steps in range( 0, total_timesteps, nbatch ): #nbatch samples, 1 on_policy call and multiple off-policy calls acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): n = np.random.poisson(replay_ratio) for _ in range(n): acer.call(on_policy=False) # no simulation steps in this return model
def learn(network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, **network_kwargs): set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 nb_actions = env.action_space.shape[-1] assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype = np.float32) #vector episode_step = np.zeros(nenvs, dtype = int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # note these outputs are batched from vecenv t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append(eval_episode_reward[d]) eval_episode_reward[d] = 0.0 if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) return agent
def learn(*, network, env, total_timesteps, seed=None, eval_env=None, replay_strategy='future', policy_save_interval=5, clip_return=True, demo_file=None, override_params=None, load_path=None, save_path=None, **kwargs ): override_params = override_params or {} if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() num_cpu = MPI.COMM_WORLD.Get_size() # Seed everything. rank_seed = seed + 1000000 * rank if seed is not None else None set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS env_name = env.spec.id params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) params['rollout_batch_size'] = env.num_envs if demo_file is not None: params['bc_loss'] = 1 params.update(kwargs) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.') logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) if load_path is not None: tf_util.load_variables(load_path) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] eval_env = eval_env or env rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params) evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params) n_cycles = params['n_cycles'] n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size return train( save_path=save_path, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, demo_file=demo_file)