def test_multi_speed(env_name, num_envs, benchmark): venv = ProcgenEnv(num_envs=num_envs, env_name=env_name) venv.reset() actions = np.zeros([venv.num_envs]) def rollout(max_steps): step_count = 0 while step_count < max_steps: _obs, _rews, _dones, _infos = venv.step(actions) step_count += 1 benchmark(lambda: rollout(1000)) venv.close()
def collect_observations(): rng = np.random.RandomState(0) venv = ProcgenEnv(num_envs=2, env_name=env_name, rand_seed=23) obs = venv.reset() obses = [obs["rgb"]] for _ in range(128): obs, _rew, _done, _info = venv.step( rng.randint( low=0, high=venv.action_space.n, size=(venv.num_envs,), dtype=np.int32, ) ) obses.append(obs["rgb"]) return np.array(obses)
agent = Agent(envs).to(device) optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) # ALGO Logic: Storage setup obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device) actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device) logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) dones = torch.zeros((args.num_steps, args.num_envs)).to(device) values = torch.zeros((args.num_steps, args.num_envs)).to(device) # TRY NOT TO MODIFY: start the game global_step = 0 start_time = time.time() next_obs = torch.Tensor(envs.reset()).to(device) next_done = torch.zeros(args.num_envs).to(device) num_updates = args.total_timesteps // args.batch_size for update in range(1, num_updates + 1): # Annealing the rate if instructed to do so. if args.anneal_lr: frac = 1.0 - (update - 1.0) / num_updates lrnow = frac * args.learning_rate optimizer.param_groups[0]["lr"] = lrnow for step in range(0, args.num_steps): global_step += 1 * args.num_envs obs[step] = next_obs dones[step] = next_done
import numpy as np import gym from procgen import ProcgenEnv # env = gym.make('procgen:procgen-coinrun-v0') # obs = env.reset() # # while True: # obs, rew, done, info = env.step(env.action_space.sample()) # env.render() # if done: # break env = ProcgenEnv(num_envs=2, env_name="coinrun", num_levels=12, start_level=34) obs = env.reset() print(obs['rgb'].shape) action = np.ones(2) * env.action_space.sample() obs, rew, done, info = env.step(action) print(obs) print(rew) print(done) print(info) # while True: # obs, rew, done, info = env.step(env.action_space.sample()) # env.render() # if done:
def rollout(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, num_steps, num_envs, env_name, num_levels, start_level, distribution_mode, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # if isinstance(lr, float): lr = constfn(lr) # else: assert callable(lr) # if isinstance(cliprange, float): cliprange = constfn(cliprange) # else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) # Instantiate the runner object # runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) # if eval_env is not None: # eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam) # epinfobuf = deque(maxlen=100) # if eval_env is not None: # eval_epinfobuf = deque(maxlen=100) # if init_fn is not None: # init_fn() # # Start total timer # tfirststart = time.perf_counter() # nupdates = total_timesteps//nbatch # for update in range(1, nupdates+1): # assert nbatch % nminibatches == 0 # # Start timer # tstart = time.perf_counter() # frac = 1.0 - (update - 1.0) / nupdates # # Calculate the learning rate # lrnow = lr(frac) # # Calculate the cliprange # cliprangenow = cliprange(frac) # if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # # Get minibatch # obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 # if eval_env is not None: # eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632 # if update % log_interval == 0 and is_mpi_root: logger.info('Done.') # epinfobuf.extend(epinfos) # if eval_env is not None: # eval_epinfobuf.extend(eval_epinfos) # # Here what we're going to do is for each minibatch calculate the loss and append it. # mblossvals = [] # if states is None: # nonrecurrent version # # Index of each element of batch_size # # Create the indices array # inds = np.arange(nbatch) # for _ in range(noptepochs): # # Randomize the indexes # np.random.shuffle(inds) # # 0 to batch_size with batch_train_size step # for start in range(0, nbatch, nbatch_train): # end = start + nbatch_train # mbinds = inds[start:end] # slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) # mblossvals.append(model.train(lrnow, cliprangenow, *slices)) # else: # recurrent version # assert nenvs % nminibatches == 0 # envsperbatch = nenvs // nminibatches # envinds = np.arange(nenvs) # flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) # for _ in range(noptepochs): # np.random.shuffle(envinds) # for start in range(0, nenvs, envsperbatch): # end = start + envsperbatch # mbenvinds = envinds[start:end] # mbflatinds = flatinds[mbenvinds].ravel() # slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) # mbstates = states[mbenvinds] # mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) # # Feedforward --> get losses --> update # lossvals = np.mean(mblossvals, axis=0) # # End timer # tnow = time.perf_counter() # # Calculate the fps (frame per second) # fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) rewards = [] for i in range(num_steps): env = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode) env = VecExtractDictObs(env, "rgb") env = VecMonitor( venv=env, filename=None, keep_buf=100, ) env = VecNormalize(venv=env, ob=False) obs = env.reset() done = False reward = 0.0 timesteps = 0 while not done: # action = env.action_space.sample() # print("example of an action: ", action) # print("\n\n") # print("my action: ") actions, _, _, _ = model.step(obs) # print(actions.shape) # print("obs shape: ", obs.shape) # print(actions[0]) obs, r, done, _ = env.step(actions[0]) done = done.all() reward += r timesteps += 1 rewards.append(reward) #Logging reward, timesteps, and numsteps logger.logkv("numsteps", i) logger.logkv("timesteps", timesteps) logger.logkv("episode_reward_mean", safemean(reward)) logger.dumpkvs() # if update % log_interval == 0 or update == 1: # # Calculates if value function is a good predicator of the returns (ev > 1) # # or if it's just worse than predicting nothing (ev =< 0) # ev = explained_variance(values, returns) # logger.logkv("misc/serial_timesteps", update*nsteps) # logger.logkv("misc/nupdates", update) # logger.logkv("misc/total_timesteps", update*nbatch) # logger.logkv("fps", fps) # logger.logkv("misc/explained_variance", float(ev)) # logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) # logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) # if eval_env is not None: # logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) # logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) # logger.logkv('misc/time_elapsed', tnow - tfirststart) # for (lossval, lossname) in zip(lossvals, model.loss_names): # logger.logkv('loss/' + lossname, lossval) # logger.dumpkvs() # if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: # checkdir = osp.join(logger.get_dir(), 'checkpoints') # os.makedirs(checkdir, exist_ok=True) # savepath = osp.join(checkdir, '%.5i'%update) # print('Saving to', savepath) # model.save(savepath) return model
ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=n_steps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) load_path = '000002400.ckpt' if load_path is not None: model.load(load_path) obs = training_env.reset() dones = [False] states = model.initial_state import numpy as np step = 0 rew = [] for _ in range(video_interval + video_length + 1): actions, values, states, _ = model.step(obs, S=states, M=dones) obs[:], rewards, dones, infos = training_env.step(actions) rew.append(rewards) step += 1 print(f"Steps: {step}") training_env.render() if dones[0]: break