def test_action_mask_run_acer(vec_env, policy, env_class): env = vec_env([env_class]) model = ACER(policy, env, verbose=0) obs, done, action_masks = env.reset(), [False], [] while not done[0]: action, _states = model.predict(obs, action_mask=action_masks) obs, _, done, infos = env.step(action) action_masks.clear() for info in infos: env_action_mask = info.get('action_mask') action_masks.append(env_action_mask) env.close()
from stable_baselines.common.cmd_util import make_atari_env from stable_baselines.common.vec_env import VecFrameStack from stable_baselines import ACER # There already exists an environment generator # that will make and wrap atari environments correctly. # Here we are also multiprocessing training (num_env=4 => 4 processes) env = make_atari_env('PongNoFrameskip-v4', num_env=4, seed=0) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) model = ACER('CnnPolicy', env, verbose=1) model.learn(total_timesteps=25000) # save model.save("cnn_pong") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=20, deterministic=False) print('mean_reward = %s +/- %s' % (mean_reward, std_reward)) # Enjoy trained agent env.rendermode = 'on' obs = env.reset() cumreward = 0 results = list() minable = list() while True: #for i in range(turns): action, _states = model.predict(obs, deterministic=False) obs, rewards, dones, info = env.step(action) cumreward += rewards print(action, rewards, dones, cumreward) results.append(info[0]) a = abs(info[1] - 1) #translating sequence errors to be positive, else zero minable.append(a) # if info[1]==1: # results.append(info[0]) # a=abs(info[1]-1) #translating sequence errors to be positive, else zero # minable.append(a) #env.renderif('on') if dones == True: break