def evaluate(step, policy_net, device, env, n_actions, eps=0.05, num_episode=5): env = wrap_deepmind(env, clip_rewards=True) sa = m.ActionSelector(eps, eps, policy_net, EPS_DECAY, n_actions, device) e_rewards = [] q = deque(maxlen=5) for i in range(num_episode): env.reset() e_reward = 0 for _ in range(10): # no-op n_frame, _, done, _ = env.step(0) n_frame = m.fp(n_frame) q.append(n_frame) while not done: state = torch.cat(list(q))[1:].unsqueeze(0) # print(state.shape) action, eps = sa.select_action(state, train) n_frame, reward, done, info = env.step(action) n_frame = m.fp(n_frame) q.append(n_frame) e_reward += reward e_rewards.append(e_reward) f = open("file.txt", 'a') f.write("%f, %d, %d\n" % (float(sum(e_rewards)) / float(num_episode), step, num_episode)) f.close()
def demo(num_episode=1): eps = 0.01 env_raw = make_atari(args.env_name) env = wrap_deepmind(env_raw) c, h, w = m.fp(env.reset()).shape n_actions = env.action_space.n policy_net = m.DQN(h, w, n_actions, device).to(device) if device == "cuda": policy_net.load_state_dict( torch.load("models/" + args.env_name.replace("NoFrameskip-v4", "") + "_best.pth")) else: policy_net.load_state_dict(torch.load("models/"+args.env_name.replace("NoFrameskip-v4","")+\ "_best.pth", map_location=torch.device('cpu'))) policy_net.eval() sa = m.ActionSelector(eps, eps, policy_net, 100, n_actions, device) q = deque(maxlen=5) e_rewards = [] for eee in range(num_episode): print("Demo episode %d/%d" % (eee + 1, num_episode) + "...") env.reset() e_reward = 0 for _ in range(5): # no-op n_frame, _, done, _ = env.step(0) n_frame = m.fp(n_frame) q.append(n_frame) while not done: if num_episode <= 1: env.render() time.sleep(0.02) state = torch.cat(list(q))[1:].unsqueeze(0) action, eps = sa.select_action(state, False) n_frame, reward, done, _ = env.step(action) n_frame = m.fp(n_frame) q.append(n_frame) e_reward += reward e_rewards.append(e_reward) avg_reward = float(sum(e_rewards)) / float(num_episode) env.close() print("Average reward of " + args.env_name + " is %.1f" % (avg_reward)) print("Average std of " + args.env_name + " is %.1f" % (np.std(e_rewards)))
def evaluate(step, policy_net, device, env, n_actions, eps=0.01, num_episode=5): global best_reward if not os.path.exists("models"): os.makedirs("models") env = wrap_deepmind(env) sa = m.ActionSelector(eps, eps, policy_net, EPS_DECAY, n_actions, device) e_rewards = [] q = deque(maxlen=5) for _ in range(num_episode): env.reset() e_reward = 0 for _ in range(5): # no-op n_frame, _, done, _ = env.step(0) n_frame = m.fp(n_frame) q.append(n_frame) while not done: state = torch.cat(list(q))[1:].unsqueeze(0) # 为什么这里取的是后4帧的图像??? action, eps = sa.select_action(state, train) n_frame, reward, done, _ = env.step(action) n_frame = m.fp(n_frame) q.append(n_frame) e_reward += reward e_rewards.append(e_reward) f = open(env_name + ".csv", 'a') avg_reward = float(sum(e_rewards)) / float(num_episode) std = np.array(e_rewards).std() print("The average reward is: %.5f" % (avg_reward, )) if avg_reward > best_reward: print("Best reward, save model to disk!!!") torch.save(policy_net.state_dict(), "models/" + env_name + "_" + str(int(avg_reward)) + ".pth") best_reward = avg_reward f.write("%f, %f, %d, %d\n" % (avg_reward, std, step, num_episode)) f.close()
# 2. Seed and best value torch.manual_seed(114514) best_reward = 0.0 # 3. environment reset env_name = args.env_id.replace( "NoFrameskip-v4", "") if "NoFrameskip-v4" in args.env_id else args.env_id.replace( "-ramNoFrameskip-v4", "") env_raw = make_atari(args.env_id) env = wrap_deepmind(env_raw, frame_stack=False, episode_life=True, clip_rewards=True) c, h, w = m.fp(env.reset()).shape n_actions = env.action_space.n # 4. Network reset policy_net = m.DQN(h, w, n_actions, device).to(device) target_net = m.DQN(h, w, n_actions, device).to(device) policy_net.apply( policy_net.init_weights ) # apply函数会把init_weights函数作用在每一个子模块上,如果更换了模型结构也可以不用更改inti函数,这就是apply的好处 target_net.load_state_dict(policy_net.state_dict()) target_net.eval() # 5. DQN hyperparameters BATCH_SIZE = 32 GAMMA = 0.99 EPS_START = 1.
device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # if gpu is to be used # 3. environment reset # env_name = 'Breakout' env_name = 'SpaceInvaders' # env_name = 'Riverraid' # env_name = 'Seaquest' # env_name = 'MontezumaRevenge' env_raw = make_atari('{}NoFrameskip-v4'.format(env_name)) env = wrap_deepmind(env_raw, frame_stack=False, episode_life=True, clip_rewards=True) c, h, w = m.fp(env.reset()).shape n_actions = env.action_space.n print(n_actions) # 4. Network reset policy_net = m.DQN(h, w, n_actions, device).to(device) target_net = m.DQN(h, w, n_actions, device).to(device) policy_net.apply(policy_net.init_weights) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() # 5. DQN hyperparameters BATCH_SIZE = 32 GAMMA = 0.99 EPS_START = 1. EPS_END = 0.1