mod_action_space = [2, 3, 4, 5] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = Env(device) agent = Agent(eps=dum_val, eps_min=dum_val, eps_max=dum_val, eps_decay=dum_val, num_actions=len(mod_action_space), device=device) agent.turn_eps_off() stack = Frstack(initial_frame=env.state) # create policy net and load saved weights policy_net = DDQN(NUM_FRAMES, len(mod_action_space)) if USE_GPU: policy_net.cuda() def test(): policy_net.load_state_dict(torch.load(POLICY_NET_PATH)) policy_net.eval() print("testing...") all_rewards = [] all_images = [] for episode in range(NUM_TEST_EPISODES): env.reset() episode_reward = 0 stack.push(env.state, True) curr_state = stack.get_stack()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = Env(device) agent = Agent(eps=EPS_MAX, eps_min=EPS_MIN, eps_max=EPS_MAX, eps_decay=EPS_DECAY, num_actions=len(mod_action_space), device=device) memory = PriorityReplayBuffer(MEMORY_SIZE) stack = Frstack(initial_frame=env.state) # initialize policy and target network policy_net = DDQN(NUM_FRAMES, len(mod_action_space)) target_net = DDQN(NUM_FRAMES, len(mod_action_space)) if USE_GPU: policy_net.cuda() target_net.cuda() target_net.load_state_dict(policy_net.state_dict()) target_net.eval() # TODO: consider RMSProp vs Adam - DeepMind paper uses RMSProp optimizer = optim.Adam(params=policy_net.parameters(), lr=ALPHA) def experience_replay(): # experience tuple - (state, action, next_state, reward, done) batch, idxs, is_weights = memory.sample(BATCH_SIZE) batch = list(zip(*batch)) # convert experiences from numpy to CUDA (if available) tensors state_tensors = torch.from_numpy(np.stack(batch[0])).type(dtype) action_tensors = torch.from_numpy(np.stack(batch[1])).type(dlongtype)