def test_parallel(eps_decay, gamma, lr, network, seed, n_test_episodes, render, device): id = 'LunarLander-v2' env = gym.make(id).unwrapped n_actions = env.action_space.n n_states = env.observation_space.shape[0] print('start in valuating the agent with parameters: {pr}'.format( pr=[eps_decay, gamma, lr, network])) model_path = 'model_{lr}_{eps_decay}_{gamma}_{network}.pt'.format( lr=lr, eps_decay=eps_decay, gamma=gamma, network=network) if network not in NETWORK.keys(): raise ValueError('Network key not existed!') fc1_unit, fc2_unit = NETWORK.get(network) policy_net = load_model(path=model_path, fc1_unit=fc1_unit, fc2_unit=fc2_unit, state_size=n_states, action_size=n_actions) rewards = test(n_test_episodes, policy_net, env, device=device, seed=seed, render=render) rewards_path = 'test_rewards_{lr}_{eps_decay}_{gamma}_{network}.pkl'.format( lr=lr, eps_decay=eps_decay, gamma=gamma, network=network) save_rewards(rewards=rewards, path=rewards_path, option='test_rewards')
def get_rewards(size, nb_episodes, n_playout, Agent): env = SnakeGame(size) total_rewards = [] for n in range(nb_episodes): print('episode : {}'.format(n)) sum_rewards = 0 env.reset() done = False while not done: action = Agent.BestMove(env, n_playout) _, reward, done = env.step(action) sum_rewards += reward total_rewards.append(sum_rewards) records = { 'n_playout': n_playout, 'size': size, 'rewards': total_rewards } save_rewards(records, folder=folder, filename=r'\rewards {} {}'.format(size, n_playout)) return total_rewards
if done: break rewards.append(cumulative_reward) rate = EPS_END + (EPS_START - EPS_END) * math.exp( -1. * steps_done / EPS_DECAY) print('cumulative reward for episode {n_ep} is {cum_reward}; With the epsilon: {eps}'. \ format(n_ep=i_episode, cum_reward=cumulative_reward, eps=rate)) # update the target net after a while if i_episode % TARGET_UPDATE == 0: # If want the soft update the weights # soft_update(local_model=policy_net, target_model=target_net, tau=TAU) target_net.load_state_dict(policy_net.state_dict()) print("target net weights updated") if np.min(rewards[-5:]) >= 200: break # save the rewards # rewards_path = 'training_rewards_{lr}_{eps_decay}_{network}.pkl'.format(lr=LR,eps_decay=EPS_DECAY,network='simple' ) rewards_path = 'demo_training_rewards.pkl' save_rewards(rewards=rewards, path=rewards_path, option='training_rewards') # save the policy net # model_path = 'model_{lr}_{eps_decay}_{network}.pt'.format(lr=LR,eps_decay=EPS_DECAY,network='simple' ) model_path = 'demo_model.pt' save_model(model=policy_net, path=model_path)
def train(eps_decay, gamma, lr, network, seed=131): id = 'LunarLander-v2' env = gym.make(id).unwrapped n_actions = env.action_space.n n_states = env.observation_space.shape[0] # set seed random.seed(seed) env.seed(seed) # initiate the network device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if network not in NETWORK.keys(): raise ValueError('Network key not existed!') fc1_unit, fc2_unit = NETWORK.get(network) policy_net = DQN(state_size=n_states, action_size=n_actions, fc1_unit=fc1_unit, fc2_unit=fc2_unit, seed=131).to(device) target_net = DQN(state_size=n_states, action_size=n_actions, fc1_unit=fc1_unit, fc2_unit=fc2_unit, seed=1).to(device) target_net.load_state_dict(policy_net.state_dict()) # initiate the memory replayer and optimizer memory = ReplayMemory(MEMORY_CAPACITY) # optimizer = optim.RMSprop(policy_net.parameters()) optimizer = optim.Adam(policy_net.parameters(), lr=lr) # initiate the global steps steps_done = 0 # Here my watch started rewards = [] for i_episode in range(N_EPISODES): cumulative_reward = 0 state = env.reset() state = torch.tensor([state]) for t in count(): if t > N_STEPS_TIMEOUT: break action, steps_done = select_action(state=state, policy_net=policy_net, n_actions=n_actions, steps_done=steps_done, device=device, eps_end=EPS_END, eps_start=EPS_START, eps_decay=eps_decay) state_next, reward, done, _ = env.step(action.item()) # env.render() cumulative_reward = cumulative_reward + reward # convert it to tensor state_next = torch.tensor([state_next], device=device) reward = torch.tensor([reward], device=device, dtype=torch.float32) memory.push(state, action, state_next, reward) state = state_next # every step update the weights in the policy net optimize_model(memory=memory, batch_size=BATCH_SIZE, device=device, policy_net=policy_net, target_net=target_net, optimizer=optimizer, gamma=gamma) if done: break rewards.append(cumulative_reward) # update the target net after a while if i_episode % TARGET_UPDATE == 0: # If want the soft update the weights # soft_update(local_model=policy_net, target_model=target_net, tau=TAU) target_net.load_state_dict(policy_net.state_dict()) if np.min(rewards[-5:]) >= 200: break # save the rewards rewards_path = 'training_rewards_{lr}_{eps_decay}_{gamma}_{network}.pkl'.format( lr=lr, eps_decay=eps_decay, gamma=gamma, network=network) save_rewards(rewards=rewards, path=rewards_path, option='training_rewards') # save the policy net model_path = 'model_{lr}_{eps_decay}_{gamma}_{network}.pt'.format( lr=lr, eps_decay=eps_decay, gamma=gamma, network=network) save_model(model=policy_net, path=model_path) print("Finished parameter combo: {params}".format( params=[eps_decay, gamma, lr, network]))
def run(self, verbose=False, show=False, epochs=10, train=True, scheduling=True): if verbose and show: raise ValueError( 'Experiment can either be run in verbose or show mode') end_states = [] try: epoch_iter = tqdm(range(epochs)) if verbose else range(epochs) if config.LOAD_MODEL: self.exp_bot.Q, self.exp_bot.epsilon = load_model( self.exp_bot.Q) for epoch in epoch_iter: self.exp_bot.epoch = epoch self.exp_bot.train = train self.exp_game.start() if show: self.exp_display.start() if verbose: pbar = tqdm() save_model(self.exp_bot.Q, epoch, self.exp_bot.epsilon) if epoch % config.TARGET_UPDATE == 0: self.exp_bot.Q_target.load_state_dict( self.exp_bot.Q.state_dict()) count = 0 while True: count += 1 self.num_iters += 1 if self.num_iters % 50000 == 0 and scheduling: self.exp_bot.epsilon = max( 0.1, self.exp_bot.epsilon - (self.num_iters // 50000) * 0.1) start_time = time.time() if show: game_screen = self.exp_game.get_screen() status = self.exp_bot.get_status() self.exp_display.update(game_screen, status) if not self.exp_display.running: self.exp_game.quit() break if self.exp_display.paused: continue state = self.exp_game.get_state() if not self.exp_game.running: break if count == 15000: self.exp_game.quit() break act = self.exp_bot.choose_action(state, self.replay_buffer) # act = self.exp_bot.choose_action(state) if act == -1: self.exp_game.quit() break self.exp_game.do_action(act) new_state = self.exp_bot.parse_state( self.exp_game.get_state(), update=False) if verbose: pbar.update(1) remaining = config.MOVE_DELAY - (time.time() - start_time) if remaining > 0: time.sleep(remaining) # if self.exp_bot.prev_state != new_state: # print(self.exp_bot.prev_state, new_state) if self.exp_bot.prev_state and new_state: self.replay_buffer.add( self.exp_bot.prev_state, action.map_act_int[self.exp_bot.prev_act], self.exp_bot.prev_reward, new_state, False) if self.exp_bot.prev_state and new_state: self.replay_buffer.add( self.exp_bot.prev_state, action.map_act_int[self.exp_bot.prev_act], self.exp_bot.prev_reward, new_state, True) if verbose: tqdm.write('Epoch {}: {}'.format(epoch, state['score'])) pbar.close() self.history.append(state['score']) save_rewards(self.history) end_states.append(state) except Exception as e: raise e finally: if show: self.exp_display.stop() return end_states
path = 'demo_model.pt' policy_net = load_model(path=path, state_size=n_states, action_size=n_actions, fc1_unit=16, fc2_unit=8) rewards = test(n_test_episodes, policy_net, env, device=device, seed=seed, render=render) print("Rewards list for {n} episode is: {r}".format(n=n_test_episodes, r=rewards)) save_rewards(rewards=rewards, path='demo_test_rewards.pkl', option='test_rewards') elif test_option == 'hyperparam': hyper_params = [(eps_decay, gamma, lr, network, seed, n_test_episodes, render, device) for eps_decay in EPS_DECAY for gamma in GAMMA for lr in LR for network in NETWORK.keys()] pool = Pool(6) pool.starmap(test_parallel, hyper_params) pool.close() else: raise ValueError('Option not avaiable!')