def test_parallel(eps_decay, gamma, lr, network, seed, n_test_episodes, render,
                  device):
    id = 'LunarLander-v2'
    env = gym.make(id).unwrapped
    n_actions = env.action_space.n
    n_states = env.observation_space.shape[0]
    print('start in valuating the agent with parameters: {pr}'.format(
        pr=[eps_decay, gamma, lr, network]))
    model_path = 'model_{lr}_{eps_decay}_{gamma}_{network}.pt'.format(
        lr=lr, eps_decay=eps_decay, gamma=gamma, network=network)
    if network not in NETWORK.keys():
        raise ValueError('Network key not existed!')

    fc1_unit, fc2_unit = NETWORK.get(network)
    policy_net = load_model(path=model_path,
                            fc1_unit=fc1_unit,
                            fc2_unit=fc2_unit,
                            state_size=n_states,
                            action_size=n_actions)
    rewards = test(n_test_episodes,
                   policy_net,
                   env,
                   device=device,
                   seed=seed,
                   render=render)

    rewards_path = 'test_rewards_{lr}_{eps_decay}_{gamma}_{network}.pkl'.format(
        lr=lr, eps_decay=eps_decay, gamma=gamma, network=network)
    save_rewards(rewards=rewards, path=rewards_path, option='test_rewards')
def get_rewards(size, nb_episodes, n_playout, Agent):
    env = SnakeGame(size)
    total_rewards = []
    for n in range(nb_episodes):
        print('episode : {}'.format(n))
        sum_rewards = 0
        env.reset()
        done = False
        while not done:
            action = Agent.BestMove(env, n_playout)
            _, reward, done = env.step(action)
            sum_rewards += reward
        total_rewards.append(sum_rewards)
        records = {
            'n_playout': n_playout,
            'size': size,
            'rewards': total_rewards
        }
        save_rewards(records,
                     folder=folder,
                     filename=r'\rewards {} {}'.format(size, n_playout))
    return total_rewards
            if done:
                break

        rewards.append(cumulative_reward)
        rate = EPS_END + (EPS_START - EPS_END) * math.exp(
            -1. * steps_done / EPS_DECAY)
        print('cumulative reward for episode {n_ep} is {cum_reward}; With the epsilon: {eps}'. \
              format(n_ep=i_episode,
                     cum_reward=cumulative_reward,
                     eps=rate))

        # update the target net after a while
        if i_episode % TARGET_UPDATE == 0:
            # If want the soft update the weights
            #         soft_update(local_model=policy_net, target_model=target_net, tau=TAU)
            target_net.load_state_dict(policy_net.state_dict())
            print("target net weights updated")

        if np.min(rewards[-5:]) >= 200:
            break

    # save the rewards
    # rewards_path = 'training_rewards_{lr}_{eps_decay}_{network}.pkl'.format(lr=LR,eps_decay=EPS_DECAY,network='simple' )
    rewards_path = 'demo_training_rewards.pkl'
    save_rewards(rewards=rewards, path=rewards_path, option='training_rewards')

    # save the policy net
    # model_path = 'model_{lr}_{eps_decay}_{network}.pt'.format(lr=LR,eps_decay=EPS_DECAY,network='simple' )
    model_path = 'demo_model.pt'
    save_model(model=policy_net, path=model_path)
示例#4
0
def train(eps_decay, gamma, lr, network, seed=131):
    id = 'LunarLander-v2'
    env = gym.make(id).unwrapped
    n_actions = env.action_space.n
    n_states = env.observation_space.shape[0]
    # set seed
    random.seed(seed)
    env.seed(seed)

    # initiate the network
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if network not in NETWORK.keys():
        raise ValueError('Network key not existed!')

    fc1_unit, fc2_unit = NETWORK.get(network)
    policy_net = DQN(state_size=n_states,
                     action_size=n_actions,
                     fc1_unit=fc1_unit,
                     fc2_unit=fc2_unit,
                     seed=131).to(device)
    target_net = DQN(state_size=n_states,
                     action_size=n_actions,
                     fc1_unit=fc1_unit,
                     fc2_unit=fc2_unit,
                     seed=1).to(device)
    target_net.load_state_dict(policy_net.state_dict())

    # initiate the memory replayer and optimizer
    memory = ReplayMemory(MEMORY_CAPACITY)
    # optimizer = optim.RMSprop(policy_net.parameters())
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)

    # initiate the global steps
    steps_done = 0
    # Here my watch started
    rewards = []
    for i_episode in range(N_EPISODES):
        cumulative_reward = 0
        state = env.reset()
        state = torch.tensor([state])
        for t in count():
            if t > N_STEPS_TIMEOUT:
                break
            action, steps_done = select_action(state=state,
                                               policy_net=policy_net,
                                               n_actions=n_actions,
                                               steps_done=steps_done,
                                               device=device,
                                               eps_end=EPS_END,
                                               eps_start=EPS_START,
                                               eps_decay=eps_decay)

            state_next, reward, done, _ = env.step(action.item())
            # env.render()
            cumulative_reward = cumulative_reward + reward
            # convert it to tensor
            state_next = torch.tensor([state_next], device=device)
            reward = torch.tensor([reward], device=device, dtype=torch.float32)
            memory.push(state, action, state_next, reward)
            state = state_next

            # every step update the weights in the policy net
            optimize_model(memory=memory,
                           batch_size=BATCH_SIZE,
                           device=device,
                           policy_net=policy_net,
                           target_net=target_net,
                           optimizer=optimizer,
                           gamma=gamma)

            if done:
                break

        rewards.append(cumulative_reward)

        # update the target net after a while
        if i_episode % TARGET_UPDATE == 0:
            # If want the soft update the weights
            #         soft_update(local_model=policy_net, target_model=target_net, tau=TAU)
            target_net.load_state_dict(policy_net.state_dict())

        if np.min(rewards[-5:]) >= 200:
            break

    # save the rewards
    rewards_path = 'training_rewards_{lr}_{eps_decay}_{gamma}_{network}.pkl'.format(
        lr=lr, eps_decay=eps_decay, gamma=gamma, network=network)
    save_rewards(rewards=rewards, path=rewards_path, option='training_rewards')

    # save the policy net
    model_path = 'model_{lr}_{eps_decay}_{gamma}_{network}.pt'.format(
        lr=lr, eps_decay=eps_decay, gamma=gamma, network=network)
    save_model(model=policy_net, path=model_path)
    print("Finished parameter combo: {params}".format(
        params=[eps_decay, gamma, lr, network]))
示例#5
0
    def run(self,
            verbose=False,
            show=False,
            epochs=10,
            train=True,
            scheduling=True):
        if verbose and show:
            raise ValueError(
                'Experiment can either be run in verbose or show mode')

        end_states = []
        try:
            epoch_iter = tqdm(range(epochs)) if verbose else range(epochs)
            if config.LOAD_MODEL:
                self.exp_bot.Q, self.exp_bot.epsilon = load_model(
                    self.exp_bot.Q)
            for epoch in epoch_iter:
                self.exp_bot.epoch = epoch
                self.exp_bot.train = train
                self.exp_game.start()
                if show:
                    self.exp_display.start()
                if verbose:
                    pbar = tqdm()
                save_model(self.exp_bot.Q, epoch, self.exp_bot.epsilon)
                if epoch % config.TARGET_UPDATE == 0:
                    self.exp_bot.Q_target.load_state_dict(
                        self.exp_bot.Q.state_dict())
                count = 0
                while True:
                    count += 1
                    self.num_iters += 1
                    if self.num_iters % 50000 == 0 and scheduling:
                        self.exp_bot.epsilon = max(
                            0.1, self.exp_bot.epsilon -
                            (self.num_iters // 50000) * 0.1)
                    start_time = time.time()
                    if show:
                        game_screen = self.exp_game.get_screen()
                        status = self.exp_bot.get_status()
                        self.exp_display.update(game_screen, status)
                        if not self.exp_display.running:
                            self.exp_game.quit()
                            break
                        if self.exp_display.paused:
                            continue
                    state = self.exp_game.get_state()
                    if not self.exp_game.running:
                        break
                    if count == 15000:
                        self.exp_game.quit()
                        break
                    act = self.exp_bot.choose_action(state, self.replay_buffer)
                    # act = self.exp_bot.choose_action(state)
                    if act == -1:
                        self.exp_game.quit()
                        break
                    self.exp_game.do_action(act)
                    new_state = self.exp_bot.parse_state(
                        self.exp_game.get_state(), update=False)
                    if verbose:
                        pbar.update(1)
                    remaining = config.MOVE_DELAY - (time.time() - start_time)
                    if remaining > 0:
                        time.sleep(remaining)
                    # if self.exp_bot.prev_state != new_state:
                    #     print(self.exp_bot.prev_state, new_state)
                    if self.exp_bot.prev_state and new_state:
                        self.replay_buffer.add(
                            self.exp_bot.prev_state,
                            action.map_act_int[self.exp_bot.prev_act],
                            self.exp_bot.prev_reward, new_state, False)
                if self.exp_bot.prev_state and new_state:
                    self.replay_buffer.add(
                        self.exp_bot.prev_state,
                        action.map_act_int[self.exp_bot.prev_act],
                        self.exp_bot.prev_reward, new_state, True)
                if verbose:
                    tqdm.write('Epoch {}: {}'.format(epoch, state['score']))
                    pbar.close()
                self.history.append(state['score'])
                save_rewards(self.history)
                end_states.append(state)
        except Exception as e:
            raise e
        finally:
            if show:
                self.exp_display.stop()
        return end_states
        path = 'demo_model.pt'
        policy_net = load_model(path=path,
                                state_size=n_states,
                                action_size=n_actions,
                                fc1_unit=16,
                                fc2_unit=8)
        rewards = test(n_test_episodes,
                       policy_net,
                       env,
                       device=device,
                       seed=seed,
                       render=render)
        print("Rewards list for {n} episode is: {r}".format(n=n_test_episodes,
                                                            r=rewards))
        save_rewards(rewards=rewards,
                     path='demo_test_rewards.pkl',
                     option='test_rewards')

    elif test_option == 'hyperparam':

        hyper_params = [(eps_decay, gamma, lr, network, seed, n_test_episodes,
                         render, device) for eps_decay in EPS_DECAY
                        for gamma in GAMMA for lr in LR
                        for network in NETWORK.keys()]

        pool = Pool(6)
        pool.starmap(test_parallel, hyper_params)
        pool.close()
    else:
        raise ValueError('Option not avaiable!')