Пример #1
0
import gym
from agents import experienceReplayBuffer, DDQNAgent, QNetwork
import torch
from agents import evaluate
from copy import deepcopy

if __name__ == "__main__":
    n_iter = 100000
    env = gym.make('gym_pvz:pvz-env-v2')
    nn_name = input("Save name: ")
    buffer = experienceReplayBuffer(memory_size=100000, burn_in=10000)
    net = QNetwork(env, device='cpu', use_zombienet=False, use_gridnet=False)
    # old_agent = torch.load("agents/benchmark/dfq5_znet_epslinear")
    # net.zombienet.load_state_dict(old_agent.zombienet.state_dict())
    # for p in net.zombienet.parameters():
    #     p.requires_grad = False
    # net.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()),
    #                                       lr=net.learning_rate)
    agent = DDQNAgent(env, net, buffer, n_iter=n_iter, batch_size=200)
    agent.train(max_episodes=n_iter,
                evaluate_frequency=5000,
                evaluate_n_iter=1000)
    torch.save(agent.network, nn_name)
    agent._save_training_data(nn_name)
Пример #2
0
def train(config, env):
    all_rewards = []
    steps_taken = []
    all_losses = []

    epsilon = 1.0
    tf.reset_default_graph()
    annealing_rate = (epsilon - config.epsilon_min) / config.total_episodes

    tfConfig = tf.ConfigProto()
    tfConfig.gpu_options.per_process_gpu_memory_fraction = config.gpu
    with tf.Session(config=tfConfig) as sess:
        #       agent = QAgent(sess, config)
        agent = DDQNAgent(sess, config)
        sess.run(tf.global_variables_initializer())
        fig.show()
        fig.canvas.draw()

        # Create folder to store model in, if doesn't exist.
        if config.save_model and not os.path.exists(path):
            os.makedirs(path)

        total_step_count = 0

        if config.load_model:
            print('Loading latest saved model...')
            agent.load_agent_state()

        for episode_count in range(1, config.total_episodes + 1):
            step_count = 0
            episode_buffer = []
            running_reward = 0
            episode_loss = []
            done = False

            s = env.reset()
            while step_count < config.max_episode_length and not done:
                if config.render_env:
                    env.render()

                step_count += 1
                total_step_count += 1

                if np.random.randn(1) < epsilon or \
                        total_step_count < config.pretrain_steps:
                    action = np.random.randint(0, config.a_size)
                else:
                    action = agent.take_action(s)
                    print(action)

                next_state, reward, done, _ = env.step(action)
                if config.verbose:
                    print("Post Action", action, " on step count", step_count,
                          "total_step_count", total_step_count, "next_state",
                          next_state, "reward", reward, "done", done)
                d_int = 1 if done else 0
                running_reward += reward
                episode_buffer.append([s, action, reward, next_state, d_int])
                s = next_state

                if total_step_count > config.pretrain_steps and \
                total_step_count % config.update_freq == 0:
                    episode_loss.append(np.mean(agent.update_agent()))

            agent.add_experiences(episode_buffer)
            all_rewards.append(running_reward)
            if len(episode_loss) != 0:
                all_losses.append(np.mean(episode_loss))
            steps_taken.append(step_count)

            if total_step_count > config.pretrain_steps:
                epsilon -= annealing_rate

            # Save model.
            if config.save_model and total_step_count > config.pretrain_steps and \
                    episode_count % config.save_model_episode_interval == 0:
                print('Saving model...')
                agent.save_agent_state(
                    path + '/model-' + str(episode_count) + '.ckpt',
                    total_step_count)

            # Refresh charts
            if total_step_count > config.pretrain_steps and \
            episode_count % config.chart_refresh_interval == 0:
                refresh_chart(all_rewards, all_losses)
Пример #3
0
def ddqn_train(model_name,
               load_model=False,
               model_filename=None,
               optimizer_filename=None):
    print("DDQN -- Training")

    env = make('hungry_geese')
    trainer = env.train(
        ['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py'])

    agent = DDQNAgent(rows=11, columns=11, num_actions=3)
    buffer = ReplayBuffer()
    strategy = EpsilonGreedyStrategy(start=0.5, end=0.0, decay=0.00001)

    if load_model:
        agent.load_model_weights(model_filename)
        agent.load_optimizer_weights(optimizer_filename)

    start_episode = 0
    end_episode = 50000
    epochs = 32
    batch_size = 128

    training_rewards = []
    evaluation_rewards = []
    last_1000_ep_reward = []

    for episode in range(start_episode + 1, end_episode + 1):
        obs_dict = trainer.reset()
        epsilon = strategy.get_epsilon(episode - start_episode)
        ep_reward, ep_steps, done = 0, 0, False
        prev_direction = 0

        while not done:
            ep_steps += 1

            state = preprocess_state(obs_dict, prev_direction)
            action = agent.select_epsilon_greedy_action(state, epsilon)
            direction = get_direction(prev_direction, action)
            next_obs_dict, _, done, _ = trainer.step(
                env.specification.action.enum[direction])
            reward = calculate_reward(obs_dict, next_obs_dict)
            next_state = preprocess_state(next_obs_dict, direction)
            buffer.add(state, action, reward, next_state, done)

            obs_dict = next_obs_dict
            prev_direction = direction

            ep_reward += reward

        if len(buffer) >= batch_size:
            for _ in range(epochs):
                states, actions, rewards, next_states, dones = buffer.get_samples(
                    batch_size)
                agent.fit(states, actions, rewards, next_states, dones)

        print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) +
              " - STEPS: " + str(ep_steps))

        if len(last_1000_ep_reward) == 1000:
            last_1000_ep_reward = last_1000_ep_reward[1:]
        last_1000_ep_reward.append(ep_reward)

        if episode % 10 == 0:
            agent.update_target_network()

        if episode % 1000 == 0:
            print('Episode ' + str(episode) + '/' + str(end_episode))
            print('Epsilon: ' + str(round(epsilon, 3)))

            last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3)
            training_rewards.append(last_1000_ep_reward_mean)
            print('Average reward in last 1000 episodes: ' +
                  str(last_1000_ep_reward_mean))
            print()

        if episode % 1000 == 0:
            eval_reward = 0
            for i in range(100):
                obs_dict = trainer.reset()
                epsilon = 0
                done = False
                prev_direction = 0
                while not done:
                    state = preprocess_state(obs_dict, prev_direction)
                    action = agent.select_epsilon_greedy_action(state, epsilon)
                    direction = get_direction(prev_direction, action)
                    next_obs_dict, _, done, _ = trainer.step(
                        env.specification.action.enum[direction])
                    reward = calculate_reward(obs_dict, next_obs_dict)
                    obs_dict = next_obs_dict
                    prev_direction = direction
                    eval_reward += reward
            eval_reward /= 100
            evaluation_rewards.append(eval_reward)
            print("Evaluation reward: " + str(eval_reward))
            print()

        if episode % 5000 == 0:
            agent.save_model_weights('models/ddqn_' + model_name + '_' +
                                     str(episode) + '.h5')
            agent.save_optimizer_weights('models/ddqn_' + model_name + '_' +
                                         str(episode) + '_optimizer.npy')

    agent.save_model_weights('models/ddqn_' + model_name + '_' +
                             str(end_episode) + '.h5')
    agent.save_optimizer_weights('models/ddqn_' + model_name + '_' +
                                 str(end_episode) + '_optimizer.npy')

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)],
             training_rewards)
    plt.title('Reward')
    plt.show()

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)],
             evaluation_rewards)
    plt.title('Evaluation rewards')
    plt.show()
    agent0 = DDQNAgentWithPER(action_space_size=gs.get_action_space_size(),
                              neurons_per_hidden_layer=128,
                              hidden_layers=5)
    agent0.alpha = 0.1
    agent0.epsilon = 0.005

    agent1 = RandomAgent()

    agent2 = DDQNAgentWithER(action_space_size=gs.get_action_space_size(),
                             neurons_per_hidden_layer=128,
                             hidden_layers=5)
    agent2.alpha = 0.1
    agent2.epsilon = 0.005

    agent3 = DDQNAgent(action_space_size=gs.get_action_space_size(),
                       neurons_per_hidden_layer=128,
                       hidden_layers=5)
    agent3.alpha = 0.1
    agent3.epsilon = 0.005

    agent4 = DeepQLearningAgent(action_space_size=gs.get_action_space_size(),
                                neurons_per_hidden_layer=128,
                                hidden_layers=5)
    agent4.alpha = 0.1
    agent4.epsilon = 0.005

    agent5 = PPOAgent(state_space_size=gs.get_vectorized_state().shape[0],
                      action_space_size=gs.get_action_space_size())

    agent6 = RandomRolloutAgent(100, False)
Пример #5
0
    if args.double_dqn == 'True':
        args.double_dqn = True
    else:
        args.double_dqn = False
    if args.duelling == 'True':
        args.duelling = True
    else:
        args.duelling = False

    print("Double DQN {}, Duelling Architecture {}".format(
        args.double_dqn, args.duelling))

    # instantiate appropriate agent
    if (args.double_dqn is True) & (args.duelling is True):
        agent = DDQNAgent(state_size=37,
                          action_size=4,
                          model=DuelingQNetwork,
                          seed=0)
        agent_name = 'duel_ddqn'

    elif (args.double_dqn is True) & (args.duelling is False):
        agent = DDQNAgent(state_size=37, action_size=4, model=QNetwork, seed=0)
        agent_name = 'ddqn'

    elif (args.double_dqn is False) & (args.duelling is True):
        agent = DQNAgent(state_size=37,
                         action_size=4,
                         model=DuelingQNetwork,
                         seed=0)
        agent_name = 'duel_dqn'

    else: