Пример #1
0
def main(config, max_num_of_steps, max_num_of_episodes, load_model, save_model,
         load_memory, save_memory, log_path):
    agent = DQNAgent(config)

    with agent.graph.as_default():
        if load_model:
            step = agent.load_model(load_model)
            screen_log.info("Load model: {}".format(load_model))
            screen_log.info("Start from step {}".format(step))
        else:
            step = 0

        if load_memory:
            agent.load_memory(load_memory)
            n_frames = len(agent.memory)
            screen_log.info("Load memory: {}".format(load_memory))
            screen_log.info("Memory size: {}".format(n_frames))

        log_name = ('{:02}{:02}{:02}{:02}{:02}'.format(*time.localtime()[1:6]))
        summary_writer = tf.summary.FileWriter(logdir=os.path.join(
            log_path, '{}'.format(log_name)),
                                               graph=agent.graph)

        episode = 0
        rewards_per_episode = []
        sum_Qs = .0
        sum_losses = .0

        try:
            while (step < max_num_of_steps and episode < max_num_of_episodes):
                episode += 1
                episode_done = False

                next_observation = reset_random_env()
                next_observation = preprocess_observation(next_observation)

                rewards_per_episode.append(0)

                while not episode_done:
                    observation = next_observation

                    if len(agent.memory) < config['replay_start_size']:
                        # init replay memory
                        action = env.action_space.sample()

                        next_observation, reward, episode_done, info = env.step(
                            action)
                        next_observation = preprocess_observation(
                            next_observation)
                        agent.memory.append(
                            MemoryItem(observation, action, reward,
                                       episode_done, info))

                        continue

                    state = agent.get_recent_state(observation)
                    Qs = agent.get_Q_values(state)
                    Qs = Qs[0]

                    # epsilon-greedy action selection
                    epsilon = get_epsilon(config, step)
                    if np.random.RandomState().rand() < epsilon:
                        action = env.action_space.sample()
                    else:
                        action = agent.get_action_from_Q(Qs)

                    next_observation, reward, episode_done, info = env.step(
                        action)
                    next_observation = preprocess_observation(next_observation)
                    agent.memory.append(
                        MemoryItem(observation, action, reward, episode_done,
                                   info))

                    step += 1
                    rewards_per_episode[-1] += reward
                    sum_Qs += Qs[action]

                    # train step
                    loss, loss_summary_str = agent.optimize_Q()
                    summary_writer.add_summary(loss_summary_str, step)
                    sum_losses += loss

                    if step % 1000 == 0:
                        ave_loss = sum_losses / step
                        ave_reward = np.mean(rewards_per_episode)
                        ave_Q = sum_Qs / step

                        [Q_summary_str, reward_summary_str
                         ] = agent.evaluate(ave_reward, ave_Q)

                        summary_writer.add_summary(Q_summary_str, step)
                        summary_writer.add_summary(reward_summary_str, step)

                        screen_log.info(
                            'step: {}, ave. loss: {:g}, '
                            'ave. reward: {:g}, ave. Q: {:g}'.format(
                                step,
                                ave_loss,
                                ave_reward,
                                ave_Q,
                            ))
                    if step % 10000 == 0:
                        agent.save_model(save_model, step)
                    if step % 1000000 == 0:
                        agent.save_memory(save_memory, step)

        except KeyboardInterrupt:
            print("\nUser interrupted training...")
        finally:
            summary_writer.close()

            agent.save_model(save_model, step)
            agent.save_memory(save_memory, step)

        screen_log.info(
            'Finished: the number of steps {}, the number of episodes {}.'.
            format(step, episode))
Пример #2
0
def main(config, screen_log, frame_output, max_episodes, load_model):
    game_name = 'BreakoutDeterministic-v4'
    env = gym.make(game_name)
    agent = DQNAgent(config)

    with agent.graph.as_default():
        if load_model:
            _ = agent.load_model(load_model)
            screen_log.info("Load model: {}".format(load_model))

        rewards_per_episode = []
        play_images = []

        try:
            for episode in range(max_episodes):
                init_frame = env.reset()
                play_images.append(Image.fromarray(init_frame))

                next_observation = preprocess_observation(init_frame)
                env.render()

                episode_done = False
                rewards_per_episode.append(0)

                while not episode_done:
                    # sleep for the duration of the frame so we can see what happens
                    sleep(1. / 30)

                    observation = next_observation

                    if len(agent.memory) < config['agent_history_length']:
                        # init replay memory
                        action = env.action_space.sample()

                        next_observation, reward, episode_done, info = env.step(
                            action)
                        next_observation = preprocess_observation(
                            next_observation)
                        agent.memory.append(
                            MemoryItem(observation, action, reward,
                                       episode_done, info))

                        continue

                    state = agent.get_recent_state(observation)
                    Qs = agent.get_Q_values(state)
                    Qs = Qs[0]
                    # epsilon-greedy action selection
                    if np.random.RandomState().rand(
                    ) < config['evaluation_exploration']:
                        action = env.action_space.sample()
                    else:
                        action = agent.get_action_from_Q(Qs)

                    next_observation, reward, episode_done, info = env.step(
                        action)
                    play_images.append(Image.fromarray(next_observation))

                    next_observation = preprocess_observation(next_observation)
                    agent.memory.append(
                        MemoryItem(observation, action, reward, episode_done,
                                   info))

                    rewards_per_episode[-1] += reward

                    env.render()

                screen_log.info(
                    'episode: {}, reward: {:g}, ave. reward: {:g}, '.format(
                        episode + 1,
                        rewards_per_episode[-1],
                        np.mean(rewards_per_episode),
                    ))

            play_images[0].save(
                frame_output,
                save_all=True,
                append_images=play_images[1:],
                duration=30,
            )
        except KeyboardInterrupt:
            print("\nUser interrupted playinging...")
        finally:
            env.close()

        screen_log.info(
            'Finished: the best reward {:g}, the ave. reward {:g}.'.format(
                np.max(rewards_per_episode),
                np.mean(rewards_per_episode),
            ))