Пример #1
0
def accumulate_experience(teacher,
                          exp_replay: Supervised_ExperienceReplay,
                          config=student_config):
    """
    teacher feeds the Experience replay with new experiences
    :param teacher: teacher net, knows how to solve the problem
    :param exp_replay: the experience replay where the teacher saves its experiences
    :param config : holds customer variables such as OBSERVE
    :return: an experience replay filled with new experiences
    """

    env = gym.make("PongNoFrameskip-v4")
    env = wrap_deepmind(env, frame_stack=True)
    steps = 0
    while 1:
        state = env.reset()
        state = np.asarray(state)
        done = False
        while not done:
            steps += 1
            teacher_q_value = teacher.get_q(
                state=np.reshape(state, (1, state.shape[0], state.shape[1],
                                         state.shape[2])))
            action = teacher.select_action(teacher_q_value)
            next_state, reward, done, _ = env.step(action + 1)
            next_state = np.asarray(next_state)
            exp_replay.add_memory(state, teacher_q_value,
                                  action)  # feeding the experience replay
            state = next_state
        if steps > config.OBSERVE:  # we have OBSERVE  number of exp in exp_replay
            try:
                del env
            except ImportError:
                pass
            break
Пример #2
0
def evaluate(agent: DQNAgent, n_epoch=10, render=False):
    """
    evaluate the agent
    :param agent: agent to be evaluated
    :param n_epoch: number of epoch to evaluate, the bigger the more accurate the evaluation is
    :param render: if you want to visualize the evaluation
    :return: score of the evaluation
    """
    env = gym.make("PongNoFrameskip-v4")
    env = wrap_deepmind(env, frame_stack=True)
    final_score = []
    for e in range(n_epoch):
        state = env.reset()
        state = np.asarray(state)
        done = False
        epoch_reward = 0.0
        while not done:
            if render:
                env.render()
            q_values = agent.get_q(
                state=np.reshape(state, (1, state.shape[0], state.shape[1],
                                         state.shape[2])))
            action = agent.select_action(qValues=q_values, explore=False)
            next_state, reward, done, _ = env.step(action + 1)
            # 1 for up 2 for stay 3 for down, action is from 0 to 2 so we need an offset
            next_state = np.asarray(next_state)
            state = next_state
            epoch_reward += reward
        print("Episode ", e,
              " / {} finished with reward {}".format(n_epoch, epoch_reward))
        final_score.append(epoch_reward)
    final_score = np.mean(final_score)
    try:
        del env
    except ImportError:
        pass
    return final_score
Пример #3
0
from agents.DQN_NSTEP import Model
from utils.hyperparameters import Config
from utils.plot import plot, save_plot
from utils.wrappers import wrap_pytorch, make_atari, wrap_deepmind

# 智能体名称
agent_name = "DQN"
# 获取配置文件
config = Config()
# 记录开始时间
start = timer()
# 声明环境为乒乓球
env_id = "PongNoFrameskip-v4"
env = make_atari(env_id)
env = wrap_deepmind(env, frame_stack=False)
env = wrap_pytorch(env)
# 构建模型
model = Model(env=env, config=config)
# 场景的收获
episode_reward = 0
# 获取场景初始状态
observation = env.reset()
# max_frames = int(config.MAX_FRAMES / 50)
# 最大frame数量
max_frames = config.MAX_FRAMES
# process_count = int(max_frames / 40)
# 输出进度的频率值,达到数量即输出
process_count = int(max_frames / 2000)
# 上次输出处理时间
process_time = 0
Пример #4
0
def fit(logger, agent, target_agent, n_epoch, update=True):
    logger.info("Start :  training agent ")
    env = gym.make("PongNoFrameskip-v4")
    env = wrap_deepmind(env, frame_stack=True)
    if USE_PER:
        exp_replay = MultiStepPrioritizedExperienceReplay(
            size=dense_config.memory_size,
            gamma=agent.gamma,
            alpha=dense_config.ALPHA_PER)
    else:
        exp_replay = ExperienceReplayMultistep(size=dense_config.memory_size,
                                               gamma=agent.gamma)
    degradation = dense_config.steps_per_train / dense_config.EXPLORE
    agent.set_degradation(degradation)
    last_100_epochs_reward = np.zeros(100)
    total_steps = 0
    best_reward = -21.0
    i = 0
    for e in range(n_epoch):
        state = env.reset()
        state = np.asarray(state)
        print("agent epsilon : {}".format(agent.epsilon))
        done = False
        epoch_reward = 0.0
        while not done:
            total_steps += 1
            q_values = agent.get_q(
                state=np.reshape(state, (1, state.shape[0], state.shape[1],
                                         state.shape[2])))
            action = agent.select_action(qValues=q_values)
            next_state, reward, done, _ = env.step(
                action + 1
            )  # 1 for up 2 for stay 3 for down, action is from 0 to 2 so we need an offset
            next_state = np.asarray(next_state)
            exp_replay.add_memory(
                state, action, reward, next_state, done,
                total_steps % dense_config.steps_per_train ==
                0)  # transaction are inserted after steps per train
            state = next_state
            epoch_reward += reward
            if total_steps % dense_config.steps_per_train == 0:
                agent.lower_epsilon()
            if total_steps < dense_config.OBSERVE:
                continue
            if total_steps % dense_config.steps_per_train == 0:
                train_on_batch(agent, target_agent, exp_replay, e)
            if update and total_steps % dense_config.UPDTATE_FREQ == 0:
                agent.save_model()
                target_agent.sync(agent_path=agent.model_path)
                print("Update target DQN")
        last_100_epochs_reward[e % 100] = epoch_reward
        if e < 100:
            if best_reward < epoch_reward:
                logger.info("Best Reward : episode {} / {}, reward {}".format(
                    e, n_epoch, epoch_reward))
                best_reward = epoch_reward
            print(
                "Episode ", e,
                " / {} finished with reward {}".format(n_epoch, epoch_reward))
        else:
            mean_100_reward = sum(last_100_epochs_reward) / 100
            if best_reward < mean_100_reward:
                print(
                    "Best Reward : episode {} to {}, with average reward of {}"
                    .format(e - 100, e, mean_100_reward))
                best_reward = mean_100_reward
            print(
                "Episode ", e,
                " / {} finished with reward of {} and the last 100 average reward is {} "
                .format(n_epoch, epoch_reward, mean_100_reward))
            logger.info(
                "Episode {} / {} finished with reward of {} and the last 100 average reward is {} "
                .format(e, n_epoch, epoch_reward, mean_100_reward))

            if mean_100_reward > 20.0:
                agent.save_model()
                logger.info(
                    "Goal achieved!, at episode {} to {}, with average reward of {}"
                    .format(e - 100, e, mean_100_reward))
                i += 1
                if i % 5 == 0:
                    break
                else:
                    i = 0
    try:
        del env
    except ImportError:
        pass
Пример #5
0
def main(argv):

    env = gym.make(config.game_name)
    env = wrap_deepmind(env, config.episode_life, config.preprocess,
                        config.max_and_skip, config.clip_rewards,
                        config.no_op_reset, config.scale)

    num_actions = env.action_space.n

    sess = tf.Session()

    agent = DQNAgent(sess=sess, num_actions=num_actions)

    sess.run(tf.global_variables_initializer())

    rewards = tf.placeholder(dtype=tf.float32, shape=(None), name='reward')

    saver = tf.train.Saver()
    tf.summary.scalar('avg.reward/ep', tf.reduce_mean(rewards))
    tf.summary.scalar('max.reward/ep', tf.reduce_max(rewards))

    writer = tf.summary.FileWriter('logs_12_v4_allwrap_constant_lr',
                                   sess.graph)
    summary_merged = tf.summary.merge_all()

    episode_rewards = []
    batch_loss = []

    replay_buffer = ReplayBuffer()
    time_step = 0
    episode = 0
    total_reward_list = []

    #scheduler
    e = e_scheduler()
    lr = lr_scheduler()

    while time_step < config.MAX_TIME_STEPS:

        done = False
        total_reward = 0
        '''
        frame --> 84 x 84 x 1
        state --> 84 x 84 x 4
        '''

        frame = env.reset()

        frame_scale = np.array(frame).astype(np.float32) / 255.0

        #맨 처음 frame을 받아올때는 past_frames이 존재하지않으므로, (84x84)의 0인 행렬을 받아서 초기화
        past_frames = np.zeros(
            (config.height, config.width, agent.history_length - 1),
            dtype=np.uint8)  #저장용
        past_frames_scale = np.zeros(
            (config.height, config.width, agent.history_length - 1),
            dtype=np.float32)  #학습용

        state = agent.process_state_into_stacked_frames(frame,
                                                        past_frames,
                                                        past_state=None)
        state_scale = np.array(state).astype(np.float32) / 255.0

        while not done:

            if np.random.rand() < e.get(
            ) or time_step < config.REPLAY_START_SIZE:
                action = env.action_space.sample()
            else:
                action = agent.predict_action(state_scale)
            time_step += 1

            frame_after, reward, done, info = env.step(action)

            frame_after_scale = np.array(frame_after).astype(
                np.float32) / 255.0

            replay_buffer.add_experience(state, action, reward, done)

            if not done:  #+21 or -21

                #새로 생긴 frame을 과거 state에 더해줌.
                state_after = agent.process_state_into_stacked_frames(
                    frame_after, past_frames, past_state=state)

                state_after_scale = np.array(state_after).astype(
                    np.float32) / 255.0

                past_frames = np.concatenate((past_frames, frame_after),
                                             axis=2)
                past_frames = past_frames[:, :, 1:]

                past_frames_scale = np.array(past_frames).astype(
                    np.float32) / 255.0

                #print(past_frames.shape)
                state = state_after
                state_scale = state_after_scale

            total_reward += reward

            #training
            if time_step > config.REPLAY_START_SIZE and time_step % config.LEARNING_FREQ == 0:
                e.update(time_step)
                lr.update(time_step)

                b_state, b_action, b_reward, b_state_after, b_done = replay_buffer.sample_batch(
                    config.BATCH_SIZE)

                Q_of_state_after = agent.sess.run(
                    agent.target_Q,
                    feed_dict={agent.target_state: b_state_after})

                target_Q_p = []
                for i in range(config.BATCH_SIZE):
                    if b_done[i]:
                        target_Q_p.append(b_reward[i])
                    else:
                        target_Q_p.append(b_reward[i] +
                                          config.DISCOUNT_FACTOR *
                                          np.max(Q_of_state_after[i]))

                agent.sess.run(
                    [agent.train_step, agent.Q, agent.loss], {
                        agent.target_Q_p: target_Q_p,
                        agent.action: b_action,
                        agent.state: b_state,
                        agent.lr: lr.get()
                    })

            if time_step % config.target_UPDATE_FREQ == 0:
                agent.sess.run(agent.update_fn)

            if time_step % config.REWARD_RECORD_FREQ == 0 and len(
                    total_reward_list) != 0:
                summary = sess.run(summary_merged,
                                   feed_dict={rewards: total_reward_list})
                writer.add_summary(summary, time_step)
                total_reward_list = []

            if time_step % config.MODEL_RECORD_FREQ == 0:
                saver.save(sess,
                           'model_12_v4_allwrap_constant_lr/dqn.ckpt',
                           global_step=time_step)

        #학습과 상관 x
        episode += 1
        #For Debugging
        if episode % 100 == 0:
            print('episode : %d 점수: %d' % (episode, total_reward))

        total_reward_list.append(total_reward)