示例#1
0
def main():
    env = gym.make("PongNoFrameskip-v4")
    env = ScaledFloatFrame(wrap_dqn(env))
    model = distdeepq.models.cnn_to_dist_mlp(convs=[(32, 8, 4), (64, 4, 2),
                                                    (64, 3, 1)],
                                             hiddens=[256],
                                             dueling=False)
    act = distdeepq.learn(
        env,
        p_dist_func=model,
        lr=1e-4,
        max_timesteps=2000000,
        # max_timesteps=100000,
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=False,
        dist_params={
            'Vmin': -10,
            'Vmax': 10,
            'nb_atoms': 51
        })
    act.save("pong_model.pkl")
    env.close()
示例#2
0
def main():
    env = gym.make("PongNoFrameskip-v4")
    env = ScaledFloatFrame(wrap_dqn(env))
    act = deepq.load("pong_model.pkl")

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
示例#3
0
def main():
    env = gym.make("PongNoFrameskip-v4")
    env = ScaledFloatFrame(wrap_dqn(env))
    model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                                    hiddens=[256],
                                    dueling=True)
    act = simple.learn(env,
                       q_func=model,
                       lr=1e-4,
                       max_timesteps=200000,
                       buffer_size=10000,
                       exploration_fraction=0.1,
                       exploration_final_eps=0.01,
                       train_freq=4,
                       learning_starts=10000,
                       target_network_update_freq=1000,
                       gamma=0.99,
                       prioritized_replay=True,
                       tf_log_dir='./log')
    act.save("pong_model.pkl")
    env.close()
示例#4
0
def main():
    env = gym.make("PongNoFrameskip-v4")
    env = ScaledFloatFrame(wrap_dqn(env))
    model = deepq.models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=True
    )
    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-4,
        max_timesteps=2000000,
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=True
    )
    act.save("pong_model.pkl")
    env.close()
示例#5
0
def main():
    env = gym.make("PongNoFrameskip-v4")
    env = ScaledFloatFrame(wrap_dqn(env))
    act = DeepqWithGaze.load("pong_model.pkl")

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
示例#6
0
def main():
    env = gym.make("BreakoutNoFrameskip-v4")
    env = ScaledFloatFrame(wrap_dqn(env))
    agent = DQN(env)
    agent.update_target()
    episodes_rewards = [0] * 100
    avg_rewards = []
    skip_rewards = []
    step_num = 0
    for episode in range(EPISODE):
        goal = 0
        img_buf = deque()
        state = env.reset()
        while True:
            action = agent.egreedy_action(state)
            next_state, reward, done, _ = env.step(action)
            # env.render()
            # time.sleep(0.01)
            agent.perceive(state, action, reward, next_state, done, step_num)
            goal += reward
            step_num += 1
            state = next_state
            if done:
                episodes_rewards.pop(0)
                episodes_rewards.append(goal)
                break
                # print "Current reward:", goal," Step number:", step_num
        print("Episode: ", episode, " Last 100 episode average reward: ", np.average(episodes_rewards), " Toal step number: ", step_num, " eps: ", agent.epsilon)

        if step_num > 2000000:
            break

        if episode % 50 == 0:
            skip_rewards.append(goal)

        if episode % 100 == 0:
            avg_rewards.append(np.average(episodes_rewards))
            out_file = open("avg_rewards.pkl",'wb')
            out_file1 = open("skip_rewards.pkl",'wb')
            pickle.dump(avg_rewards, out_file)
            pickle.dump(skip_rewards, out_file1)
            out_file.close()
            out_file1.close()
            agent.saver.save(agent.session, 'saved_networks/' + 'network' + '-dqn', global_step=episode)

    env.close()
示例#7
0
def play():
    env = gym.make("BreakoutNoFrameskip-v4")
    env = ScaledFloatFrame(wrap_dqn(env))
    agent = DQN(env)
    for episode in range(TEST):
        goal = 0
        step_num = 0
        state = env.reset()
        while True:
            action = agent.action(state)
            next_state, reward, done, _ = env.step(action)
            step_num += 1
            env.render()
            time.sleep(0.01)
            goal += reward
            state = next_state
            if done or step_num > MAX_STEP_PER_EPISODE:
                print("Episode: ", episode, " Total reward: ", goal)
                break
示例#8
0
#import sys
#sys.path.append("../gym")
#import gym

import gym

from baselines import deepq
from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame


# Create the Baseline game environment

# TRAIN
env = gym.make("PongNoFrameskip-v4")
env = ScaledFloatFrame(wrap_dqn(env))
model = deepq.models.cnn_to_mlp(
    convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
    hiddens=[256],
    dueling=True
)
act = deepq.learn(
    env,
    q_func=model,
    lr=1e-4,
    max_timesteps=2000000,
    buffer_size=10000,
    exploration_fraction=0.1,
    exploration_final_eps=0.01,
    train_freq=4,
    learning_starts=10000,
    target_network_update_freq=1000,
def train():
    from linear_schedule import Linear

    ledger = defaultdict(lambda: MovingAverage(Reporting.reward_average))

    M.config(file=os.path.join(RUN.log_directory, RUN.log_file))
    M.diff()

    with U.make_session(
            RUN.num_cpu), Logger(RUN.log_directory) as logger, contextify(
                gym.make(G.env_name)) as env:
        env = ScaledFloatFrame(wrap_dqn(env))

        if G.seed is not None:
            env.seed(G.seed)
        logger.log_params(G=vars(G), RUN=vars(RUN), Reporting=vars(Reporting))
        inputs = TrainInputs(action_space=env.action_space,
                             observation_space=env.observation_space)
        trainer = QTrainer(inputs=inputs,
                           action_space=env.action_space,
                           observation_space=env.observation_space)
        if G.prioritized_replay:
            replay_buffer = PrioritizedReplayBuffer(size=G.buffer_size,
                                                    alpha=G.alpha)
        else:
            replay_buffer = ReplayBuffer(size=G.buffer_size)

        class schedules:
            # note: it is important to have this start from the begining.
            eps = Linear(G.n_timesteps * G.exploration_fraction, 1,
                         G.final_eps)
            if G.prioritized_replay:
                beta = Linear(G.n_timesteps - G.learning_start, G.beta_start,
                              G.beta_end)

        U.initialize()
        trainer.update_target()
        x = np.array(env.reset())
        ep_ind = 0
        M.tic('episode')
        for t_step in range(G.n_timesteps):
            # schedules
            eps = 0 if G.param_noise else schedules.eps[t_step]
            if G.prioritized_replay:
                beta = schedules.beta[t_step - G.learning_start]

            x0 = x
            M.tic('sample', silent=True)
            (action, *_), action_q, q = trainer.runner.act([x], eps)
            x, rew, done, info = env.step(action)
            ledger['action_q_value'].append(action_q.max())
            ledger['action_q_value/mean'].append(action_q.mean())
            ledger['action_q_value/var'].append(action_q.var())
            ledger['q_value'].append(q.max())
            ledger['q_value/mean'].append(q.mean())
            ledger['q_value/var'].append(q.var())
            ledger['timing/sample'].append(M.toc('sample', silent=True))
            # note: adding sample to the buffer is identical between the prioritized and the standard replay strategy.
            replay_buffer.add(s0=x0,
                              action=action,
                              reward=rew,
                              s1=x,
                              done=float(done))

            logger.log(
                t_step, {
                    'q_value': ledger['q_value'].latest,
                    'q_value/mean': ledger['q_value/mean'].latest,
                    'q_value/var': ledger['q_value/var'].latest,
                    'q_value/action': ledger['action_q_value'].latest,
                    'q_value/action/mean':
                    ledger['action_q_value/mean'].latest,
                    'q_value/action/var': ledger['action_q_value/var'].latest
                },
                action=action,
                eps=eps,
                silent=True)

            if G.prioritized_replay:
                logger.log(t_step, beta=beta, silent=True)

            if done:
                ledger['timing/episode'].append(M.split('episode',
                                                        silent=True))
                ep_ind += 1
                x = np.array(env.reset())
                ledger['rewards'].append(info['total_reward'])

                silent = (ep_ind % Reporting.print_interval != 0)
                logger.log(t_step,
                           timestep=t_step,
                           episode=green(ep_ind),
                           total_reward=ledger['rewards'].latest,
                           episode_length=info['timesteps'],
                           silent=silent)
                logger.log(t_step, {
                    'total_reward/mean':
                    yellow(ledger['rewards'].mean, lambda v: f"{v:.1f}"),
                    'total_reward/max':
                    yellow(ledger['rewards'].max, lambda v: f"{v:.1f}"),
                    "time_spent_exploring":
                    default(eps, percent),
                    "timing/episode":
                    green(ledger['timing/episode'].latest, sec),
                    "timing/episode/mean":
                    green(ledger['timing/episode'].mean, sec),
                },
                           silent=silent)
                try:
                    logger.log(t_step, {
                        "timing/sample":
                        default(ledger['timing/sample'].latest, sec),
                        "timing/sample/mean":
                        default(ledger['timing/sample'].mean, sec),
                        "timing/train":
                        default(ledger['timing/train'].latest, sec),
                        "timing/train/mean":
                        green(ledger['timing/train'].mean, sec),
                        "timing/log_histogram":
                        default(ledger['timing/log_histogram'].latest, sec),
                        "timing/log_histogram/mean":
                        default(ledger['timing/log_histogram'].mean, sec)
                    },
                               silent=silent)
                    if G.prioritized_replay:
                        logger.log(t_step, {
                            "timing/update_priorities":
                            default(ledger['timing/update_priorities'].latest,
                                    sec),
                            "timing/update_priorities/mean":
                            default(ledger['timing/update_priorities'].mean,
                                    sec)
                        },
                                   silent=silent)
                except Exception as e:
                    pass
                if G.prioritized_replay:
                    logger.log(
                        t_step,
                        {"replay_beta": default(beta, lambda v: f"{v:.2f}")},
                        silent=silent)

            # note: learn here.
            if t_step >= G.learning_start and t_step % G.learn_interval == 0:
                if G.prioritized_replay:
                    experiences, weights, indices = replay_buffer.sample(
                        G.replay_batch_size, beta)
                    logger.log_histogram(t_step, weights=weights)
                else:
                    experiences, weights = replay_buffer.sample(
                        G.replay_batch_size), None
                M.tic('train', silent=True)
                x0s, actions, rewards, x1s, dones = zip(*experiences)
                td_error_val, loss_val = trainer.train(s0s=x0s,
                                                       actions=actions,
                                                       rewards=rewards,
                                                       s1s=x1s,
                                                       dones=dones,
                                                       sample_weights=weights)
                ledger['timing/train'].append(M.toc('train', silent=True))
                M.tic('log_histogram', silent=True)
                logger.log_histogram(t_step, td_error=td_error_val)
                ledger['timing/log_histogram'].append(
                    M.toc('log_histogram', silent=True))
                if G.prioritized_replay:
                    M.tic('update_priorities', silent=True)
                    new_priorities = np.abs(td_error_val) + eps
                    replay_buffer.update_priorities(indices, new_priorities)
                    ledger['timing/update_priorities'].append(
                        M.toc('update_priorities', silent=True))

            if t_step % G.target_network_update_interval == 0:
                trainer.update_target()

            if t_step % Reporting.checkpoint_interval == 0:
                U.save_state(os.path.join(RUN.log_directory, RUN.checkpoint))