示例#1
0
文件: run.py 项目: uusama/RL
def run(env,
        batch_size,
        agent,
        memory,
        discount,
        steps=300,
        episode_i=0,
        eps=.9,
        render=False,
        normalize=False):
    state = env.reset()
    done = False
    acc_reward = 0.0
    loss = 0.0
    for i in range(steps):
        if done:
            break
        # eps should decay overtime
        action = agent.move(state, eps=.9)
        # print("state:",state.shape,state)
        if normalize:
            state = featurize_state(state)

        next_state, reward, done, _ = env.step(action)
        acc_reward += reward
        memory.add((state, action, next_state, reward, done))
        if render:
            env.render()

        if len(memory.memory) > batch_size:
            state_m, action_m, next_state_m, reward_m, done_m = zip(
                *memory.sample(batch_size))
            state_m = np.array(state_m)
            action_m = np.array(action_m)
            next_state_m = np.array(next_state_m)
            reward_m = np.array(reward_m)
            done_m = np.array(done_m)

            q_m = agent.predict(next_state_m)

            actual_target_m = reward_m + (1. - done_m) * discount * np.amax(
                q_m, axis=1)

            targets = agent.predict(state_m)

            # assign the actual reward to the taken action
            for i, action in enumerate(action_m):
                targets[i, action] = actual_target_m[i]
            loss = agent.train(states=state_m, targets=targets)
            state = copy.copy(next_state)

    # print("acc_reward:", acc_reward)
    return acc_reward, i, loss
示例#2
0
parser.add_argument('--model')
#no training episodes
parser.add_argument('--eps')

parser.add_argument('--render')

args = parser.parse_args()


if args.tensorboard:
    writer = SummaryWriter()

    write_proc = subprocess.Popen(['tensorboard', '--logdir', '{}'.format(args.tensorboard)])

env = env.Environment(args.env)

if args.alg == 'DQN':
    agent = agent.DQNAgent(env, args.mode, args.model, writer)

try:
    if args.mode == 'train':
        agent.train(int(args.eps), args.render)
    elif args.mode == 'play':
        agent.play(int(args.eps))
except KeyboardInterrupt:
    print('PROCESS KILLED BY USER')
finally:
    env.close()
    if args.tensorboard:
        write_proc.terminate()
示例#3
0
import agent
import environment
import replay

env = environment.Environment('Breakout-v0')
replay = replay.ExperienceReplay(env)
agent = agent.Agent(env, replay)
# agent.restore()
agent.train()
示例#4
0
def run(
    agent_type="dqn",
    hidden_layer_size=32,
    gamma=1.0,
    min_epsilon=0.001,
    learning_rate=2.5e-4,
    env_name="CartPole-v0",
    num_episodes=3000,
    log_interval=100,
    replay_buffer_capacity=10**5,
    use_prioritized_experience_buffer=False,
    max_steps_per_episode = 10000,
    batch_size = 32,
    use_soft_update = False,
    online_update_period = 1,
    target_update_tau = 1,
    target_sync_period = 100,
):
    env = gym.make(env_name)

    cfg = {
        "type": agent_type,
        "network": {
            "type": "dense",
            "hidden_layers": (hidden_layer_size, hidden_layer_size),
        },
        "gamma": gamma,
        "min_epsilon": min_epsilon
    }
    agent = DQN(
        cfg, 
        env.observation_space.shape, 
        env.action_space.n,
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss_function=tf.keras.losses.MeanSquaredError(),
    )

    if use_prioritized_experience_buffer:
        buffer = PrioritizedReplayBuffer(
            size=replay_buffer_capacity, 
            alpha=0.6, 
            anneal_alpha_rate=1e-5, 
            anneal_beta_rate=1e-5
        )
    else:
        buffer = UniformReplayBuffer(size=replay_buffer_capacity)

    observer = [
        AverageObserver(log_interval), 
        MaximumObserver(log_interval)
    ]

    train(
        env, agent, buffer,
        num_episodes=num_episodes, 
        max_steps_per_episode=max_steps_per_episode,
        batch_size=batch_size,
        online_update_period=online_update_period,
        target_sync_period=target_sync_period,
        log_interval=log_interval,
        use_soft_update=use_soft_update,
        target_update_tau=target_update_tau,
        observer=observer
    )
示例#5
0
def main(args):
    if args.seed is not None:
        print("Setting random seed: %d" % args.seed)
        np.random.seed(args.seed)
        tf.random.set_seed(args.seed)

    job_dir = args.job_dir if args.job_dir.startswith('gs') else os.path.join(
        args.job_dir,
        datetime.now().strftime('%Y%m%d%H%M%s'))
    if not tf.io.gfile.exists(job_dir):
        tf.io.gfile.makedirs(job_dir)
    print('Job dir: %s' % job_dir)

    board = Board()
    agent = Agent(board.size,
                  hidden_size=args.agent_net_size,
                  num_conv=args.agent_net_conv)

    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        args.lr,
        int((args.epoch_games * 60 * args.lr_decay_epochs) / args.batch_size),
        args.lr_decay)

    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

    metrics_step = tf.Variable(1, dtype=tf.int64)
    checkpoint = tf.train.Checkpoint(step=tf.Variable(1, dtype=tf.int64),
                                     optimizer=optimizer,
                                     net=agent)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                    os.path.join(
                                                        job_dir,
                                                        'checkpoints'),
                                                    max_to_keep=None)
    if args.contest_to_update:
        temp_checkpoint_manager = tf.train.CheckpointManager(
            checkpoint,
            os.path.join(job_dir, 'temp_checkpoint'),
            max_to_keep=1)
    metrics_writer = tf.summary.create_file_writer(
        os.path.join(job_dir, 'metrics'))

    try:
        ray.init(num_cpus=args.num_cpus)
        checkpoint_manager.save()

        with metrics_writer.as_default():
            for e in range(args.epochs):
                if args.contest_to_update:
                    # Restore the last accepted agent parameters
                    checkpoint.restore(checkpoint_manager.latest_checkpoint)
                # Benchmark
                if e % 5 == 0:
                    t = time.time()
                    for name, (wins, losses) in benchmark_agent(
                            checkpoint_manager.latest_checkpoint, board.size,
                            args).items():
                        tf.summary.scalar('benchmark/%s/wins' % name,
                                          wins / args.benchmark_games,
                                          step=metrics_step)
                        tf.summary.scalar('benchmark/%s/losses' % name,
                                          losses / args.benchmark_games,
                                          step=metrics_step)
                        tf.summary.scalar(
                            'benchmark/%s/draws' % name,
                            (args.benchmark_games - wins - losses) /
                            args.benchmark_games,
                            step=metrics_step)

                    ttrb = float(time.time() - t)
                    tf.summary.scalar('perf/time_to_run_benchmarks',
                                      ttrb,
                                      step=metrics_step)
                    print('Time to run benchmarks: %.4f' % ttrb)

                # Collect epoch samples
                print('Epoch: %d' % e)
                t = time.time()
                samples, stats = collect_samples(
                    checkpoint_manager.checkpoints, board.size, args)
                ttcs = float(time.time() - t)
                for key, val in stats.items():
                    tf.summary.scalar('game_metrics/%s' % key,
                                      val,
                                      step=metrics_step)
                tf.summary.scalar('perf/time_to_collect_samples',
                                  ttcs,
                                  step=metrics_step)
                print('Time to collect samples: %.4f' % ttcs)

                for (states, action_probabilities, action_indices,
                     state_values, rewards) in batches(samples,
                                                       args.batch_size):
                    if np.any(np.isnan(action_probabilities)):
                        raise ValueError('NaN Action P')

                    loss = train(
                        agent,
                        optimizer,
                        tf.convert_to_tensor(states, dtype=tf.float32),
                        tf.convert_to_tensor(action_probabilities,
                                             dtype=tf.float32),
                        tf.convert_to_tensor(action_indices, dtype=tf.int32),
                        tf.convert_to_tensor(state_values, dtype=tf.float32),
                        tf.convert_to_tensor(rewards, dtype=tf.float32),
                    )
                    tf.summary.scalar('train/loss', loss, step=metrics_step)
                    tf.summary.scalar(
                        'train/mean_advantage',
                        tf.reduce_mean(
                            tf.convert_to_tensor(rewards, dtype=tf.float32) -
                            tf.convert_to_tensor(state_values,
                                                 dtype=tf.float32)),
                        step=metrics_step)
                    metrics_step.assign_add(1)
                    checkpoint.step.assign_add(1)

                if args.contest_to_update:
                    # Update parameters only if the new agent beats the old one.
                    temp_checkpoint_manager.save()
                    t = time.time()
                    new_wins, old_wins = compare_agents(
                        checkpoint_manager.latest_checkpoint,
                        temp_checkpoint_manager.latest_checkpoint, args)
                    tf.summary.scalar('perf/time_to_compare_agents',
                                      float(time.time() - t),
                                      step=metrics_step)
                    tf.summary.scalar('train/new_agent_win_rate',
                                      new_wins / (new_wins + old_wins),
                                      step=metrics_step)
                    if ((new_wins + old_wins) > 0) and (
                            new_wins /
                        (new_wins + old_wins) >= args.win_rate_threshold):
                        checkpoint_manager.save()
                else:
                    checkpoint_manager.save()

    finally:
        ray.shutdown()
import agent
from environment import GymEnvironment
import tensorflow as tf

env_agent = GymEnvironment()
agent = agent.DQNAgent(environment=env_agent)

with tf.Session() as sess:
    agent.build_dqn(sess)
    sess.run(tf.global_variables_initializer())

    agent.train(episodes=50000)
示例#7
0
import gym
import model
import agent

env = gym.make('CartPole-v0')
model = model.Model(num_actions=env.action_space.n)

obs = env.reset()

#fetch agent class
agent = agent.Agent(model)

#train agent
rewards_history = agent.train(env)
print("Finished training, testing...")
#test fully trained agent
print("%d out of 200" % agent.test(env))  # score out of 200
示例#8
0
    env = AtariWrapper(env, **config["env"]["wrapper"])

agent = DQN(
    config["agent"],
    env.observation_space.shape,
    env.action_space.n,
)

if config["buffer"]["use_per"]:
    buffer = PrioritizedReplayBuffer(
        size = config["buffer"]["size"],
        alpha = config["buffer"]["alpha"],
        beta = config["buffer"]["beta"],
        anneal_alpha_rate = config["buffer"]["anneal_alpha_rate"],
        anneal_beta_rate = config["buffer"]["anneal_beta_rate"]
    )
else:
    buffer = UniformReplayBuffer(config["buffer"]["size"])

observer = []
if config["train"]["display_average_reward"]:
    observer.append(AverageObserver(config["train"]["log_interval"]))
if config["train"]["display_max_reward"]:
    observer.append(MaximumObserver(config["train"]["log_interval"]))

c = config["train"]
c.update(config["misc"])
c["observer"] = observer
history = train(env, agent, buffer, **c)
logging.info(history)
示例#9
0
        print("result", result)


if __name__ == '__main__':
    train(
        Checkers,
        'checkers-4',
        model_width=64,
        #alpha_steps={0: 0.1, 5: 0.01, 10: 0.001, 20: 0.0001},
        alpha_steps={0: 0.001},
        #discount_steps={0:0.7, 30: 0.9, 60: 0.99},
        discount_steps={0: 0.99},
        epsilon_steps={
            0: 1,
            5: 0.7,
            20: 0.5,
            50: 0.3
        },
        #epsilon_steps={0: 1},
        num_models=2,
        epoch_size=1000,
        num_epochs=1000,
        sample_size=1000,
        num_samples=100,
        play_at_end=False,
        saveDir='/home/sam/scratch/tflow',
        loadModels=False)
    """
    train(TicTacToe, 'tictactoe-small',
            model_width=64,
            alpha_steps={0: 0.01, 10: 0.001, 30: 0.0001},
示例#10
0
def run(
    agent_type="dqn",
    gamma=1.0,
    min_epsilon=0.1,
    learning_rate=2.5e-4,
    env_name="MsPacman-v0",
    use_wrapper=True,
    num_episodes=1000,
    log_interval=100,
    replay_buffer_capacity=10**5,
    use_prioritized_experience_buffer=False,
    max_steps_per_episode = 10000,
    batch_size = 32,
    use_soft_update = False,
    online_update_period = 1,
    target_update_tau = 1,
    target_sync_period = 100,
    decay_rate=1e-5,
    num_saves = 0,
    saved_model_dir = None,
    warm_up = 10000
):
    env = gym.make(env_name)
    if use_wrapper:
        # convert (210, 160, 3) to (84, 84, 1)
        env = AtariWrapper(env)

    cfg = {
        "type": agent_type,
        "network": {
            "type": "conv2d",
            "structure": None,
        },
        "gamma": gamma,
        "min_epsilon": min_epsilon
    }
    agent = DQN(
        cfg, 
        env.observation_space.shape, 
        env.action_space.n,
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss_function=tf.keras.losses.MeanSquaredError(),
    )

    if use_prioritized_experience_buffer:
        buffer = PrioritizedReplayBuffer(
            size=replay_buffer_capacity, 
            alpha=0.6, 
            anneal_alpha_rate=1e-5, 
            anneal_beta_rate=1e-5
        )
    else:
        buffer = UniformReplayBuffer(size=replay_buffer_capacity)

    observer = [
        AverageObserver(log_interval), 
        MaximumObserver(log_interval)
    ]

    train(
        env, agent, buffer,
        num_episodes=num_episodes, 
        max_steps_per_episode=max_steps_per_episode,
        batch_size=batch_size,
        online_update_period=online_update_period,
        target_sync_period=target_sync_period,
        log_interval=log_interval,
        use_soft_update=use_soft_update,
        target_update_tau=target_update_tau,
        observer=observer,
        decay_rate=decay_rate,
        num_saves=num_saves,
        saved_model_dir=saved_model_dir,
        warm_up=10000
    )