def init():
    global RENDER, S, R, S_, env, state_dim, action_dim, action_bound, actor, critic
    env = Env(np.ones([
        39,
    ]) * 1e-5)

    state_dim = env.observation_space()
    action_dim = env.action_space()
    action_bound = env.action_bound()

    with tf.name_scope('S'):
        S = tf.placeholder(tf.float32, shape=[None, state_dim], name='s')
    with tf.name_scope('R'):
        R = tf.placeholder(tf.float32, [None, 1], name='r')
    with tf.name_scope('S_'):
        S_ = tf.placeholder(tf.float32, shape=[None, state_dim], name='s_')

    sess = tf.Session()

    actor = Actor(sess, action_dim, action_bound, LR_A, REPLACE_ITER_A)
    critic = Critic(sess, state_dim, action_dim, LR_C, GAMMA, REPLACE_ITER_C,
                    actor.a, actor.a_)
    actor.add_grad_to_graph(critic.a_grads)

    sess.run(tf.global_variables_initializer())
Exemplo n.º 2
0
def main():
    global args
    args = parse_args()
    # global logger
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s")
    # file logger
    fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w')
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    # console logger
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    # argument validation
    args.cuda = args.cuda and torch.cuda.is_available()
    args.device = torch.device("cuda" if args.cuda else "cpu")
    logger.debug(args)
    torch.manual_seed(random.randint(1, 10000))
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.enabled = False  # Disable nondeterministic ops (not sure if critical but better safe than sorry)
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    split_files = os.path.join(args.data, args.split_file)
    dataset_file = os.path.join(args.data, 'dataset.pth')
    if os.path.isfile(dataset_file):
        train_dataset = torch.load(dataset_file)
    else:
        train_dataset = Dataset(split_files, args.in_dim)
        torch.save(train_dataset, dataset_file)
    logger.debug('==> Size of train data   : %d ' % len(train_dataset))

    # initialize environment, agent, memory
    env = Env(args, train_dataset)
    action_space = env.action_space()
    dqn = Agent(args, action_space)
    mem = ReplayMemory(args, args.memory_capacity)

    # create trainer object for training and testing
    trainer = Trainer(args, env, dqn, mem)

    if args.evaluate:
        # Evaluate step
        dqn.eval()
        for _ in range(args.evaluation_episodes):
            trainer.evaluate_one_step(train_dataset)
    else:
        # Training step
        trainer.train(train_dataset, logger)
        logger.debug('==> Checkpointing everything now...')
        dqn.save(os.path.join(args.save, args.expname))
Exemplo n.º 3
0
    torch.cuda.set_device(args.device)
    torch.cuda.manual_seed(random.randint(1, 10000))
    torch.backends.cudnn.enabled = False  # Disable nondeterministic ops (not sure if critical but better safe than sorry)
else:
    args.device = torch.device('cpu')


# Simple ISO 8601 timestamped logger
def log(s):
    print('[' + str(datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s)


# Environment
env = Env(args)
env.train()
action_space = env.action_space()

# Agent
dqn = Agent(args, env)
mem = ReplayMemory(args, args.memory_capacity)
priority_weight_increase = (1 - args.priority_weight) / (args.T_max -
                                                         args.learn_start)

# Construct validation memory
val_mem = ReplayMemory(args, args.evaluation_size)
T, done = 0, True
while T < args.evaluation_size:
    if done:
        state, done = env.reset(), False

    next_state, _, done = env.step(random.randint(0, action_space - 1))
Exemplo n.º 4
0
def main():
    args = parse_arguments()

    results_dir = os.path.join('results', args.id)
    os.makedirs(results_dir, exist_ok=True)
    logger = Logger(results_dir)

    metrics = {
        'steps': [],
        'rewards': [],
        'Qs': [],
        'best_avg_reward': -float('inf')
    }
    np.random.seed(args.seed)
    torch.manual_seed(np.random.randint(1, 10000))
    if torch.cuda.is_available() and not args.disable_cuda:
        args.device = torch.device('cuda')
        torch.cuda.manual_seed(np.random.randint(1, 10000))
        torch.backends.cudnn.enabled = args.enable_cudnn
    else:
        args.device = torch.device('cpu')

    if args.tensorboard_dir is None:
        writer = SummaryWriter(
            os.path.join(results_dir, 'tensorboard', args.game,
                         args.architecture))
    else:
        writer = SummaryWriter(
            os.path.join(args.tensorboard_dir, args.game, args.architecture))

    # Environment
    env = Env(args)
    env.train()
    action_space = env.action_space()

    # Agent
    dqn = Agent(args, env)

    # If a model is provided, and evaluate is fale, presumably we want to resume, so try to load memory
    if args.model is not None and not args.evaluate:
        if not args.memory:
            raise ValueError(
                'Cannot resume training without memory save path. Aborting...')
        elif not os.path.exists(args.memory):
            raise ValueError(
                'Could not find memory file at {path}. Aborting...'.format(
                    path=args.memory))

        mem = load_memory(args.memory, args.disable_bzip_memory)

    else:
        mem = ReplayMemory(args, args.memory_capacity)

    priority_weight_increase = (1 - args.priority_weight) / (args.T_max -
                                                             args.learn_start)

    # Construct validation memory
    val_mem = ReplayMemory(args, args.evaluation_size)
    T, done = 0, True
    while T < args.evaluation_size:
        if done:
            state, done = env.reset(), False

        next_state, _, done = env.step(np.random.randint(0, action_space))
        val_mem.append(state, None, None, done)
        state = next_state
        T += 1

    if args.evaluate:
        dqn.eval()  # Set DQN (online network) to evaluation mode
        avg_reward, avg_Q = test(args,
                                 0,
                                 dqn,
                                 val_mem,
                                 metrics,
                                 results_dir,
                                 evaluate=True)  # Test
        logger.info('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' +
                    str(avg_Q))
    else:
        # Training loop
        dqn.train()
        T, done = 0, True
        accumulate_reward = 0
        for T in trange(1, args.T_max + 1):
            if done:
                state, done = env.reset(), False
                writer.add_scalar('Train/Reward', accumulate_reward, T)
                accumulate_reward = 0

            if T % args.replay_frequency == 0:
                dqn.reset_noise()  # Draw a new set of noisy weights

            action = dqn.act(
                state)  # Choose an action greedily (with noisy weights)
            next_state, reward, done = env.step(action)  # Step
            accumulate_reward += reward
            if args.reward_clip > 0:
                reward = max(min(reward, args.reward_clip),
                             -args.reward_clip)  # Clip rewards
            mem.append(state, action, reward,
                       done)  # Append transition to memory

            # Train and test
            if T >= args.learn_start:
                mem.priority_weight = min(
                    mem.priority_weight + priority_weight_increase,
                    1)  # Anneal importance sampling weight β to 1

                if T % args.replay_frequency == 0:
                    dqn.learn(
                        mem
                    )  # Train with n-step distributional double-Q learning

                if T % args.evaluation_interval == 0:
                    dqn.eval()  # Set DQN (online network) to evaluation mode
                    avg_reward, avg_Q = test(args, T, dqn, val_mem, metrics,
                                             results_dir)  # Test
                    writer.add_scalar('Eval/Reward', avg_reward, T)
                    writer.add_scalar('Eval/Q', avg_Q, T)
                    logger.info('T = ' + str(T) + ' / ' + str(args.T_max) +
                                ' | Avg. reward: ' + str(avg_reward) +
                                ' | Avg. Q: ' + str(avg_Q))
                    dqn.train(
                    )  # Set DQN (online network) back to training mode

                    # If memory path provided, save it
                    if args.memory is not None:
                        save_memory(mem, args.memory, args.disable_bzip_memory)

                # Update target network
                if T % args.target_update == 0:
                    dqn.update_target_net()

                # Checkpoint the network
                if (args.checkpoint_interval !=
                        0) and (T % args.checkpoint_interval == 0):
                    dqn.save(results_dir, 'checkpoint.pth')

            state = next_state

    env.close()
import sys
import random

import numpy as np

from env import Env

games = ['atlantis', 'breakout', 'pong', 'space_invaders', 'kung_fu_master', 'boxing', 'seaquest', 'chopper_command']

for game in games:
    env = Env(game, 1234, 'cuda', 600, 4, False)
    acts = env.action_space() - 1

    all_rews = []
    for i in range(100):
        env.reset()
        done = False
        rew = 0
        while not done:
            _, r, done, _ = env.step(random.randint(0, acts))
            rew += r
        all_rews.append(rew)
        print('Ep: ', i, end='\r')
    print(game, np.mean(all_rews))
Exemplo n.º 6
0
def ensemble_test(args,
                  T,
                  dqn,
                  val_mem,
                  metrics,
                  results_dir,
                  num_ensemble,
                  evaluate=False):
    env = Env(args)
    env.eval()
    metrics['steps'].append(T)
    T_rewards, T_Qs = [], []
    action_space = env.action_space()

    # Test performance over several episodes
    done = True
    for _ in range(args.evaluation_episodes):
        while True:
            if done:
                state, reward_sum, done = env.reset(), 0, False
            q_tot = 0
            for en_index in range(num_ensemble):
                if en_index == 0:
                    q_tot = dqn[en_index].ensemble_q(state)
                else:
                    q_tot += dqn[en_index].ensemble_q(state)
            action = q_tot.argmax(1).item()

            state, reward, done = env.step(action)  # Step
            reward_sum += reward
            if args.render:
                env.render()
            if done:
                T_rewards.append(reward_sum)
                break
    env.close()

    # Test Q-values over validation memory
    for state in val_mem:  # Iterate over valid states
        for en_index in range(num_ensemble):
            T_Qs.append(dqn[en_index].evaluate_q(state))

    avg_reward, avg_Q = sum(T_rewards) / len(T_rewards), sum(T_Qs) / len(T_Qs)
    if not evaluate:
        # Save model parameters if improved
        if avg_reward > metrics['best_avg_reward']:
            metrics['best_avg_reward'] = avg_reward
            for en_index in range(num_ensemble):
                dqn[en_index].save(results_dir,
                                   name='%dth_model.pth' % (en_index))

        # Append to results and save metrics
        metrics['rewards'].append(T_rewards)
        metrics['Qs'].append(T_Qs)

        torch.save(metrics, os.path.join(results_dir, 'metrics.pth'))
        # Plot
        _plot_line(metrics['steps'],
                   metrics['rewards'],
                   'Reward',
                   path=results_dir)
        _plot_line(metrics['steps'], metrics['Qs'], 'Q', path=results_dir)

    # Return average reward and Q-value
    return avg_reward, avg_Q