Exemplo n.º 1
0
    def __init__(self, env_config, agent_config, use_cuda = True):
        self.env = GymEnvironment(name = env_config["name"])
        self.action_size = self.env.action_size[0]
        self.state_size = self.env.obs_size[0]
        # initialize mimic agent
        self.agent = MimicAgent(action_size = self.action_size,
                                state_size = self.state_size,
                                **agent_config, use_cuda = use_cuda)

        # if train_config.get('load_path')
            # self.agent.load_models(train_config.get('load_path'))

        # initialize expert
        # self.expert = LunarLanderExpert()
        self.expert = SmallReactivePolicy(self.env.observation_space, self.env.action_space)
Exemplo n.º 2
0
 def __init__(self, env_config, agent_config):
     self.n = 10
     self.noise_dim = 2
     self.env = GymEnvironment(name = env_config["name"])
     self.critic = SimpleCritic(self.n, self.env.obs_size, self.env.action_size)
     self.hallucinator = SimpleHallucinator(self.n, self.env.obs_size, self.noise_dim)
     self.policy_buffer = PolicyBuffer()
     self.policy_c = Policy
     self.trainer = SimpleTrainer(self.env, self.critic, self.hallucinator, self.policy_buffer, self.policy_c, self.noise_dim)
Exemplo n.º 3
0
def test_game_data():
    g = GameBatchData(get_timestamp(True))
    env = GymEnvironment('Atlantis-v0')
    env.reset()
    for _ in range(5):
        i = g.new_game(get_timestamp(True))
        print("Game: %s" % i)
        for _ in range(5):
            observation, reward, done, info = env.step(0)
            d = g.add_step(
                timestamp=get_timestamp(True),
                observation=observation,
                concatenated_observation=observation,
                reward="rew%s" % (random.randint(1, 10)),
                action_value=[
                    "av%s" % (random.randint(1, 10)),
                    "av%s" % (random.randint(1, 10))
                ],
                action="a%s" % (random.randint(1, 10)),
            )
            print("Step %s" % d)
    g.save_progress()
Exemplo n.º 4
0
def test_graph():
    env = GymEnvironment('MsPacman-v0')
    graph = Graph(actions=10)
    env.reset()
    screenshots = []
    g = graph.get_graph()
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)

        for x in range(1, 7):
            env.render()
            observation, reward, done, info = env.step(0)
            screenshots.append(observation)
            if x % 4 == 0:
                concat_image = np.concatenate((screenshots[0], screenshots[1], screenshots[2], screenshots[3]), axis=1)
                im = Image.fromarray(concat_image).convert('LA')
                # im.show()
                grayscale_im = np.array(im)

                graph.run_graph(sess, grayscale_im.reshape([-1, 25600]))
Exemplo n.º 5
0
def main(args):
    env = GymEnvironment(args, gamma)
    env.env = env.env.unwrapped

    actor_critic = Policy(obs_shape,
                          env.action_size,
                          base_kwargs={'recurrent': False})
    actor_critic.load_state_dict(torch.load('log/model.pt'))
    actor_critic.to(device)

    agent = PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch,
                value_loss_coef, entropy_coef, lr, eps, max_grad_norm)
    rollouts = RolloutStorage(num_steps, num_processes, obs_shape,
                              env.action_space,
                              actor_critic.recurrent_hidden_state_size)
    current_obs = torch.zeros(num_processes, *obs_shape)

    obs, _, _, _ = env.new_expt()
    obs = obs[np.newaxis, ...]

    current_obs[:, -1] = torch.from_numpy(obs)
    rollouts.obs[0].copy_(current_obs)

    current_obs = current_obs.to(device)
    rollouts.to(device)

    num_updates = math.ceil(args.max_timesteps / (num_processes * num_steps))
    n_goal_reached = 0
    n_episodes = 0
    for j in range(num_updates):
        for step in range(num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            cpu_actions = action.squeeze(1).cpu().numpy()

            (obs, reward, done), goal_reached = env.act(action)
            reward = torch.from_numpy(np.expand_dims(np.stack([reward]),
                                                     1)).float()

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in [done]])

            masks = masks.to(device)

            current_obs[:, :-1] = current_obs[:, 1:]
            if done:
                current_obs[:] = 0
            current_obs[:, -1] = torch.from_numpy(obs)
            rollouts.insert(current_obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

            if done:
                n_episodes += 1
                env.new_expt()
                if goal_reached:
                    n_goal_reached += 1

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                rollouts.masks[step]).detach()

        rollouts.compute_returns(next_value, use_gae, gamma, tau, step)
        value_loss, action_loss, dist_entropy = agent.update(rollouts, step)
        rollouts.after_update()

        if j % log_interval == 0:
            total_num_steps = (j + 1) * num_processes * num_steps

            try:
                success = float(n_goal_reached) / n_episodes
            except ZeroDivisionError:
                success = 0.
            print(
                "Timesteps: {}, Goal reached : {} / {}, Success %: {}".format(
                    total_num_steps, n_goal_reached, n_episodes, success))

    if args.lang_coeff > 0:
        av_list = np.array(env.action_vectors_list)
        for k in range(len(spearman_corr_coeff_actions)):
            sr, _ = spearmanr(env.rewards_list, av_list[:, k])
            print(k, sr)
Exemplo n.º 6
0
mainarg.add_argument(
    "--save_weights_prefix",
    help="Save network to given file. Epoch and extension will be appended.")

comarg = parser.add_argument_group('Common')
comarg.add_argument("output_folder", help="Where to write results to.")
comarg.add_argument("--num_episodes",
                    type=int,
                    default=100,
                    help="Number of episodes to test.")
comarg.add_argument("--random_seed",
                    type=int,
                    help="Random seed for repeatable experiments.")
args = parser.parse_args()

if args.random_seed:
    random.seed(args.random_seed)

env = GymEnvironment(args.env_id, args)
net = DeepQNetwork(env.numActions(), args)
mem = None
agent = Agent(env, mem, net, args)

if args.load_weights:
    print "Loading weights from %s" % args.load_weights
    net.load_weights(args.load_weights)

env.gym.monitor.start(args.output_folder, force=True)
agent.play(args.num_episodes)
env.gym.monitor.close()
Exemplo n.º 7
0
antarg.add_argument("--random_starts", type=int, default=30, help="Perform max this number of dummy actions after game restart, to produce more random game dynamics.")

mainarg = parser.add_argument_group('Main loop')
mainarg.add_argument("--load_weights", help="Load network from file.")
mainarg.add_argument("--save_weights_prefix", help="Save network to given file. Epoch and extension will be appended.")

comarg = parser.add_argument_group('Common')
comarg.add_argument("output_folder", help="Where to write results to.")
comarg.add_argument("--num_episodes", type=int, default=10, help="Number of episodes to test.")
comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.")
args = parser.parse_args()

if args.random_seed:
  random.seed(args.random_seed)

env = GymEnvironment(args.env_id, args)
net = DeepQNetwork(env.numActions(), args)
buf = MemoryBuffer(args)

if args.load_weights:
  print "Loading weights from %s" % args.load_weights
  net.load_weights(args.load_weights)

env.gym.monitor.start(args.output_folder, force=True)
avg_reward = 0
num_episodes = args.num_episodes
for i_episode in xrange(num_episodes):
    env.restart()
    observation = env.getScreen()
    buf.reset()
    i_total_reward = 0
Exemplo n.º 8
0
                    choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
                    default="INFO",
                    help="Log level.")
args = parser.parse_args()

logger = logging.getLogger()
logger.setLevel(args.log_level)
# bug with double logging
if args.environment == 'gym':
    logger.handlers.pop()

if args.random_seed:
    random.seed(args.random_seed)

# instantiate classes
env = GymEnvironment(args.rom_file,
                     args) if args.environment == 'gym' else ALEEnvironment(
                         args.rom_file, args)
mem = ReplayMemory(args.replay_size, args)
net = DeepQNetwork(env.numActions(), args)
agent = Agent(env, mem, net, args)
stats = Statistics(agent, net, mem, env, args)

if args.load_weights:
    logger.info("Loading weights from %s" % args.load_weights)
    net.load_weights(args.load_weights)

if args.play_games:
    logger.info("Playing for %d game(s)" % args.play_games)
    stats.reset()
    agent.play(args.play_games)
    stats.write(0, "play")
Exemplo n.º 9
0
def main():
    # Process arguments
    args = utils.parse_args()

    # Use random seed from argument
    if args.random_seed:
        random.seed(args.random_seed)

    # Instantiate environment class
    if args.environment == "ale":
        env = ALEEnvironment(args.game, args)
    elif args.environment == "gym":
        env = GymEnvironment(args.game, args)
    elif args.environment == "robot":
        env = RobotEnvironment(args.game, args)
    else:
        assert False, "Unknown environment" + args.environment

    # Instantiate DQN
    action_dim = env.action_dim()
    state_dim = env.state_dim()
    net = DQN(state_dim, action_dim, args)

    # Load weights before starting training
    if args.load_weights:
        filepath = args.load_weights
        net.load(filepath)

    # Instantiate agent
    agent = Agent(env, net, args)

    # Start statistics
    stats = Statistics(agent, agent.net, agent.net.memory, env, args)

    # Play game with two players (user and agent)
    if args.two_player:
        player_b = PlayerTwo(args)
        env.set_mode('test')
        stats.reset()
        agent.play_two_players(player_b)
        stats.write(0, "2player")
        sys.exit()

    # Play agent
    if args.play_games > 0:
        env.set_mode('test')
        stats.reset()
        for _ in range(args.play_games):
            agent.play()
        stats.write(0, "play")
        sys.exit()

    # Populate replay memory with random steps
    if args.random_steps:
        env.set_mode('test')
        stats.reset()
        agent.play_random(args.random_steps)
        stats.write(0, "random")

    for epoch in range(args.start_epoch, args.epochs):
        # Train agent
        if args.train_steps:
            env.set_mode('train')
            stats.reset()
            agent.train(args.train_steps)
            stats.write(epoch + 1, "train")

            # Save weights after every epoch
            if args.save_weights_prefix:
                filepath = args.save_weights_prefix + "_%d.h5" % (epoch + 1)
                net.save(filepath)

        # Test agent
        if args.test_steps:
            env.set_mode('test')
            stats.reset()
            agent.test(args.test_steps)
            stats.write(epoch + 1, "test")

    # Stop statistics
    stats.close()
Exemplo n.º 10
0
Arquivo: main.py Projeto: fizz-ml/hDQN
 def __init__(self, env_config, subroutine_configs):
     self.env = GymEnvironment(name = env_config["name"])
     self.controllers = []
     for config in subroutine_configs:
         c = DQNController(config,self.env)
         self.controllers.append(c)
Exemplo n.º 11
0
import agent
from environment import GymEnvironment
import tensorflow as tf

env_agent = GymEnvironment()
agent = agent.DQNAgent(environment=env_agent)

with tf.Session() as sess:
    agent.build_dqn(sess)
    sess.run(tf.global_variables_initializer())

    agent.train(episodes=50000)
Exemplo n.º 12
0
 def __init__(self, env_config, agent_config, use_cuda=True):
     self.env = GymEnvironment(name=env_config["name"])
     self.agent = DDPGAgent(action_size=self.env.action_size[0],
                            state_size=self.env.obs_size[0],
                            **agent_config,
                            use_cuda=use_cuda)
Exemplo n.º 13
0
class Runner:
    def __init__(self, env_config, agent_config, use_cuda=True):
        self.env = GymEnvironment(name=env_config["name"])
        self.agent = DDPGAgent(action_size=self.env.action_size[0],
                               state_size=self.env.obs_size[0],
                               **agent_config,
                               use_cuda=use_cuda)

    def train(self, train_config):
        # Load model
        if train_config.get('load_path'):
            self.agent.load_models(train_config.get('load_path'))

        # Fill experience replay
        self.env.new_episode()
        ma_reward = 0

        prefill = train_config['prefill']
        if prefill > 0:
            temp_reward = 0
            temp_done = False
            for step in range(prefill):
                cur_obs = self.env.cur_obs
                _ = self.agent.get_next_action(cur_obs)
                cur_action = np.asarray([random.random() * 2.0 - 1.0] *
                                        self.env.action_size[0])
                next_state, reward, done = self.env.next_obs(cur_action,
                                                             render=(step %
                                                                     8 == 0))

                temp_reward = reward
                temp_done = done
                self.agent.log_reward(temp_reward, temp_done)
                ma_reward = ma_reward * 0.99 + reward * 0.01

        # Start training
        train_steps = train_config['steps']

        temp_reward = 0
        temp_done = True
        for step in range(train_steps):
            cur_obs = self.env.cur_obs
            # TODO: This step probably belongs somewhere else
            cur_action = np.squeeze(self.agent.get_next_action(cur_obs),
                                    axis=0)
            if (any(np.isnan(cur_obs))):
                pdb.set_trace()
            next_state, reward, done = self.env.next_obs(cur_action,
                                                         render=(step %
                                                                 8 == 0))

            temp_reward = reward
            temp_done = done

            self.agent.log_reward(temp_reward, temp_done)

            self.agent.train()
            ma_reward = ma_reward * 0.995 + reward * 0.005
            if (step % 500 == 0):
                print(cur_obs, ' ', cur_action, 'Reward:', ma_reward)
                print('Eps', self.agent.epsilon)
            if (step % 5000 == 0):
                print('Saving weights')
                self.agent.save_models(train_config['save_path'])

    def test(self, test_config):
        if test_config.get('load_path'):
            self.agent.load_models(test_config.get('load_path'))
        else:
            print(
                'Warning: did not parse load path. Running random init model')

        test_steps = test_config['steps']

        self.env.new_episode()

        temp_reward = 0
        temp_done = False
        for step in range(test_steps):
            cur_obs = self.env.cur_obs
            cur_action = np.squeeze(self.agent.get_next_action(cur_obs,
                                                               is_test=True),
                                    axis=0)
            cur_action = np.clip(cur_action, -1, 1)
            next_state, reward, done = self.env.next_obs(cur_action,
                                                         render=True)
Exemplo n.º 14
0
class Runner:
    def __init__(self, env_config, agent_config, fd_config, use_cuda = True):
        self.env = GymEnvironment(name = env_config["name"])
        self.action_size = self.env.action_size[0]
        self.state_size = self.env.obs_size[0]
        # initialize mimic agent
        self.agent = MimicAgent(action_size = self.action_size,
                                state_size = self.state_size,
                                **agent_config, use_cuda = use_cuda)

        self.fd = FDModel(action_size = self.action_size,
                            state_size = self.state_size,
                            **fd_config, use_cuda = use_cuda)
        # if train_config.get('load_path')
            # self.agent.load_models(train_config.get('load_path'))

        # initialize expert
        # self.expert = LunarLanderExpert()
        self.expert = SmallReactivePolicy(self.env.observation_space, self.env.action_space)

    def reset():
        self.agent.reset()
        self.env.new_episode()
    
    def sample_expert(self, num_tuples, do_render = False):
        '''
            Accumulates experience tuples from the expert for num tuples.
            Returns states, action, rewards and done flags as np arrays.
        '''
        state_size = self.state_size
        action_size = self.action_size
        capacity = num_tuples

        actions = np.empty((capacity, action_size), dtype = np.float16)
        states = np.empty((capacity, state_size), dtype = np.float16)
        next_states = np.empty((capacity, state_size), dtype = np.float16)
        rewards = np.empty(capacity, dtype = np.float16)

        self.env.new_episode()

        transition = 0
        while transition < num_tuples:
            print('{} / {}'.format(transition+1, num_tuples))
            cur_obs = self.env.cur_obs
            cur_action = self.expert.get_next_action(cur_obs)

            next_state, reward, done = self.env.next_obs(cur_action, render = ((transition % 8 == 0) and do_render))
            
            # dont confuse the fd model with terminal states
            if not done:
                actions[transition] = cur_action
                states[transition] = cur_obs
                next_states[transition] = next_state 
                rewards[transition] = reward
                transition += 1

        print('Ave expert reward: ', np.mean(rewards))
        return states, actions, next_states, rewards

    def train_mimic(self, states, actions, train_config, num_epochs = 4000):
        # Load model
        #train_config.get('num_epochs')
        self.agent.train_epochs(states, actions, num_epochs, states.shape[0])

    def train_mimic_fd(self, states, actions, train_config, num_epochs, do_render = False):
        state_size = self.state_size
        action_size = self.action_size
        capacity = num_tuples

        actions = np.empty((capacity, action_size), dtype = np.float16)
        states = np.empty((capacity, state_size), dtype = np.float16)
        rewards = np.empty(capacity, dtype = np.float16)
        dones = np.empty(capacity, dtype = np.bool)

        self.env.new_episode()

        beta = 1.0
        tuples_per_epoch = int(num_tuples/num_epochs)
        epochs = 0
        for i in range(num_tuples):
            print('{} / {}'.format(i+1, num_tuples))
            cur_obs = self.env.cur_obs
            cur_action = None
            expert_action = None
            if beta > np.random.rand():
                cur_action = self.expert.get_next_action(cur_obs)
                expert_action = cur_action
            else:
                expert_action = self.expert.get_next_action(cur_obs)
                cur_action = np.squeeze(self.agent.get_next_action(cur_obs), axis=0)
                # cur_action = np.clip(cur_action, -1, 1)

            next_state, reward, done = self.env.next_obs(cur_action, render = ((i % 8 == 0) and do_render))

            actions[i] = expert_action
            states[i] = cur_obs
            rewards[i] = reward
            dones[i] = done

            beta = 1.0-float(i)/num_tuples

            if ((i+1)%tuples_per_epoch) == 0 and i != 0:
                self.agent.train_epochs(states[:i], actions[:i], 1, 500)
                epochs += 1



    def train_fd(self, states, actions, next_states, train_config, num_epochs):
        self.fd.train_epochs(states, actions, next_states, num_epochs, states.shape[0])

    def train_dagger(self, train_config, num_tuples, num_epochs, do_render = False):
        # Load model
        #train_config.get('num_epochs')
        state_size = self.state_size
        action_size = self.action_size
        capacity = num_tuples

        actions = np.empty((capacity, action_size), dtype = np.float16)
        states = np.empty((capacity, state_size), dtype = np.float16)
        rewards = np.empty(capacity, dtype = np.float16)
        dones = np.empty(capacity, dtype = np.bool)

        self.env.new_episode()

        beta = 1.0
        tuples_per_epoch = int(num_tuples/num_epochs)
        epochs = 0
        for i in range(num_tuples):
            print('{} / {}'.format(i+1, num_tuples))
            cur_obs = self.env.cur_obs
            cur_action = None
            expert_action = None
            if beta > np.random.rand():
                cur_action = self.expert.get_next_action(cur_obs)
                expert_action = cur_action
            else:
                expert_action = self.expert.get_next_action(cur_obs)
                cur_action = np.squeeze(self.agent.get_next_action(cur_obs), axis=0)
                # cur_action = np.clip(cur_action, -1, 1)

            next_state, reward, done = self.env.next_obs(cur_action, render = ((i % 8 == 0) and do_render))

            actions[i] = expert_action
            states[i] = cur_obs
            rewards[i] = reward
            dones[i] = done

            beta = 1.0-float(i)/num_tuples

            if ((i+1)%tuples_per_epoch) == 0 and i != 0:
                self.agent.train_epochs(states[:i], actions[:i], 1, 500)
                epochs += 1

        # print('Ave expert reward: ', np.mean(rewards))


    def test_mimic(self, test_config, do_render = False):
        test_steps = test_config['steps']

        self.env.new_episode()

        tot_reward = 0
        for step in range(test_steps):
            cur_obs = self.env.cur_obs
            cur_action = np.squeeze(self.agent.get_next_action(cur_obs), axis=0)
            # cur_action = np.clip(cur_action, -1, 1)
            _, reward, _ = self.env.next_obs(cur_action, render = (step%1==0 and do_render))
            tot_reward += reward

        print('Ave test reward: {}'.format(tot_reward/test_steps))
        return tot_reward/test_steps
def train(params):
    
    # Load Atari rom and prepare ALE environment 
    atari = GymEnvironment(params.random_start_wait, params.show_game)

    # Initialize two Q-Value Networks one for training and one for target prediction
    dqn_train  = DeepQNetwork(
        params=params,
        num_actions=atari.num_actions,
        network_name="qnetwork-train",
        trainable=True
    )

    # Q-Network for predicting target Q-values
    dqn_target= DeepQNetwork(
        params=params,
        num_actions=atari.num_actions,
        network_name="qnetwork-target",
        trainable=False
    )
    
    # Initialize replay memory for storing experience to sample batches from
    replay_mem = ReplayMemory(params.replay_capacity, params.batch_size)

    # Small structure for storing the last four screens
    history = ScreenHistory(params)

    # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
    replay_mem_dump   = os.path.abspath(os.path.join(params.output_dir, "replay_memory.hdf5"))
    checkpoint_dir    = os.path.abspath(os.path.join(params.output_dir, "checkpoints"))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    train_step         = 0
    count_actions      = np.zeros(atari.num_actions)   # Count per action (only greedy)
    count_act_random   = 0  # Count of random actions
    count_act_greedy   = 0  # Count of greedy actions

    # Histories of qvalues and loss for running average
    qvalues_hist = collections.deque([0]*params.interval_summary,  maxlen=params.interval_summary)
    loss_hist    = collections.deque([10]*params.interval_summary, maxlen=params.interval_summary)

    # Time measurements
    dt_batch_gen    = collections.deque([0]*10, maxlen=10)
    dt_optimization = collections.deque([0]*10, maxlen=10)
    dt_train_total  = collections.deque([0]*10, maxlen=10)

    # Optionally load pre-initialized replay memory from disk
    if params.replay_mem_dump is not None and params.is_train:
        print("Loading pre-initialized replay memory from HDF5 file.")
        replay_mem.load(params.replay_mem_dump)


    # Initialize a new game and store the screens in the history
    reward, screen, is_terminal = atari.new_random_game()
    for _ in xrange(params.history_length):
        history.add(screen)

    # Initialize the TensorFlow session
    gpu_options = tf.GPUOptions(
       per_process_gpu_memory_fraction=0.4
    )

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        # Initialize the TensorFlow session
        init = tf.initialize_all_variables()
        sess.run(init)

        # Only save trainable variables and the global step to disk
        tf_vars_to_save = tf.trainable_variables() + [dqn_train.global_step]
        saver = tf.train.Saver(tf_vars_to_save, max_to_keep=40)

        if params.model_file is not None:
            # Load pre-trained model from disk
            saver.restore(sess, params.model_file)
            train_step, learning_rate = sess.run([dqn_train.global_step, dqn_train.learning_rate])
            print("Restarted training from model file. Step = %06i, Learning Rate = %.5f" % (train_step, learning_rate))


        # Initialize summary writer
        dqn_train.build_summary_writer(sess)

        # Initialize the target Q-Network fixed with the same weights
        update_target_network(sess, "qnetwork-train", "qnetwork-target")


        for step in xrange(params.num_steps):

            replay_mem_size = replay_mem.num_examples()
            if params.is_train and replay_mem_size < params.train_start and step % 1000 == 0:
                print("Initializing replay memory %i/%i" % (step, params.train_start))

            # Epsilon Greedy Exploration: with the probability of epsilon
            # choose a random action, otherwise go greedy with the action
            # having the maximal Q-value. Note the minimum episolon of 0.1
            if params.is_train:
                epsilon = max(0.1, 1.0-float(train_step*params.train_freq) / float(params.epsilon_step))
            else:
                epsilon = 0.05


            ################################################################
            ####################### SELECT A MOVE ##########################
            ################################################################

            # Either choose a random action or predict the action using the Q-network
            do_random_action = (random.random() < epsilon)
            if do_random_action or (replay_mem_size < params.train_start and params.is_train):
                action_id = random.randrange(atari.num_actions)
                count_act_random += 1
            else:

                # Get the last screens from the history and perform
                # feed-forward through the network to compute Q-values
                feed_dict  = { dqn_train.pl_screens: history.get() }
                qvalues    = sess.run(dqn_train.qvalues, feed_dict=feed_dict)

                # Choose the best action based on the approximated Q-values
                qvalue_max = np.max(qvalues[0])
                action_id  = np.argmax(qvalues[0])

                count_act_greedy += 1
                count_actions[action_id] += 1
                qvalues_hist.append(qvalue_max)


            ################################################################
            ####################### PLAY THE MOVE ##########################
            ################################################################

            # Play the selected action (either random or predicted) on the Atari game
            # Note that the action is performed for k = 4 frames (frame skipping)
            cumulative_reward, screen, is_terminal = atari.act(action_id)

            # Perform reward clipping and add the example to the replay memory
            cumulative_reward = min(+1.0, max(-1.0, cumulative_reward))

            # Add the screen to short term history and replay memory
            history.add(screen)

            # Add experience to replay memory
            if params.is_train:
                replay_mem.add(action_id, cumulative_reward, screen, is_terminal)

            # Check if we are game over, and if yes, initialize a new game
            if is_terminal:
                reward, screen, is_terminal = atari.new_random_game()
                replay_mem.add(0, reward, screen, is_terminal)
                history.add(screen)


            ################################################################
            ###################### TRAINING MODEL ##########################
            ################################################################


            if params.is_train and step > params.train_start and step % params.train_freq == 0:

                t1 = time.time()

                # Prepare batch and train the network
                # TODO: set actions with terminal == 1 to reward = -1 ??
                screens_in, actions, rewards, screens_out, terminals = replay_mem.sample_batch()

                dt_batch_gen.append(time.time() - t1)
                t2 = time.time()

                # Compute the target rewards from the previously fixed network
                # Note that the forward run is performed on the output screens.
                qvalues_target = sess.run(
                    dqn_target.qvalues,
                    feed_dict={ dqn_target.pl_screens: screens_out }
                )

                # Inputs for trainable Q-network
                feed_dict = {
                    dqn_train.pl_screens   : screens_in,
                    dqn_train.pl_actions   : actions,
                    dqn_train.pl_rewards   : rewards,
                    dqn_train.pl_terminals : terminals,
                    dqn_train.pl_qtargets  : np.max(qvalues_target, axis=1),

                }

                # Actual training operation
                _, loss, train_step = sess.run([dqn_train.train_op,
                                                dqn_train.loss,
                                                dqn_train.global_step],
                                                feed_dict=feed_dict)

                t3 = time.time()
                dt_optimization.append(t3 - t2)
                dt_train_total.append(t3 - t1)

                # Running average of the loss
                loss_hist.append(loss)

                 # Check if the returned loss is not NaN
                if np.isnan(loss):
                    print("[%s] Training failed with loss = NaN." %
                          datetime.now().strftime("%Y-%m-%d %H:%M"))


                # Once every n = 10000 frames update the Q-network for predicting targets
                if train_step % params.network_update_rate == 0:
                    print("[%s] Updating target network." % datetime.now().strftime("%Y-%m-%d %H:%M"))
                    update_target_network(sess, "qnetwork-train", "qnetwork-target")


                ################################################################
                ####################### MODEL EVALUATION #######################
                ################################################################

                if params.is_train and train_step % params.eval_frequency == 0:

                    eval_total_reward = 0
                    eval_num_episodes = 0
                    eval_num_rewards = 0
                    eval_episode_max_reward = 0
                    eval_episode_reward = 0
                    eval_actions = np.zeros(atari.num_actions)

                    # Initialize new game without random start moves
                    reward, screen, terminal = atari.new_game()
                    for _ in range(4):
                        history.add(screen)

                    for eval_step in range(params.eval_steps):

                        if random.random() < params.eval_epsilon:
                            # Random action
                            action_id = random.randrange(atari.num_actions)
                        else:
                            # Greedy action
                            # Get the last screens from the history and perform
                            # feed-forward through the network to compute Q-values
                            feed_dict_eval  = { dqn_train.pl_screens: history.get() }
                            qvalues = sess.run(dqn_train.qvalues, feed_dict=feed_dict_eval)

                            # Choose the best action based on the approximated Q-values
                            qvalue_max = np.max(qvalues[0])
                            action_id  = np.argmax(qvalues[0])

                        # Keep track of how many of each action is performed
                        eval_actions[action_id] += 1

                        # Perform the action
                        reward, screen, terminal = atari.act(action_id)
                        history.add(screen)

                        eval_episode_reward += reward
                        if reward > 0:
                            eval_num_rewards += 1

                        if terminal:
                            eval_total_reward += eval_episode_reward
                            eval_episode_max_reward = max(eval_episode_reward, eval_episode_max_reward)
                            eval_episode_reward = 0
                            eval_num_episodes += 1

                            reward, screen, terminal = atari.new_game()
                            for _ in range(4):
                                history.add(screen)

                    # Send statistics about the environment to TensorBoard
                    eval_update_ops = [
                        dqn_train.eval_rewards.assign(eval_total_reward),
                        dqn_train.eval_num_rewards.assign(eval_num_rewards),
                        dqn_train.eval_max_reward.assign(eval_episode_max_reward),
                        dqn_train.eval_num_episodes.assign(eval_num_episodes),
                        dqn_train.eval_actions.assign(eval_actions / np.sum(eval_actions))

                    ]
                    sess.run(eval_update_ops)
                    summaries = sess.run(dqn_train.eval_summary_op, feed_dict=feed_dict)
                    dqn_train.train_summary_writer.add_summary(summaries, train_step)

                    print("[%s] Evaluation Summary" % datetime.now().strftime("%Y-%m-%d %H:%M"))
                    print("  Total Reward: %i" % eval_total_reward)
                    print("  Max Reward per Episode: %i" % eval_episode_max_reward)
                    print("  Num Episodes: %i" % eval_num_episodes)
                    print("  Num Rewards: %i" % eval_num_rewards)


                ################################################################
                ###################### PRINTING / SAVING #######################
                ################################################################

                # Write a training summary to disk
                if params.is_train and train_step % params.interval_summary == 0:

                    avg_dt_batch_gen    = sum(dt_batch_gen)    / float(len(dt_batch_gen))
                    avg_dt_optimization = sum(dt_optimization) / float(len(dt_optimization))
                    avg_dt_total        = sum(dt_train_total)  / float(len(dt_train_total))
                    # print("Avg. Time Batch Preparation: %.3f seconds" % avg_dt_batch_gen)
                    # print("Avg. Time Train Operation:   %.3f seconds" % avg_dt_train_op)
                    # print("Avg. Time Total per Batch:   %.3f seconds (%.2f samples/second)" %
                    #       (avg_dt_total, (1.0/avg_dt_total)*params.batch_size))

                    # Send statistics about the environment to TensorBoard
                    update_game_stats_ops = [
                        dqn_train.avg_reward_per_game.assign(atari.avg_reward_per_episode()),
                        dqn_train.max_reward_per_game.assign(atari.max_reward_per_episode),
                        dqn_train.avg_moves_per_game.assign(atari.avg_steps_per_episode()),
                        dqn_train.total_reward_replay.assign(replay_mem.total_reward()),
                        dqn_train.num_games_played.assign(atari.episode_number),
                        dqn_train.actions_random.assign(count_act_random),
                        dqn_train.actions_greedy.assign(count_act_greedy),
                        dqn_train.runtime_batch.assign(avg_dt_batch_gen),
                        dqn_train.runtime_train.assign(avg_dt_optimization),
                        dqn_train.runtime_total.assign(avg_dt_total),
                        dqn_train.samples_per_second.assign((1.0/avg_dt_total)*params.batch_size)
                    ]
                    sess.run(update_game_stats_ops)

                    # Build and save summaries
                    summaries = sess.run(dqn_train.train_summary_op, feed_dict=feed_dict)
                    dqn_train.train_summary_writer.add_summary(summaries, train_step)

                    avg_qvalue = avg_loss = 0
                    for i in xrange(len(qvalues_hist)):
                        avg_qvalue += qvalues_hist[i]
                        avg_loss   += loss_hist[i]

                    avg_qvalue /= float(len(qvalues_hist))
                    avg_loss   /= float(len(loss_hist))

                    format_str = "[%s] Step %06i, ReplayMemory = %i, Epsilon = %.4f, "\
                                 "Episodes = %i, Avg.Reward = %.2f, Max.Reward = %.2f, Avg.QValue = %.4f, Avg.Loss = %.6f"
                    print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), train_step,
                                        replay_mem.num_examples(), epsilon, atari.episode_number,
                                        atari.avg_reward_per_episode(), atari.max_reward_per_episode,
                                        avg_qvalue, avg_loss))

                    # For debugging purposes, dump the batch to disk
                    #print("[%s] Writing batch images to file (debugging)" %
                    #      datetime.now().strftime("%Y-%m-%d %H:%M"))
                    #batch_output_dir = os.path.join(params.output_dir, "batches/%06i/" % train_step)
                    #replay_mem.write_batch_to_disk(batch_output_dir, screens_in, actions, rewards, screens_out)

                # Write model checkpoint to disk
                if params.is_train and train_step % params.interval_checkpoint == 0:
                    path = saver.save(sess, checkpoint_prefix, global_step=train_step)
                    print("[%s] Saving TensorFlow model checkpoint to disk." %
                          datetime.now().strftime("%Y-%m-%d %H:%M"))

                    # Dump the replay memory to disk
                    # TODO: fix this!
                    # print("[%s] Saving replay memory to disk." %
                    #       datetime.now().strftime("%Y-%m-%d %H:%M"))
                    # replay_mem.save(replay_mem_dump)

                    sum_actions = float(reduce(lambda x, y: x+y, count_actions))
                    action_str = ""
                    for action_id, action_count in enumerate(count_actions):
                        action_perc = action_count/sum_actions if not sum_actions == 0 else 0
                        action_str += "<%i, %s, %i, %.2f> " % \
                                      (action_id, atari.action_to_string(action_id),
                                       action_count, action_perc)

                    format_str = "[%s] Q-Network Actions Summary: NumRandom: %i, NumGreedy: %i, %s"
                    print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"),
                                        count_act_random, count_act_greedy, action_str))

        print("Finished training Q-network.")
Exemplo n.º 16
0
def create_emulator(args):
    if args.environment == "ale":
        return AtariEnvironment(args)
    else:
        return GymEnvironment(args)
Exemplo n.º 17
0
args = parser.parse_args()

logger = logging.getLogger()
logger.setLevel(args.log_level)

if args.random_seed:
    random.seed(args.random_seed)

# instantiate classes
if args.environment == 'ale':
    env = ALEEnvironment(args.game, args)
    logger.info("Using ALE Environment")
elif args.environment == 'gym':
    # logger does not work with this line
    #logger.handlers.pop()
    env = GymEnvironment(args.game, args)
    logger.info("Using Gym Environment")
else:
    assert False, "Unknown environment" + args.environment

mem = ReplayMemory(args.replay_size, args)
net = DeepQNetwork(env.numActions(), args)
agent = Agent(env, mem, net, args)
stats = Statistics(agent, net, mem, env, args)

if args.load_weights:
    logger.info("Loading weights from %s" % args.load_weights)
    net.load_weights(args.load_weights)

if args.play_games:
    logger.info("Playing for %d game(s)" % args.play_games)
Exemplo n.º 18
0
comarg = parser.add_argument_group('Common')
comarg.add_argument("output_folder", help="Where to write results to.")
comarg.add_argument("--num_episodes",
                    type=int,
                    default=10,
                    help="Number of episodes to test.")
comarg.add_argument("--random_seed",
                    type=int,
                    help="Random seed for repeatable experiments.")
args = parser.parse_args()

if args.random_seed:
    random.seed(args.random_seed)

env = GymEnvironment(args.env_id, args)
net = DeepQNetwork(env.numActions(), args)
buf = MemoryBuffer(args)

if args.load_weights:
    print "Loading weights from %s" % args.load_weights
    net.load_weights(args.load_weights)

env.gym.monitor.start(args.output_folder, force=True)
avg_reward = 0
num_episodes = args.num_episodes
for i_episode in xrange(num_episodes):
    env.restart()
    observation = env.getScreen()
    buf.reset()
    i_total_reward = 0
Exemplo n.º 19
0
antarg.add_argument("--exploration_decay_steps", type=float, default=1000000, help="How many steps to decay the exploration rate.")
antarg.add_argument("--exploration_rate_test", type=float, default=0.05, help="Exploration rate used during testing.")
antarg.add_argument("--train_frequency", type=int, default=4, help="Perform training after this many game steps.")
antarg.add_argument("--train_repeat", type=int, default=1, help="Number of times to sample minibatch during training.")
antarg.add_argument("--random_starts", type=int, default=30, help="Perform max this number of dummy actions after game restart, to produce more random game dynamics.")

mainarg = parser.add_argument_group('Main loop')
mainarg.add_argument("--load_weights", help="Load network from file.")
mainarg.add_argument("--save_weights_prefix", help="Save network to given file. Epoch and extension will be appended.")

comarg = parser.add_argument_group('Common')
comarg.add_argument("output_folder", help="Where to write results to.")
comarg.add_argument("--num_episodes", type=int, default=100, help="Number of episodes to test.")
comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.")
args = parser.parse_args()

if args.random_seed:
  random.seed(args.random_seed)

env = GymEnvironment(args.env_id, args)
net = DeepQNetwork(env.numActions(), args)
mem = None
agent = Agent(env, mem, net, args)

if args.load_weights:
  print "Loading weights from %s" % args.load_weights
  net.load_weights(args.load_weights)

env.gym.monitor.start(args.output_folder, force=True)
agent.play(args.num_episodes)
env.gym.monitor.close()
Exemplo n.º 20
0
import agent
import tensorflow as tf
import argparse
from environment import GymEnvironment

env_agent = GymEnvironment(display=True)
agent = agent.DQNAgent(environment=env_agent, display=True)

with tf.Session() as sess:
    agent.build_dqn(sess)
    sess.run(tf.global_variables_initializer())
    agent.load_model()
    agent.play(10)