Exemplo n.º 1
0
    def __init__(self, main_network, thread_id):
        self.main_network = main_network
        self.thread_id = thread_id

        # Create environment.
        self.env = make_atari(ENV_NAME)
        self.obs_space = self.env.observation_space
        self.action_space = self.env.action_space

        # Local network.
        self.local_network = QValueNetwork(self.obs_space,
                                           self.action_space,
                                           name="local_network_" +
                                           str(thread_id),
                                           auxiliary_network=main_network)
        # Target network.
        self.target_network = QValueNetwork(self.obs_space,
                                            self.action_space,
                                            name="target_network_" +
                                            str(thread_id),
                                            auxiliary_network=main_network)
Exemplo n.º 2
0
def visualize(file_name):
    # Create folders.
    if not os.path.isdir(FIGURE_VISUALIZATION_DIR):
        os.makedirs(FIGURE_VISUALIZATION_DIR)

    # Obtain environment parameters.
    env = make_atari(ENV_NAME)
    obs_space = env.observation_space
    action_space = env.action_space

    # Only build main network for visualization.
    main_network = QValueNetwork(obs_space, action_space, name="main_network")

    obs = env.reset()
    list_obs = []

    with tf.Session() as sess:
        # Load network parameters.
        saver = tf.train.Saver(var_list=main_network.variables)
        saver.restore(sess, SAVE_DIR + file_name)

        done = False
        while True:
            # Get the raw observation.
            raw_obs = env.render(mode="rgb_array")
            list_obs.append(raw_obs)

            env.render()
            # Get action.
            q = sess.run(main_network.q,
                         feed_dict={
                             main_network.Obs:
                             np.expand_dims(np.array(obs) / 255.0, 0)
                         })
            action = np.argmax(q[0])
            # Interact with the environment.
            obs_next, reward, done, _ = env.step(action)
            if done:
                # Get the last raw observation.
                raw_obs = env.render(mode="rgb_array")
                list_obs.append(raw_obs)
                break
            # Update the observation.
            obs = obs_next

    env.close()

    # Record the gameplay.
    imageio.mimsave(FIGURE_VISUALIZATION_DIR + "gameplay.gif",
                    [plot_obs(obs) for obs in list_obs],
                    fps=30)
Exemplo n.º 3
0
def worker_process(job_name, task_index, cluster_dict, file_name):
    import tensorflow as tf
    # GPU training.
    if USE_GPU:
        os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=PER_PROCESS_GPU_MEMORY_FRACTION)
        config = tf.ConfigProto(gpu_options=gpu_options)
    else:
        config = None

    # Create and start a server for the local task.
    cluster = tf.train.ClusterSpec(cluster_dict)
    server = tf.train.Server(cluster,
                             job_name=job_name,
                             task_index=task_index,
                             config=config)

    if job_name == "ps":
        # Parameter server.
        with tf.device("/job:" + job_name + "/task:" + str(task_index)):
            queue = tf.FIFOQueue(cluster.num_tasks("worker"),
                                 tf.int32,
                                 shared_name="done_queue" + str(task_index))
        # Close the parameter server when all queues from workers have been filled.
        with tf.Session(server.target) as sess:
            for i in range(cluster.num_tasks("worker")):
                sess.run(queue.dequeue())
        return []

    elif job_name == "worker":
        # Obtain environment parameters.
        env = make_atari(ENV_NAME)
        obs_space = env.observation_space
        action_space = env.action_space

        # Worker.
        with tf.device(
                tf.train.replica_device_setter(worker_device="/job:" +
                                               job_name + "/task:" +
                                               str(task_index),
                                               cluster=cluster)):
            # Build networks.
            main_network = QValueNetwork(obs_space,
                                         action_space,
                                         name="main_network")
            target_network = QValueNetwork(obs_space,
                                           action_space,
                                           name="target_network",
                                           auxiliary_network=main_network)

        replay_buffer = ReplayBuffer(buffer_size=BUFFER_SIZE)
        list_episodic_reward = []
        episodic_reward = 0
        obs = env.reset()

        # Additional settings for the first worker (task_index = 0).
        if task_index == 0:
            saver = tf.train.Saver(var_list=main_network.variables,
                                   max_to_keep=1)
            next_target_network_update_step = 0
            next_autosave_step = 0

        with tf.train.MonitoredTrainingSession(
                master=server.target,
                is_chief=(task_index == 0),
                config=config,
                save_summaries_steps=None,
                save_summaries_secs=None,
                save_checkpoint_steps=None,
                save_checkpoint_secs=None) as sess:

            # Initialize buffers.
            for _ in range(INITIAL_BUFFER_SIZE):
                # Sample random action.
                action = np.random.randint(action_space.n)
                # Interact with the environment.
                obs_next, reward, done, _ = env.step(action)
                episodic_reward += reward
                if done:
                    obs_next = env.reset()
                    episodic_reward = 0
                # Store data.
                data = [obs, action, reward, done, obs_next]
                replay_buffer.append(data)
                # Update observation.
                obs = obs_next

            # Run until reaching maximum training steps.
            while sess.run(main_network.global_step) < TOTAL_STEP:
                global_step = sess.run(main_network.global_step)
                if task_index == 0:
                    # Synchronize the target network periodically (target network <- main network).
                    if global_step >= next_target_network_update_step:
                        sess.run(target_network.sync_op)
                        next_target_network_update_step += TARGET_NETWORK_UPDATE_STEP

                # Sample action with epsilon-greedy policy.
                epsilon = EPSILON_MAX - (
                    EPSILON_MAX - EPSILON_MIN) * np.minimum(
                        global_step / EPSILON_DECAY_STEP, 1)
                if np.random.uniform() < epsilon:
                    action = np.random.randint(action_space.n)
                else:
                    q = sess.run(target_network.q,
                                 feed_dict={
                                     target_network.Obs:
                                     np.expand_dims(np.array(obs) / 255.0, 0)
                                 })
                    action = np.argmax(q[0])
                # Interact with the environment.
                obs_next, reward, done, _ = env.step(action)
                episodic_reward += reward
                if done:
                    obs_next = env.reset()
                    list_episodic_reward.append((global_step, episodic_reward))
                    delta_time = int(time.time() - start_time)
                    print("Step ",
                          global_step,
                          "/",
                          TOTAL_STEP,
                          ": Time spent = ",
                          delta_time,
                          " s , Episodic reward = ",
                          episodic_reward,
                          sep="")
                    episodic_reward = 0
                # Store data.
                data = [obs, action, reward, done, obs_next]
                replay_buffer.append(data)
                # Update observation.
                obs = obs_next

                # Learning rate.
                lr = LEARNING_RATE[-1]
                for i in range(len(LR_ANNEAL_STEP)):
                    if global_step < LR_ANNEAL_STEP[i]:
                        lr = LEARNING_RATE[i]
                        break

                # Sample training data from the replay buffer.
                batch_data = replay_buffer.sample(BATCH_SIZE)
                batch_obs, batch_action, batch_reward, batch_done, batch_obs_next = \
                  [np.array([batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0]))]

                # Compute the target Q value:
                #   target_q = r + (1 - done) * REWARD_DISCOUNT * max[q(s', a)]
                q_next = sess.run(
                    target_network.q,
                    feed_dict={target_network.Obs: batch_obs_next / 255.0})
                max_qnext = np.amax(q_next, axis=1)
                target_q = batch_reward + (
                    1 - batch_done) * REWARD_DISCOUNT * max_qnext

                # Update the main network (main network <- local network gradients).
                sess.run(main_network.train_op,
                         feed_dict={
                             main_network.Obs: batch_obs / 255.0,
                             main_network.Action: batch_action,
                             main_network.TargetQ: target_q,
                             main_network.LR: lr
                         })

                if task_index == 0:
                    # Save the main network periodically.
                    if global_step >= next_autosave_step:
                        saver.save(sess._sess._sess._sess._sess,
                                   SAVE_DIR + file_name)
                        next_autosave_step += AUTOSAVE_STEP

            if task_index == 0:
                # Save the main network.
                saver.save(sess._sess._sess._sess._sess, SAVE_DIR + file_name)

        tf.contrib.keras.backend.clear_session()
        # Close the environment.
        env.close()

        queues = []
        # Create a shared queue on the worker which is visible on the parameter server.
        for i in range(cluster.num_tasks("ps")):
            with tf.device("/job:ps/task:" + str(i)):
                queue = tf.FIFOQueue(cluster.num_tasks("worker"),
                                     tf.int32,
                                     shared_name="done_queue" + str(i))
                queues.append(queue)
        # Notify all parameter servers that the current worker has finished the task.
        with tf.Session(server.target) as sess:
            for i in range(cluster.num_tasks("ps")):
                sess.run(queues[i].enqueue(task_index))
        # Release memory when a worker is finished.
        tf.contrib.keras.backend.clear_session()

        return list_episodic_reward
Exemplo n.º 4
0
    def __init__(self,
                 obs_shape,
                 num_action,
                 network_type,
                 gamma=1.0,
                 grad_max_norm=10):
        # Build networks.
        self.Obs = tf.placeholder(tf.float32, (None, *obs_shape))
        self.ObsTarget = tf.placeholder(tf.float32, (None, *obs_shape))
        main_network = QValueNetwork(self.Obs,
                                     num_action,
                                     network_type,
                                     name="main_network")
        target_network = QValueNetwork(self.ObsTarget,
                                       num_action,
                                       network_type,
                                       name="target_network")

        # Update target network.
        self.update_target_network_op = [
            tf.assign(ref, value) for ref, value in zip(
                target_network.variables, main_network.variables)
        ]

        # Sample actions.
        self.Epsilon = tf.placeholder(tf.float32, ())
        batch_size = tf.shape(self.Obs)[0]
        random_action = tf.random_uniform(tf.stack([batch_size]),
                                          minval=0,
                                          maxval=num_action,
                                          dtype=tf.int64)
        self.greedy_action = tf.argmax(main_network.q, axis=1)
        random_value = tf.random.uniform(tf.stack([batch_size]),
                                         minval=0,
                                         maxval=1,
                                         dtype=tf.float32)
        self.epsilon_action = tf.where(random_value < self.Epsilon,
                                       random_action, self.greedy_action)

        # Train.
        self.Action = tf.placeholder(tf.int32, (None, ))
        self.Reward = tf.placeholder(tf.float32, (None, ))
        self.Done = tf.placeholder(tf.float32, (None, ))
        self.Weights = tf.placeholder(tf.float32, (None, ))

        # Double Q-learning.
        #   a_next = argmax(q_main_network(obs_next))
        #   q_target = reward + (1 - done) * gamma * q_target_network(obs_next, a_next)
        main_network_for_target_evaluation = QValueNetwork(self.ObsTarget,
                                                           num_action,
                                                           network_type,
                                                           name="main_network")
        action_next = tf.argmax(main_network_for_target_evaluation.q, axis=1)
        q_next = tf.reduce_sum(target_network.q *
                               tf.one_hot(action_next, num_action),
                               axis=1)
        q_target = self.Reward + (1 - self.Done) * gamma * q_next

        # Loss function.
        q_a = tf.reduce_sum(main_network.q *
                            tf.one_hot(self.Action, num_action),
                            axis=1)
        self.td_error = q_target - q_a
        loss = tf.reduce_mean(self.Weights *
                              huber_loss(self.td_error, delta=1.0))

        # Optimization.
        self.LearningRate = tf.placeholder(tf.float32, ())
        optimizer = tf.train.AdamOptimizer(self.LearningRate)
        gradients = optimizer.compute_gradients(
            loss, var_list=main_network.trainable_variables)
        gradients = [(tf.clip_by_norm(grad, grad_max_norm), var)
                     for grad, var in gradients]
        self.train_op = optimizer.apply_gradients(gradients)
Exemplo n.º 5
0
def train(file_name):
  # Create folders.
  if not os.path.isdir(SAVE_DIR):
    os.makedirs(SAVE_DIR)
  if not os.path.isdir(FIGURE_TRAINING_DIR):
    os.makedirs(FIGURE_TRAINING_DIR)
  
  # Obtain environment parameters.
  env = make_atari(ENV_NAME)
  obs_space = env.observation_space
  action_space = env.action_space
  
  # Build networks.
  main_network = QValueNetwork(obs_space, action_space, name = "main_network")
  target_network = QValueNetwork(obs_space, action_space, name = "target_network", auxiliary_network = main_network)
  variables_initializer = tf.global_variables_initializer()
  
  replay_buffer = ReplayBuffer(buffer_size = BUFFER_SIZE)
  start_time = time.time()
  list_episodic_reward = []
  episodic_reward = 0
  
  obs = env.reset()
  
  with tf.Session() as sess:
    # Initialize all variables.
    sess.run(variables_initializer)
    # Only save the main network.
    saver = tf.train.Saver(var_list = main_network.variables)
    
    # Initialize buffers.
    for _ in range(INITIAL_BUFFER_SIZE):
      # Sample random action.
      action = np.random.randint(action_space.n)
      # Interact with the environment.
      obs_next, reward, done, _ = env.step(action)
      episodic_reward += reward
      if done:
        obs_next = env.reset()
        episodic_reward = 0
      # Store data.
      data = [obs, action, reward, done, obs_next]
      replay_buffer.append(data)
      # Update observation.
      obs = obs_next
    
    for step in range(TOTAL_STEP):
      # Synchronize the target network periodically (target network <- main network).
      if step % TARGET_NETWORK_UPDATE_STEP == 0:
        sess.run(target_network.sync_op)
      
      # Sample action with epsilon-greedy policy.
      epsilon = EPSILON_MAX - (EPSILON_MAX - EPSILON_MIN) * np.minimum(step / EPSILON_DECAY_STEP, 1)
      if np.random.uniform() < epsilon:
        action = np.random.randint(action_space.n)
      else:
        q = sess.run(target_network.q, feed_dict = {target_network.Obs: np.expand_dims(np.array(obs) / 255.0, 0)})
        action = np.argmax(q[0])
      # Interact with the environment.
      obs_next, reward, done, _ = env.step(action)
      episodic_reward += reward
      if done:
        obs_next = env.reset()
        list_episodic_reward.append((step, episodic_reward))
        delta_time = int(time.time() - start_time)
        print("Step ", step, "/", TOTAL_STEP, ": Time spent = ", delta_time, " s , Episodic reward = ", episodic_reward, sep = "")
        episodic_reward = 0
      # Store data.
      data = [obs, action, reward, done, obs_next]
      replay_buffer.append(data)
      # Update observation.
      obs = obs_next
      
      # Learning rate.
      lr = LEARNING_RATE[-1]
      for i in range(len(LR_ANNEAL_STEP)):
        if step < LR_ANNEAL_STEP[i]:
          lr = LEARNING_RATE[i]
          break
      
      # Sample training data from the replay buffer.
      batch_data = replay_buffer.sample(BATCH_SIZE)
      batch_obs, batch_action, batch_reward, batch_done, batch_obs_next = \
        [np.array([batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0]))]
      
      # Compute the target Q value:
      #   target_q = r + (1 - done) * REWARD_DISCOUNT * max[q(s', a)]
      q_next = sess.run(target_network.q, feed_dict = {target_network.Obs: batch_obs_next / 255.0})
      max_qnext = np.amax(q_next, axis = 1)
      target_q = batch_reward + (1 - batch_done) * REWARD_DISCOUNT * max_qnext
      
      # Update the main network.
      sess.run(main_network.train_op, feed_dict = {
        main_network.Obs: batch_obs / 255.0, main_network.Action: batch_action, main_network.TargetQ: target_q, main_network.LR: lr
        })
      
      # Save the main network periodically.
      if step % AUTOSAVE_STEP == 0:
        saver.save(sess, SAVE_DIR + file_name)
    
    # Save the main network.
    saver = tf.train.Saver(var_list = main_network.variables)
    saver.save(sess, SAVE_DIR + file_name)
  
  total_time = int(time.time() - start_time)
  print("Training finished in ", total_time, " s.", sep = "")
  
  # Close the environment.
  env.close()
  
  # Plot the episodic reward against training step curve.
  plot_episodic_reward(list_episodic_reward, file_name)
Exemplo n.º 6
0
def train(file_name):
    # Create folders.
    if not os.path.isdir(SAVE_DIR):
        os.makedirs(SAVE_DIR)
    if not os.path.isdir(FIGURE_TRAINING_DIR):
        os.makedirs(FIGURE_TRAINING_DIR)

    # Obtain environment parameters.
    env = make_atari(ENV_NAME)
    obs_space = env.observation_space
    action_space = env.action_space
    env.close()

    # Build networks.
    main_network = QValueNetwork(obs_space, action_space, name="main_network")
    target_network = QValueNetwork(obs_space,
                                   action_space,
                                   name="target_network",
                                   auxiliary_network=main_network)
    variables_initializer = tf.global_variables_initializer()

    # Create parallel environments.
    par_env = ParallelEnvironment(
        [make_atari(ENV_NAME) for _ in range(NUM_ENV)])

    replay_buffer = ReplayBuffer(buffer_size=BUFFER_SIZE)
    start_time = time.time()
    list_episodic_reward = []
    episodic_reward = np.zeros(NUM_ENV)

    obs = par_env.reset()

    with tf.Session() as sess:
        # Initialize all variables.
        sess.run(variables_initializer)
        # Only save the main network.
        saver = tf.train.Saver(var_list=main_network.variables)

        # Initialize buffers.
        while replay_buffer.get_size() < INITIAL_BUFFER_SIZE:
            # Sample random action.
            action = np.random.randint(action_space.n, size=NUM_ENV)
            # Interact with the environment.
            obs_next, reward, done, _ = par_env.step(action)
            episodic_reward += reward
            for i in range(NUM_ENV):
                if done[i]:
                    episodic_reward[i] = 0
            # Store data.
            for i in range(NUM_ENV):
                data = [obs[i], action[i], reward[i], done[i], obs_next[i]]
                replay_buffer.append(data)
            # Update observation.
            obs = obs_next

        step = 0
        next_target_network_update_step = 0
        next_autosave_step = 0
        while step < TOTAL_STEP:
            # Synchronize the target network periodically (target network <- main network).
            if step >= next_target_network_update_step:
                sess.run(target_network.sync_op)
                next_target_network_update_step += TARGET_NETWORK_UPDATE_STEP

            # Sample action with epsilon-greedy policy.
            epsilon = EPSILON_MAX - (EPSILON_MAX - EPSILON_MIN) * np.minimum(
                step / EPSILON_DECAY_STEP, 1)
            random_uniform = np.random.uniform(size=NUM_ENV)
            action = np.zeros(NUM_ENV, dtype=np.int32)
            random_action_index = np.argwhere(random_uniform < epsilon)
            if np.shape(random_action_index)[0] > 0:
                action[tuple(
                    np.transpose(random_action_index))] = np.random.randint(
                        action_space.n, size=np.shape(random_action_index)[0])
            greedy_action_index = np.argwhere(random_uniform >= epsilon)
            if np.shape(greedy_action_index)[0] > 0:
                q = sess.run(target_network.q,
                             feed_dict={
                                 target_network.Obs:
                                 np.array(obs)[tuple(
                                     np.transpose(greedy_action_index))] /
                                 255.0
                             })
                action[tuple(np.transpose(greedy_action_index))] = np.argmax(
                    q, axis=1)
            # Interact with the environment.
            obs_next, reward, done, _ = par_env.step(action)
            episodic_reward += reward
            for i in range(NUM_ENV):
                if done[i]:
                    list_episodic_reward.append((step, episodic_reward[i]))
                    delta_time = int(time.time() - start_time)
                    print("Step ",
                          step,
                          "/",
                          TOTAL_STEP,
                          ": Time spent = ",
                          delta_time,
                          " s , Episodic reward = ",
                          episodic_reward[i],
                          sep="")
                    episodic_reward[i] = 0
            # Store data.
            for i in range(NUM_ENV):
                data = [obs[i], action[i], reward[i], done[i], obs_next[i]]
                replay_buffer.append(data)
            # Update observation.
            obs = obs_next

            # Learning rate.
            lr = LEARNING_RATE[-1]
            for i in range(len(LR_ANNEAL_STEP)):
                if step < LR_ANNEAL_STEP[i]:
                    lr = LEARNING_RATE[i]
                    break

            for _ in range(NUM_ENV):
                # Sample training data from the replay buffer.
                batch_data = replay_buffer.sample(BATCH_SIZE)
                batch_obs, batch_action, batch_reward, batch_done, batch_obs_next = \
                  [np.array([batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0]))]

                # Compute the target Q value:
                #   target_q = r + (1 - done) * REWARD_DISCOUNT * max[q(s', a)]
                q_next = sess.run(
                    target_network.q,
                    feed_dict={target_network.Obs: batch_obs_next / 255.0})
                max_qnext = np.amax(q_next, axis=1)
                target_q = batch_reward + (
                    1 - batch_done) * REWARD_DISCOUNT * max_qnext

                # Update the main network.
                sess.run(main_network.train_op,
                         feed_dict={
                             main_network.Obs: batch_obs / 255.0,
                             main_network.Action: batch_action,
                             main_network.TargetQ: target_q,
                             main_network.LR: lr
                         })

            # Save the main network periodically.
            if step >= next_autosave_step:
                saver.save(sess, SAVE_DIR + file_name)
                next_autosave_step += AUTOSAVE_STEP

            # Update step.
            step += NUM_ENV

        # Save the main network.
        saver = tf.train.Saver(var_list=main_network.variables)
        saver.save(sess, SAVE_DIR + file_name)

    total_time = int(time.time() - start_time)
    print("Training finished in ", total_time, " s.", sep="")

    # Close the environment.
    par_env.close()

    # Plot the episodic reward against training step curve.
    plot_episodic_reward(list_episodic_reward, file_name)