Exemplo n.º 1
0
def main(args):

    with tf.Session() as sess:

        target_pos = np.array([10., 10., 10.])
        init_pose = np.array([0., 0., 0.1, 0., 0., 0.])  # initial pose

        env = Takeoff_Task(init_pose, target_pos=target_pos)
        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))

        state_dim = env.state_size
        action_dim = env.action_size
        action_bound = env.action_high

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             float(args['actor_lr']), float(args['tau']),
                             int(args['minibatch_size']))

        critic = CriticNetwork(sess, state_dim, action_dim,
                               float(args['critic_lr']), float(args['tau']),
                               float(args['gamma']))

        actor_noise = OUActionNoise(mu=np.zeros(action_dim), sigma=0.2)

        train(sess, env, args, actor, critic, actor_noise)
Exemplo n.º 2
0
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 n_actions,
                 gamma=0.99,
                 max_size=1000000,
                 fc1_dims=400,
                 fc2_dims=300,
                 batch_size=64):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.alpha = alpha
        self.beta = beta

        self.memory = ReplayBuffer(max_size, input_dims, n_actions)

        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  fc1_dims,
                                  fc2_dims,
                                  n_actions=n_actions,
                                  name='actor')
        self.critic = CriticNetwork(beta,
                                    input_dims,
                                    fc1_dims,
                                    fc2_dims,
                                    n_actions=n_actions,
                                    name='critic')

        self.target_actor = ActorNetwork(alpha,
                                         input_dims,
                                         fc1_dims,
                                         fc2_dims,
                                         n_actions=n_actions,
                                         name='target_actor')

        self.target_critic = CriticNetwork(beta,
                                           input_dims,
                                           fc1_dims,
                                           fc2_dims,
                                           n_actions=n_actions,
                                           name='target_critic')

        self.update_network_parameters(tau=1)
Exemplo n.º 3
0
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 env,
                 gamma=0.99,
                 n_actions=2,
                 buffer_size=1e6,
                 batch_size=64):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size

        self.replay_buffer = ReplayBuffer(buffer_size)
        self.sess = tf.Session()

        self.actor = Actor(alpha, input_dims, n_actions, 'Actor', self.sess,
                           env.action_space.high)
        self.critic = Critic(beta, input_dims, n_actions, 'Critic', self.sess)
        self.target_actor = Actor(alpha, input_dims, n_actions, 'TargetActor',
                                  self.sess, env.action_space.high)
        self.target_critic = Critic(beta, input_dims, n_actions,
                                    'TargetCritic', self.sess)

        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.update_critic = [
            self.target_critic.params[i].assign(
                tf.multiply(self.critic.params[i], self.tau) +
                tf.multiply(self.target_critic.params[i], 1. - self.tau))
            for i in range(len(self.target_critic.params))
        ]

        self.update_actor = [
            self.target_actor.params[i].assign(
                tf.multiply(self.actor.params[i], self.tau) +
                tf.multiply(self.target_actor.params[i], 1. - self.tau))
            for i in range(len(self.target_actor.params))
        ]

        self.sess.run(tf.global_variables_initializer())

        self.update_network_parameters(first=True)
Exemplo n.º 4
0
    def __init__(self, alpha, beta, input_dims, tau, env, brain_name, gamma=.99,
                 n_actions=2, mem_capacity=1e6, layer1_size=400,
                 layer2_size=300, batch_size=64, multiagent=False,
                 n_agents=None, game_name='Rollerball'):

        # Initialize memory
        self.batch_size = batch_size
        self.memory = ReplayBuffer(mem_capacity)
        
        # Initialize noise
        self.noise = OUActionNoise(np.zeros(n_actions))

        # Setup device used for torch computations
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

        # Create actor critic and target networks
        self.actor = ActorNet(alpha, input_dims, layer1_size, layer2_size, n_actions, name='actor_' + game_name + '_ddpg_model').to(self.device)
        self.target_actor = ActorNet(alpha, input_dims, layer1_size, layer2_size, n_actions).to(self.device)

        self.critic = CriticNet(beta, input_dims, layer1_size, layer2_size, n_actions, name='critic_' + game_name + '_ddpg_model').to(self.device)
        self.target_critic = CriticNet(beta, input_dims, layer1_size, layer2_size, n_actions).to(self.device)
        
        # Initialize target nets to be identical to actor and critic networks
        self.init_networks()

        # Target networks set to eval, since they are not 
        # trained but simply updated with the target_network_update function
        self.target_actor.eval()
        self.target_critic.eval()

        # Set global parameters
        self.gamma = gamma
        self.env = env
        self.tau = tau
        self.state_space = input_dims
        self.action_space = n_actions
        self.multiagent = multiagent
        self.brain_name = brain_name
        if self.multiagent:
            self.n_agents = n_agents

        # Plotter object for showing live training graphs and saving them
        self.plotter = RLPlots('ddpg_training')
Exemplo n.º 5
0
    def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma,
                 mem_size, actor_l1_size, actor_l2_size, critic_l1_size,
                 critic_l2_size, batch_size):

        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(mem_size, n_states, n_actions)
        self.batch_size = batch_size

        self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                           actor_l2_size)
        self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size,
                             critic_l2_size)

        self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                                  actor_l2_size)
        self.target_critic = Critic(lr_critic, n_states, n_actions,
                                    critic_l1_size, critic_l2_size)

        self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005)

        self.update_network_parameters(tau=1)
Exemplo n.º 6
0
    def __init__(self,
                 alpha,
                 beta,
                 tau,
                 gamma,
                 state_space,
                 l1_size,
                 l2_size,
                 l3_size,
                 l4_size,
                 action_space,
                 env,
                 brain_name,
                 multibrain,
                 version,
                 mem_capacity=1e6,
                 batch_size=128,
                 multiagent=False,
                 n_agents=None,
                 eval=False):

        # Initialize memory
        self.batch_size = batch_size
        self.memory = ReplayBuffer(mem_capacity)

        # Initialize noise
        # In case of a multiagent environment, create a separate noise object for each agent
        self.noise = [OUActionNoise(np.zeros(action_space)) for i in range(n_agents)] if multiagent else \
                    OUActionNoise(np.zeros(action_space))

        # Setup device used for torch computations
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')

        # Create actor critic and target networks
        self.actor = ActorNet(alpha,
                              state_space,
                              l1_size,
                              l2_size,
                              l3_size,
                              l4_size,
                              action_space,
                              name='actor_' + version + '_ddpg_model').to(
                                  self.device)
        self.target_actor = ActorNet(alpha, state_space, l1_size, l2_size,
                                     l3_size, l4_size,
                                     action_space).to(self.device)

        self.critic = CriticNet(beta,
                                state_space,
                                l1_size,
                                l2_size,
                                l3_size,
                                l4_size,
                                action_space,
                                name='critic_' + version + '_ddpg_model').to(
                                    self.device)
        self.target_critic = CriticNet(beta, state_space, l1_size, l2_size,
                                       l3_size, l4_size,
                                       action_space).to(self.device)

        # Initialize target nets to be identical to actor and critic networks
        self.init_networks()

        # Target networks set to eval, since they are not
        # trained but simply updated with the target_network_update function
        self.target_actor.eval()
        self.target_critic.eval()

        # Set global parameters
        self.gamma = gamma
        self.env = env
        self.tau = tau
        self.eval = eval
        self.state_space = state_space
        self.action_space = action_space
        self.multiagent = multiagent
        self.multibrain = multibrain
        self.brain_name = brain_name
        self.n_agents = n_agents if self.multiagent else None

        # Initialize plotter for showing live training graphs and saving them
        self.plotter = RLPlots('ddpg_training')
Exemplo n.º 7
0
def training(file_name):
    # Create folders.
    if not os.path.isdir(SAVE_DIR):
        os.makedirs(SAVE_DIR)
    if not os.path.isdir(CSV_DIR):
        os.makedirs(CSV_DIR)
    if not os.path.isdir(FIGURE_TRAINING_DIR):
        os.makedirs(FIGURE_TRAINING_DIR)

    # Load models.
    actor = Actor(name="actor")
    actor_target = Actor(name="actor_target")
    actor_initial_update_op = target_update_op(
        actor.trainable_variables, actor_target.trainable_variables, 1.0)
    actor_target_update_op = target_update_op(actor.trainable_variables,
                                              actor_target.trainable_variables,
                                              TARGET_UPDATE_RATE)

    critic = Critic(name="critic")
    critic.build_training()
    critic_target = Critic(name="critic_target")
    critic_initial_update_op = target_update_op(
        critic.trainable_variables, critic_target.trainable_variables, 1.0)
    critic_target_update_op = target_update_op(
        critic.trainable_variables, critic_target.trainable_variables,
        TARGET_UPDATE_RATE)

    critic_with_actor = Critic(name="critic", A=actor.pi)
    actor.build_training(critic_with_actor.actor_loss)

    env = PendulumEnv()
    replay_buffer = ReplayBuffer(BUFFER_SIZE)
    action_noise = OUActionNoise(np.zeros(A_LENGTH))

    with tf.Session() as sess:
        # Initialize actor and critic networks.
        sess.run(tf.global_variables_initializer())
        sess.run([actor_initial_update_op, critic_initial_update_op])

        list_final_reward = []

        additional_episode = int(np.ceil(MIN_BUFFER_SIZE / MAX_FRAME))
        for episode in range(-additional_episode, MAX_EPISODE):
            list_actor_loss = []
            list_critic_loss = []

            # Reset the environment and noise.
            s = env.reset()
            action_noise.reset()

            for step in range(MAX_FRAME):
                env.render()

                # Get action.
                a = sess.run(actor.pi,
                             feed_dict={actor.S: np.reshape(s, (1, -1))})
                noise = action_noise.get_noise()
                a = a[0] + ACTION_SCALING * noise
                a = np.clip(a, -ACTION_SCALING, ACTION_SCALING)

                # Interact with the game engine.
                s1, r, _, _ = env.step(a)

                # Add data to the replay buffer.
                data = [s, a, [r], s1]
                replay_buffer.append(data)

                if episode >= 0:
                    for _ in range(BATCHES_PER_STEP):
                        # Sample data from the replay buffer.
                        batch_data = replay_buffer.sample(BATCH_SIZE)
                        batch_s, batch_a, batch_r, batch_s1 = [
                            np.array(
                                [batch_data[j][i] for j in range(BATCH_SIZE)])
                            for i in range(len(batch_data[0]))
                        ]

                        # Compute the next action.
                        a1 = sess.run(actor_target.pi,
                                      feed_dict={actor_target.S: batch_s1})

                        # Compute the target Q.
                        q1 = sess.run(critic_target.q,
                                      feed_dict={
                                          critic_target.S: batch_s1,
                                          critic_target.A: a1
                                      })
                        q_target = batch_r + DISCOUNT * q1

                        # Update actor and critic.
                        _, _, actor_loss, critic_loss = sess.run(
                            [
                                actor.train_op, critic.train_op,
                                actor.actor_loss, critic.critic_loss
                            ],
                            feed_dict={
                                actor.S: batch_s,
                                critic_with_actor.S: batch_s,
                                actor.LR: LR_ACTOR,
                                critic.S: batch_s,
                                critic.A: batch_a,
                                critic.QTarget: q_target,
                                critic.LR: LR_CRITIC
                            })
                        list_actor_loss.append(actor_loss)
                        list_critic_loss.append(critic_loss)

                        # Update target networks.
                        sess.run(
                            [actor_target_update_op, critic_target_update_op])

                s = s1

            # Postprocessing after each episode.
            if episode >= 0:
                list_final_reward.append(r)
                avg_actor_loss = np.mean(list_actor_loss)
                avg_critic_loss = np.mean(list_critic_loss)
                print("Episode ", format(episode, "03d"), ":", sep="")
                print("  Final Reward = ",
                      format(r, ".6f"),
                      ", Actor Loss = ",
                      format(avg_actor_loss, ".6f"),
                      ", Critic Loss = ",
                      format(avg_critic_loss, ".6f"),
                      sep="")

        # Testing.
        avg_reward = 0
        for i in range(TEST_EPISODE):
            # Reset the environment and noise.
            s = env.reset()
            action_noise.reset()

            for step in range(MAX_FRAME):
                env.render()

                # Get action.
                a = sess.run(actor.pi,
                             feed_dict={actor.S: np.reshape(s, (1, -1))})
                a = a[0]

                # Interact with the game engine.
                s, r, _, _ = env.step(a)

            # Postprocessing after each episode.
            avg_reward += r
        avg_reward /= TEST_EPISODE

        # Save the parameters.
        saver = tf.train.Saver(
            [*actor.trainable_variables, *critic.trainable_variables])
        saver.save(sess, SAVE_DIR + file_name)
    tf.contrib.keras.backend.clear_session()
    env.close()

    # Store data in the csv file.
    with open(CSV_DIR + file_name + ".csv", "w") as f:
        fieldnames = ["Episode", "Final Reward", "Average Reward"]
        writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator="\n")
        writer.writeheader()
        for episode in range(MAX_EPISODE):
            content = {
                "Episode": episode,
                "Final Reward": list_final_reward[episode]
            }
            if episode == MAX_EPISODE - 1:
                content.update({"Average Reward": avg_reward})
            writer.writerow(content)

    # Plot the training process.
    list_episode = list(range(MAX_EPISODE))
    f, ax = plt.subplots(nrows=1, ncols=1, figsize=(5, 5))
    ax.plot(list_episode, list_final_reward, "r-", label="Final Reward")
    ax.plot([MAX_EPISODE - 1], [avg_reward], "b.", label="Average Reward")
    ax.set_title("Final Reward")
    ax.set_xlabel("Episode")
    ax.set_ylabel("Reward")
    ax.legend(loc="lower right")
    ax.grid()

    f.savefig(FIGURE_TRAINING_DIR + file_name + ".png")
    plt.close(f)