Exemplo n.º 1
0
    def main(self):
        np.random.seed(0)
        replay_memory = deque(maxlen=REPLAY_MEM_CAPACITY)

        def add_to_memory(experience):
            replay_memory.append(experience)

        def sample_from_memory(minibatch_size):
            return random.sample(replay_memory, minibatch_size)

        tf.reset_default_graph()

        # placeholders
        state_placeholder = tf.placeholder(dtype=tf.float32,
                                           shape=[None, STATE_DIM])
        action_placeholder = tf.placeholder(dtype=tf.float32,
                                            shape=[None, ACTION_DIM])
        reward_placeholder = tf.placeholder(dtype=tf.float32, shape=[None])
        next_state_placeholder = tf.placeholder(dtype=tf.float32,
                                                shape=[None, STATE_DIM])
        # indicators (go into target computation)
        is_not_terminal_placeholder = tf.placeholder(dtype=tf.float32,
                                                     shape=[None])
        is_training_placeholder = tf.placeholder(dtype=tf.bool,
                                                 shape=())  # for dropout

        # episode counter
        episodes = tf.Variable(0.0, trainable=False, name='episodes')
        episode_incr_op = episodes.assign_add(1)

        # actor network
        with tf.variable_scope('actor'):
            actor = Actor(STATE_DIM,
                          ACTION_DIM,
                          HIDDEN_1_ACTOR,
                          HIDDEN_2_ACTOR,
                          HIDDEN_3_ACTOR,
                          trainable=True)
            '''              
            Policy's outputted action for each state_ph (for generating 
            actions and training the critic)
            '''
            actions_unscaled = actor.call(state_placeholder)
            actions = MIN_BANDWIDTH + tf.nn.sigmoid(actions_unscaled) * (
                MAX_BANDWIDTH - MIN_BANDWIDTH)

        # slow target actor network
        with tf.variable_scope('target_actor', reuse=False):
            target_actor = Actor(STATE_DIM,
                                 ACTION_DIM,
                                 HIDDEN_1_ACTOR,
                                 HIDDEN_2_ACTOR,
                                 HIDDEN_3_ACTOR,
                                 trainable=True)
            '''
            Slow target policy's outputted action for each next_state_ph 
            (for training the critic)
            use stop_gradient to treat the output values as constant targets 
            when doing backprop
            '''
            target_next_actions_unscaled = target_actor.call(
                next_state_placeholder)
            target_next_actions_1 = MIN_BANDWIDTH + tf.nn.sigmoid(\
                target_next_actions_unscaled)*(MAX_BANDWIDTH - MIN_BANDWIDTH)
            target_next_actions = tf.stop_gradient(target_next_actions_1)

        with tf.variable_scope('critic') as scope:
            critic = Critic(STATE_DIM,
                            ACTION_DIM,
                            HIDDEN_1_CRITIC,
                            HIDDEN_2_CRITIC,
                            HIDDEN_3_CRITIC,
                            trainable=True)
            # Critic applied to state_ph and a given action(for training critic)
            q_values_of_given_actions = critic.call(state_placeholder,
                                                    action_placeholder)
            '''
            Critic applied to state_ph and the current policy's outputted 
            actions for state_ph (for training actor via deterministic 
            policy gradient)
            '''
            q_values_of_suggested_actions = critic.call(
                state_placeholder, actions)

        # slow target critic network
        with tf.variable_scope('target_critic', reuse=False):
            target_critic = Critic(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC,
                                   HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, \
                                       trainable=True)
            '''
            Slow target critic applied to slow target actor's outputted 
            actions for next_state_ph (for training critic)
            '''
            q_values_next = tf.stop_gradient(
                target_critic.call(next_state_placeholder,
                                   target_next_actions))

        # isolate vars for each network
        actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       scope='actor')
        target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                              scope='target_actor')
        critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        scope='critic')
        target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope='target_critic')

        # update slowly-changing targets towards current actor and critic
        update_target_ops = []
        for i, target_actor_var in enumerate(target_actor_vars):
            update_target_actor_op = target_actor_var.assign(
                TAU * actor_vars[i] + (1 - TAU) * target_actor_var)
            update_target_ops.append(update_target_actor_op)

        for i, target_var in enumerate(target_critic_vars):
            target_critic_op = target_var.assign(TAU * critic_vars[i] +
                                                 (1 - TAU) * target_var)
            update_target_ops.append(target_critic_op)

        update_targets_op = tf.group(*update_target_ops,
                                     name='update_slow_targets')
        '''
        # One step TD targets y_i for (s,a) from experience replay
        # = r_i + gamma*Q_slow(s',mu_slow(s')) if s' is not terminal
        # = r_i if s' terminal
        '''
        targets = tf.expand_dims(
            reward_placeholder, 1) + tf.expand_dims(is_not_terminal_placeholder,\
                 1) * GAMMA * q_values_next

        # 1-step temporal difference errors
        td_errors = targets - q_values_of_given_actions

        # critic loss function (mean-square value error with regularization)
        critic_loss = tf.reduce_mean(tf.square(td_errors))
        for var in critic_vars:
            if not 'bias' in var.name:
                critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var)

        # critic optimizer
        critic_train_op = tf.train.AdamOptimizer(
            LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss)

        # actor loss function (mean Q-values under current policy with
        # regularization)
        actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions)
        for var in actor_vars:
            if not 'bias' in var.name:
                actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var)
        '''
        actor optimizer
        the gradient of the mean Q-values wrt actor params is the 
        deterministic policy gradient (keeping critic params fixed)
        '''

        actor_train_op = tf.train.AdamOptimizer(
            LEARNING_RATE_ACTOR*LR_DECAY**episodes).minimize(actor_loss, \
                var_list=actor_vars)

        # initialize session
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        # print(sess.run(tf.report_uninitialized_variables()))

        ## Training

        num_steps = 0
        for episode in range(NUM_EPISODES):
            total_reward = 0
            num_steps_in_episode = 0

            # Create noise
            noise = np.zeros(ACTION_DIM)
            noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \
                (MAX_BANDWIDTH - MIN_BANDWIDTH)  # TODO: uses env

            # Initial state
            self.reset()  # TODO: uses env
            state = self.input_state

            for t in range(MAX_STEPS_PER_EPISODE):
                # choose action based on deterministic policy
                state = np.asarray(state)
                state = state.reshape(1, state.shape[0])
                action, = sess.run(actions,
                                   feed_dict={state_placeholder: state, \
                                       is_training_placeholder: False})

                # add temporally-correlated exploration noise to action
                # (using an Ornstein-Uhlenbeck process)
                noise = EXPLORATION_THETA * \
                    (EXPLORATION_MU - noise) + \
                    EXPLORATION_SIGMA*np.random.randn(ACTION_DIM)
                action += noise_scale * noise

                # take step
                next_state, reward, done, = self.step(action)
                total_reward += reward

                add_to_memory((
                    state,
                    action,
                    reward,
                    next_state,
                    #    is next_observation a terminal state?
                    #    0.0 if done and not env.env._past_limit() else
                    #    1.0))
                    0.0 if done else 1.0))

                # update network weights to fit a minibatch of experience
                if num_steps % TRAIN_EVERY == 0 and \
                    len(replay_memory) >= MINI_BATCH_SIZE:

                    minibatch = sample_from_memory(MINI_BATCH_SIZE)
                    '''
                    update the critic and actor params using mean-square value 
                    error and deterministic policy gradient, respectively
                    '''
                    _, _ = sess.run([critic_train_op, actor_train_op],
                                    feed_dict={
                        state_placeholder: np.asarray([elem[0] for elem in \
                            minibatch]),
                        action_placeholder: np.asarray([elem[1] for elem in \
                            minibatch]),
                        reward_placeholder: np.asarray([elem[2] for elem in \
                            minibatch]),
                        next_state_placeholder: np.asarray([elem[3] for elem in\
                             minibatch]),
                        is_not_terminal_placeholder: np.asarray([elem[4] for \
                            elem in minibatch]),

                        is_training_placeholder: True})
                    '''
                    update slow actor and critic targets towards current actor 
                    and critic
                    '''
                    _ = sess.run(update_targets_op)

                state = next_state
                num_steps += 1
                num_steps_in_episode += 1

                if done:
                    # Increment episode counter
                    _ = sess.run(episode_incr_op)
                    break

            print('Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: \
                %7.3f'                       % (episode, total_reward, num_steps_in_episode, \
                    noise_scale))
Exemplo n.º 2
0
def main():
    ''' Create the environment
    '''
    env = gym.make(ENV_NAME)

    # For tensorboard
    writer = tf.summary.FileWriter("./tensorboard")

    assert STATE_DIM == np.prod(np.array(env.observation_space.shape))
    assert ACTION_DIM == np.prod(np.array(env.action_space.shape))

    env.seed(0)
    np.random.seed(0)
    ''' Create the replay memory
    '''
    replay_memory = Memory(REPLAY_MEM_CAPACITY)

    # Tensorflow part starts here!
    tf.reset_default_graph()
    ''' Create placeholders 
    '''
    # Placeholders
    state_placeholder = tf.placeholder(dtype=tf.float32, \
                                       shape=[None, STATE_DIM],
                                       name='state_placeholder')
    action_placeholder = tf.placeholder(dtype=tf.float32, \
                                        shape=[None, ACTION_DIM],
                                        name='action_placeholder')
    reward_placeholder = tf.placeholder(dtype=tf.float32,
                                        shape=[None],
                                        name='reward_placeholder')
    next_state_placeholder = tf.placeholder(dtype=tf.float32,
                                            shape=[None, STATE_DIM],
                                            name='next_state_placeholder')
    is_not_terminal_placeholder = tf.placeholder(
        dtype=tf.float32, shape=[None], name='is_not_terminal_placeholder')

    is_training_placeholder = tf.placeholder(dtype=tf.float32,
                                             shape=(),
                                             name='is_training_placeholder')
    ''' A counter to count the number of episodes
    '''
    episodes = tf.Variable(0.0, trainable=False, name='episodes')
    episode_incr_op = episodes.assign_add(1)
    ''' Create the actor network inside the actor scope and calculate actions
    '''
    with tf.variable_scope('actor'):
        actor = ActorNetwork(STATE_DIM,
                             ACTION_DIM,
                             HIDDEN_1_ACTOR,
                             HIDDEN_2_ACTOR,
                             HIDDEN_3_ACTOR,
                             trainable=True)
        unscaled_actions = actor.call(state_placeholder)
        ''' Scale the actions to fit within the bounds provided by the 
        environment
        '''
        actions = scale_actions(unscaled_actions, env.action_space.low,
                                env.action_space.high)
    ''' Create the target actor network inside target_actor scope and calculate 
    the target actions. Apply stop_gradient to the target actions so that 
    thier gradient is not computed at any point of time.
    '''
    with tf.variable_scope('target_actor', reuse=False):
        target_actor = ActorNetwork(STATE_DIM,
                                    ACTION_DIM,
                                    HIDDEN_1_ACTOR,
                                    HIDDEN_2_ACTOR,
                                    HIDDEN_3_ACTOR,
                                    trainable=True)

        unscaled_target_actions = target_actor.call(next_state_placeholder)
        ''' Scale the actions to fit within the bounds provided by the 
        environment
        '''
        target_actions_temp = scale_actions(unscaled_target_actions,
                                            env.action_space.low,
                                            env.action_space.low)
        target_actions = tf.stop_gradient(target_actions_temp)
    ''' Create the critic network inside the critic variable scope. Get the 
    Q-values of given actions and Q-values of actions suggested by the actor 
    network.
    '''
    with tf.variable_scope('critic'):
        critic = CriticNetwork(STATE_DIM,
                               ACTION_DIM,
                               HIDDEN_1_CRITIC,
                               HIDDEN_2_CRITIC,
                               HIDDEN_3_CRITIC,
                               trainable=True)

        q_values_of_given_actions = critic.call(state_placeholder,
                                                action_placeholder)
        q_values_of_suggested_actions = critic.call(state_placeholder, actions)
    ''' Create the target critic network inside the target_critic variable 
    scope. Calculate the target Q-values and apply stop_gradient to it.
    '''
    with tf.variable_scope('target_critic', reuse=False):
        target_critic = CriticNetwork(STATE_DIM,
                                      ACTION_DIM,
                                      HIDDEN_1_CRITIC,
                                      HIDDEN_2_CRITIC,
                                      HIDDEN_3_CRITIC,
                                      trainable=True)

        target_q_values_temp = target_critic.call(next_state_placeholder,
                                                  target_actions)
        target_q_values = tf.stop_gradient(target_q_values_temp)
    ''' Calculate 
    - trainable variables in actor (Weights of actor network), 
    - Weights of target actor network
    - trainable variables in critic (Weights of critic network),
    - Weights of target critic network
    '''
    actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='actor')

    target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='target_actor')

    critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                    scope='critic')

    target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_critic')
    ''' Get the operators for updating the target networks. The 
    update_target_networks function defined in utils returns a list of operators 
    to be run from tf session inorder to update the target networks using 
    soft update.
    '''
    update_targets_op = update_target_networks(TAU, \
        target_actor_vars, actor_vars, target_critic_vars, \
            critic_vars)
    ''' Create the tf operation to train the critic network:
    - calculate TD-target 
    - calculate TD-Error = TD-target - q_values_of_given_actions
    - calculate Critic network's loss (Mean Squared Error of TD-Errors)
    - ?
    - create a tf operation to train the critic network
    '''
    targets = tf.expand_dims(reward_placeholder, 1) + \
        tf.expand_dims(is_not_terminal_placeholder, 1) * GAMMA * \
            target_q_values
    td_errors = targets - q_values_of_given_actions
    critic_loss = tf.reduce_mean(tf.square(td_errors))

    # Update critic networks after computing loss
    for var in critic_vars:
        if not 'bias' in var.name:
            critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var)

    # optimize critic
    critic_train_op = tf.train.AdamOptimizer(
        LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss)
    ''' Create a tf operation to train the actor networks
    - Calculate the Actor network's loss
    - Create the tf operation to train the actor network
    '''
    # Actor's loss
    actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions)
    for var in actor_vars:
        if not 'bias' in var.name:
            actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var)

    # Optimize actor
    actor_train_op = tf.train.AdamOptimizer(
        LEARNING_RATE_ACTOR * LR_DECAY**episodes).minimize(actor_loss,
                                                           var_list=actor_vars)

    # Init session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    writer.add_graph(sess.graph)

    # Training
    num_steps = 0
    for episode in range(NUM_EPISODES):
        total_reward = 0
        num_steps_in_episode = 0

        # Create noise
        noise = np.zeros(ACTION_DIM)
        noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \
            (env.action_space.high - env.action_space.low)

        # Initial state
        state = env.reset()

        for _ in range(MAX_STEPS_PER_EPISODE):

            action = sess.run(actions, feed_dict={ \
                state_placeholder: state[None],
                is_training_placeholder: False})

            # Add Noise to actions
            noise = EXPLORATION_THETA * (EXPLORATION_MU - noise) + \
                EXPLORATION_SIGMA * np.random.randn(ACTION_DIM)

            action += noise_scale * noise

            # Take action on env
            next_state, reward, done, _info = env.step(action)
            next_state = np.squeeze(next_state)
            reward = np.squeeze(reward)
            action = action[0]

            total_reward += reward

            replay_memory.add_to_memory(
                (state, action, reward, next_state, 0.0 if done else 1.0))

            if num_steps % TRAIN_EVERY == 0 and replay_memory.size() >= \
                MINI_BATCH_SIZE :
                batch = replay_memory.sample_from_memory(MINI_BATCH_SIZE)
                _, _ = sess.run([critic_train_op, actor_train_op],
                    feed_dict={
                        state_placeholder: np.asarray( \
                            [elem[0] for elem in batch]),
                        action_placeholder: np.asarray( \
                            [elem[1] for elem in batch]),
                        reward_placeholder: np.asarray( \
                            [elem[2] for elem in batch]),
                        next_state_placeholder: np.asarray( \
                            [elem[3] for elem in batch]),
                        is_not_terminal_placeholder: np.asarray( \
                            [elem[4] for elem in batch]),
                        is_training_placeholder: True
                })

                _ = sess.run(update_targets_op)

            state = next_state
            num_steps += 1
            num_steps_in_episode += 1

            if done:
                _ = sess.run(episode_incr_op)
                break

        print(str((episode, total_reward, num_steps_in_episode, noise_scale)))

    env.close()