示例#1
0
文件: model.py 项目: Enhjin/sendit
    def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coef,
                 vf_coef, max_grad_norm):

        sess = tf.get_default_session()

        # CREATE THE PLACEHOLDERS
        actions_ = tf.placeholder(tf.int32, [None], name="actions_")
        advantages_ = tf.placeholder(tf.float32, [None], name="advantages_")
        rewards_ = tf.placeholder(tf.float32, [None], name="rewards_")
        lr_ = tf.placeholder(tf.float32, name="learning_rate_")
        # Keep track of old actor
        oldneglopac_ = tf.placeholder(tf.float32, [None], name="oldneglopac_")
        # Keep track of old critic
        oldvpred_ = tf.placeholder(tf.float32, [None], name="oldvpred_")
        # Cliprange
        cliprange_ = tf.placeholder(tf.float32, [])

        # CREATE OUR TWO MODELS
        # Step_model that is used for sampling
        step_model = policy(sess,
                            ob_space,
                            action_space,
                            nenvs,
                            1,
                            reuse=False)

        # Test model for testing our agent
        #test_model = policy(sess, ob_space, action_space, 1, 1, reuse=False)

        # Train model for training
        train_model = policy(sess,
                             ob_space,
                             action_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value
        # Get the value predicted
        value_prediction = train_model.vf

        # Clip the value = Oldvalue + clip(value - oldvalue, min = - cliprange, max = cliprange)
        value_prediction_clipped = oldvpred_ + tf.clip_by_value(
            train_model.vf - oldvpred_, -cliprange_, cliprange_)

        # Unclipped value
        value_loss_unclipped = tf.square(value_prediction - rewards_)

        # Clipped value
        value_loss_clipped = tf.square(value_prediction_clipped - rewards_)

        # Value loss 0.5 * SUM [max(unclipped, clipped)
        vf_loss = 0.5 * tf.reduce_mean(
            tf.maximum(value_loss_unclipped, value_loss_clipped))

        # Clip the policy
        # Output -log(pi) (new -log(pi))
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=actions_)

        # Remember we want ratio (pi current policy / pi old policy)
        # But neglopac returns us -log(policy)
        # So we want to transform it into ratio
        # e^(-log old - (-log new)) == e^(log new - log old) == e^(log(new / old))
        # = new/old (since exponential function cancels log)
        # Wish we can use latex in comments
        ratio = tf.exp(oldneglopac_ - neglogpac)  # ratio = pi new / pi old

        # Remember also that we're doing gradient ascent, aka we want to MAXIMIZE the objective function which is equivalent to say
        # Loss = - J
        # To make objective function negative we can put a negation on the multiplication (pi new / pi old) * - Advantages
        pg_loss_unclipped = -advantages_ * ratio

        # value, min [1 - e] , max [1 + e]
        pg_loss_clipped = -advantages_ * tf.clip_by_value(
            ratio, 1.0 - cliprange_, 1.0 + cliprange_)

        # Final PG loss
        # Why maximum, because pg_loss_unclipped and pg_loss_clipped are negative, getting the min of positive elements = getting
        # the max of negative elements
        pg_loss = tf.reduce_mean(tf.maximum(pg_loss_unclipped,
                                            pg_loss_clipped))

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Total loss (Remember that L = - J because it's the same thing than max J
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = find_trainable_variables("model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Build our trainer
        trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, epsilon=1e-5)

        # 4. Backpropagation
        _train = trainer.apply_gradients(grads)

        # Train function
        def train(states_in, actions, returns, values, neglogpacs, lr,
                  cliprange):

            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # Returns = R + yV(s')
            advantages = returns - values

            # Normalize the advantages (taken from aborghi implementation)
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-8)

            # We create the feed dictionary
            td_map = {
                train_model.inputs_: states_in,
                actions_: actions,
                advantages_: advantages,  # Use to calculate our policy loss
                rewards_: returns,  # Use as a bootstrap for real value
                lr_: lr,
                cliprange_: cliprange,
                oldneglopac_: neglogpacs,
                oldvpred_: values
            }

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)

            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            """
            Save the model
            """
            saver = tf.train.Saver()
            saver.save(sess, save_path)

        def load(load_path):
            """
            Load the model
            """
            saver = tf.train.Saver()
            print('Loading ' + load_path)
            saver.restore(sess, load_path)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
示例#2
0
    def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coef,
                 vf_coef, max_grad_norm):

        sess = tf.get_default_session()

        # Here we create the placeholders
        actions_ = tf.placeholder(tf.int32, [None], name="actions_")
        advantages_ = tf.placeholder(tf.float32, [None], name="advantages_")
        rewards_ = tf.placeholder(tf.float32, [None], name="rewards_")
        lr_ = tf.placeholder(tf.float32, name="learning_rate_")

        # Here we create our two models:
        # Step_model that is used for sampling
        step_model = policy(sess,
                            ob_space,
                            action_space,
                            nenvs,
                            1,
                            reuse=False)

        # Train model for training
        train_model = policy(sess,
                             ob_space,
                             action_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)
        """
        Calculate the loss
        Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
        """
        # Policy loss
        # Output -log(pi)
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=actions_)

        # 1/n * sum A(si,ai) * -logpi(ai|si)
        pg_loss = tf.reduce_mean(advantages_ * neglogpac)

        # Value loss 1/2 SUM [R - V(s)]^2
        vf_loss = tf.reduce_mean(
            tf.losses.mean_squared_error(tf.squeeze(train_model.vf), rewards_))

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Build our trainer
        trainer = tf.train.RMSPropOptimizer(learning_rate=lr_,
                                            decay=0.99,
                                            epsilon=1e-5)

        # 4. Backpropagation
        _train = trainer.apply_gradients(grads)

        def train(states_in, actions, returns, values, lr):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # Returns = R + yV(s')
            advantages = returns - values

            # We create the feed dictionary
            td_map = {
                train_model.inputs_: states_in,
                actions_: actions,
                advantages_: advantages,  # Use to calculate our policy loss
                rewards_: returns,  # Use as a bootstrap for real value
                lr_: lr
            }

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)

            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            """
            Save the model
            """
            saver = tf.train.Saver()
            saver.save(sess, save_path)

        def load(load_path):
            """
            Load the model
            """
            saver = tf.train.Saver()
            print('Loading ' + load_path)
            saver.restore(sess, load_path)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
    def __init__(self,
                 policy,
                ob_space,
                action_space,
                nenvs,
                nsteps,
                ent_coef,
                vf_coef,
                max_grad_norm):

        sess = tf.get_default_session()


        # CREATE THE PLACEHOLDERS
        actions_ = tf.placeholder(tf.int32, [None], name="actions_")
        advantages_ = tf.placeholder(tf.float32, [None], name="advantages_")
        rewards_ = tf.placeholder(tf.float32, [None], name="rewards_")
        lr_ = tf.placeholder(tf.float32, name="learning_rate_")
        # Keep track of old actor
        oldneglopac_ = tf.placeholder(tf.float32, [None], name="oldneglopac_")
        # Keep track of old critic 
        oldvpred_ = tf.placeholder(tf.float32, [None], name="oldvpred_")
        # Cliprange
        cliprange_ = tf.placeholder(tf.float32, [])


        # CREATE OUR TWO MODELS
        # Step_model that is used for sampling
        step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False)

        # Test model for testing our agent
        #test_model = policy(sess, ob_space, action_space, 1, 1, reuse=False)

        # Train model for training
        train_model = policy(sess, ob_space, action_space, nenvs*nsteps, nsteps, reuse=True)



        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
       
        # Clip the value
        # Get the value predicted
        value_prediction = train_model.vf

        # Clip the value = Oldvalue + clip(value - oldvalue, min = - cliprange, max = cliprange)
        value_prediction_clipped = oldvpred_ + tf.clip_by_value(train_model.vf - oldvpred_,  - cliprange_, cliprange_)

        # Unclipped value
        value_loss_unclipped = tf.square(value_prediction - rewards_)

        # Clipped value
        value_loss_clipped = tf.square(value_prediction_clipped - rewards_)

        # Value loss 0.5 * SUM [max(unclipped, clipped)
        vf_loss = 0.5 * tf.reduce_mean(tf.maximum(value_loss_unclipped,value_loss_clipped ))


        # Clip the policy
        # Output -log(pi) (new -log(pi))
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions_)
        
        # Remember we want ratio (pi current policy / pi old policy)
        # But neglopac returns us -log(policy)
        # So we want to transform it into ratio
        # e^(-log old - (-log new)) == e^(log new - log old) == e^(log(new / old)) 
        # = new/old (since exponential function cancels log)
        # Wish we can use latex in comments
        ratio = tf.exp(oldneglopac_ - neglogpac) # ratio = pi new / pi old

        # Remember also that we're doing gradient ascent, aka we want to MAXIMIZE the objective function which is equivalent to say
        # Loss = - J
        # To make objective function negative we can put a negation on the multiplication (pi new / pi old) * - Advantages
        pg_loss_unclipped = -advantages_ * ratio 

        # value, min [1 - e] , max [1 + e]
        pg_loss_clipped = -advantages_ * tf.clip_by_value(ratio, 1.0 - cliprange_, 1.0 + cliprange_)

        # Final PG loss
        # Why maximum, because pg_loss_unclipped and pg_loss_clipped are negative, getting the min of positive elements = getting
        # the max of negative elements
        pg_loss = tf.reduce_mean(tf.maximum(pg_loss_unclipped, pg_loss_clipped))

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Total loss (Remember that L = - J because it's the same thing than max J
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef


        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = find_trainable_variables("model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Build our trainer
        trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, epsilon=1e-5)

        # 4. Backpropagation
        _train = trainer.apply_gradients(grads)


        # Train function
        def train(states_in, actions, returns, values, neglogpacs, lr, cliprange):
            
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # Returns = R + yV(s')
            advantages = returns - values

            # Normalize the advantages (taken from aborghi implementation)
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

            # We create the feed dictionary
            td_map = {train_model.inputs_: states_in,
                     actions_: actions,
                     advantages_: advantages, # Use to calculate our policy loss
                     rewards_: returns, # Use as a bootstrap for real value
                     lr_: lr,
                     cliprange_: cliprange,
                     oldneglopac_: neglogpacs,
                     oldvpred_: values}

            policy_loss, value_loss, policy_entropy, _= sess.run([pg_loss, vf_loss, entropy, _train], td_map)
            
            return policy_loss, value_loss, policy_entropy


        def save(save_path):
            """
            Save the model
            """
            saver = tf.train.Saver()
            saver.save(sess, save_path)

        def load(load_path):
            """
            Load the model
            """
            saver = tf.train.Saver()
            print('Loading ' + load_path)
            saver.restore(sess, load_path)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
示例#4
0
    def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coef,
                 vf_coef, max_grad_norm):
        sess = tf.get_default_session()
        # sess = tf_debug.LocalCLIDebugWrapperSessionp.array([[1, 1], [2, 2], [3, 3]], dtype=np.float32)n(sess)
        # Here we create the placeholders

        timestr = time.strftime("%Y%m%d-%H%M%S")
        dirname = "./" + timestr + "log"
        logger.configure(dir=dirname)
        actions_ = tf.placeholder(tf.int32, [None], name="actions_")
        advantages_ = tf.placeholder(tf.float32, [None], name="advantages_")
        rewards_ = tf.placeholder(tf.float32, [None], name="rewards_")
        lr_ = tf.placeholder(tf.float32, name="learning_rate_")

        # Here we create our two models:
        # Step_model that is used for sampling
        step_model = policy(sess,
                            ob_space,
                            action_space,
                            nenvs,
                            1,
                            reuse=False)  #reuse why?

        # Train model for training
        train_model = policy(sess,
                             ob_space,
                             action_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)
        """
        Calculate the loss
        Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
        """
        # Policy loss
        # Output -log(pi)
        l1 = []
        # print(actions_.shape)
        #
        # actions_copy=tf.identity(actions_)
        #
        # for i in range(0-0.01],actions_copy.shape):
        #     actions_copy[i]=train_model.softmax_layer[actions_copy[i]]
        #
        #
        #
        #     result = recursive_map(actions_copy)

        # neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions_)

        if flag.LAST_LAYER_IMPL:
            neglogpac = (-1) * tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.p_layer, labels=actions_)
        else:
            neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.pi, labels=actions_)

        #neglogpac=train_model.pd.neglogp(actions_)

        # 1/n * sum A(si,ai) * -logpi(ai|si)
        # pg_loss = tf.reduce_mean(advantages_ * neglogpac)
        pg_loss = tf.reduce_mean(advantages_ * neglogpac)

        # Value loss 1/2 SUM [R - V(s)]^2
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), rewards_))

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        # entropy = tf.reduce_mean(train_model.pd.entropy())
        if flag.LAST_LAYER_IMPL:
            entropy = tf.reduce_mean(train_model.dist.entropy(name="ent"))
        else:
            entropy = tf.reduce_mean(train_model.pd.entropy())
        # vf_loss=tf.zeros(vf_loss.shape,dtype=tf.float32)

        loss = pg_loss - (entropy * ent_coef) + (vf_loss * vf_coef)

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))

        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Build our trainer
        trainer = tf.train.RMSPropOptimizer(learning_rate=lr_,
                                            decay=0.99,
                                            epsilon=1e-5)

        # 4. Backpropagation

        _train = trainer.apply_gradients(grads)

        def train(states_in, actions, returns, values, lr):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # Returns = R + yV(s')

            advantages = returns - values

            # print(advantages.shape)
            # print(actions_.shape)
            # exit

            # We create the feed dictionary
            td_map = {
                train_model.inputs_: states_in,
                actions_: actions,
                advantages_: advantages,  # Use to calculate our policy loss
                rewards_: returns,  # Use as a bootstrap for real value
                lr_: lr
            }
            if flag.LAST_LAYER_IMPL:
                pi1, policy_loss, neglogpac1, value_loss, policy_entropy, _ = sess.run(
                    [
                        train_model.softmax_layer, pg_loss, neglogpac, vf_loss,
                        entropy, _train
                    ], td_map)
            else:
                pi1, policy_loss, neglogpac1, value_loss, policy_entropy, _ = sess.run(
                    [
                        train_model.pi, pg_loss, neglogpac, vf_loss, entropy,
                        _train
                    ], td_map)
            if flag.DEBUG:
                print("pd", pi1)
            #logger.record_tabular("neglog", neglogpac1)
            #logger.record_tabular("adv", advantages)

            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            """
            Save the model
            """
            saver = tf.train.Saver()
            saver.save(sess, save_path)

        def load(load_path):
            """
            Load the model
            """

            saver = tf.train.Saver()
            print('Loading ' + load_path)
            saver.restore(sess, load_path)

        self.train = train
        self.train_model = train_model
        # self.step_model = step_model

        self.step_model = train_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state

        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
示例#5
0
    def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coef,
                 vf_coef, max_grad_norm):
        sess = tf.get_default_session()
        K.set_session(sess)
        K.set_learning_phase(1)

        # Create the placeholders
        actions_ = tf.placeholder(tf.int32, [None], name='actions_')
        advantages_ = tf.placeholder(tf.float32, [None], name='advantages_')
        rewards_ = tf.placeholder(tf.float32, [None], name='rewards_')
        lr_ = tf.placeholder(tf.float32, name='learning_rate_')
        # keep track of old actor
        oldneglopac_ = tf.placeholder(tf.float32, [None], name='oldneglopac_')
        # Keep track of old critic
        oldvpred_ = tf.placeholder(tf.float32, [None], name='oldvpred_')
        # Cliprange
        cliprange_ = tf.placeholder(tf.float32, [])

        # Create our two models
        # Step model that is used for sampling
        step_model = policy(sess,
                            ob_space,
                            action_space,
                            nenvs,
                            1,
                            reuse=False)

        # Test model for testing our agent
        #test_model = policy(sess, ob_space, action_space, 1, 1, reuse=False)

        # Train model for training
        train_model = policy(sess,
                             ob_space,
                             action_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        print('availPi', train_model.availPi)
        tf.print(train_model.availPi, [train_model.availPi],
                 'train_model.availPi')
        l0 = train_model.availPi - tf.reduce_max(
            train_model.availPi, axis=-1, keep_dims=True)
        el0 = tf.exp(l0)
        z0 = tf.reduce_sum(el0, axis=-1, keep_dims=True)
        p0 = el0 / z0
        entropy = -tf.reduce_sum((p0 + 1e-8) * tf.log(p0 + 1e-8), axis=-1)
        oneHotActions = tf.one_hot(actions_,
                                   train_model.pi.get_shape().as_list()[-1])
        neglogpac = -tf.log(
            tf.reduce_sum(tf.multiply(p0, oneHotActions), axis=-1))

        def neglogp(state, valid_ins, actions):
            return sess.run(
                neglogpac, {
                    network.X: state,
                    network.available_moves: valid_ins,
                    actions_: actions
                })

        self.neglogp = neglogp

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value
        # Get the value predicted
        value_prediction = train_model.vf

        # Clip the value = Oldvalue + clip(value - oldvalue, min = -cliprange, max = cliprange)
        value_prediction_clipped = oldvpred_ + tf.clip_by_value(
            train_model.vf - oldvpred_, -cliprange_, cliprange_)

        # Unclipped value
        value_loss_unclipped = tf.square(value_prediction - rewards_)

        # Clipped value
        value_loss_clipped = tf.square(value_prediction_clipped - rewards_)

        # Value loss 0.5 * SUM [max(unclipped, clipped)]
        vf_loss = 0.5 * tf.reduce_mean(
            tf.maximum(value_loss_unclipped, value_loss_clipped))

        # Clip the policy
        # Output -log(pi) (new - log(pi))
        #		neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions_)

        # Remember we want ratio (pi current policy / pi old policy)
        # But neglogpac returns us -log(policy)
        # So we want to transform it into ratio
        # e^(-log old - (ilog new)) == e^(log new - log old) == e^(log(new / old))
        # = new/old (since expoenential function cancels log)
        # wish we can use latex in comments
        ratio = tf.exp(oldneglopac_ - neglogpac)  # ratio = pi new / pi old

        # remember also that we're doing gradient ascent, aka we want to maximize the objective function
        # which Loss = - J
        # To make objective function negative we  put a negation on the multiplcation (pi new/pi old) * - Advantages
        pg_loss_unclipped = -advantages_ * ratio

        # value, min [1-e], max[1+e]
        pg_loss_clipped = -advantages_ * tf.clip_by_value(
            ratio, 1.0 - cliprange_, 1.0 + cliprange_)

        # Final PG Loss
        # Why maximum because log_loss_unclipped and pg_loss_clipped are negative, gitting the min of positive elements = getting
        # the max of negative elements
        pg_loss = tf.reduce_mean(tf.maximum(pg_loss_unclipped,
                                            pg_loss_clipped))

        # Calculate the entropy
        # Entropy is usedto improve exploration by limiting the premature convergence to suboptimal policy.
        entropy_loss = tf.reduce_mean(entropy)

        # Total loss (Remember that L = - J because it's the same thing than max J)
        loss = pg_loss - entropy_loss * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = find_trainable_variables('model')

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Build our training
        trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, epsilon=1e-5)

        # 4. Backpropagation
        _train = trainer.apply_gradients(grads)

        # Train function
        def train(states_in, valid_ins, text_ins, actions, returns, values,
                  neglogpacs, lr, cliprange):

            for ob_text in text_ins:
                #				print('ob_text', ob_text)
                train_model.tokenizer.fit_on_texts([ob_text.decode("utf-8")])

            # preprocess text. maybe do inside of env later?
            ob_text_input = []
            for ob_text in text_ins:
                #				print('ob_text utf8', ob_text.decode("utf-8"))
                token = train_model.tokenizer.texts_to_sequences(
                    [ob_text.decode("utf-8")])
                token = sequence.pad_sequences(
                    token, maxlen=200)  # pre_padding with 0
                ob_text_input.append(token)


#				print('token', token)
#				print('token shape', token.shape)
            ob_text_input = np.array(ob_text_input)
            shape = ob_text_input.shape
            #			print('ob_text_input shape', shape)
            ob_text_input = ob_text_input.reshape(shape[0], shape[2])

            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # Retruns = R + yV(s')
            advantages = returns - values

            #Normalize the advantages (taken from aborghi implementation)
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-8)

            td_map = {
                train_model.text_inputs_: ob_text_input,
                train_model.available_moves: valid_ins,
                actions_: actions,
                advantages_: advantages,  # Use to calculate our policy loss
                rewards_: returns,  # Use as a bootstrap for reward value
                lr_: lr,
                cliprange_: cliprange,
                oldneglopac_: neglogpacs,
                oldvpred_: values
            }
            td_map.update(train_model.split_categories_from_state(states_in))

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy_loss, _train], td_map)

            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            """
			Save the model
			"""
            saver = tf.train.Saver()
            saver.save(sess, save_path)

        def load(load_path):
            """
			Load the model
			"""
            saver = tf.train.Saver()
            print('Loading ' + load_path)
            saver.restore(sess, load_path)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
示例#6
0
    def __init__(self, policy, ob_space, action_space, nenvs, nsteps,
                 ent_coeff, vf_coeff, max_grad_norm):
        sess = tf.get_default_session()

        #Define placeholders
        actions_ = tf.placeholder(tf.int32, [None], name="actions_")
        advantages_ = tf.placeholder(tf.float32, [None], name="advantages_")
        rewards_ = tf.placeholder(tf.float32, [None], name="rewards_")
        lr_ = tf.placeholder(tf.float32, name="learning_rate_")

        #Create our two models here
        #take one step for each environment
        step_model = policy(sess,
                            ob_space,
                            action_space,
                            nenvs,
                            1,
                            reuse=False)
        #take number of steps * number of environments for total steps
        train_model = policy(sess,
                             ob_space,
                             action_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        #calculate the loss
        #Note: in the future we can add clipped Loss to control the step size of our parameter updates.
        #This can lead to better convergence *Using PPO*
        #Recall that Total Loss =  PolicyGradientLoss - Entropy*EntropyCoeff + Value*ValueCoeff

        #output loss -log(policy)
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            Logits=train_model.pi,
            Labels=actions_,
        )

        #1/n * sum(A(s,a) * -logpi(a|s))
        pg_loss = tf.reduce_mean(advantages_ * neglogpac)

        #value loss
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), rewards_))

        #entropy
        entropy = tf.reduce_mean(train_model.pd.entropy())

        #total loss
        loss = pg_loss - (entropy * ent_coeff) + (vf_loss * vf_coeff)

        #Update the parameters using the loss we've just calculated
        #Grab model params
        params = find_trainable_variables("model")

        #Calculate gradients. *We'll want to zip our parameters w/ our gradients
        grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            #Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = List(zip(grads, params))

        #build our trainer
        trainer = tf.train.RMSPropOptimizer(Learning_rate=lr_,
                                            decay=0.99,
                                            epsilon=1e-5)
        #Backprop
        _train = trainer.apply_gradients(grads)

        def train(states_in, actions, returns, values, lr):
            #here we calculate advantage A(s, a) = R+yV(s') - V(s)
            #Returns = R+yV(S')
            advantages = returns - values

            td_map = {
                train_model.inputs_: states_in,
                actions_: actions,
                advantages_: advantages,
                rewards_:
                returns,  #Recall we bootstrap "real" value since we're learning 1 step at a time. (not episode)
                lr_: lr
            }

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            saver = tf.train.Saver()
            saver.save(sess, save_path)

        def load(load_path):
            saver = tf.train.Saver()
            saver.restore(sess, load_path)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
    def __init__(self,
                 policy,
                ob_space,
                action_space,
                nenvs,
                nsteps,
                ent_coef,
                vf_coef,
                max_grad_norm):

        sess = tf.get_default_session()

        # Here we create the placeholders
        actions_ = tf.placeholder(tf.int32, [None], name="actions_")
        advantages_ = tf.placeholder(tf.float32, [None], name="advantages_")
        rewards_ = tf.placeholder(tf.float32, [None], name="rewards_")
        lr_ = tf.placeholder(tf.float32, name="learning_rate_")

        # Here we create our two models:
        # Step_model that is used for sampling
        step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False)

        # Train model for training
        train_model = policy(sess, ob_space, action_space, nenvs*nsteps, nsteps, reuse=True)

        """
        Calculate the loss
        Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
        """
        # Policy loss
        # Output -log(pi)
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions_)

        # 1/n * sum A(si,ai) * -logpi(ai|si)
        pg_loss = tf.reduce_mean(advantages_ * neglogpac)

        # Value loss 1/2 SUM [R - V(s)]^2
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf),rewards_))

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())


        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Build our trainer
        trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, decay=0.99, epsilon=1e-5)

        # 4. Backpropagation
        _train = trainer.apply_gradients(grads)

        def train(states_in, actions, returns, values, lr):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # Returns = R + yV(s')
            advantages = returns - values

            # We create the feed dictionary
            td_map = {train_model.inputs_: states_in,
                     actions_: actions,
                     advantages_: advantages, # Use to calculate our policy loss
                     rewards_: returns, # Use as a bootstrap for real value
                     lr_: lr}

            policy_loss, value_loss, policy_entropy, _= sess.run([pg_loss, vf_loss, entropy, _train], td_map)
            
            return policy_loss, value_loss, policy_entropy


        def save(save_path):
            """
            Save the model
            """
            saver = tf.train.Saver()
            saver.save(sess, save_path)

        def load(load_path):
            """
            Load the model
            """
            saver = tf.train.Saver()
            print('Loading ' + load_path)
            saver.restore(sess, load_path)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)