Exemplo n.º 1
0
    def loss(self, x, t, u_hat):
        """
        Returns mse sums catching properties of 1d heat eq.
        """

        #Parital derivatives
        u = u_hat(x, t)
        u_x = tf.gradients(u, x)[0]
        u_xx = tf.gradients(u_x, x)[0]
        u_t = tf.gradients(u, t)[0]

        #Initial and boundary conditions
        n = tf.size(x)
        zeros, ones = tf.zeros([n, 1],
                               dtype=tf.float64), tf.ones([n, 1],
                                                          dtype=tf.float64)
        input_values = tf.cast(tf.reshape(tf.linspace(0.0, 1.0, n), [-1, 1]),
                               dtype=tf.float64)

        u_t0 = u_hat(input_values, zeros)  #t=0
        u_x0 = u_hat(zeros, input_values)  #x=0
        u_x1 = u_hat(ones, input_values)  #x=1

        return (mean_squared_error(u_t, u_xx) +
                mean_squared_error(u_t0, tf.sin(np.pi * input_values)) +
                mean_squared_error(u_x0, zeros) +
                mean_squared_error(u_x1, zeros))
Exemplo n.º 2
0
def eigen_ODE_loss(x0, t, x_hat, l=n * m, seed=seed):
    """
    Custom loss function to be used in HeatLearner class.
    ( HeatLearner(custom_loss=eigen_ODE_loss) )

    Uses a loop to compute f(x(t)) because of shape requirments in f transform.
    x(t) output is shape (n*m, 1).
    x(t=t_m) is shape (n, 1).
    """

    np.random.seed(seed)
    A = np.random.normal(0, 1, (n, n))
    A = (A.T + A) / 2
    A = tf.convert_to_tensor(A, dtype=tf.float64)  #(n, n) symmetrical matrix

    x = x_hat(x0, t)  #First forward pass/prediction

    #Loops over each (n, 1) at each time m in (n*m, 1)
    Fx = []
    for i in range(int(l / n)):  #l/n = m
        x_vec = x[n * i:n * (i + 1), 0]  #x vector at time dictated by rows
        x_vec = tf.reshape(x_vec, [-1, 1])  #Shape (n, 1)

        #f(x(t))
        xT = tf.transpose(x_vec)
        m1 = tf.matmul(xT, x_vec) * A
        m2 = (1 - tf.matmul(xT, tf.matmul(A, x_vec)))
        fx = tf.matmul(m1 + m2, x_vec)

        Fx.append(fx)

    fx = tf.reshape(Fx, [-1, 1])  #Reshape to (n*m, 1)
    x_t = tf.gradients(x_hat(x0, t), t)[0]  #Gradient (n*m, 1)

    return mean_squared_error(x_t, fx - x)
Exemplo n.º 3
0
    def model_fn(features, labels, mode):
        tf.logging.set_verbosity(tf.logging.WARN)
        model = hub.Module(IMG_ENCODER, trainable=True)
        tf.logging.set_verbosity(tf.logging.INFO)
        model = model(features['x'])
        regularizer = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        output = tf.layers.dense(model,
                                 VEC_SPACE_DIMENSIONS,
                                 activation=tf.nn.relu)
        output = tf.layers.dense(model,
                                 VEC_SPACE_DIMENSIONS,
                                 activation=tf.nn.relu)
        output = tf.layers.dense(model,
                                 VEC_SPACE_DIMENSIONS,
                                 activation=tf.nn.tanh)

        if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL:
            loss = mean_squared_error(labels, output)
            regularizer = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            loss = loss + 0.25 * sum(regularizer)
        if mode == ModeKeys.TRAIN:
            train_op = AdamOptimizer(learning_rate=0.00001).minimize(
                loss=loss, global_step=get_global_step())
            return EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
        elif mode == ModeKeys.EVAL:
            eval_metric_ops = {
                'accuracy': tf.metrics.mean_cosine_distance(labels, output, 0)
            }
            return EstimatorSpec(mode=mode,
                                 loss=loss,
                                 eval_metric_ops=eval_metric_ops)
        elif mode == ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(mode=mode, predictions=output)
Exemplo n.º 4
0
train_ds = Dataset.from_tensor_slices(
    (x_train, y_train)).shuffle(n_train_samples).batch(batch_size).repeat()
test_ds = Dataset.from_tensor_slices((x_test, y_test))

for b, item in enumerate(train_ds):
    print(b, item)
    if b + 1 == math.ceil(n_train_samples / batch_size):
        break


def create_model():
    model = Sequential()
    model.add(Dense(4, activation='tanh', input_dim=1))
    model.add(Dense(1))
    return model


eager_model = create_model()
optimizer = GradientDescentOptimizer(0.1)

for e in range(n_epochs):
    for b, (x, y) in enumerate(train_ds):
        with tf.GradientTape() as tape:
            pred = eager_model.predict(x.numpy())
            loss_value = mean_squared_error(pred, y.numpy().reshape(10, 1))
            grads = tape.gradient(loss_value, eager_model.variables)
            optimizer.apply_gradients([grads.eager_model.variables])
            print(loss_value)

    print(e)
Exemplo n.º 5
0
Arquivo: a2c.py Projeto: yoniosin/A2C
    def __init__(self,
                 policy,
                 env,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 network='cnn',
                 prio_args=None):

        self.prio_args = prio_args
        sess = tf_util.get_session()
        nenvs = self.get_active_envs(env)

        nbatch = nenvs * nsteps

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            step_model = policy(nenvs, 1, sess)

            # train_model is used to train our network
            train_model = policy(nbatch, nsteps, sess)
            # our TD evaluating network

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Policy loss
        neglogpac = train_model.pd.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Value loss
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        # TD loss
        # td_loss = losses.mean_squared_error(tf.squeeze(train_model.dt), TD)

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        """prio model"""
        with tf.variable_scope('a2c_model_prio', reuse=tf.AUTO_REUSE):
            # prio_model = policy(nbatch, nsteps, sess)
            prio_model = MyNN(env, nbatch, network)

        P_R = tf.placeholder(tf.float32, [nbatch])
        PRIO = tf.placeholder(tf.float32, [nbatch])
        P_LR = tf.placeholder(tf.float32, [])

        # prio_model_loss = losses.mean_squared_error(tf.squeeze(prio_model.out), P_R) # Reward
        prio_model_loss = losses.mean_squared_error(tf.squeeze(prio_model.out),
                                                    PRIO)  # TD Error
        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")
        params_prio = find_trainable_variables("a2c_model_prio")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        prio_grads = tf.gradients(prio_model_loss, params_prio)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            prio_grads, prio_grad_norm = tf.clip_by_global_norm(
                prio_grads, max_grad_norm)
        grads = list(zip(grads, params))
        prio_grads = list(zip(prio_grads, params_prio))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Make op for one policy and value update step of A2C
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        prio_trainer = tf.train.RMSPropOptimizer(learning_rate=P_LR,
                                                 decay=alpha,
                                                 epsilon=epsilon)

        _train = trainer.apply_gradients(grads)
        _prio_train = prio_trainer.apply_gradients(prio_grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)

            prio_loss = 0
            if self.prio_args is not None:
                prio_values = GetValuesForPrio(self.prio_args['prio_type'],
                                               self.prio_args['prio_param'],
                                               advs, rewards)
                prio_td_map = {
                    prio_model.X: obs,
                    P_R: rewards,
                    P_LR: cur_lr,
                    PRIO: prio_values
                }

                prio_loss, _, p_td = sess.run(
                    [prio_model_loss, _prio_train, PRIO], prio_td_map)
                # mb aranged as 1D-vector = [[env_1: n1, ..., n_nstep],...,[env_n_active]]
                # need to take last value of each env's buffer
                self.prio_score = prio_values[list(
                    filter(lambda x: x % nsteps == (nsteps - 1),
                           range(len(prio_values))))]
            return policy_loss, value_loss, policy_entropy, prio_loss

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.prio_model = prio_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 6
0
    def __init__(self,
                 policy,
                 env,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 r0_coef=0.05,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 head=-1,
                 step_placeholder=None,
                 train_placeholder=None,
                 encoded_x_1=None,
                 encoded_x_2=None):

        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs * nsteps
        self.step_placeholder = step_placeholder
        self.train_placeholder = train_placeholder
        self.encoded_x_1 = encoded_x_1
        self.encoded_x_2 = encoded_x_2
        with tf.variable_scope('aux_model' + str(head), reuse=tf.AUTO_REUSE):
            step_model, self.step_placeholder, self.encoded_x_1 = policy(
                nenvs,
                1,
                sess,
                observ_placeholder=self.step_placeholder,
                encoded_x=self.encoded_x_1)
            train_model, self.train_placeholder, self.encoded_x_2 = policy(
                nbatch,
                nsteps,
                sess,
                observ_placeholder=self.train_placeholder,
                encoded_x=self.encoded_x_2)

            A = tf.placeholder(train_model.action.dtype,
                               train_model.action.shape)
            ADV = tf.placeholder(tf.float32, [nbatch])
            R = tf.placeholder(tf.float32, [nbatch])
            LR = tf.placeholder(tf.float32, [])

            neglogpac = train_model.pd.neglogp(A)
            entropy = tf.reduce_mean(train_model.pd.entropy())

            pg_loss = tf.reduce_mean(ADV * neglogpac)
            print(train_model.vf)
            print(R)
            vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)
            #r0_loss = losses.mean_squared_error(tf.squeeze(train_model.r), R)

            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

            params = find_trainable_variables('aux_model' + str(head))
            grads = tf.gradients(loss, params)
            if max_grad_norm is not None:
                grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            grads = list(zip(grads, params))
            #print("gradiants to update: ", grads)
            trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                                decay=alpha,
                                                epsilon=epsilon)
            _train = trainer.apply_gradients(grads)

            with tf.name_scope('summaries'):
                a_r = tf.summary.scalar('avg_reward', tf.reduce_mean(R))
                #a_p_l = tf.summary.scalar('avg_pg_loss', tf.reduce_mean(pg_loss))
                #a_v_l = tf.summary.scalar('avg_vf_loss', tf.reduce_mean(vf_loss))
                #a_l = tf.summary.scalar('avg_loss', tf.reduce_mean(loss))
                #merged = tf.summary.merge([a_r, a_p_l, a_v_l, a_l])
                merged = tf.summary.merge([a_r])

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(
            obs, states, rewards, masks, actions, values
        ):  #Policy already sampled!! We need to update the critic now.
            advs = rewards - values  #For a set of (s, a), we get (r0 - v0, r1 - v1, ...)
            #print("advs: ", advs)
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            #print(td_map)
            #we train out model with observed actions, advs, rewards, cur_lr
            #how is values and rewards calculated though? These cannot be sampled from a single state.
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, summary, _ = sess.run(
                [pg_loss, vf_loss, entropy, merged, _train], td_map)

            return policy_loss, value_loss, policy_entropy, summary

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)

        self.train_writer = tf.summary.FileWriter('logs/aux/' + str(head),
                                                  sess.graph)

        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 7
0
    def __init__(self, policy, env, nsteps, icm,idf,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):

        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs*nsteps

        self.idf=idf

        print("This is Icm in Model Init function " , type(icm))


        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            step_model = policy(nenvs, 1, sess)

            # train_model is used to train our network
            train_model = policy(nbatch, nsteps, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Policy loss
        neglogpac = train_model.pd.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Value loss
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))

        if icm is not None :

            grads = grads + icm.pred_grads_and_vars
            # print("Gradients added ")
            # print("independetly there shape were a2c : {} icm :{} and together {} ".format(np.shape(grads),np.shape(icm.pred_grads_and_vars),
                # np.shape(grads_and_vars)))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Make op for one policy and value update step of A2C
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)

        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values , next_obs ) :
        #, icm_rewards,cumulative_dicounted_icm): #, new_rew):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            # print(" icm called in train function ", type(icm))
            advs = rewards - values



            # print("Now the advantage ", advs )

            # icm_adv = icm_rewards - values
            # m , s = get_mean_and_std(icm_adv)

            # > adv Normaliztion
            # m , s = get_mean_and_std(advs)
            # advs = (advs - m) / (s + 1e-7)



            # advs = (icm_adv - m) / (s + 1e-7)


            # icm_adv = (icm_adv - icm_adv.mean()) / (  + 1e-7) 
            # print("icm advantage ", icm_adv)


            # advs = new_rew - values
            # print("Advantage :", advs)
            # print("On train shapes are  ")
            # print(" obs {} states {} rewards {} masks {} actions {} values {} ".
                # format(np.shape(obs) , np.shape(states) , np.shape(rewards) , np.shape(masks) ,np.shape(actions) ,
                # np.shape(values) ))
            # print("Received Advantage {} rewards {} values {}".format(
                # advs , rewards , values) )

           
            # print("advantage reward and values shape ")
            # print("advs {} , rewards shape {} , values {}".format(np.shape(advs) , np.shape(rewards) , np.shape(values)))

            for step in range(len(obs)):
                cur_lr = lr.value()

            if icm is None :

                td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            else :
                # print("curiosity Td Map ")
                # print(" obs {} , next obs {} , actions  {} ".format(np.shape(obs) , np.shape(next_obs),
                    # np.shape(actions)))
                td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr , 
                icm.state_:obs, icm.next_state_ : next_obs , icm.action_ : actions }# , icm.R :rewards }



            if icm is None :
                if states is not None:
                    td_map[train_model.S] = states
                    td_map[train_model.M] = masks
                
                policy_loss, value_loss, policy_entropy, _ = sess.run(
                    [pg_loss, vf_loss, entropy, _train],
                    td_map
                )
                return policy_loss, value_loss, policy_entropy
            else :
                if states is not None:
                    td_map[train_model.S] = states
                    td_map[train_model.M] = masks
                if self.idf :    
                    policy_loss, value_loss, policy_entropy,forward_loss , inverse_loss , icm_loss, _ = sess.run(
                        [pg_loss, vf_loss, entropy, icm.forw_loss , icm.inv_loss, icm.icm_loss ,_train],
                        td_map)
                    return policy_loss, value_loss, policy_entropy,forward_loss , inverse_loss , icm_loss, advs

                else :
                    policy_loss, value_loss, policy_entropy,forward_loss , icm_loss, _ = sess.run(
                        [pg_loss, vf_loss, entropy, icm.forw_loss , icm.icm_loss ,_train],
                        td_map)

                    return policy_loss, value_loss, policy_entropy,forward_loss , 0.0 , icm_loss, advs




        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 8
0
# +
from tensorflow.contrib.layers import xavier_initializer
from tensorflow.losses import mean_squared_error
from tensorflow.train import AdamOptimizer

tf.reset_default_graph()

X_data = tf.placeholder(tf.float32, shape=[None, x_vals.shape[1]])
y_target = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.get_variable(shape=[x_vals.shape[1], 1], name="W", initializer=xavier_initializer())
b = tf.get_variable(shape=[1, 1], name="b", initializer=xavier_initializer())

output = tf.matmul(X_data, W) - b
l2_norm = mean_squared_error(output, y_target)
# -

# $$ Loss = \max(0, 1 - \hat{y(i)} \cdot y(i)) + \alpha  ||X \cdot W - b||^2 $$

loss = tf.reduce_mean(tf.maximum(0., 1. - output * y_target)) + 0.01 * l2_norm
optimizer = AdamOptimizer(0.01).minimize(loss)

# +
batch_size = 1024

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(20000):
        rand_index = np.random.choice(len(X_train), size=batch_size)
        rand_x = X_train[rand_index]
Exemplo n.º 9
0
    def __init__(self,
                 policy,
                 env,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear'):

        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs * nsteps

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            step_model = policy(nenvs, 1, sess)
            train_model = policy(nbatch, nsteps, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("a2c_model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 10
0
def _mse(y_true: Tensor, y_pred: Tensor) -> Tensor:
    """均方误差损失(losses.mse())

    :param y_true: shape = (N, In), float32
    :param y_pred: shape = (N, In), float32
    :return: shape = (N,), float32"""

    return tf.reduce_mean((y_true - y_pred)**2, axis=-1)


tf.random.set_seed(0)
y_true = tf.random.normal((16, 10))
y_pred = tf.random.normal((16, 10))
print(losses.mse(y_true, y_pred))
print(losses.mean_squared_error(y_true, y_pred))  # 答案同上
print(_mse(y_true, y_pred))


# tf.Tensor(
# [1.1952267 3.4243941 2.1024227 2.3010921 2.3643446 1.8302895 1.6360563
#  3.5714912 2.3740485 4.2296114 1.4224513 4.019039  0.7188259 1.5340036
#  1.5875269 2.435854 ], shape=(16,), dtype=float32)
# tf.Tensor(
# [1.1952267 3.4243941 2.1024227 2.3010921 2.3643446 1.8302895 1.6360563
#  3.5714912 2.3740485 4.2296114 1.4224513 4.019039  0.7188259 1.5340036
#  1.5875269 2.435854 ], shape=(16,), dtype=float32)
# tf.Tensor(
# [1.1952267 3.4243941 2.1024227 2.301092  2.3643446 1.8302895 1.6360562
#  3.5714912 2.3740482 4.2296114 1.4224513 4.019039  0.7188258 1.5340036
#  1.587527  2.435854 ], shape=(16,), dtype=float32)
Exemplo n.º 11
0
    def __init__(self,
                 policy,
                 env,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 diverse_r_coef=0.1,
                 gamma=0.99,
                 total_timesteps=int(80e6),
                 lrschedule='linear'):
        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs * nsteps

        with tf.variable_scope('vfo_model', reuse=tf.AUTO_REUSE):
            step_model = policy(nbatch=nenvs, nsteps=1, sess=sess)
            train_model = policy(nbatch=nbatch, nsteps=nsteps, sess=sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])
        params = find_trainable_variables('vfo_model')
        print(params)

        # ==============================
        # model-free actor-critic loss
        # ==============================
        with tf.variable_scope('mf_loss'):
            neglogpac = train_model.pd.neglogp(A)
            entropy = tf.reduce_mean(train_model.pd.entropy())

            pg_loss = tf.reduce_mean(ADV * neglogpac)
            vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

            grads = tf.gradients(loss, params)
            if max_grad_norm is not None:
                grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            grads = list(zip(grads, params))

        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        # ==============================
        # diverse options policy loss
        # ==============================
        option_train_ops = []
        option_losses = []
        option_losses_names = []
        option_distil_train_op = None
        with tf.variable_scope('options_loss'):
            diversity_reward = -1 * tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=train_model.op_z,
                logits=train_model.option_discriminator)
            diversity_reward = tf.check_numerics(
                diversity_reward, 'Check numerics (1): diversity_reward')
            diversity_reward -= tf.log(
                tf.reduce_sum(train_model.prior_op_z * train_model.op_z) +
                1e-6)
            print('d_reward:', diversity_reward.get_shape().as_list())

            intrinsic_reward = tf.multiply(
                train_model.next_pvfs - train_model.pvfs, train_model.op_z)
            intrinsic_reward = tf.reduce_sum(intrinsic_reward, 1)
            print('i_reward:', intrinsic_reward.get_shape().as_list())
            reward = diverse_r_coef * diversity_reward + intrinsic_reward

            with tf.variable_scope('critic'):
                next_vf = tf.reduce_sum(
                    tf.multiply(train_model.next_pvfs, train_model.op_z), 1)
                print('next_vf:', next_vf.get_shape().as_list())
                option_q_y = tf.stop_gradient(reward +
                                              (1 - train_model.dones) * gamma *
                                              next_vf)
                option_q = tf.squeeze(train_model.option_q, 1)
                print('option_q_y:', option_q_y.get_shape().as_list())
                print('option_q:', option_q.get_shape().as_list())

                option_q_loss = 0.5 * tf.reduce_mean(
                    (option_q_y - option_q)**2)

            with tf.variable_scope('actor'):
                log_op_pi_t = train_model.option_pd.logp(A)
                log_target_t = tf.squeeze(train_model.option_q, 1)
                pvf = tf.reduce_sum(
                    tf.multiply(train_model.pvfs, train_model.op_z), 1)
                print('op_pi:', log_op_pi_t.get_shape().as_list())
                print('op_t:', log_target_t.get_shape().as_list())
                print('pvf:', pvf.get_shape().as_list())
                kl_surrogate_loss = tf.reduce_mean(
                    log_op_pi_t *
                    tf.stop_gradient(log_op_pi_t - log_target_t - pvf))

            with tf.variable_scope('discriminator'):
                print('op_z:', train_model.op_z.get_shape().as_list())
                print('op_dis:',
                      train_model.option_discriminator.get_shape().as_list())
                discriminator_loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=train_model.op_z,
                        logits=train_model.option_discriminator_logits))

            with tf.variable_scope('distillation'):
                # NOTE: to train distillation, op_z should be feed with q(z|s)
                print('mf_pi:', train_model.pi.get_shape().as_list())
                print('op_pi:', train_model.option_pi.get_shape().as_list())
                distillation_loss = losses.mean_squared_error(
                    tf.stop_gradient(train_model.pi), train_model.option_pi)

        _train_option_q = tf.train.AdamOptimizer(lr).minimize(
            loss=option_q_loss, var_list=params)
        option_train_ops.append(_train_option_q)
        option_losses.append(option_q_loss)
        option_losses_names.append('option_critic')

        _train_option_policy = tf.train.AdamOptimizer(lr).minimize(
            loss=kl_surrogate_loss, var_list=params)
        option_train_ops.append(_train_option_policy)
        option_losses.append(kl_surrogate_loss)
        option_losses_names.append('option_actor')

        _train_option_disc = tf.train.AdamOptimizer(lr).minimize(
            loss=discriminator_loss, var_list=params)
        option_train_ops.append(_train_option_disc)
        option_losses.append(discriminator_loss)
        option_losses_names.append('option_discriminator')

        option_distil_train_op = tf.train.AdamOptimizer(lr).minimize(
            loss=distillation_loss, var_list=params)

        tf.summary.FileWriter(logger.get_dir(), sess.graph)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def train_options(obs, next_obs, states, next_states, masks,
                          next_masks, actions, actions_full, dones, options_z):
            feed = {
                train_model.X: obs,
                train_model.X_next: next_obs,
                A: actions,
                train_model.ac: actions_full,
                train_model.dones: dones,
                train_model.op_z: options_z
            }
            if states is not None:
                feed[train_model.S] = states
                feed[train_model.next_S] = next_states
                feed[train_model.M] = masks
                feed[train_model.next_M] = next_masks

            record_loss_values = []
            for name, loss, train_op in zip(option_losses_names, option_losses,
                                            option_train_ops):
                loss_value, _ = sess.run([loss, train_op], feed)
                record_loss_values.append((name + '_loss', loss_value))

            return record_loss_values

        def distill_mf_to_options(obs, states, masks):
            feed = {train_model.X: obs}
            if states is not None:
                feed[train_model.S] = states
                feed[train_model.M] = masks

            option_ensembles = sess.run(train_model.option_discriminator, feed)
            feed[train_model.op_z] = option_ensembles
            distillation_loss_value, _ = sess.run(
                [distillation_loss, option_distil_train_op], feed)

            return distillation_loss_value

        self.train = train
        self.train_options = train_options
        self.distill_mf_to_options = distill_mf_to_options
        self.train_model = train_model
        self.prior_op_z = train_model.prior_op_z
        self.step_model = step_model
        self.step = step_model.step
        self.option_step = step_model.option_step
        self.option_select = step_model.option_select
        self.selective_option_step = step_model.selective_option_step
        self.value = step_model.value
        self.proto_value = step_model.proto_value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 12
0
    def __init__(
            self,
            ob_size,  # dimension of observation vector
            act_size,  # dimension of action vector
            latents,  # network hidden layer sizes
            learning_rate=1e-5,  # learning rate
            activation='relu',  # activation function
            optimizer='adam',  # optimization function
            vf_coef=0.1,  # vf_loss weight
            ent_coef=0.01,  # ent_loss weight
            max_grad_norm=0.5):  # how frequently the logs are printed out

        sess = tf_util.get_session()

        activation = tf_util.get_activation(activation)
        optimizer = tf_util.get_optimizer(optimizer)

        # learning_rate = tf.train.polynomial_decay(
        #     learning_rate=learning_rate,
        #     global_step=tf.train.get_or_create_global_step(),
        #     decay_steps=total_epoches,
        #     end_learning_rate=learning_rate / 10,
        # )

        # placeholders for use
        X = tf.placeholder(tf.float32, [None, None, ob_size], 'observation')
        A = tf.placeholder(tf.int32, [None], 'action')
        ADV = tf.placeholder(tf.float32, [None], 'advantage')
        R = tf.placeholder(tf.float32, [None], 'reward')

        with tf.variable_scope('a2c'):
            policy = build_policy(
                observations=X,
                act_size=act_size,
                latents=latents,
                vf_latents=latents,
                activation=activation
            )

        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Policy loss
        neglogpac = policy.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(policy.entropy())

        # Value loss
        vf_loss = losses.mean_squared_error(R, policy.vf)

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # gradients and optimizer
        params = tf.trainable_variables('a2c')
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))

        # 3. Make op for one policy and value update step of A2C
        trainer = optimizer(learning_rate=learning_rate)

        _train = trainer.apply_gradients(grads)

        # Add ops to save and restore all the variables.
        saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='a2c'))

        def step(obs):
            action, value = sess.run([policy.action, policy.vf], feed_dict={
                X: obs
            })
            return action, value

        def value(obs):
            return sess.run(policy.vf, feed_dict={
                X: obs
            })

        def debug_output(obs):
            """
            This function is only for debug
            """
            return sess.run([policy.logits, policy.latent, policy.vf_latent], feed_dict={
                X: obs
            })

        def train(obs, actions, rewards, values):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            advs = rewards - values

            td_map = {X:obs, A:actions, ADV:advs, R:rewards}
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )

            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            saver.save(sess, save_path)
            print(f'Model saved to {save_path}')

        def load(load_path):
            saver.restore(sess, load_path)
            print(f'Model restored from {load_path}')


        self.train = train
        self.step = step
        self.value = value
        self.save = save
        self.load = load

        # for debug
        self.debug_output = debug_output

        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 13
0
    def __init__(
            self,
            policy,
            env,
            nsteps,
            ent_coef=0.01  # 엔트로피계수
        ,
            vf_coef=0.5  # 가치계수
        ,
            max_grad_norm=0.5,
            lr=7e-4,
            alpha=0.99  # 벼림비 에누리
        ,
            epsilon=1e-5  # 
        ,
            total_timesteps=int(80e6),
            lrschedule='linear'):

        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs * nsteps

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model 은 표집을 위해 사용한다
            step_model = policy(nenvs, 1, sess)

            # train_model 은 망을 벼림하기위해 사용한다
            train_model = policy(nbatch, nsteps, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        # 손실(loss)을 계산한다
        # Total loss = Policy gradient loss - entropy coefficient * entropy + Value coefficient * value loss
        # 총손실 = 정책 기울기손실 - 엔트로피계수 * 엔트로피 + 가치계수 * 가치손실

        # 정책 손실(Policy loss)
        neglogpac = train_model.pd.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # 엔트로피는 부적절한(suboptimal) 정책으로 조기수렴하는 것을 제한하여 탐사를 개선하는데 사용한다.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # 가치 손실(Value loss)
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # 손실(loss)을 사용하여 참여값(parameter: 가중값, 편향값)을 갱신한다
        # 1. 모형의 참여값(parameter)을 가져온다
        params = find_trainable_variables("a2c_model")

        # 2. 기울기(gradient)를 계산한다
        grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            # 기울기를 제한한다: Clip the gradients ( normalize )
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)

        # zip 은 참여값(parameter)에 관련된 각각의 기울기를 합산(aggregate)한다.
        # 예를들어 zip(ABCD, xyza) => Ax, By, Cz, Da
        grads = list(zip(grads, params))

        # 3. A2C 정책과 가치 갱신 한단계에 대한 동작을 만든다.
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)

        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            # 여기에서 강점을 계산한다: advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            advs = rewards - values

            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }

            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)

            return policy_loss, value_loss, policy_entropy

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 14
0
    def _build_net(self, s, h, scope, trainable, ent_coef=0.01, vf_coef=0.5):
        # s is the state of the current market
        # h is the number of hand 0-11
        with tf.variable_scope(scope):
            init_w = tf.random_normal_initializer(0., 0.3)
            init_b = tf.constant_initializer(0.1)
            net = tf.layers.dense(s,
                                  30,
                                  activation=tf.nn.relu,
                                  kernel_initializer=init_w,
                                  bias_initializer=init_b,
                                  name='l1',
                                  trainable=trainable)

            with tf.variable_scope('tcn'):
                tcndropout = tf.placeholder_with_default(0., shape=())
                value_map = build_tcn(s,
                                      tcndropout,
                                      kernel_size=3,
                                      num_channels=[256, 64, 32, 10])
                if (modelDebug):
                    print("value_map shape",
                          value_map.shape)  #value_map shape (?, 20, 10)
            with tf.variable_scope('vin'):
                v = value_map[:, -1, tf.
                              newaxis, :]  # get the values of the last time step
                vi_w = tf.get_variable('vi_w', [3, 1, 3],
                                       initializer=init_w,
                                       trainable=trainable)
                for i in range(-2, -5, -1):
                    q = tf.pad(v, tf.constant([[0, 0], [0, 0], [1, 1]]))
                    q = tfnn.conv1d(q, vi_w, 1, "VALID", data_format="NCW")
                    #v: [?,1,1,12] vi_w:[1,3,1,3]
                    if (modelDebug):
                        print("q shape", q.shape)  # q shape (?, 3, 10)
                    v = tf.reduce_max(q, axis=1, keepdims=True, name="v%d" % i)
                    v = v + value_map[:, i, tf.newaxis, :]
                # print(v.shape)
            with tf.variable_scope('a'):
                v = v[:, 0, :]  # reshape v into rank2
                paddings = tf.constant([[0, 0], [3, 3]])
                v = tf.pad(v, paddings, "SYMMETRIC")
                h_pos = tf.one_hot(h, depth=10)
                # att_v = v[:,0,h:h+7]# the attentioned value function
                att_v = tf.concat([v, h_pos], 1)  # concat the onehot position
                if (modelDebug):
                    print("att_v", att_v.shape)  #att_v (?, 26)
                action = tf.layers.dense(att_v,
                                         self.a_dim,
                                         kernel_initializer=init_w,
                                         bias_initializer=init_b,
                                         name='a',
                                         trainable=trainable)
                action = tf.nn.softmax(action)  #action (?, 3)
                if (modelDebug):
                    print("action", action.shape)

                value = tf.layers.dense(att_v,
                                        1,
                                        kernel_initializer=init_w,
                                        bias_initializer=init_b,
                                        name="v",
                                        trainable=trainable)
                if (modelDebug):
                    print("value :", value.shape)

                a = tf.argmax(
                    action, axis=1
                )  # the optimal action selected by algorithm for inference
                if (modelDebug):
                    print("a:", a.shape)
                a_hot = tf.one_hot(
                    A, depth=3
                )  # the one_hot vector from A(place holder of explored action) for training
                prob = tf.reduce_sum(tf.multiply(action, a_hot),
                                     reduction_indices=[1])
                eligibility = tf.log(prob) * (R - value)
                loss = -tf.reduce_sum(eligibility)

                entropy = tf.reduce_mean(tf.multiply(
                    tf.log(action),
                    action))  # the entropy term promotes exploration
                if (modelDebug):
                    print(" tf.multiply( tf.log(action), action  )",
                          tf.multiply(tf.log(action), action).shape)
                    print("entropy", entropy.shape)
                loss += entropy * ent_coef

                vf_loss = losses.mean_squared_error(value, R)
                loss -= vf_loss * vf_coef

                optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)

        return a, optimizer, value
Exemplo n.º 15
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nscripts=16,
                 nsteps=20,
                 nstack=4,
                 ent_coef=0.1,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.001,
                 kfac_clip=0.001,
                 lrschedule='linear',
                 alpha=0.99,
                 epsilon=1e-5):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nsml.bind(sess=sess)
        #nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])

        XY0 = tf.placeholder(tf.int32, [nbatch])
        XY1 = tf.placeholder(tf.int32, [nbatch])

        # ADV == TD_TARGET - values
        ADV = tf.placeholder(tf.float32, [nbatch])
        TD_TARGET = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         nenvs,
                                         1,
                                         nstack,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           nenvs,
                                           nsteps,
                                           nstack,
                                           reuse=True)

        # Policy 1 : Base Action : train_model.pi label = A

        script_mask = tf.concat([
            tf.zeros([nscripts * nsteps, 1]),
            tf.ones([(nprocs - nscripts) * nsteps, 1])
        ],
                                axis=0)

        pi = train_model.pi
        pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi,
                                                                   labels=A)
        neglogpac *= tf.stop_gradient(pac_weight)

        inv_A = 1.0 - tf.cast(A, tf.float32)

        xy0_mask = tf.cast(A, tf.float32)
        xy1_mask = tf.cast(A, tf.float32)

        condition0 = tf.equal(xy0_mask, 2)
        xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
        xy0_mask = 1.0 - xy0_mask

        condition1 = tf.equal(xy1_mask, 2)
        xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)

        # One hot representation of chosen marine.
        # [batch_size, 2]
        pi_xy0 = train_model.pi_xy0
        pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024),
                                   axis=1)

        logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=pi_xy0, labels=XY0)
        logpac_xy0 *= tf.stop_gradient(pac_weight)
        logpac_xy0 *= tf.cast(xy0_mask, tf.float32)

        pi_xy1 = train_model.pi_xy1
        pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024),
                                   axis=1)

        # 1D? 2D?
        logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=pi_xy1, labels=XY1)
        logpac_xy1 *= tf.stop_gradient(pac_weight)
        logpac_xy1 *= tf.cast(xy1_mask, tf.float32)

        pg_loss = tf.reduce_mean(ADV * neglogpac)
        pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
        pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)

        vf_ = tf.squeeze(train_model.vf)

        vf_r = tf.concat([
            tf.ones([nscripts * nsteps]),
            tf.zeros([(nprocs - nscripts) * nsteps])
        ],
                         axis=0) * TD_TARGET
        vf_masked = vf_ * tf.squeeze(script_mask) + vf_r

        #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]

        vf_loss = losses.mean_squared_error(vf_masked, TD_TARGET)
        entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
        entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
        entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
        entropy = entropy_a + entropy_xy0 + entropy_xy1

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=lr,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        self.logits = logits = train_model.pi

        # xy0

        self.params_common = params_common = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
        self.params_xy0 = params_xy0 = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES,
            scope='model/xy0') + params_common

        train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss

        self.grads_check_xy0 = grads_xy0 = tf.gradients(
            train_loss_xy0, params_xy0)
        if max_grad_norm is not None:
            grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)

        grads_xy0 = list(zip(grads_xy0, params_xy0))
        trainer_xy0 = tf.train.RMSPropOptimizer(learning_rate=lr,
                                                decay=alpha,
                                                epsilon=epsilon)
        _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)

        # xy1

        self.params_xy1 = params_xy1 = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES,
            scope='model/xy1') + params_common

        train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss

        self.grads_check_xy1 = grads_xy1 = tf.gradients(
            train_loss_xy1, params_xy1)
        if max_grad_norm is not None:
            grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)

        grads_xy1 = list(zip(grads_xy1, params_xy1))
        trainer_xy1 = tf.train.RMSPropOptimizer(learning_rate=lr,
                                                decay=alpha,
                                                epsilon=epsilon)
        _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)

        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
            advs = td_targets - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                XY0: xy0,
                XY1: xy1,
                ADV: advs,
                TD_TARGET: td_targets,
                PG_LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _, \
            policy_loss_xy0, policy_entropy_xy0, _, \
            policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train,
                 pg_loss_xy0, entropy_xy0, _train_xy0,
                 pg_loss_xy1, entropy_xy1, _train_xy1],
                td_map)
            return policy_loss, value_loss, policy_entropy, \
                   policy_loss_xy0, policy_entropy_xy0, \
                   policy_loss_xy1, policy_entropy_xy1

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        print("global_variables_initializer start")
        tf.global_variables_initializer().run(session=sess)
        print("global_variables_initializer complete")
Exemplo n.º 16
0
    def initialize(self) -> None:
        keras.backend.set_session(self._session)

        ##############################################################################
        #                                 Q-network                                  #
        ##############################################################################
        self._sensor_input_tensor = tf.placeholder(
            dtype=tf.float32,
            shape=(None, *self._sensor_state_shape),
            name="sensor_input",
        )
        self._heat_input_tensor = tf.placeholder(
            dtype=tf.float32,
            shape=(None, *self._heat_state_shape),
            name="heat_input")
        # self._position_input_tensor = tf.placeholder(
        #     dtype=tf.float32,
        #     shape=(None, *self._position_state_shape),
        #     name="position_input"
        # )

        # conv1 = Conv2D(
        #     filters=self._n_sensors // 2,
        #     kernel_size=(1, self._window_size),
        #     data_format="channels_last",
        #     activation="relu",
        #     name="conv1",
        # )
        # conv_output = conv1(self._sensor_input_tensor)
        #
        # conv2 = Conv2D(
        #     filters=self._n_sensors // 4,
        #     kernel_size=(1, self._window_size),
        #     data_format="channels_last",
        #     activation="relu",
        #     name="conv2",
        # )
        # conv_output = conv2(conv_output)
        #
        # flatten_conv = Flatten(name="flatten_conv")
        # flattened_conv = flatten_conv(conv_output)

        concat = Concatenate(name="concat")
        concatenated_input = concat([
            self._sensor_input_tensor,
            self._heat_input_tensor,
            # self._position_input_tensor,
        ])

        hidden_dense0 = Dense(units=self._n_sensor_inputs * 8,
                              activation="relu",
                              name="hidden_dense1")
        x = hidden_dense0(concatenated_input)

        hidden_dense1 = Dense(units=self._n_sensor_inputs * 4,
                              activation="relu",
                              name="hidden_dense1")
        x = hidden_dense1(x)

        hidden_dense2 = Dense(units=self._n_sensor_inputs * 2,
                              activation="relu",
                              name="hidden_dense2")
        x = hidden_dense2(x)

        hidden_dense3 = Dense(units=self._n_sensor_inputs,
                              activation="relu",
                              name="hidden_dense3")
        x = hidden_dense3(x)

        output_layer = Dense(units=self._n_output_angles + 1,
                             name="action_quality")
        self._actions_qualities_tensor = output_layer(x)
        self._action_index_tensor = tf.argmax(self._actions_qualities_tensor,
                                              axis=1,
                                              name="output")

        ################################################################################
        #                             Updating Q-network                               #
        ################################################################################
        self._chosen_actions_tensor = tf.placeholder(dtype=tf.int32,
                                                     shape=(None, ),
                                                     name="chosen_actions")
        self._rewards_tensor = tf.placeholder(dtype=tf.float32,
                                              shape=(None, ),
                                              name="discounted_rewards")
        self._terminates_tensor = tf.placeholder(dtype=tf.float32,
                                                 shape=(None, ),
                                                 name="episode_terminated")
        self._replay_next_states_qualities_tensor = tf.placeholder(
            dtype=tf.float32,
            shape=self._actions_qualities_tensor.shape,
            name="replay_next_states_qualities")

        next_state_indices = tf.stack((tf.range(
            0,
            tf.shape(self._rewards_tensor)[0]), self._chosen_actions_tensor),
                                      axis=1)
        responsible_qualities = tf.gather_nd(
            self._replay_next_states_qualities_tensor, next_state_indices)
        # noinspection PyTypeChecker
        target_quality = (self._rewards_tensor +
                          self._terminates_tensor * responsible_qualities *
                          self._process_config.reward_discount_coef)

        tf_range = tf.range(0,
                            tf.shape(self._rewards_tensor)[0],
                            dtype=tf.int32)
        state_indices = tf.stack((tf_range, self._chosen_actions_tensor),
                                 axis=1)
        current_quality = tf.gather_nd(self._actions_qualities_tensor,
                                       state_indices)

        loss = losses.mean_squared_error(target_quality,
                                         current_quality,
                                         reduction=losses.Reduction.MEAN)
        optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)

        self._update_model = optimizer.minimize(loss)
        self._session.run(tf.global_variables_initializer())
        self._saver = tf.train.Saver()
Exemplo n.º 17
0
    def __init__(self,
                 optimiser,
                 policy,
                 env,
                 nsteps,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6)):

        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs * nsteps

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            step_model = policy(nenvs, 1, sess)

            # train_model is used to train our network
            train_model = policy(nbatch, nsteps, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])
        Ent_Coeff = tf.placeholder(tf.float32, [])  # for Entropy

        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Policy loss
        neglogpac = train_model.pd.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Value loss
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        loss = pg_loss - entropy * Ent_Coeff + vf_loss * vf_coef

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Make op for one policy and value update step of A2C
        if optimiser == 'RMSProp':
            trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                                decay=alpha,
                                                epsilon=epsilon)
        elif optimiser == 'SGD':
            trainer = tf.train.GradientDescentOptimizer(learning_rate=LR)

        _train = trainer.apply_gradients(grads)

        #https://stackoverflow.com/a/45624533
        _slot_vars = [
            trainer.get_slot(var, name) for name in trainer.get_slot_names()
            for var in params
        ]
        SLOTS = [tf.placeholder(tf.float32, slot.shape) for slot in _slot_vars]
        _set_slots = [var.assign(SLOTS[i]) for i, var in enumerate(_slot_vars)]

        def get_opt_state():
            return sess.run(_slot_vars)

        def set_opt_state(state):
            feed = {k: v for k, v in zip(SLOTS, state)}
            return sess.run(_set_slots, feed)

        def train(obs, states, rewards, masks, actions, values, ent_coeff):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            advs = rewards - values

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                Ent_Coeff: ent_coeff,
                LR: 1.0
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        # Only this bit added
        def get_mean_std_neg_ll(obs, actions):
            td_map = {train_model.X: obs, A: actions}
            vals = sess.run(
                [train_model.pd.mean, train_model.pd.std, neglogpac], td_map)
            return vals

        self.get_mean_std_neg_ll = get_mean_std_neg_ll
        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        self.get_opt_state = get_opt_state
        self.set_opt_state = set_opt_state
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 18
0
    def __init__(self,
                 policy,
                 env,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 lr=1e-3,
                 alpha=0.99,
                 epsilon=1e-5):
        sess = tf.Session()
        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model for sampling
            step_model = policy(nenvs, 1, sess)
            # train_model to train our network
            train_model = policy(nbatch, nsteps, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        # total_loss = Policy gradient loss - entropy * entropy coeff + value coeff * value loss

        # policy loss
        neglogpac = train_model.pd.neglogp(A)
        # L = -log(pi(a|s)) * Adv(s, a)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        #entropy is used to improve exploration by limiting the premature
        # convergence to suboptimal policy
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # value loss
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # update params using loss
        # 1. get model params
        params = find_trainable_variables("a2c_model")

        # 2. calculate the gradients
        grads = tf.gradients(loss, params)
        grads = list(zip(grads, params))

        # 3. make op for one policy and value update step of A2C
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        def train(obs, states, rewards, mask, actions, values):
            # we calculate advantage A(s, a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            advs = rewards - values
            #for step in range(len(obs)):
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
Exemplo n.º 19
0
    def __init__(self,
                 action_dim,
                 state_dim,
                 lr=0.001,
                 ent_coef=0.01,
                 value_coef=0.5,
                 reward_decay=0.95,
                 output_graph=False):
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.lr = lr
        self.ent_coef = ent_coef
        self.value_coef = value_coef
        # self.actor_lr = actor_lr
        # self.critic_lr = critic_lr
        self.gamma = reward_decay
        self.output_graph = output_graph

        tf.reset_default_graph()

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        if output_graph:
            tf.summary.FileWriter("logs/", self.sess.graph)

        with tf.name_scope("inputs"):
            self.tf_obs = tf.placeholder(tf.float32, [None, self.state_dim],
                                         name="observations")
            self.tf_ac = tf.placeholder(tf.float32, [None, self.action_dim],
                                        name="actions")
            self.advantage = tf.placeholder(tf.float32, [
                None,
            ],
                                            name="advantage")
            self.R = tf.placeholder(tf.float32, [
                None,
            ], name="return")
        # fc1
        layer = tf.layers.dense(
            inputs=self.tf_obs,
            units=10,
            activation=tf.nn.tanh,
            kernel_initializer=tf.random_normal_initializer(mean=0,
                                                            stddev=0.3),
            bias_initializer=tf.constant_initializer(0.1),
            name='actor_fc1')

        # fc2
        mean_all_act = tf.layers.dense(
            inputs=layer,
            units=self.action_dim,
            activation=None,
            kernel_initializer=tf.random_normal_initializer(mean=0,
                                                            stddev=0.3),
            bias_initializer=tf.constant_initializer(0.1),
            name='actor_fc2')

        logstd_act = tf.get_variable(name="logstd",
                                     shape=[1, self.action_dim],
                                     initializer=tf.zeros_initializer())
        pdparam = tf.concat([mean_all_act, mean_all_act * 0.0 + logstd_act],
                            axis=1)
        self.pd = DiagGaussianPd(pdparam)

        self.action = self.pd.sample()
        self.neglogp = self.pd.neglogp(self.action)

        # for critic network, we share the first layer with actor
        self.value = tf.layers.dense(
            input=layer,
            units=1,
            activation=None,
            kernel_initializer=tf.random_normal_initializer(mean=0,
                                                            stddev=0.3),
            bias_initializer=tf.constant_initializer(0.1),
            name="critic_fc2")

        with tf.name_scope("Loss"):
            # Total loss = Policy loss - entropy * ent_coef + value loss * value_coef
            pg_loss = tf.reduce_mean(self.advantage * self.neglogp)
            value_loss = losses.mean_squared_error(
                tf.squeeze(self.value, self.R))
            entropy = tf.reduce_mean(self.pd.entropy())

            # total loss
            loss = pg_loss - entropy * self.ent_coef + value_loss * self.value_coef

        with tf.name_scope("Train"):
            train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)

        def step(self, observation):
            actions, values, neglogp = self.sess.run(
                [self.action, self.value, self.neglogp],
                feed_dict={self.tf_obs: observation[np.newaxis, :]})
            return actions, values, neglogp

        def learn(self, obs, actions, rewards, values):
            # calculate adv = reward - V(s)
            # reward = r + yV(s')
            advs = rewards - values
            # value = self.sess.run(self.value, feed_dict={self.obs: state})

            td_map = {
                self.tf_obs: obs,
                self.tf_ac: actions,
                self.advantage: advs,
                self.R: rewards
            }

            policy_loss, value_loss, policy_entropy, _ = self.sess.run(
                [pg_loss, value_loss, entropy, train_op], feed_dict=td_map)
            return policy_loss, value_loss, policy_entropy

        self.step = step
        self.learn = learn
Exemplo n.º 20
0
    def __init__(self,
                 network,
                 env,
                 *,
                 seed=None,
                 nsteps=5,
                 total_timesteps=int(80e6),
                 vf_coef=0.5,
                 ent_coef=0.5,
                 max_grad_norm=0.5,
                 lr=1e-5,
                 lrschedule='constant',
                 gamma=0.99,
                 alpha=0.99,
                 epsilon=1e-5,
                 model_save_path=None,
                 tb_log_path=None):
        """
        Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.

        Parameters:
        -----------

        :param network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see policies.py.py for full list)
                specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                See policies.py.py for more details on using recurrent nets in policies.py

        :param env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)

        :param seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)

        :param nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                nenv is number of environment copies simulated in parallel)

        :param total_timesteps: int, total number of timesteps to train on (default: 80M)

        :param vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5)

        :param ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01)

        :param max_grad_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)

        :param lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

        :param lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of
                the training progress as input and returns fraction of the learning rate (specified as lr) as output

        :param epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

        :param alpha: float, RMSProp decay parameter (default: 0.99)

        :param gamma: float, reward discounting parameter (default: 0.99)

        :param model_save_path: str, the location to save model parameters (if None, auto saving)

        :param tb_log_path: str, the log location for tensorboard (if None, no logging)

        """

        self.policy = build_policy(network)
        self.env = env
        self.nenvs = env.num_envs
        self.nsteps = nsteps
        nbatch = self.nenvs * nsteps
        self.seed = seed
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.lr = lr
        self.gamma = gamma
        self.alpha = alpha
        self.epsilon = epsilon
        self.total_timesteps = total_timesteps
        self.lrschedule = lrschedule
        self.model_save_path = model_save_path
        self.tb_log_path = None  # tb_log_path
        self.sess = get_session()
        self.graph = self.sess.graph
        self.episode_reward = np.zeros((self.nenvs, ))

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            self.step_model = self.policy(self.sess,
                                          env.observation_space,
                                          env.action_space,
                                          self.nenvs,
                                          1,
                                          self.nenvs,
                                          reuse=False)

            # train_model is used to train our network
            self.train_model = self.policy(self.sess,
                                           env.observation_space,
                                           env.action_space,
                                           self.nenvs,
                                           self.nsteps,
                                           nbatch,
                                           reuse=True)

        with tf.variable_scope('loss', reuse=False):
            self.action_ph = tf.placeholder(self.train_model.action.dtype,
                                            self.train_model.action.shape)
            self.adv_ph = tf.placeholder(tf.float32, [nbatch])
            self.reward_ph = tf.placeholder(tf.float32, [nbatch])
            self.lr_ph = tf.placeholder(tf.float32, [])

            # Calculate the loss
            # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

            # Policy loss
            neglogpac = self.train_model.proba_distribution.neglogp(
                self.action_ph)
            # L = A(s,a) * -logpi(a|s)
            self.pg_loss = tf.reduce_mean(self.adv_ph * neglogpac)

            # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
            self.entropy = tf.reduce_mean(
                self.train_model.proba_distribution.entropy())

            # Value loss
            self.vf_loss = losses.mean_squared_error(
                tf.squeeze(self.train_model.value_fn), self.reward_ph)

            self.reg_loss = tf.contrib.layers.apply_regularization(
                tf.contrib.layers.l2_regularizer(0.8),
                tf.trainable_variables())

            self.loss = self.pg_loss - self.entropy * ent_coef + self.vf_loss * vf_coef + self.reg_loss

            tf.summary.scalar('lr', self.lr_ph)
            tf.summary.scalar('pg_loss', self.pg_loss)
            tf.summary.scalar('entropy', self.entropy)
            tf.summary.scalar('vf_loss', self.vf_loss)
            tf.summary.scalar('loss', self.loss)
            tf.summary.histogram('obs', self.train_model.obs_ph)

            # Update parameters using loss
            # 1. Get the model parameters
            params = tf.trainable_variables("a2c_model")

            # 2. Calculate the gradients
            self.grads = grads = tf.gradients(self.loss, params)
            if max_grad_norm is not None:
                # Clip the gradients (normalize)
                grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            grads = list(zip(grads, params))

        # 3. Make op for one policy and value update step of A2C
        trainer = tf.train.RMSPropOptimizer(learning_rate=self.lr_ph,
                                            decay=alpha,
                                            epsilon=epsilon)

        self.apply_backprop = trainer.apply_gradients(grads)

        self.lr_schedule = Scheduler(initial_value=lr,
                                     n_values=total_timesteps,
                                     schedule=lrschedule)
        self.step = self.step_model.step
        self.value = self.step_model.value
        self.initial_state = self.step_model.initial_state
        self.def_path_pre = os.path.dirname(
            os.path.abspath(__file__)) + '/tmp/'  # default path prefix
        self.summary = tf.summary.merge_all()
        tf.global_variables_initializer().run(session=self.sess)
Exemplo n.º 21
0
    def __init__(self, policy, env, nsteps,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):

        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs*nsteps


        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            step_model = policy(nenvs, 1, sess)

            # train_model is used to train our network
            train_model = policy(nbatch, nsteps, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Policy loss
        neglogpac = train_model.pd.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Value loss
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Make op for one policy and value update step of A2C
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)

        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy


        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 22
0
    def __init__(self,
                 policy,
                 env,
                 nsteps,
                 dropoutpi_keep_prob,
                 dropoutpi_keep_prob_value,
                 dropoutvf_keep_prob,
                 dropoutvf_keep_prob_value,
                 isbnpitrainmode,
                 isbnvftrainmode,
                 l1regpi,
                 l2regpi,
                 l1regvf,
                 l2regvf,
                 wclippi,
                 wclipvf,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 regnologstd=False,
                 regonlylogstd=False):

        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs * nsteps

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            step_model = policy(nenvs, 1, sess)

            # train_model is used to train our network
            train_model = policy(nbatch, nsteps, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        self.dropoutpi_keep_prob = dropoutpi_keep_prob
        self.dropoutpi_keep_prob_value = dropoutpi_keep_prob_value
        self.dropoutvf_keep_prob = dropoutvf_keep_prob
        self.dropoutvf_keep_prob_value = dropoutvf_keep_prob_value
        self.isbnpitrainmode = isbnpitrainmode
        self.isbnvftrainmode = isbnvftrainmode

        #REGULARIZATION
        self.toregularizepi = l1regpi > 0 or l2regpi > 0
        self.toregularizevf = l1regvf > 0 or l2regvf > 0
        self.toweightclippi = wclippi > 0
        self.toweightclipvf = wclipvf > 0

        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Policy loss
        neglogpac = train_model.pd.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Value loss
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        if self.toregularizepi:
            print("Regularizing policy network: L1 = {}, L2 = {}".format(
                l1regpi, l2regpi))
            regularizerpi = tf.contrib.layers.l1_l2_regularizer(
                scale_l1=l1regpi, scale_l2=l2regpi, scope='a2c_model/pi')
            all_trainable_weights_pi = find_trainable_variables('a2c_model/pi')
            regularization_penalty_pi = tf.contrib.layers.apply_regularization(
                regularizerpi, all_trainable_weights_pi)
            loss = loss + regularization_penalty_pi
        if self.toregularizevf:
            print("Regularizing value network: L1 = {}, L2 = {}".format(
                l1regvf, l2regvf))
            regularizervf = tf.contrib.layers.l1_l2_regularizer(
                scale_l1=l1regvf, scale_l2=l2regvf, scope='a2c_model/vf')
            all_trainable_weights_vf = find_trainable_variables('a2c_model/vf')
            regularization_penalty_vf = tf.contrib.layers.apply_regularization(
                regularizervf, all_trainable_weights_vf)
            loss = loss + regularization_penalty_vf

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Make op for one policy and value update step of A2C
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)

        _train = trainer.apply_gradients(grads)

        if self.toweightclippi:
            print("Weight clipping policy network = {}".format(wclippi))
            policyparams = find_trainable_variables('a2c_model/pi')
            self._wclip_ops_pi = []
            self.wclip_bounds_pi = [-wclippi, wclippi]
            for toclipvar in policyparams:
                if 'logstd' in toclipvar.name:
                    continue
                self._wclip_ops_pi.append(
                    tf.assign(
                        toclipvar,
                        tf.clip_by_value(toclipvar, self.wclip_bounds_pi[0],
                                         self.wclip_bounds_pi[1])))
            self._wclip_op_pi = tf.group(*self._wclip_ops_pi)
        if self.toweightclipvf:
            print("Weight clipping value network = {}".format(wclipvf))
            valueparams = find_trainable_variables('a2c_model/vf')
            self._wclip_ops_vf = []
            self.wclip_bounds_vf = [-wclipvf, wclipvf]
            for toclipvar in valueparams:
                self._wclip_ops_vf.append(
                    tf.assign(
                        toclipvar,
                        tf.clip_by_value(toclipvar, self.wclip_bounds_vf[0],
                                         self.wclip_bounds_vf[1])))
            self._wclip_op_vf = tf.group(*self._wclip_ops_vf)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            if self.dropoutpi_keep_prob is not None:
                td_map[
                    self.dropoutpi_keep_prob] = self.dropoutpi_keep_prob_value
            if self.dropoutvf_keep_prob is not None:
                td_map[
                    self.dropoutvf_keep_prob] = self.dropoutvf_keep_prob_value
            if self.isbnpitrainmode is not None:
                td_map[self.isbnpitrainmode] = True
            if self.isbnvftrainmode is not None:
                td_map[self.isbnvftrainmode] = True
            train_tensors = [pg_loss, vf_loss, entropy, _train]
            if self.toweightclippi:
                train_tensors.append(self._wclip_op_pi)
            if self.toweightclipvf:
                train_tensors.append(self._wclip_op_vf)
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                train_tensors, td_map)[:4]
            return policy_loss, value_loss, policy_entropy

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)