Exemplo n.º 1
0
        def a2c_loss(pi, vf):
            neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.pi, labels=A)
            pg_loss = tf.reduce_mean(ADV * neglogpac)
            vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
            entropy = tf.reduce_mean(cat_entropy(train_model.pi))

            # ent_coef_mode = hparams.get('ent_coef_mode', 'default')
            # ent_coef_val = hparams.get('ent_coef_val', ent_coef)

            # if ent_coef_mode == 'default':
            #     actual_ent_coef = ent_coef_val
            # elif ent_coef_mode == 'linear_teacher':
            #     actual_ent_coef = ent_coef_val * TEACHER_C + ent_coef * (1 - TEACHER_C)
            # elif ent_coef_mode == 'additive_teacher':
            #     actual_ent_coef = ent_coef_val + ent_coef_val * TEACHER_C
            # else:
            #     raise Exception('unrecognized ent_coef_mode: {}'.format(ent_coef_mode))

            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
            return loss, pg_loss, vf_loss, entropy
Exemplo n.º 2
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        #X, processed_x = observation_input(ob_space, nbatch)
        X, processed_x = observation_input(ob_space, None)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        self.entropy = cat_entropy(self.pi)

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        def neg_log_prob(actions):
            return tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.pi, labels=actions)

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
        self.neg_log_prob = neg_log_prob
Exemplo n.º 3
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', logdir=None):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs*nsteps

        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=train_model.a0)
        entropy = tf.reduce_sum(cat_entropy(train_model.pi))
        params = find_trainable_variables("model")
        tf.summary.histogram("vf", train_model.vf)
        tf.summary.histogram("R", R)

        if train_model.relaxed:
            pg_loss = tf.constant(0.0)
            oh_A = tf.one_hot(train_model.a0, ac_space.n)

            params = find_trainable_variables("model")
            policy_params = [v for v in params if "pi" in v.name]
            vf_params = [v for v in params if "vf" in v.name]
            entropy_grads = tf.gradients(entropy, policy_params)

            ddiff_loss = tf.reduce_sum(train_model.vf - train_model.vf_t)
            ddiff_grads = tf.gradients(ddiff_loss, policy_params)

            sm = tf.nn.softmax(train_model.pi)
            dlogp_dpi = oh_A * (1. - sm) + (1. - oh_A) * (-sm)
            pi_grads = -((tf.expand_dims(R, 1) - train_model.vf_t) * dlogp_dpi)
            pg_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads)
            pg_grads = [pg - dg for pg, dg in zip(pg_grads, ddiff_grads)]

            pi_param_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads)

            cv_grads = tf.concat([tf.reshape(p, [-1]) for p in pg_grads], 0)
            cv_grad_splits = tf.reduce_sum(tf.square(cv_grads))
            vf_loss = cv_grad_splits * vf_coef

            cv_grads = tf.gradients(vf_loss, vf_params)

            policy_grads = []
            for e_grad, p_grad, param in zip(entropy_grads, pg_grads, policy_params):
                grad = -e_grad * ent_coef + p_grad
                policy_grads.append(grad)
            grad_dict = {}

            for g, v in list(zip(policy_grads, policy_params))+list(zip(cv_grads, vf_params)):
                grad_dict[v] = g

            grads = [grad_dict[v] for v in params]
            print(grads)


        else:
            pg_loss = tf.reduce_sum((tf.stop_gradient(R) - tf.stop_gradient(train_model.vf)) * neglogpac)
            policy_params = [v for v in params if "pi" in v.name]
            pg_grads = tf.gradients(pg_loss, policy_params)

            vf_loss = tf.reduce_sum(mse(tf.squeeze(train_model.vf), R))
            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
            grads = tf.gradients(loss, params)

        grads = list(zip(grads, params))

        ema = tf.train.ExponentialMovingAverage(.99)
        all_policy_grads = tf.concat([tf.reshape(g, [-1]) for g in pg_grads], 0)
        all_policy_grads_sq = tf.square(all_policy_grads)
        apply_mean_op = ema.apply([all_policy_grads, all_policy_grads_sq])
        em_mean = ema.average(all_policy_grads)
        em_mean_sq = ema.average(all_policy_grads_sq)
        em_var = em_mean_sq - tf.square(em_mean)
        em_log_var = tf.log(em_var + 1e-20)
        mlgv = tf.reduce_mean(em_log_var)

        for g, v in grads:
            print(v.name, g)
            tf.summary.histogram(v.name, v)
            tf.summary.histogram(v.name+"_grad", g)

        self.sum_op = tf.summary.merge_all()
        self.writer = tf.summary.FileWriter(logdir)

        trainer = tf.train.AdamOptimizer(learning_rate=LR, beta2=.99999)
        with tf.control_dependencies([apply_mean_op]):
            _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
        self._step = 0
        def train(obs, states, rewards, masks, u1, u2, values, summary=False):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X:obs, train_model.U1:u1, train_model.U2:u2,
                ADV:advs, R:rewards, LR:cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            if summary:
                sum_str, policy_loss, value_loss, policy_entropy, lv, _ = sess.run(
                    [self.sum_op, pg_loss, vf_loss, entropy, mlgv, _train],
                    td_map
                )
                self.writer.add_summary(sum_str, self._step)
            else:
                policy_loss, value_loss, policy_entropy, lv, _ = sess.run(
                    [pg_loss, vf_loss, entropy, mlgv, _train],
                    td_map
                )
            self._step += 1
            return policy_loss, value_loss, policy_entropy, lv

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 4
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6),
                 lrschedule='linear', replay_lambda=1, ss_rate=1,
                 replay_loss=None):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs*nsteps

        # If we have replay_loss, create replay buffer and stage buffer
        # Use this to enforce replay loss lower
        if replay_loss is not None:
            self.replay_buffer = [] # holds all past data

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))

        # Introduce replay_loss if given
        if replay_loss == "L2":
            # Replace train_model.pi with whatever is predicted label
            # Replace A with whatever is recorded label
            re_loss = tf.nn.l2_loss(tf.nn.softmax(train_model.pi) - A) / nbatch
        elif replay_loss == "Distillation":
            # Replace y_donor with whatever is recorded label
            # Replace y_acceptor with whatever is predicted label
            re_loss = tf.reduce_mean( - tf.reduce_sum(tf.stop_gradient(y_donor)
                                                      * tf.log(y_acceptor),
                                                      reduction_indices=1))
        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
        if replay_loss is not None:
            loss = loss + replay_lambda*re_loss
        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 cell=256,
                 sv_M=32,
                 algo='regular',
                 ib_alpha=1e-3):
        sess = tf_util.make_session()

        act_model = policy(sess,
                           ob_space,
                           ac_space,
                           nbatch_act,
                           1,
                           1,
                           cell=cell,
                           M=sv_M,
                           model='step_model',
                           algo=algo)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nbatch_train,
                             1,
                             nsteps,
                             cell=cell,
                             M=sv_M,
                             model='train_model',
                             algo=algo)

        A = train_model.wpdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC_expand = tf.placeholder(tf.float32, [None, sv_M])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        OLDVPRED_expand = tf.placeholder(tf.float32, [None, sv_M])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])

        if algo == 'use_svib_uniform' or algo == 'use_svib_gaussian':

            def expand_placeholder(X, M=sv_M):
                return tf.tile(tf.expand_dims(X, axis=-1), [1, M])

            A_expand, R_expand = expand_placeholder(A), expand_placeholder(R)
            neglogpac_expand = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.wpi_expand,
                labels=A_expand)  #shape=[nbatch, sv_M]
            entropy_expand = tf.reduce_mean(cat_entropy(
                train_model.wpi_expand),
                                            axis=-1)  #shape=[nbatch]
            vpred_expand = train_model.wvf_expand[:, :, 0]
            vpredclipped_expand = OLDVPRED_expand + tf.clip_by_value(
                train_model.wvf_expand[:, :, 0] - OLDVPRED_expand, -CLIPRANGE,
                CLIPRANGE)
            vf_loss1_expand = tf.square(vpred_expand - R_expand)
            vf_loss2_expand = tf.square(vpredclipped_expand - R_expand)
            vf_loss_expand = .5 * tf.reduce_mean(tf.maximum(
                vf_loss1_expand, vf_loss2_expand),
                                                 axis=-1)  #shape = [nbatch]
            ratio_expand = tf.exp(OLDNEGLOGPAC_expand - neglogpac_expand)
            ADV_expand = R_expand - OLDVPRED_expand
            # ADV_expand_mean, ADV_expand_var = tf.nn.moments(ADV_expand, axes=0, keep_dims=True)#shape = [1,sv_M]
            ADV_expand_mean, ADV_expand_var = tf.nn.moments(
                ADV_expand, axes=[0, 1])  #shape = [1,sv_M]
            ADV_expand_normal = (ADV_expand - ADV_expand_mean) / (
                tf.sqrt(ADV_expand_var) + 1e-8)
            pg_losses_expand = -ADV_expand_normal * ratio_expand
            pg_losses2_expand = -ADV_expand_normal * tf.clip_by_value(
                ratio_expand, 1. - CLIPRANGE, 1. + CLIPRANGE)
            pg_loss_expand = tf.reduce_mean(tf.maximum(pg_losses_expand,
                                                       pg_losses2_expand),
                                            axis=-1)
            J_theta = -(pg_loss_expand + vf_coef * vf_loss_expand -
                        ent_coef * entropy_expand)

            loss_expand = -J_theta / float(nbatch_train)
            pg_loss_expand_ = tf.reduce_mean(pg_loss_expand)
            vf_loss_expand_ = tf.reduce_mean(vf_loss_expand)
            entropy_expand_ = tf.reduce_mean(entropy_expand)

            log_p_grads = tf.gradients(
                J_theta / np.sqrt(ib_alpha),
                [train_model.wh_expand])[0]  #shape=[nbatch, sv_M, cell]
            if algo == 'use_svib_gaussian':
                mean, var = tf.nn.moments(
                    train_model.wh_expand, axes=1,
                    keep_dims=True)  #shape=[nbatch, 1,cell]
                gaussian_grad = -(train_model.wh_expand - mean) / (
                    float(sv_M) * (var + 1e-3))
                log_p_grads += 5e-3 * (
                    tf_l2norm(log_p_grads, axis=-1, keep_dims=True) /
                    tf_l2norm(gaussian_grad, axis=-1,
                              keep_dims=True)) * gaussian_grad
            sv_grads = tf.constant(0.,
                                   tf.float32,
                                   shape=[nbatch_train, 0, cell])
            exploit_total_norm_square = 0
            explore_total_norm_square = 0
            explore_coef = 1.
            if env_name == 'SeaquestNoFrameskip-v4':
                explore_coef = 0.01
            elif env_name in [
                    'AirRaidNoFrameskip-v4,'
                    'BreakoutNoFrameskip-v4', 'AtlantisNoFrameskip-v4',
                    'StarGunnerNoFrameskip-v4', 'AsteroidsNoFrameskip-v4',
                    'YarsRevengeNoFrameskip-v4'
            ]:
                explore_coef = 0.
            print('env_name:', env_name, 'explore_coef: ', explore_coef)
            for i in range(sv_M):
                exploit = tf.reduce_sum(train_model.rpf_matrix[:, :, i:i + 1] *
                                        log_p_grads,
                                        axis=1)
                explore = np.sqrt(
                    ib_alpha) * explore_coef * train_model.rpf_grads[:, i, :]
                exploit_total_norm_square += tf.square(
                    tf_l2norm(exploit, axis=-1, keep_dims=False))
                explore_total_norm_square += tf.square(
                    tf_l2norm(explore, axis=-1, keep_dims=False))
                sv_grad = exploit + explore  #shape=[nbatch, cell]
                sv_grads = tf.concat(
                    [sv_grads, tf.expand_dims(sv_grad, axis=1)], axis=1)
            SV_GRADS = tf.placeholder(tf.float32, [nbatch_train, sv_M, cell])
            repr_loss = tf.reduce_mean(SV_GRADS * train_model.wh_expand,
                                       axis=1)  #shape=[nbatch,cell]
            repr_loss = -tf.reduce_mean(tf.reduce_sum(
                repr_loss,
                axis=-1))  #max optimization problem to minimization problem

            #op for debugging and visualization
            exploit_explore_ratio = tf.sqrt(
                exploit_total_norm_square /
                tf.maximum(explore_total_norm_square, 0.01))[0]
            # rpf_mat = tf.expand_dims(train_model.rpf_matrix, axis=-1)
            # log_p_grads_tile = tf.tile(tf.expand_dims(log_p_grads, axis=2), [1,1,sv_M,1])
            # exploit = tf.reduce_sum(rpf_mat*log_p_grads_tile, axis=1)
            # explore = np.sqrt(ib_alpha) * train_model.rpf_grads
            # sv_grads = exploit + explore
            # ind = 1
            # exploit = tf.reduce_sum(train_model.rpf_matrix[:, :, i:i + 1] * log_p_grads, axis=1)
            # explore = train_model.rpf_grads[:, i, :]
            # clip_coef = tf_l2norm(exploit, axis=-1, keep_dims=True)
            # explore_norm = tf_l2norm(explore, axis=-1, keep_dims=True)
            # explore = explore * 1e-2 * clip_coef / tf.maximum(explore_norm, clip_coef)
            # sv_grad = exploit + np.sqrt(ib_alpha) * explore  # shape=[nbatch, cell]

            grads_expand, grad_norm_expand = grad_clip(loss_expand,
                                                       max_grad_norm,
                                                       ['model/worker_module'])
            trainer_expand = tf.train.AdamOptimizer(learning_rate=LR,
                                                    epsilon=1e-5)
            _train_expand = trainer_expand.apply_gradients(grads_expand)
            repr_grads, repr_global_norm = grad_clip(
                repr_loss, max_grad_norm, ['model/ordinary_encoder'])
            repr_trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
            _repr_train = repr_trainer.apply_gradients(repr_grads)
        else:
            print('env_name:', env_name)
            neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.wpi, labels=A)
            entropy = tf.reduce_mean(cat_entropy(train_model.wpi))
            vpred = train_model.wvf[:, 0]
            vpredclipped = OLDVPRED + tf.clip_by_value(
                train_model.wvf[:, 0] - OLDVPRED, -CLIPRANGE, CLIPRANGE)
            vf_losses1 = tf.square(vpred - R)
            vf_losses2 = tf.square(vpredclipped - R)
            vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
            ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
            pg_losses = -ADV * ratio
            pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                                 1.0 + CLIPRANGE)
            pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

            grads, grad_norm = grad_clip(
                loss, max_grad_norm,
                ['model/worker_module', 'model/ordinary_encoder'])
            trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
            _train = trainer.apply_gradients(grads)

        with tf.variable_scope('model'):
            params = tf.trainable_variables()

        def generate_old_expand_data(obs, noises, masks, actions, states=None):
            noises_expand = sess.run(train_model.noise_expand)
            repr_td_map = {
                train_model.wX: obs,
                train_model.istraining: False,
                A: actions,
                train_model.noise_expand: noises_expand,
                train_model.NOISE_KEEP: noises
            }
            if states is not None:
                repr_td_map[train_model.wS] = states
                repr_td_map[train_model.wM] = masks
            neglogpacs_expand, vpreds_expand = \
                sess.run([neglogpac_expand, vpred_expand], feed_dict=repr_td_map)
            shape = noises_expand.shape
            noises_expand = noises_expand.reshape(nbatch_train, sv_M - 1,
                                                  *shape[1:])
            return [noises_expand, neglogpacs_expand, vpreds_expand]

        def train(lr,
                  cliprange,
                  obs,
                  noises,
                  returns,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  noises_expand=None,
                  neglogpacs_expand=None,
                  vpreds_expand=None,
                  states=None):
            advs = returns - values
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            if algo == 'use_svib_uniform' or algo == 'use_svib_gaussian':
                shape = noises_expand.shape
                noises_expand_ = noises_expand.reshape(
                    nbatch_train * (sv_M - 1), *shape[2:])
                # print(noises_expand_.shape)
                repr_td_map = {
                    train_model.wX: obs,
                    train_model.istraining: True,
                    A: actions,
                    R: returns,
                    LR: lr,
                    CLIPRANGE: cliprange,
                    train_model.noise_expand: noises_expand_,
                    train_model.NOISE_KEEP: noises,
                    OLDNEGLOGPAC_expand: neglogpacs_expand,
                    OLDVPRED_expand: vpreds_expand
                }
            rl_td_map = {
                train_model.istraining: True,
                A: actions,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange
            }
            if states is not None:
                if algo == 'use_svib_uniform' or algo == 'use_svib_gaussian':
                    repr_td_map[train_model.wS] = states
                    repr_td_map[train_model.wM] = masks
                rl_td_map[train_model.wS] = states
                rl_td_map[train_model.wM] = masks

            if algo == 'use_svib_uniform' or algo == 'use_svib_gaussian':
                sv_gradients, whs_expand, ir_ratio = sess.run(
                    [sv_grads, train_model.wh_expand, exploit_explore_ratio],
                    feed_dict=repr_td_map)
                rl_td_map[OLDNEGLOGPAC_expand], rl_td_map[
                    OLDVPRED_expand], rl_td_map[
                        train_model.
                        wh_expand] = neglogpacs_expand, vpreds_expand, whs_expand
                value_loss, policy_loss, policy_entropy, _, rl_grad_norm = sess.run(
                    [
                        vf_loss_expand_, pg_loss_expand_, entropy_expand_,
                        _train_expand, grad_norm_expand
                    ],
                    feed_dict=rl_td_map)
                repr_td_map[SV_GRADS] = sv_gradients
                repr_grad_norm, represent_loss, __ = sess.run(
                    [repr_global_norm, repr_loss, _repr_train],
                    feed_dict=repr_td_map)
            else:
                rl_td_map[train_model.wX], rl_td_map[
                    train_model.
                    noise] = obs, noises  #noise won't be used when algo is 'regular'
                rl_td_map[OLDNEGLOGPAC], rl_td_map[OLDVPRED], rl_td_map[
                    ADV] = neglogpacs, values, advs
                value_loss, policy_loss, policy_entropy, _, rl_grad_norm = sess.run(
                    [vf_loss, pg_loss, entropy, _train, grad_norm],
                    feed_dict=rl_td_map)
                represent_loss, rpf_norm_, rpf_grad_norm_, sv_gradients, ir_ratio, repr_grad_norm = 0., 0., 0., 0., 0, 0.
            return policy_loss, value_loss, policy_entropy, represent_loss, ir_ratio, rl_grad_norm, repr_grad_norm

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'represent_loss',
            'exploit_explore_ratio', 'rl_grad_norm', 'repr_grad_norm'
        ]

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)
            # If you want to load weights, also save/load observation scaling inside VecNormalize

        self.generate_old_expand_data = generate_old_expand_data
        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.wvalue
        self.initial_state = act_model.w_initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)  #pylint: disable=E1101
Exemplo n.º 6
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 lambda_dist=0.01,
                 total_timesteps=None,
                 lrschedule='linear'):

        sess = tf.get_default_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        config = Config()

        act_model = policy(config)
        config.reuse = True
        train_model = policy(config)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.logits, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.logits))

        aux_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.rp_logits, labels=A)
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + aux_loss * lambda_dist

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        saver = tf.train.Saver()

        def train(obs, rs, rr, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr,
                train_model.inputs_s: rs,
                train_model.inputs_r: rr
            }

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            saver.save(sess, save_path + 'model.ckpt')

        def load(load_path):
            saver.restore(sess, load_path + 'model.ckpt')

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.act = act_model.act
        self.value = act_model.value
        self.save = save
        self.load = load
Exemplo n.º 7
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 nstack,
                 num_procs,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 continuous_actions=False,
                 debug=False,
                 numAgents=2,
                 itr=1,
                 particleEnv=False,
                 communication=False):
        self.continuous_actions = continuous_actions
        self.nenvs = nenvs
        print('vf_coef', vf_coef)
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        # print('action space: ', ac_space)
        if particleEnv == False:
            nact = ac_space.n
        elif communication == False:
            nact = ac_space[itr].n
        else:
            nact = ac_space[itr].high - ac_space[itr].low  # modified
        self.nact = nact
        # print('nact: ', nact)
        # print(nact)
        nbatch = nenvs * nsteps
        # print(nbatch)
        # print('batch size: ', nbatch)
        if self.continuous_actions:
            A = tf.placeholder(tf.float32, [nbatch])
        elif particleEnv == False or communication == False:
            A = tf.placeholder(tf.int32, [nbatch])
        else:
            actions_per_agent = 2
            A = tf.placeholder(tf.int32, [actions_per_agent, nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])
        if particleEnv == False:
            step_model = policy(
                sess,
                ob_space,
                ac_space,
                nenvs,
                1,
                nstack,
                reuse=tf.AUTO_REUSE,
                continuous_actions=continuous_actions)  #, itr=itr)
            train_model = policy(
                sess,
                ob_space,
                ac_space,
                nenvs,
                nsteps,
                nstack,
                reuse=tf.AUTO_REUSE,
                continuous_actions=continuous_actions)  #, itr=itr)
        elif communication == False:
            # print('step model')
            step_model = policy(sess,
                                ob_space,
                                ac_space,
                                nenvs,
                                1,
                                nstack,
                                reuse=False,
                                continuous_actions=continuous_actions,
                                itr=itr,
                                communication=communication)
            # print('train model')
            train_model = policy(sess,
                                 ob_space,
                                 ac_space,
                                 nenvs,
                                 nsteps,
                                 nstack,
                                 reuse=tf.AUTO_REUSE,
                                 continuous_actions=continuous_actions,
                                 itr=itr,
                                 communication=communication)
        else:
            step_model = policy(sess,
                                ob_space,
                                ac_space,
                                nenvs,
                                1,
                                nstack,
                                reuse=tf.AUTO_REUSE,
                                continuous_actions=continuous_actions,
                                itr=itr,
                                communication=communication)
            train_model = policy(sess,
                                 ob_space,
                                 ac_space,
                                 nenvs,
                                 nsteps,
                                 nstack,
                                 reuse=tf.AUTO_REUSE,
                                 continuous_actions=continuous_actions,
                                 itr=itr,
                                 communication=communication)
        # else:
        # else:
        #     step_model = []
        #     train_model = []
        #     for i in range(numAgents):
        #         step_model.append(policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions))
        #         train_model.append(policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True, continuous_actions=continuous_actions))

        # print(train_model)
        if self.continuous_actions:
            neglogpac = tf.log(mse(train_model.mu, A))
        elif particleEnv == False or communication == False:
            # print('A: ', A)
            neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.pi, labels=A)
            vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
            entropy = tf.reduce_mean(cat_entropy(train_model.pi))
            pg_loss = tf.reduce_mean(ADV * neglogpac)
            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        else:
            neglogpac = []
            entropy = []
            pg_loss = []
            loss = []
            vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
            neglogpac_ = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.pi_c, labels=A[0])
            entropy_ = tf.reduce_mean(cat_entropy(train_model.pi_c))
            pg_loss_ = tf.reduce_mean(ADV * neglogpac_)
            entropy.append(entropy_)
            pg_loss.append(pg_loss_)
            loss.append(pg_loss_ - entropy_ * ent_coef + vf_loss * vf_coef)
            neglogpac_ = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.pi_u, labels=A[1])
            entropy_ = tf.reduce_mean(cat_entropy(train_model.pi_u))
            pg_loss_ = tf.reduce_mean(ADV * neglogpac_)
            entropy.append(entropy_)
            pg_loss.append(pg_loss_)
            loss.append(pg_loss_ - entropy_ * ent_coef + vf_loss * vf_coef)

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # f itr == 0:
        # trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = tf.train.AdamOptimizer(
            learning_rate=LR, name=str(itr)
        ).apply_gradients(
            grads
        )  # , decay=alpha, epsilon=epsilon, name=str(itr)).apply_gradients(grads)
        # _train = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon, name=str(itr)).apply_gradients(grads) # Error here

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs,
                  states,
                  rewards,
                  masks,
                  actions,
                  values,
                  debug=False,
                  numAgents=2):
            # print('train rewards and values')
            # print(actions[0])
            # print(actions[1])
            # print(rewards)
            # print(values)
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            # if states != []:
            if train_model.initial_state != []:
                # print(states)
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            if debug == True:
                policy_loss, value_loss, policy_entropy, all_grad_vals, _ = sess.run(
                    [pg_loss, vf_loss, entropy, grads, _train], td_map)
                # grad_vals = [(np.min(grad_vals), np.max(grad_vals), np.sum(grad_vals)) for grad_vals in all_grad_vals]
                # print('Policy Gradients: ')
                # print(all_grad_vals[9])
                # print('Value Gradients: ')
                # print(all_grad_vals[11])
                print('Gradient Values: ')
                print(all_grad_vals)
            else:
                policy_loss, value_loss, policy_entropy, _ = sess.run(
                    [pg_loss, vf_loss, entropy, _train], td_map)
            # else:
            # td_map = []
            #     print('Train Model in train')
            #     print(train_model)
            #     for i in range(numAgents):
            #         td_map = {train_model[i].X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            #         if train_model[i].initial_state != []:
            #             print('states')
            #            print(states)
            #            td_map[train_model[i].S] = states
            #            td_map[train_model[i].M] = masks
            #        if debug:
            #            print('point1')
            #            policy_loss, value_loss, policy_entropy, all_grad_vals, _ = sess.run(
            #                [pg_loss, vf_loss, entropy, grads, _train],
            #                td_map
            #            )
            #            print('point2')
            #            grad_vals = [(np.min(grad_vals), np.max(grad_vals), np.sum(grad_vals)) for grad_vals in all_grad_vals]
            #            print('Policy Gradients: ')
            #            print(all_grad_vals[9])
            #            print('Value Gradients: ')
            #            print(all_grad_vals[11])
            #        else:
            #            policy_loss, value_loss, policy_entropy, _ = sess.run(
            #                [pg_loss, vf_loss, entropy, _train],
            #                td_map
            #            )
            # print('Policy Loss: ')
            # print(policy_loss)
            # print('Value Loss: ')
            # print(value_loss)

            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            #make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        # if numAgents == 1:
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        # else:
        #     self.step = []
        #     self.value = []
        #     self.initial_state = []
        #     for i in range(numAgents):
        #         self.step.append(step_model[i].step)
        #         self.value.append(step_model[i].value)
        #         self.initial_state.append(step_model[i].initial_state)
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 8
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         nenvs,
                                         1,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           nenvs * nsteps,
                                           nsteps,
                                           reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV * logpac)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss

        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(
            train_model.vf))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss,
                                                            var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                PG_LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 9
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 v_mix_coef=0.5,
                 max_grad_norm=0.5,
                 lr_alpha=7e-4,
                 lr_beta=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 r_ex_coef=1.0,
                 r_in_coef=0.0,
                 v_ex_coef=1.0):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch], 'A')
        R_EX = tf.placeholder(tf.float32, [nbatch], 'R_EX')
        ADV_EX = tf.placeholder(tf.float32, [nbatch], 'ADV_EX')
        RET_EX = tf.placeholder(tf.float32, [nbatch], 'RET_EX')
        V_MIX = tf.placeholder(tf.float32, [nbatch], 'V_MIX')
        DIS_V_MIX_LAST = tf.placeholder(tf.float32, [nbatch], 'DIS_V_MIX_LAST')
        COEF_MAT = tf.placeholder(tf.float32, [nbatch, nbatch], 'COEF_MAT')
        LR_ALPHA = tf.placeholder(tf.float32, [], 'LR_ALPHA')
        LR_BETA = tf.placeholder(tf.float32, [], 'LR_BETA')

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        r_mix = r_ex_coef * R_EX + r_in_coef * tf.reduce_sum(
            train_model.r_in * tf.one_hot(A, nact), axis=1)
        ret_mix = tf.squeeze(
            tf.matmul(COEF_MAT, tf.reshape(r_mix, [nbatch, 1])),
            [1]) + DIS_V_MIX_LAST
        adv_mix = ret_mix - V_MIX

        neglogpac = train_model.pd.neglogp(A)
        pg_mix_loss = tf.reduce_mean(adv_mix * neglogpac)
        v_mix_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_mix),
                                        ret_mix))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        policy_loss = pg_mix_loss - ent_coef * entropy + v_mix_coef * v_mix_loss

        policy_params = train_model.policy_params
        policy_grads = tf.gradients(policy_loss, policy_params)
        if max_grad_norm is not None:
            policy_grads, policy_grad_norm = tf.clip_by_global_norm(
                policy_grads, max_grad_norm)
        policy_grads_and_vars = list(zip(policy_grads, policy_params))
        policy_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_ALPHA,
                                                   decay=alpha,
                                                   epsilon=epsilon)
        policy_train = policy_trainer.apply_gradients(policy_grads_and_vars)

        rmss = [policy_trainer.get_slot(var, 'rms') for var in policy_params]
        policy_params_new = {}
        for grad, rms, var in zip(policy_grads, rmss, policy_params):
            ms = rms + (tf.square(grad) - rms) * (1 - alpha)
            policy_params_new[
                var.name] = var - LR_ALPHA * grad / tf.sqrt(ms + epsilon)
        policy_new = train_model.policy_new_fn(policy_params_new, ob_space,
                                               ac_space, nbatch, nsteps)

        neglogpac_new = policy_new.pd.neglogp(A)
        ratio_new = tf.exp(tf.stop_gradient(neglogpac) - neglogpac_new)
        pg_ex_loss = tf.reduce_mean(-ADV_EX * ratio_new)
        v_ex_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_ex), RET_EX))
        intrinsic_loss = pg_ex_loss + v_ex_coef * v_ex_loss

        intrinsic_params = train_model.intrinsic_params
        intrinsic_grads = tf.gradients(intrinsic_loss, intrinsic_params)
        if max_grad_norm is not None:
            intrinsic_grads, intrinsic_grad_norm = tf.clip_by_global_norm(
                intrinsic_grads, max_grad_norm)
        intrinsic_grads_and_vars = list(zip(intrinsic_grads, intrinsic_params))
        intrinsic_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_BETA,
                                                      decay=alpha,
                                                      epsilon=epsilon)
        intrinsic_train = intrinsic_trainer.apply_gradients(
            intrinsic_grads_and_vars)

        lr_alpha = Scheduler(v=lr_alpha,
                             nvalues=total_timesteps,
                             schedule=lrschedule)
        lr_beta = Scheduler(v=lr_beta,
                            nvalues=total_timesteps,
                            schedule=lrschedule)

        all_params = tf.global_variables()

        def train(obs, policy_states, masks, actions, r_ex, ret_ex, v_ex,
                  v_mix, dis_v_mix_last, coef_mat):
            advs_ex = ret_ex - v_ex
            for step in range(len(obs)):
                cur_lr_alpha = lr_alpha.value()
                cur_lr_beta = lr_beta.value()
            td_map = {
                train_model.X: obs,
                policy_new.X: obs,
                A: actions,
                R_EX: r_ex,
                ADV_EX: advs_ex,
                RET_EX: ret_ex,
                V_MIX: v_mix,
                DIS_V_MIX_LAST: dis_v_mix_last,
                COEF_MAT: coef_mat,
                LR_ALPHA: cur_lr_alpha,
                LR_BETA: cur_lr_beta
            }
            if policy_states is not None:
                td_map[train_model.PS] = policy_states
                td_map[train_model.M] = masks
            return sess.run([entropy, policy_train, intrinsic_train],
                            td_map)[0]

        def save(save_path):
            ps = sess.run(all_params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(all_params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.intrinsic_reward = step_model.intrinsic_reward
        self.init_policy_state = step_model.init_policy_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 10
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 hparams=None):
        assert hparams != None
        hparams['_vf_coef'] = vf_coef

        # Create the session.
        sess = tf_util.make_session(
            per_process_gpu_memory_fraction=hparams.get('gpu_fraction', 0.25))
        self.sess = sess

        # Copy hparams.
        self.hparams = hparams
        self.nenvs = nenvs
        self.nsteps = nsteps

        self.hparams['batch_size'] = nenvs * nsteps

        # Setup constants.
        nact = ac_space.n
        nbatch = nenvs * nsteps
        self.nbatch = nbatch
        nh, nw, nc = ob_space.shape
        ob_shape_train = (nbatch, nh, nw, nc)
        ob_shape_step = (nenvs, nh, nw, nc)

        # Setup placeholders.
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])
        TEACHER_C = tf.placeholder(tf.float32, [])
        DROPOUT_STRENGTH = tf.placeholder(tf.float32, [],
                                          name='DROPOUT_STRENGTH')
        self.DROPOUT_STRENGTH = DROPOUT_STRENGTH
        X_train = tf.placeholder(tf.float32, ob_shape_train,
                                 name='Ob_train')  #obs
        X_step = tf.placeholder(tf.float32, ob_shape_step,
                                name='Ob_step')  #obs
        attention_truth = None

        step_hparams = copy.deepcopy(hparams)
        train_hparams = copy.deepcopy(hparams)

        # if self.hparams.get('fixed_dropout_noise'):
        #     self.step_env_random = tf.get_variable(
        #         shape=[nenvs, 7, 7, 1],
        #         name='env_random',
        #         initializer=tf.truncated_normal_initializer(),
        #         trainable=False,
        #     )

        #     self.train_env_random = tf.tile(tf.expand_dims(self.step_env_random, axis=0), multiples=[nsteps, 1, 1, 1, 1])
        #     self.train_env_random = tf.reshape(
        #         tf.transpose(self.train_env_random, perm=[1, 0, 2, 3, 4]),
        #         [nbatch, 7, 7, 1])

        #     step_hparams['_env_random'] = self.step_env_random
        #     train_hparams['_env_random'] = self.train_env_random

        # train_hparams['_dropout_strength'] = DROPOUT_STRENGTH
        # step_hparams['_dropout_strength'] = DROPOUT_STRENGTH

        # Create the models.
        step_model = policy(sess,
                            X_step,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            reuse=False,
                            hparams=step_hparams)
        train_model = policy(sess,
                             X_train,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True,
                             hparams=train_hparams)

        if hparams.get('teacher_ckpt'):
            assert hparams.get('use_fixed_attention') or hparams.get(
                'learn_attention_from_teacher') or hparams.get(
                    'do_joint_training')

            # Create the teacher, so that way we can use its attention weights
            # instead of learning how to do attention on our own.
            # step_teacher = self._create_sfmnet(X_step, reuse=False, is_step_model=True)

            train_teacher = self._create_object_segmentation_net(
                X_train,
                reuse=False,
                is_step_model=False,
                embedding=train_model.original_h
                if hparams['do_joint_training'] else None,
            )
            train_attention_truth, train_attention_mask = self._get_attention_truth(
                train_teacher, is_step_model=False)

            # step_attention_truth = self._get_attention_truth(step_teacher, is_step_model=True)

            # if hparams.get('use_fixed_attention'):
            #     step_hparams['_attention_truth'] = step_attention_truth
            #     train_hparams['_attention_truth'] = train_attention_truth

            # if hparams.get('do_joint_training'):
            #     step_hparams['_teacher_h3'] = step_teacher.conv3
            #     step_hparams['_teacher_h'] = step_teacher.embedding

            #     train_hparams['_teacher_h3'] = train_teacher.conv3
            #     train_hparams['_teacher_h'] = train_teacher.embedding

        # if hparams.get('use_target_model'):
        #     assert not hparams.get('do_joint_training')

        #     target_hparams = copy.copy(train_hparams)
        #     target_hparams['_policy_scope'] = 'target_model'
        #     target_hparams['_src_scope'] = 'model'
        #     target_model = policy(sess, X_step, ob_space, ac_space, nenvs, 1, reuse=False, hparams=target_hparams)
        #     target_model.setup_copy_weights()
        #     self.target_model = target_model

        scaled_images = tf.cast(train_model.X, tf.float32) / 255.
        print('scaled_images shape: {}'.format(scaled_images))

        sfm_base = object_segmentation.ObjectSegmentationBase(
            frames=scaled_images, embedding=train_model.h)
        sfm_hparams = copy.deepcopy(hparams)
        sfm_hparams['batch_size'] = nenvs * nsteps

        tf.summary.image('frame0',
                         tf.expand_dims(train_model.X[..., -2], axis=-1),
                         max_outputs=1)
        tf.summary.image('frame1',
                         tf.expand_dims(train_model.X[..., -1], axis=-1),
                         max_outputs=1)

        # Create the loss function.
        def a2c_loss(pi, vf):
            neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.pi, labels=A)
            pg_loss = tf.reduce_mean(ADV * neglogpac)
            vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
            entropy = tf.reduce_mean(cat_entropy(train_model.pi))

            # ent_coef_mode = hparams.get('ent_coef_mode', 'default')
            # ent_coef_val = hparams.get('ent_coef_val', ent_coef)

            # if ent_coef_mode == 'default':
            #     actual_ent_coef = ent_coef_val
            # elif ent_coef_mode == 'linear_teacher':
            #     actual_ent_coef = ent_coef_val * TEACHER_C + ent_coef * (1 - TEACHER_C)
            # elif ent_coef_mode == 'additive_teacher':
            #     actual_ent_coef = ent_coef_val + ent_coef_val * TEACHER_C
            # else:
            #     raise Exception('unrecognized ent_coef_mode: {}'.format(ent_coef_mode))

            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
            return loss, pg_loss, vf_loss, entropy

        loss, pg_loss, vf_loss, entropy = a2c_loss(train_model.pi,
                                                   train_model.vf)

        # if hparams.get('dropout_data_aug_c'):
        #     logged_augs = False
        #     loss_c = 1.0 - hparams['num_dropout_models'] * hparams['dropout_data_aug_c']
        #     assert loss_c >= hparams['dropout_data_aug_c'] - 1e-5
        #     loss = loss_c * loss

        #     for pi_noise, vf_noise in zip(train_model.pi_noises, train_model.vf_noises):
        #         l2, pg2, vf2, entropy2 = a2c_loss(pi_noise, vf_noise)
        #         loss += l2 * hparams['dropout_data_aug_c']

        #         if not logged_augs:
        #             logged_augs = True
        #             tf.summary.scalar('aug_loss', tf.reduce_mean(l2))
        #             tf.summary.scalar('aug_pgloss', tf.reduce_mean(pg2))
        #             tf.summary.scalar('aug_vfloss', tf.reduce_mean(vf2))
        #             tf.summary.scalar('aug_entropyloss', tf.reduce_mean(entropy2))

        #     print("ADDING DROPOUT DATA AUG")

        # if hasattr(train_model, 'noise_loss') and hparams.get('noise_loss_c'):
        #     loss += train_model.noise_loss
        #     print("ADDING NOISE LOSS")

        # tf.summary.image('frame0', tf.expand_dims(train_model.X[..., -2],-1), max_outputs=1)
        # tf.summary.image('frame1', tf.expand_dims(train_model.X[..., -1],-1),  max_outputs=1)

        teacher_loss = 0.0

        if hparams.get('teacher_ckpt') and hparams.get(
                'learn_attention_from_teacher'):
            assert hparams.get('attention_20') or hparams.get(
                'inverted_attention_20')
            # Load in the teacher.
            # teacher = sfmnet.SfmNet(hparams=sfm_hparams, sfm_base=sfm_base, is_teacher_network=True)

            # attention_loss = tf.nn.softmax_cross_entropy_with_logits(
            #     labels=train_attention_truth,
            #     logits=tf.reshape(train_model.attention_logits, [nbatch,-1])
            # )
            # print('attention_loss: {}'.format(attention_loss.get_shape()))
            # print('train_attention_mask: {}'.format(train_attention_mask.get_shape()))
            # attention_loss = attention_loss * train_attention_mask
            # attention_loss = tf.reduce_mean(attention_loss)

            # # for t in [5., 10., 20., 40., 75., 100., 200., 500., 1000.]:
            # #     truth = tf.nn.softmax(coarse_masks / t)
            # #     tf.summary.image('attention_truth_{}'.format(t), tf.reshape(truth, [nbatch, 7, 7, 1]), max_outputs=1)
            # tf.summary.scalar('attention_loss', attention_loss)
            # tf.summary.scalar('attention_teaching', tf.reduce_mean(train_attention_mask))

            # teacher_loss = TEACHER_C * attention_loss

            tf.summary.scalar('teacher_c', TEACHER_C)
            truth, mask = self._get_attention_truth_20(train_teacher,
                                                       is_step_model=False)
            tf.summary.image('attention_20_truth',
                             tf.reshape(truth, [80, 20, 20, 1]),
                             max_outputs=1)

            if hparams.get('attention_20'):
                attention_loss_20 = tf.nn.softmax_cross_entropy_with_logits(
                    labels=truth,
                    logits=tf.reshape(train_model.attention_logits_20,
                                      [-1, 400]))
                attention_loss_20 = tf.reduce_mean(attention_loss_20 * mask)

                tf.summary.scalar('attention_loss_20', attention_loss_20)
                tf.summary.scalar('attention_teaching_20',
                                  tf.reduce_mean(mask))
                teacher_loss += TEACHER_C * attention_loss_20

            if hparams.get('extrapath_attention_20'):
                print("EXTRAPATH ATTENTION!!!")
                attention_loss_20 = tf.nn.softmax_cross_entropy_with_logits(
                    labels=truth,
                    logits=tf.reshape(
                        train_model.extrapath_attention_logits_20, [-1, 400]))
                attention_loss_20 = tf.reduce_mean(attention_loss_20 * mask)

                tf.summary.scalar('attention_loss_20', attention_loss_20)
                tf.summary.scalar('attention_teaching_20',
                                  tf.reduce_mean(mask))
                teacher_loss += (-TEACHER_C) * attention_loss_20

        # if hparams.get('learn_attention_from_pg'):
        #     attention_logits = tf.reshape(train_model.attention_logits, [nbatch, 49])
        #     attention_actions = sample(attention_logits)
        #     attention_actions = tf.stop_gradient(attention_actions)

        #     attention_neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=attention_logits, labels=attention_actions)
        #     attention_pg_loss = tf.reduce_mean(ADV * attention_neglogpac)

        #     tf.summary.scalar('attention_pg_loss', attention_pg_loss)

        #     loss += attention_pg_loss * hparams['learn_attention_from_pg']

        # if hparams.get('teacher_ckpt') and hparams.get('learn_translation_from_teacher'):
        #     with tf.variable_scope("model"):
        #         with tf.variable_scope('object_translation'):
        #             pred_translation = fc(train_model.h, 'obj_t', nh=2*self.hparams['k_obj'], init_scale=1.0)
        #             pred_translation = tf.reshape(pred_translation, (-1, self.hparams['k_obj'], 2))

        #     teacher_translation = tf.stop_gradient(train_teacher.object_translation)
        #     translation_loss = mse(pred_translation, teacher_translation)
        #     translation_loss = tf.reduce_mean(translation_loss)
        #     teacher_loss += TEACHER_C * translation_loss
        #     tf.summary.scalar('translation_loss', translation_loss)

        if hparams['do_joint_training']:
            teacher_loss += tf.reduce_mean(
                train_teacher.transform_loss +
                train_teacher.mask_reg_loss) * TEACHER_C

        if hasattr(train_model, 'attention_logits_20'):
            # Want a low entropy distribution, so that we are focused on only a small part of the image per frame.
            reshaped_logits = tf.reshape(train_model.attention_logits_20,
                                         [-1, 400])
            attention_entropy = tf.reduce_mean(cat_entropy(reshaped_logits))
            teacher_loss -= hparams[
                'attention_entropy_c'] * attention_entropy * TEACHER_C

            tf.summary.scalar('attention_entropy', attention_entropy)

        if hasattr(train_model, 'extrapath_attention_logits_20'):
            # Want a low entropy distribution, so that we are focused on only a small part of the image per frame.
            reshaped_logits = tf.reshape(
                train_model.extrapath_attention_logits_20, [-1, 400])
            attention_entropy = tf.reduce_mean(cat_entropy(reshaped_logits))
            teacher_loss -= hparams[
                'attention_entropy_c'] * attention_entropy * TEACHER_C

            tf.summary.scalar('extrapath_attention_entropy', attention_entropy)

        # if hasattr(train_model, 'attention_weights_20'):
        #     # Want this to be high entropy, so we are looking at different parts of the image on different images.
        #     batch_logits = tf.reshape(tf.reduce_sum(train_model.attention_weights_20, axis=0), [1, 400])
        #     attention_entropy = tf.reduce_mean(cat_entropy_softmax(batch_logits))
        #     loss -= hparams['batch_entropy_c'] * attention_entropy
        #     tf.summary.scalar('batch_entropy', attention_entropy)

        # if hparams['do_joint_training'] and False:
        #     assert hparams.get('teacher_ckpt')
        #     teacher_loss += TEACHER_C * train_teacher.total_loss
        # else:
        #     sfm_loss = None

        # if hparams['do_flow_prediction']:
        #     assert hparams.get('teacher_ckpt')
        #     flow_truth_x, flow_truth_y = self._get_flow_truth(train_teacher)
        #     predicted_flow = conv(train_model.flow_base, 'pred_flow', nf=4, rf=1, stride=1, trainable=True)

        #     flow_pred_x = tf.reshape(predicted_flow[..., :2], [-1, 2])
        #     flow_pred_y = tf.reshape(predicted_flow[..., 2:], [-1, 2])

        #     flow_x_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=flow_truth_x, logits=flow_pred_x))
        #     flow_y_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=flow_truth_y, logits=flow_pred_y))
        #     flow_loss = flow_x_loss + flow_y_loss

        #     # flow_error = tf.reduce_mean(mse(flow_truth, predicted_flow))
        #     teacher_loss += TEACHER_C * flow_loss * hparams['flow_error_c']

        #     flow_x_acc = tf.reduce_mean(tf.cast(tf.argmax(flow_pred_x, axis=-1) == flow_truth_x, tf.int32))
        #     flow_y_acc = tf.reduce_mean(tf.cast(tf.argmax(flow_pred_y, axis=-1) == flow_truth_y, tf.int32))

        #     # tf.summary.scalar('flow_error_if_predict_zeros', tf.reduce_mean(0.5 * tf.square(flow_truth)))
        #     tf.summary.scalar('flow_x_loss', flow_x_loss)
        #     tf.summary.scalar('flow_y_loss', flow_y_loss)
        #     tf.summary.scalar('flow_x_acc', flow_x_acc)
        #     tf.summary.scalar('flow_y_acc', flow_y_acc)
        #     # tf.summary.image('predicted_flow_x', tf.expand_dims(predicted_flow[..., 0], axis=-1), max_outputs=1)
        #     # tf.summary.image('predicted_flow_y', tf.expand_dims(predicted_flow[..., 1], axis=-1), max_outputs=1)

        self.train_writer = tf.summary.FileWriter(
            os.path.join(hparams['base_dir'], 'logs',
                         hparams['experiment_name']), sess.graph)
        # TODO(vikgoel): when we don't need the teacher, we should ensure that we don't merge its summaries so that way
        #                we don't need to execute that part of the graph.
        merged_summaries = tf.summary.merge_all()

        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)

        def get_train_op(loss_op):
            params = find_trainable_variables("model")

            # Switch from GATE_NONE to GATE_GRAPH to enhance reproducibility.
            #grads = tf.gradients(loss, params)
            grads_and_params = trainer.compute_gradients(
                loss=loss_op,
                var_list=params,
                gate_gradients=tf.train.RMSPropOptimizer.GATE_GRAPH)
            grads = [x[0] for x in grads_and_params]
            params = [x[1] for x in grads_and_params]

            if max_grad_norm is not None:
                grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            grads = list(zip(grads, params))

            return trainer.apply_gradients(grads)

        _fast_train = get_train_op(loss)
        _teacher_train = get_train_op(loss + teacher_loss)

        params = find_trainable_variables("model")
        print('*' * 20)
        print('chosen trainable variables')
        for p in params:
            print(p.name)
        print('*' * 20)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
        self.lr = lr

        write_counter = 0

        def train(obs, states, rewards, masks, actions, values):
            nonlocal write_counter

            if lr.n % hparams['target_model_update_frequency'] == 0 and hasattr(
                    self, 'target_model'):
                print('COPYING WEIGHTS INTO TARGET MODEL')
                self.target_model.copy_weights()

            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            # Smooth approximation:
            #teacher_decay_c = hparams['teacher_decay_c']#9.9e-6 # 2.5e-5
            #teacher_c = 1.0 / (teacher_decay_c * lr.n + 1)
            #teacher_c = min(hparams['max_teacher_c'], teacher_c)

            if not hparams['use_extra_path']:
                lerp = float(lr.n) / 1e7
                lerp = min(lerp, 1)
                teacher_c = hparams['max_teacher_c'] * (1. - lerp)
            else:
                teacher_c = 1

            # Linear decay schedule
            # teacher_c = (hparams['teacher_cutoff_step'] - lr.n) / hparams['teacher_cutoff_step']
            # teacher_c = max(teacher_c, 0)

            # # Lower bound on the decay
            # teacher_c = (1 - hparams['teacher_loss_c']) * teacher_c + hparams['teacher_loss_c']

            _train = _fast_train if teacher_c == 0 else _teacher_train

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr,
                TEACHER_C: teacher_c
            }
            # td_map[DROPOUT_STRENGTH] = get_dropout_strength(hparams, lr.n)

            if self.hparams['teacher_ckpt'] and self.hparams[
                    'do_joint_training']:
                td_map[train_teacher.mask_reg_c] = 1

            #if states is not None:
            #    td_map[train_model.S] = states
            #    td_map[train_model.M] = masks

            ops = [pg_loss, vf_loss, entropy, _train]

            # if hparams.get('no_train_a2c'):
            #     ops = ops[:-1]

            if 'attention' in hparams['policy']:
                ops.append(train_model.attention_weights_20)

            write_summaries = hparams.get(
                'teacher_ckpt') or 'attention' in hparams['policy']

            if write_summaries:
                if write_counter % 10 != 0:
                    write_summaries = False
                write_counter += 1

            if write_summaries:
                ops.append(merged_summaries)

            sess_results = sess.run(ops, td_map)

            policy_loss = sess_results[0]
            value_loss = sess_results[1]
            policy_entropy = sess_results[2]

            if write_summaries:
                summary = sess_results[-1]
                self.train_writer.add_summary(summary, lr.n)

            if 'attention' in hparams['policy']:
                attention_output = sess_results[-2 if write_summaries else -1]
                publish_attention_weights(attention_output[:5, ...])

            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load

        # Initialize all of the variables in a deterministic order so that each experiment is reproducible.
        global_vars = tf.global_variables()
        global_vars = sorted(global_vars, key=lambda x: x.name)
        for var in global_vars:
            tf.variables_initializer([var]).run(session=sess)
        #tf.global_variables_initializer().run(session=sess)

        if hparams.get('teacher_ckpt'):
            # Load in the teacher AFTER doing the init so we don't overwrite the weights.
            restore_teacher_from_checkpoint(sess, hparams['teacher_ckpt'])
Exemplo n.º 11
0
    def __init__(self, policy, ob_space, ac_space, nenvs, master_ts = 1, worker_ts = 8,
                 ent_coef=0.01, vf_coef=0.5, max_grad_norm=2.5, lr=7e-4, cell=256,
                 ib_alpha=0.04, sv_M=32, algo='use_svib_uniform',
                 alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs*master_ts*worker_ts # master what's mean?

        # A:action, ADV:advantage, R:reward, LR:Learning Rate
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, 1, cell=cell, M=sv_M, model='step_model', algo=algo)
        train_model = policy(sess, ob_space, ac_space, nbatch, master_ts, worker_ts, cell = cell, M=sv_M, model='train_model', algo=algo)
        print('model_setting_done, algorithm:', str(algo))

        '''
        可视化互信息,暂时跳过
        '''
        ib_loss = train_model.mi_xh_loss
        T = train_model.T_value
        t_grads, t_global_norm = grad_clip(-vf_coef*ib_loss, max_grad_norm, ['model/T/update_params'])
        t_trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _t_train = t_trainer.apply_gradients(t_grads)
        T_update_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/T/update_params')
        T_orig_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/T/orig_params')
        reset_update_params = [update_param.assign(orig_param) for update_param, orig_param in zip(T_update_params, T_orig_params)]

        # rpf_matrix, rpf_grads = rpf_kernel(vf_loss_sv, rpf_h)

        if algo == 'use_svib_uniform' or algo == 'use_svib_gaussian':
            def expand_placeholder(X, M=sv_M):
                return tf.tile(tf.expand_dims(X, axis=-1), [1, M])
            A_expand, R_expand = expand_placeholder(A), expand_placeholder(R)
            neglogpac_expand = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi_expand, labels=A_expand)#shape=[nbatch, sv_M]
            # pg_loss_expand = tf.reduce_mean(ADV_expand * neglogpac_expand, axis=-1)
            pg_loss_expand = tf.reduce_mean(tf.stop_gradient(R_expand-train_model.wvf_expand[:,:,0]) * neglogpac_expand, axis=-1)
            vf_loss_expand = tf.reduce_mean(mse(tf.squeeze(train_model.wvf_expand), R_expand), axis=-1)
            entropy_expand = tf.reduce_mean(cat_entropy(train_model.wpi_expand), axis=-1)#shape=[nbatch]
            J_theta = -(pg_loss_expand + vf_coef*vf_loss_expand - ent_coef*entropy_expand)

            loss_expand = -J_theta / float(nbatch)
            pg_loss_expand_ = tf.reduce_mean(pg_loss_expand)
            vf_loss_expand_ = tf.reduce_mean(vf_loss_expand)
            entropy_expand_ = tf.reduce_mean(entropy_expand)
            loss_expand_ = -tf.reduce_mean(J_theta)

            print('ib_alpha: ', ib_alpha)
            log_p_grads = tf.gradients(J_theta/np.sqrt(ib_alpha), [train_model.wh_expand])[0]#shape=[nbatch, sv_M, cell]
            if algo == 'use_svib_gaussian':
                mean, var = tf.nn.moments(train_model.wh_expand, axes=1, keep_dims=True)#shape=[nbatch, 1,cell]
                gaussian_grad = -(train_model.wh_expand - mean)/(float(sv_M) * (var+1e-3))
                log_p_grads += 5e-3*(tf_l2norm(log_p_grads, axis=-1, keep_dims=True)/tf_l2norm(gaussian_grad, axis=-1, keep_dims=True))*gaussian_grad
            sv_grads = tf.constant(0., tf.float32, shape=[nbatch, 0, cell])
            for i in range(sv_M):
                sv_grad = tf.reduce_sum(train_model.rpf_matrix[:, :, i:i+1] * log_p_grads, axis=1) + np.sqrt(ib_alpha)*train_model.rpf_grads[:, i, :]#shape=[nbatch, cell]
                sv_grads = tf.concat([sv_grads, tf.expand_dims(sv_grad, axis=1)], axis=1)
                
            SV_GRADS = tf.placeholder(tf.float32, [nbatch, sv_M, cell])
            repr_loss = tf.reduce_mean(SV_GRADS * train_model.wh_expand, axis=1)#shape=[nbatch,cell]
            repr_loss = -tf.reduce_mean(tf.reduce_sum(repr_loss, axis=-1))#max optimization problem to minimization problem
            # repr_loss = -tf.reduce_mean(repr_loss, axis=0)

            # sv_grad_ = tf.reduce_sum(train_model.rpf_matrix[:, :, 2:3] * log_p_grads, axis=1) + train_model.rpf_grads[:, 2, :]
            # exploit_term = tf.reduce_sum(train_model.rpf_matrix[:, :, 2:3] * log_p_grads, axis=1)
            # explore_term = train_model.rpf_grads[:, 2, :]
            grads_expand, global_norm_expand = grad_clip(loss_expand, max_grad_norm, ['model/worker_module'])
            trainer_expand = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
            _train_expand = trainer_expand.apply_gradients(grads_expand)

            repr_grads, repr_global_norm = grad_clip(repr_loss, max_grad_norm, ['model/ordinary_encoder'])
            repr_trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
            _repr_train = repr_trainer.apply_gradients(repr_grads)

        elif algo == 'sv_a2c':
            def expand_placeholder(X, M=sv_M):
                return tf.tile(tf.expand_dims(X, axis=-1), [1, M])
            A_expand, R_expand = expand_placeholder(A), expand_placeholder(R) # [40, 32]
            sigma = tf.constant(1e-5)
            neglogpac_expand = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi_expand, labels=A_expand) + sigma # [40, 32]
            pg_loss_expand = tf.reduce_mean(tf.stop_gradient(R_expand - train_model.wvf_expand[:, :, 0]) * neglogpac_expand, axis=-1) # [40, ]
            vf_loss_sv = tf.expand_dims(mse(tf.squeeze(train_model.wvf_expand), R_expand), axis=-1) # [40, 32, 1]
            vf_loss_expand = tf.reduce_mean(mse(tf.squeeze(train_model.wvf_expand), R_expand), axis=-1) # [40, ]
            entropy_expand = tf.reduce_mean(cat_entropy(train_model.wpi_expand), axis=-1)  # shape=[nbatch]

            J_theta = pg_loss_expand + vf_coef * vf_loss_expand - ent_coef * entropy_expand # [40, ]
            # 为什么要除nbatch
            loss_expand = J_theta / float(nbatch) # [40, ]

            pg_loss_expand_ = tf.reduce_mean(pg_loss_expand)
            vf_loss_expand_ = tf.reduce_mean(vf_loss_expand) # [1]
            entropy_expand_ = tf.reduce_mean(entropy_expand)
            loss_expand_ = tf.reduce_mean(J_theta)

            print('ib_alpha: ', ib_alpha)
            # mean, var = tf.constant(0., tf.float32, [nbatch, 1, 1]), tf.constant(1, tf.float32, [nbatch, 1, 1])
            mean, var = tf.nn.moments(vf_loss_sv, axes=1, keep_dims=True) # [40, 1, 1]
            # Problem1: guassian gradient cauculate problem
            log_p_grads = -(vf_loss_sv - mean) / (float(sv_M) * (var))

            sv_grads = tf.constant(0., tf.float32, shape=[nbatch, 0, 1]) # [nbatch, m, 1]

            rpf_h = self.h_coef(vf_loss_sv, sv_M)
            rpf_matrix, rpf_grads = self.rpf_kernel(vf_loss_sv, rpf_h, sv_M)
            for i in range(sv_M):
                # sv_grad = tf.reduce_sum(train_model.rpf_matrix[:, :, i:i+1] * log_p_grads, axis=1) + sqrt(ib_alpha) * train_model.rpf_grads[:, i, :] #shape=[nbatch, cell]
                sv_grad = tf.reduce_sum(rpf_matrix[:, :, i:i + 1] * log_p_grads, axis=1) + rpf_grads[:, i, :]
                sv_grads = tf.concat([sv_grads, tf.expand_dims(sv_grad, axis=1)], axis=1)

            SV_GRADS = tf.placeholder(tf.float32, [nbatch, sv_M, 1])
            sv_loss = tf.reduce_mean(SV_GRADS * vf_loss_sv, axis=1)

            loss_expand -=  ib_alpha * (tf_l2norm(loss_expand, axis=-1, keep_dims=True)/tf_l2norm(sv_loss, axis=-1, keep_dims=True)) * sv_loss

            grads_expand, global_norm_expand = grad_clip(loss_expand, max_grad_norm, ['model/worker_module', 'model/ordinary_encoder'])
            trainer_expand = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
            _train_expand = trainer_expand.apply_gradients(grads_expand)

            # sv_loss_grads, sv_global_norm = grad_clip(sv_loss, max_grad_norm, ['model/worker_module/comm', 'model/worker_module/w_value'])
            # sv_trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
            # _sv_train = sv_trainer.apply_gradients(sv_loss_grads)


        elif algo == 'anchor':
            neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi, labels=A)
            pg_loss = tf.reduce_mean(ADV * neglogpac)
            entropy = tf.reduce_mean(cat_entropy(train_model.wpi))
            vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.wvf), R))

            # anchor method
            param_list = []
            for scope in ['model/worker_module']:
                List = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
                print(len(List))
                param_list += List

            param_value_layer1_w = param_list[0]
            param_value_layer1_b = param_list[1]
            param_policy_layer2_w = param_list[2]
            param_value_layer2_w = param_list[4]
            param_value_layer2_b = param_list[5]

            init_stddev = 5.0 # 7.0
            init_stddev_2 = 0.18 / np.sqrt(cell)  # normal scaling
            lambda_anchor = [0.000001,0.1]

            layer1_w_init = tf.random_normal(mean=0., stddev=init_stddev, shape=param_value_layer1_w.get_shape())
            layer1_b_init = tf.random_normal(mean=0., stddev=init_stddev, shape=param_value_layer1_b.get_shape())
            layer2_w_init = tf.random_normal(mean=0, stddev=init_stddev_2, shape=param_value_layer2_w.get_shape())
            layer2_b_init = tf.random_normal(mean=0, stddev=init_stddev_2, shape=param_value_layer2_b.get_shape())

            loss_anchor = lambda_anchor[0] / nbatch * tf.reduce_sum(tf.square(layer1_w_init - param_value_layer1_w))
            loss_anchor += lambda_anchor[0] / nbatch * tf.reduce_sum(tf.square(layer1_b_init - param_value_layer1_b))
            loss_anchor += lambda_anchor[1] / nbatch * tf.reduce_sum(tf.square(layer2_w_init - param_value_layer2_w))
            loss_anchor += lambda_anchor[1] / nbatch * tf.reduce_sum(tf.square(layer2_b_init - param_value_layer2_b))

            loss = pg_loss + vf_coef * vf_loss - ent_coef * entropy + loss_anchor

            grads, global_norm = grad_clip(loss, max_grad_norm, ['model/worker_module', 'model/ordinary_encoder'])
            trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
            _train = trainer.apply_gradients(grads)

        else: # regular algorithm
            neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi, labels=A)
            pg_loss = tf.reduce_mean(ADV * neglogpac)
            entropy = tf.reduce_mean(cat_entropy(train_model.wpi))
            vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.wvf), R))

            loss = pg_loss + vf_coef * vf_loss - ent_coef * entropy

            grads, global_norm = grad_clip(loss, max_grad_norm, ['model/worker_module', 'model/ordinary_encoder'])
            trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
            _train = trainer.apply_gradients(grads)

        params = find_trainable_variables("model")
        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)


        def train(wobs, whs, states, rewards, masks, actions, values, noises):
            advs = rewards - values
            # adv_mu, adv_var = np.mean(advs), np.var(advs)+1e-3
            # advs = (advs - adv_mu) / adv_var

            for step in range(len(whs)):
                cur_lr = lr.value()
            sv_td_map = {train_model.wX : wobs, train_model.istraining:True, A:actions, R:rewards, LR:cur_lr}

            # Sess Graph
            # writer = tf.summary.FileWriter('./', sess.graph)

            repr_td_map = {train_model.wX: wobs, train_model.istraining: True, A: actions, R: rewards, LR: cur_lr}
            rl_td_map = {train_model.wX : wobs, train_model.istraining: True, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states is not None:
                rl_td_map[train_model.wS] = states
                rl_td_map[train_model.wM] = masks
            repr_grad_norm = 0.
            # print(str(np.sum(whs-sess.run(train_model.wh, feed_dict={train_model.wX : wobs, train_model.istraining:True, train_model.noise:noises}))))
            if algo == 'use_svib_uniform' or algo == 'use_svib_gaussian':
                repr_td_map[train_model.noise_expand], repr_td_map[train_model.NOISE_KEEP] = sess.run(train_model.noise_expand), noises
                wh_expands, sv_gradients = sess.run([train_model.wh_expand, sv_grads], feed_dict=repr_td_map)
                rl_td_map[train_model.wh_expand] = wh_expands
                tloss, value_loss, policy_loss, policy_entropy, rl_grad_norm, _ = sess.run(
                    [loss_expand_, vf_loss_expand_, pg_loss_expand_, entropy_expand_, global_norm_expand, _train_expand],
                    feed_dict=rl_td_map
                )
                repr_td_map[SV_GRADS] = sv_gradients
                # if algo == 'use_svib_gaussian':
                #     gaussian_gradients, repr_grad_norm, __ =\
                #         sess.run([gaussian_grad, repr_global_norm, _repr_train], feed_dict=repr_td_map)
                #     return tloss, value_loss, policy_loss, policy_entropy, rl_grad_norm, gaussian_gradients, repr_grad_norm  # represnet_loss, SV_GRAD, EXPLOIT, LOG_P_GRADS, EXPLORE
                repr_grad_norm, represent_loss, __ = sess.run([repr_global_norm, repr_loss, _repr_train], feed_dict=repr_td_map)

            elif algo == 'anchor':
                rl_td_map[train_model.wX] = wobs
                tloss, value_loss, policy_loss, policy_entropy, rl_grad_norm, anchor_loss, _ = sess.run(
                    [loss, vf_loss, pg_loss, entropy, global_norm, loss_anchor ,_train],
                    feed_dict=rl_td_map
                )

                represent_loss = 0.
                sv_loss_ = 0
            elif algo == 'sv_a2c':
                sv_td_map[train_model.noise_expand], sv_td_map[train_model.NOISE_KEEP] = sess.run(
                    train_model.noise_expand), noises
                wvf_expands, sv_gradients = sess.run([train_model.wvf_expand, sv_grads], feed_dict=sv_td_map)
                rl_td_map[train_model.wvf_expand] = wvf_expands
                rl_td_map[train_model.noise_expand], rl_td_map[train_model.NOISE_KEEP] = sess.run(
                    train_model.noise_expand), noises
                rl_td_map[SV_GRADS] = sv_gradients
                tloss, value_loss, policy_loss, policy_entropy, rl_grad_norm, sv_loss_, _ = sess.run(
                    [loss_expand_, vf_loss_expand_, pg_loss_expand_, entropy_expand_, global_norm_expand,
                     sv_loss, _train_expand],
                    feed_dict=rl_td_map
                )
                sv_td_map[SV_GRADS] = sv_gradients
                anchor_loss = 0.
                represent_loss = 0.

            else:
                rl_td_map[train_model.wX], rl_td_map[train_model.noise] = wobs, noises#noise won't be used when algo is 'regular'
                tloss, value_loss, policy_loss, policy_entropy, rl_grad_norm, _ = sess.run(
                    [loss, vf_loss, pg_loss, entropy, global_norm, _train],
                    feed_dict=rl_td_map
                )
                # repr_td_map[WH_GRADS] = wh_gradients
                # repr_grad_norm, __ = sess.run([ordin_repr_global_norm, _ordin_repr_train], feed_dict=repr_td_map)
                repr_grad_norm = 0.
                represent_loss = 0.
                anchor_loss = 0
                sv_loss_ = 0
            return tloss, value_loss, policy_loss, policy_entropy, rl_grad_norm, repr_grad_norm, represent_loss, anchor_loss, sv_loss_#SV_GRAD, EXPLOIT, LOG_P_GRADS, EXPLORE

        def train_mine(wobs, whs, steps=256, lr=7e-4):
            # whs_std = (whs-np.mean(whs,axis=0,keepdims=True))/(1e-8 + np.std(whs,axis=0,keepdims=True))
            idx = np.arange(len(whs))
            ___ = sess.run(reset_update_params)
            for i in range(int(steps)):
                np.random.shuffle(idx)
                mi, T_value, __ = sess.run([ib_loss, T, _t_train],
                                           feed_dict={train_model.wX: wobs[idx], train_model.wh: whs[idx],
                                                      LR: lr, train_model.istraining: True})
            logger.record_tabular('mutual_info_loss', float(mi))
            logger.record_tabular('T_value', float(T_value))
            logger.dump_tabular()

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_mine = train_mine
        self.train_model = train_model
        self.step_model = step_model
        self.get_wh = step_model.get_wh
        self.get_noise = step_model.get_noise
        self.value = step_model.wvalue
        self.step = step_model.step
        self.initial_state = step_model.w_initial_state
        self.save = save
        self.load = load
        self.sv_M = sv_M
        # self.rpf_h = rpf_h
        # self.rpf_matrix = rpf_matrix
        # self.rpf_grads = rpf_grads
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 12
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear'):

        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nenvs,
                                inter_op_parallelism_threads=nenvs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        self.saver = tf.train.Saver(max_to_keep=1000)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(path, steps):
            make_path(path)
            self.saver.save(sess, path + 'model', global_step=steps)

        def load(path, steps):
            self.saver = tf.train.import_meta_graph(path + 'model' + '-' +
                                                    str(steps) + '.meta')
            self.saver.restore(sess, tf.train.latest_checkpoint(path))

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 13
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs*nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 14
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs*nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        # Defines step_model function and train_model functions
        # Pass each model a copy of 'sess'
        print("Constructing model... STEP_MODEL & TRAIN_MODEL: constructing step_model policy | " + str(policy))
        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)

        # train_model takes in the mini-batch produced by 5 step_models, NOTE: reuse = true
        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        # var init: this neglogpac is still somewhat unknown,
        # looks like it does softmax over policy layer of training model
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        print("MAIN: neglocpac = sparse_softmax_cross_entropy_with_logits() inputs: ")
        print("MAIN: train_model_pi: " + str(train_model.pi))
        print("MAIN: labels: " + str(A))

        # var init: policy gradient loss determined by average of all advantage * neglogpac
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # value function loss is mse(tf.squeeze(train_model.vf), R)
        # ^ in english, mse(model value prediction, actual Reward)
        # mse == means squared error, defined in a2c/utils.py
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))

        # entropy of policy
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))

        # total loss calculation?
        # todo: is this the loss function definition??? check with a3c paper
        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef


        # params gets trainable variables from model (weights of network?)
        params = find_trainable_variables("model")

        # computes gradients (change of weights, or direction of weights) using 'loss' and 'params' above
        # computes 'symbolic derivatives of sum 'loss' w.r.t 'params'
        # from tflow docs: 'gradients() adds ops to the graph to output the derivs of 'params'
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)

        # TODO: how many gradients are computed here, should be 16
        grads = list(zip(grads, params))
        # RMSProp optimizes learning rate , check thesis notes
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        # RMSProp pushes back new gradients over trainable variables to change weights
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)


        writer = tf.summary.FileWriter("/tmp/helloTensorBoard.txt")
        writer.add_graph(sess.graph)

        # Trains the model,
        # TODO: What is 'masks' input param
        # TODO: How often does train_model (steps thru train_model) get run vs. step_model
        #   A: I think it does a 'train_model' for each mini-batch, which is currently 5 steps
        # Does a sess.run with train_model
        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            # td_map hooks up all inputs for train model?
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}

            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            # Policy Loss, Value Loss, and Policy Entropy calculations

            # Propagates losses backwards through the neural network?
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):


            path = logger.get_dir() + "/model.pkl"

            print("Logger dir: " + logger.get_dir())
            print("MODEL SAVED TO : " + str(path))

            ps = sess.run(params)
            #make_path(osp.dirname(save_path))
            joblib.dump(ps, path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 15
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 nstack,
                 num_procs,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 nModelsToKeep=5):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            nstack,
                            reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs,
                             nsteps,
                             nstack,
                             reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save():
            modelfile = os.path.join(
                logger.get_dir(),
                datetime.datetime.now().strftime("model-%Y-%m-%d-%H-%M-%S-%f"))

            ps = sess.run(params)
            joblib.dump(ps, modelfile)
            logger.log('Model saved to %s' % modelfile)

            model_files = sorted(
                fnmatch.filter(os.listdir(logger.get_dir()), "model-*"))
            if len(model_files) > nModelsToKeep:
                for old_file in model_files[0:-nModelsToKeep]:
                    os.remove(os.path.join(logger.get_dir(), old_file))

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)
            logger.log('Model loaded from %s' % load_path)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 16
0
    def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV*logpac)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss


        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
        self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params=params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss,params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)



        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 17
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 nstack,
                 num_procs,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps

        writter = tf.summary.FileWriter(
            "/tmp/a2c_demo/1")  # Change for SAT: this is to use tensorBoard

        A = tf.placeholder(
            tf.int32, [nbatch])  # Comments by Fei: this must be the action
        ADV = tf.placeholder(
            tf.float32,
            [nbatch])  # Comments by Fei: this must be the advantage
        R = tf.placeholder(
            tf.float32, [nbatch])  # Comments by Fei: this must be the reward
        LR = tf.placeholder(
            tf.float32, [])  # Comments by Fei: this must be the learning rate

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            nstack,
                            reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs,
                             nsteps,
                             nstack,
                             reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi,
            labels=A)  # Comments by Fei: pi is nbatch * nact
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            # writter.add_graph(sess.graph)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 18
0
    def __init__(self, policy, ob_space, ac_space, nenvs, master_ts = 1, worker_ts = 30,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, cell = 256,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear',
            algo='regular', beta=1e-3):

        print('Create Session')
        gpu_options = tf.GPUOptions(allow_growth=True)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        nact = ac_space.n
        nbatch = nenvs*master_ts*worker_ts

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, 1, cell = cell, model='step_model', algo=algo)
        train_model = policy(sess, ob_space, ac_space, nbatch, master_ts, worker_ts, model='train_model', algo=algo)
        print('model_setting_done')

        #loss construction
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.wvf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.wpi))
        pg_loss = pg_loss - entropy * ent_coef
        print('algo: ', algo, 'max_grad_norm: ', str(max_grad_norm))
        try:
            if algo == 'regular':
                loss = pg_loss + vf_coef * vf_loss
            elif algo == 'VIB':
                '''
                implement VIB here, apart from the vf_loss and pg_loss, there should be a third loss,
                the kl_loss = ds.kl_divergence(model.encoding, prior), where prior is a Gaussian distribution with mu=0, std=1
                the final loss should be pg_loss + vf_coef * vf_loss + beta*kl_loss
                '''
                prior = ds.Normal(0.0, 1.0)
                kl_loss = tf.reduce_mean(ds.kl_divergence(train_model.encoding, prior))
                loss = pg_loss + vf_coef * vf_loss + beta*kl_loss
                # pass
            else:
                raise Exception('Algorithm not exists')
        except Exception as e:
            print(e)

        grads, global_norm = grad_clip(loss, max_grad_norm, ['model'])
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(wobs, whs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(whs)):
                cur_lr = lr.value()

            td_map = {train_model.wX:wobs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states is not None:
                td_map[train_model.wS] = states
                td_map[train_model.wM] = masks

            '''
            you can add and run additional loss for VIB here for debugging, such as kl_loss
            '''
            tloss, value_loss, policy_loss, policy_entropy, _ = sess.run(
                [loss, vf_loss, pg_loss, entropy, _train],
                feed_dict=td_map
            )
            return tloss, value_loss, policy_loss, policy_entropy

        params = find_trainable_variables("model")
        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.wvalue
        self.get_wh = step_model.get_wh
        self.initial_state = step_model.w_initial_state
        self.train = train
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 19
0
  def __init__(self,
               policy,
               ob_space,
               ac_space,
               nenvs,
               total_timesteps,
               nprocs=32,
               nscripts=16,
               nsteps=20,
               nstack=4,
               ent_coef=0.1,
               vf_coef=0.5,
               vf_fisher_coef=1.0,
               lr=0.25,
               max_grad_norm=0.001,
               kfac_clip=0.001,
               lrschedule='linear',
               alpha=0.99,
               epsilon=1e-5):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        intra_op_parallelism_threads=nprocs,
        inter_op_parallelism_threads=nprocs)
    config.gpu_options.allow_growth = True
    self.sess = sess = tf.Session(config=config)
    nsml.bind(sess=sess)
    #nact = ac_space.n
    nbatch = nenvs * nsteps
    A = tf.placeholder(tf.int32, [nbatch])

    XY0 = tf.placeholder(tf.int32, [nbatch])
    XY1 = tf.placeholder(tf.int32, [nbatch])

    # ADV == TD_TARGET - values
    ADV = tf.placeholder(tf.float32, [nbatch])
    TD_TARGET = tf.placeholder(tf.float32, [nbatch])
    PG_LR = tf.placeholder(tf.float32, [])
    VF_LR = tf.placeholder(tf.float32, [])

    self.model = step_model = policy(
        sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
    self.model2 = train_model = policy(
        sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

    # Policy 1 : Base Action : train_model.pi label = A

    script_mask = tf.concat(
        [
            tf.zeros([nscripts * nsteps, 1]),
            tf.ones([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0)

    pi = train_model.pi
    pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
    neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi, labels=A)
    neglogpac *= tf.stop_gradient(pac_weight)

    inv_A = 1.0 - tf.cast(A, tf.float32)

    xy0_mask = tf.cast(A, tf.float32)
    xy1_mask = tf.cast(A, tf.float32)

    condition0 = tf.equal(xy0_mask, 2)
    xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
    xy0_mask = 1.0 - xy0_mask

    condition1 = tf.equal(xy1_mask, 2)
    xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)

    # One hot representation of chosen marine.
    # [batch_size, 2]
    pi_xy0 = train_model.pi_xy0
    pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy0, labels=XY0)
    logpac_xy0 *= tf.stop_gradient(pac_weight)
    logpac_xy0 *= tf.cast(xy0_mask, tf.float32)

    pi_xy1 = train_model.pi_xy1
    pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    # 1D? 2D?
    logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy1, labels=XY1)
    logpac_xy1 *= tf.stop_gradient(pac_weight)
    logpac_xy1 *= tf.cast(xy1_mask, tf.float32)

    pg_loss = tf.reduce_mean(ADV * neglogpac)
    pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
    pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)

    vf_ = tf.squeeze(train_model.vf)

    vf_r = tf.concat(
        [
            tf.ones([nscripts * nsteps, 1]),
            tf.zeros([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0) * TD_TARGET
    vf_masked = vf_ * script_mask + vf_r

    #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]

    vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
    entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
    entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
    entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
    entropy = entropy_a + entropy_xy0 + entropy_xy1

    loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

    params = find_trainable_variables("model")
    grads = tf.gradients(loss, params)
    if max_grad_norm is not None:
      grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
    grads = list(zip(grads, params))
    trainer = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train = trainer.apply_gradients(grads)

    self.logits = logits = train_model.pi

    # xy0

    self.params_common = params_common = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
    self.params_xy0 = params_xy0 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy0') + params_common

    train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy0 = grads_xy0 = tf.gradients(
        train_loss_xy0, params_xy0)
    if max_grad_norm is not None:
      grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)

    grads_xy0 = list(zip(grads_xy0, params_xy0))
    trainer_xy0 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)

    # xy1

    self.params_xy1 = params_xy1 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy1') + params_common

    train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy1 = grads_xy1 = tf.gradients(
        train_loss_xy1, params_xy1)
    if max_grad_norm is not None:
      grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)

    grads_xy1 = list(zip(grads_xy1, params_xy1))
    trainer_xy1 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)

    self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

    def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
      advs = td_targets - values
      for step in range(len(obs)):
        cur_lr = self.lr.value()

      td_map = {
          train_model.X: obs,
          A: actions,
          XY0: xy0,
          XY1: xy1,
          ADV: advs,
          TD_TARGET: td_targets,
          PG_LR: cur_lr
      }
      if states != []:
        td_map[train_model.S] = states
        td_map[train_model.M] = masks

      policy_loss, value_loss, policy_entropy, _, \
      policy_loss_xy0, policy_entropy_xy0, _, \
      policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
          [pg_loss, vf_loss, entropy, _train,
           pg_loss_xy0, entropy_xy0, _train_xy0,
           pg_loss_xy1, entropy_xy1, _train_xy1],
          td_map)
      return policy_loss, value_loss, policy_entropy, \
             policy_loss_xy0, policy_entropy_xy0, \
             policy_loss_xy1, policy_entropy_xy1

    def save(save_path):
      ps = sess.run(params)
      joblib.dump(ps, save_path)

    def load(load_path):
      loaded_params = joblib.load(load_path)
      restores = []
      for p, loaded_p in zip(params, loaded_params):
        restores.append(p.assign(loaded_p))
      sess.run(restores)

    self.train = train
    self.save = save
    self.load = load
    self.train_model = train_model
    self.step_model = step_model
    self.step = step_model.step
    self.value = step_model.value
    self.initial_state = step_model.initial_state
    print("global_variables_initializer start")
    tf.global_variables_initializer().run(session=sess)
    print("global_variables_initializer complete")
Exemplo n.º 20
0
    def __init__(self, policy, p, has_state):
        """
        policy : Internal Policy model such as  SnakeModel.CNNPolicy
        p : Hyperparameters required for training
        """
        sess = tf_util.make_session()
        # Tensorflow model initiallization
        step_model = policy(sess=sess,
                            p=p,
                            train_phase=False,
                            has_state=has_state)  # Deploy model settings
        train_model = policy(sess=sess,
                             p=p,
                             train_phase=True,
                             has_state=has_state)  # Training model settings
        saver = tf.train.Saver()

        #Step 2 : Initialize the training parameters
        A = tf.placeholder(tf.int32, [p.N_BATCH])
        ADV = tf.placeholder(tf.float32, [p.N_BATCH])
        R = tf.placeholder(tf.float32, [p.N_BATCH])
        LR = tf.placeholder(tf.float32, [])

        #Step 3 : Define the loss Function
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)  #
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * p.ENTROPY_COEFF + vf_loss * p.VALUE_FUNC_COEFF

        #Step 4 : Define the loss optimizer
        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if p.MAX_GRAD_NORM is not None:
            grads, grad_norm = tf.clip_by_global_norm(
                grads, p.MAX_GRAD_NORM
            )  # Clipping the gradients to protect learned weights
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=p.RMS_DECAY,
                                            epsilon=p.EPSILON)
        _train = trainer.apply_gradients(
            grads)  # This is the variable which will be used
        lr = Scheduler(v=p.LEARNING_RATE,
                       nvalues=p.N_TIMESTEPS,
                       schedule=p.LEARNING_RATE_SCHEDULE
                       )  # Learning rate changes linearly or as per arguments

        # Step 5 : Write down the summary parameters to be used
        writer = tf.summary.FileWriter(p.LOG_PATH)  #summary writer

        def train(obs, rewards, masks, actions, values, states):
            """
            obs     : batch x n x m x 1 snake matrix
            rewards : batch x 1 rewards corrosponding to action 
            actions : batch x 1 discrete action taken
            values  : batch x 1 output of value function during the training process  
            """
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                train_model.S: states,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            #ps = sess.run(params)
            #make_path(save_path)
            #joblib.dump(ps, save_path)
            saver.save(sess, save_path)

        def load(load_path):
            #loaded_params = joblib.load(load_path)
            #restores = []
            #for p, loaded_p in zip(params, loaded_params):
            #    restores.append(p.assign(loaded_p))
            #ps = sess.run(restores)
            saver.restore(sess, load_path)

        def add_scalar_summary(tag, value, step):
            summary = tf.Summary(
                value=[tf.Summary.Value(tag=tag, simple_value=value)])
            writer.add_summary(summary, step)

        # Expose the user to closure functions
        self.train = train
        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.hidden_value = step_model.hidden_value
        self.initial_state = step_model.initial_state
        self.add_scalar_summary = add_scalar_summary
        self.save = save
        self.load = load
        # Initialize global variables and add tf graph
        tf.global_variables_initializer().run(session=sess)
        writer.add_graph(tf.get_default_graph())  #write graph
Exemplo n.º 21
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 param=None):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            reuse=False,
                            param=param)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True,
                             param=param)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 22
0
  def __init__(self,
               policy,
               ob_space,
               ac_space,
               nenvs,
               total_timesteps,
               nprocs=32,
               nscripts=16,
               nsteps=20,
               nstack=4,
               ent_coef=0.1,
               vf_coef=0.5,
               vf_fisher_coef=1.0,
               lr=0.25,
               max_grad_norm=0.001,
               kfac_clip=0.001,
               lrschedule='linear',
               alpha=0.99,
               epsilon=1e-5):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        intra_op_parallelism_threads=nprocs,
        inter_op_parallelism_threads=nprocs)
    config.gpu_options.allow_growth = True
    self.sess = sess = tf.Session(config=config)
    nsml.bind(sess=sess)
    #nact = ac_space.n
    nbatch = nenvs * nsteps
    A = tf.placeholder(tf.int32, [nbatch])

    XY0 = tf.placeholder(tf.int32, [nbatch])
    XY1 = tf.placeholder(tf.int32, [nbatch])

    # ADV == TD_TARGET - values
    ADV = tf.placeholder(tf.float32, [nbatch])
    TD_TARGET = tf.placeholder(tf.float32, [nbatch])
    PG_LR = tf.placeholder(tf.float32, [])
    VF_LR = tf.placeholder(tf.float32, [])

    self.model = step_model = policy(
        sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
    self.model2 = train_model = policy(
        sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

    # Policy 1 : Base Action : train_model.pi label = A

    script_mask = tf.concat(
        [
            tf.zeros([nscripts * nsteps, 1]),
            tf.ones([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0)

    pi = train_model.pi
    pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
    neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi, labels=A)
    neglogpac *= tf.stop_gradient(pac_weight)

    inv_A = 1.0 - tf.cast(A, tf.float32)

    xy0_mask = tf.cast(A, tf.float32)
    xy1_mask = tf.cast(A, tf.float32)

    condition0 = tf.equal(xy0_mask, 2)
    xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
    xy0_mask = 1.0 - xy0_mask

    condition1 = tf.equal(xy1_mask, 2)
    xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)

    # One hot representation of chosen marine.
    # [batch_size, 2]
    pi_xy0 = train_model.pi_xy0
    pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy0, labels=XY0)
    logpac_xy0 *= tf.stop_gradient(pac_weight)
    logpac_xy0 *= tf.cast(xy0_mask, tf.float32)

    pi_xy1 = train_model.pi_xy1
    pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    # 1D? 2D?
    logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy1, labels=XY1)
    logpac_xy1 *= tf.stop_gradient(pac_weight)
    logpac_xy1 *= tf.cast(xy1_mask, tf.float32)

    pg_loss = tf.reduce_mean(ADV * neglogpac)
    pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
    pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)

    vf_ = tf.squeeze(train_model.vf)

    vf_r = tf.concat(
        [
            tf.ones([nscripts * nsteps, 1]),
            tf.zeros([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0) * TD_TARGET
    vf_masked = vf_ * script_mask + vf_r

    #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]

    vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
    entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
    entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
    entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
    entropy = entropy_a + entropy_xy0 + entropy_xy1

    loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

    params = find_trainable_variables("model")
    grads = tf.gradients(loss, params)
    if max_grad_norm is not None:
      grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
    grads = list(zip(grads, params))
    trainer = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train = trainer.apply_gradients(grads)

    self.logits = logits = train_model.pi

    # xy0

    self.params_common = params_common = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
    self.params_xy0 = params_xy0 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy0') + params_common

    train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy0 = grads_xy0 = tf.gradients(
        train_loss_xy0, params_xy0)
    if max_grad_norm is not None:
      grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)

    grads_xy0 = list(zip(grads_xy0, params_xy0))
    trainer_xy0 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)

    # xy1

    self.params_xy1 = params_xy1 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy1') + params_common

    train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy1 = grads_xy1 = tf.gradients(
        train_loss_xy1, params_xy1)
    if max_grad_norm is not None:
      grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)

    grads_xy1 = list(zip(grads_xy1, params_xy1))
    trainer_xy1 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)

    self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

    def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
      advs = td_targets - values
      for step in range(len(obs)):
        cur_lr = self.lr.value()

      td_map = {
          train_model.X: obs,
          A: actions,
          XY0: xy0,
          XY1: xy1,
          ADV: advs,
          TD_TARGET: td_targets,
          PG_LR: cur_lr
      }
      if states != []:
        td_map[train_model.S] = states
        td_map[train_model.M] = masks

      policy_loss, value_loss, policy_entropy, _, \
      policy_loss_xy0, policy_entropy_xy0, _, \
      policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
          [pg_loss, vf_loss, entropy, _train,
           pg_loss_xy0, entropy_xy0, _train_xy0,
           pg_loss_xy1, entropy_xy1, _train_xy1],
          td_map)
      return policy_loss, value_loss, policy_entropy, \
             policy_loss_xy0, policy_entropy_xy0, \
             policy_loss_xy1, policy_entropy_xy1

    def save(save_path):
      ps = sess.run(params)
      joblib.dump(ps, save_path)

    def load(load_path):
      loaded_params = joblib.load(load_path)
      restores = []
      for p, loaded_p in zip(params, loaded_params):
        restores.append(p.assign(loaded_p))
      sess.run(restores)

    self.train = train
    self.save = save
    self.load = load
    self.train_model = train_model
    self.step_model = step_model
    self.step = step_model.step
    self.value = step_model.value
    self.initial_state = step_model.initial_state
    print("global_variables_initializer start")
    tf.global_variables_initializer().run(session=sess)
    print("global_variables_initializer complete")