def _init(self,
              ob_space,
              ac_space,
              hid_size_phi,
              num_hid_layers_phi,
              dim_phi,
              gaussian_fixed_var=True):
        """

        input: ob, T_ac established as placeholder

        output: ac, ob_next

        """
        assert isinstance(ob_space, gym.spaces.Box)
        self._ob_space = ob_space

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        # normalize obs and clip them
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz

        # define phi (shared parameter for useful representation)
        #for i in range(num_hid_layers_phi):
        hid_size_list = [hid_size_phi] * num_hid_layers_phi + [dim_phi]
        for i, hid_size in enumerate(hid_size_list):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "phi%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0)))

        self.phi = phi = last_out
        self._featurize = U.function([ob], [phi])

        # define v^pi
        self.vpred = dense(phi,
                           1,
                           "vf_final",
                           weight_init=U.normc_initializer(1.0))[:, 0]

        # define pi(a|s)
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            # continuous action
            mean = dense(phi,
                         pdtype.param_shape()[0] // 2, "pi_final_mu",
                         U.normc_initializer(0.01))
            logstd = tf.get_variable(name="pi_final_logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            # discrete action
            pdparam = dense(phi,
                            pdtype.param_shape()[0], "pi_final",
                            U.normc_initializer(0.01))
        self.pi = pi = pdtype.pdfromflat(pdparam)

        pi_stochastic = U.get_placeholder(name="pi_stochastic",
                                          dtype=tf.bool,
                                          shape=())
        self.ac = ac = U.switch(pi_stochastic, pi.sample(), pi.mode())
        self._act = U.function([pi_stochastic, ob],
                               [ac, self.vpred, pi.logits])

        # define T(s'|s, a) and a~pi(a|s)
        self.ob_next_pdtype = ob_next_pdtype = make_pdtype(ob_space)

        if isinstance(ac_space, gym.spaces.Box):
            # if continuous action
            T_ac = U.get_placeholder(name="T_ac",
                                     dtype=tf.float32,
                                     shape=[sequence_length] +
                                     [pdtype.param_shape()[0] // 2])
        else:
            # if discrete action
            T_ac = U.get_placeholder(name="T_ac",
                                     dtype=tf.float32,
                                     shape=[sequence_length] +
                                     list(ac_space.shape))

        T_input = tf.concat([phi, tf.expand_dims(T_ac, 1)], axis=1)
        T_mean = dense(T_input, ob_space.shape[0], "T_final_mu",
                       U.normc_initializer(0.01))
        T_logstd = tf.get_variable(name="T_final_logstd",
                                   shape=[1, ob_space.shape[0]],
                                   initializer=tf.zeros_initializer())
        T_pdparam = tf.concat([T_mean, T_mean * 0.0 + T_logstd], axis=1)

        self.T = T = ob_next_pdtype.pdfromflat(T_pdparam)
        T_stochastic = U.get_placeholder(name="T_stochastic",
                                         dtype=tf.bool,
                                         shape=())
        self.ob_next = ob_next = U.switch(T_stochastic, T.sample(), T.mode())
        self._predict_ob_next = U.function([T_stochastic, ob, T_ac], [ob_next])
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "vffc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0)))
        self.vpred = dense(last_out,
                           1,
                           "vffinal",
                           weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "polfc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0)))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense(last_out,
                         pdtype.param_shape()[0] // 2, "polfinal",
                         U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = dense(last_out,
                            pdtype.param_shape()[0], "polfinal",
                            U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob],
                               [ac, self.vpred, self.pd.logits])
Exemplo n.º 3
0
def learn_original(pi,
                   dataset,
                   env_name,
                   n_action,
                   prefix,
                   traj_lim,
                   seed,
                   optim_batch_size=128,
                   max_iters=5e3,
                   adam_epsilon=1e-4,
                   optim_stepsize=1e-4,
                   ckpt_dir=None,
                   plot_dir=None,
                   task_name=None,
                   verbose=False):
    """
    learn without regularization
    """
    # custom hyperparams
    seed = 0
    max_iters = 5e4

    val_per_iter = int(max_iters / 10)
    # placeholder
    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])
    stochastic = U.get_placeholder_cached(name="stochastic")
    loss = tf.reduce_mean(tf.square(tf.to_float(ac - pi.ac)))
    var_list = pi.get_trainable_variables()
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    lossandgrad = U.function([ob, ac, stochastic],
                             [loss] + [U.flatgrad(loss, var_list)])

    U.initialize()
    adam.sync()
    logger.log("Training a policy with Behavior Cloning")
    logger.log("with {} trajs, {} steps".format(dataset.num_traj,
                                                dataset.num_transition))

    loss_history = {}
    loss_history["train_action_loss"] = []
    loss_history["val_action_loss"] = []

    for iter_so_far in tqdm(range(int(max_iters))):
        ob_expert, ac_expert, _, _ = dataset.get_next_batch(
            optim_batch_size, 'train')
        train_loss, g = lossandgrad(ob_expert, ac_expert, True)
        adam.update(g, optim_stepsize)
        if verbose and iter_so_far % val_per_iter == 0:
            ob_expert, ac_expert, _, _ = dataset.get_next_batch(-1, 'val')
            val_loss, _ = lossandgrad(ob_expert, ac_expert, True)
            logger.log("Training loss: {}, Validation loss: {}".format(
                train_loss, val_loss))

            loss_history["train_action_loss"].append(train_loss)
            loss_history["val_action_loss"].append(val_loss)

    plot(env_name, loss_history, traj_lim, plot_dir)

    os.makedirs(ckpt_dir, exist_ok=True)
    if ckpt_dir is None:
        savedir_fname = tempfile.TemporaryDirectory().name
    else:
        ckpt_fname = "ckpt.bc.{}.{}".format(traj_lim, seed)
        savedir_fname = osp.join(ckpt_dir, ckpt_fname)
    U.save_state(savedir_fname, var_list=pi.get_variables())
    return savedir_fname
Exemplo n.º 4
0
def build_train(make_obs_ph,
                make_acs_ph,
                optimizer,
                mu_func,
                phi_sa_dim,
                scope,
                reuse,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                mu_stochastic=True,
                param_noise=False,
                param_noise_filter_func=None):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):

        obs_t_input = make_obs_ph("obs_t")
        act_t_input = make_acs_ph("act_t")
        phi_sa_t_ph = tf.placeholder(tf.float32, [None, phi_sa_dim],
                                     name="phi_sa")
        obs_tp1_input = make_obs_ph("obs_tp1")
        act_tp1_input = make_acs_ph("act_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")
        mu_stochastic_ph = U.get_placeholder(name="mu_stochastic",
                                             dtype=tf.bool,
                                             shape=())

        mu_t_est = mu_func(mu_stochastic_ph, obs_t_input.get(),
                           act_t_input.get())

        mu_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope=tf.get_variable_scope().name)

        mu_tp1_est = mu_func(mu_stochastic_ph, obs_tp1_input.get(),
                             act_tp1_input.get())

        target_mu_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name)

        mask = tf.expand_dims(1.0 - done_mask_ph, axis=1)
        mu_tp1_est_masked = tf.multiply(mu_tp1_est, mask)

        mu_t_target = phi_sa_t_ph + gamma * mu_tp1_est_masked

        td_error = mu_t_est - tf.stop_gradient(mu_t_target)
        td_error = tf.reduce_sum(tf.square(td_error), 1)

        errors = td_error
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error,
                                                    var_list=mu_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=mu_func_vars)

        update_target_expr = []
        for var, var_target in zip(
                sorted(mu_func_vars, key=lambda v: v.name),
                sorted(target_mu_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        train = U.function(inputs=[
            mu_stochastic_ph, obs_t_input, act_t_input, phi_sa_t_ph,
            obs_tp1_input, act_tp1_input, done_mask_ph, importance_weights_ph
        ],
                           outputs=td_error,
                           updates=[optimize_expr])

        update_target = U.function([], [], updates=[update_target_expr])

        mu_estimator = U.function([mu_stochastic_ph, obs_t_input, act_t_input],
                                  mu_t_est)

        return mu_estimator, train, update_target
Exemplo n.º 5
0
def learn(network,
          dataset,
          env_name,
          n_action,
          prefix,
          traj_lim,
          seed,
          optim_batch_size=32,
          max_iters=1e4,
          adam_epsilon=1e-4,
          optim_stepsize=3e-4,
          ckpt_dir=None,
          plot_dir=None,
          task_name=None,
          verbose=False):
    """
    learn with regularization
    """
    seed = 0
    alpha = 0.7
    beta = 1.0

    pi = network.pi
    T = network.T

    val_per_iter = int(max_iters / 20)

    ob = U.get_placeholder_cached(name="ob")
    T_ac = U.get_placeholder_cached(name="T_ac")
    pi_stochastic = U.get_placeholder_cached(name="pi_stochastic")
    T_stochastic = U.get_placeholder_cached(name="T_stochastic")

    ac = network.pdtype.sample_placeholder([None])
    ob_next = network.ob_next_pdtype.sample_placeholder([None])

    onehot_ac = tf.one_hot(ac, depth=n_action)
    ce_loss = tf.losses.softmax_cross_entropy(logits=pi.logits,
                                              onehot_labels=onehot_ac)

    ce_loss = tf.reduce_mean(ce_loss)

    reg_loss = tf.reduce_mean(tf.square(tf.to_float(ob_next -
                                                    network.ob_next)))

    losses = [ce_loss, reg_loss]

    total_loss = alpha * ce_loss + beta * reg_loss

    var_list = network.get_trainable_variables()
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    lossandgrad = U.function(
        [ob, ac, T_ac, ob_next, pi_stochastic, T_stochastic],
        losses + [U.flatgrad(total_loss, var_list)])

    U.initialize()
    adam.sync()
    logger.log("Training a policy with Behavior Cloning")
    logger.log("with {} trajs, {} steps".format(dataset.num_traj,
                                                dataset.num_transition))

    loss_history = {}
    loss_history["train_action_loss"] = []
    loss_history["train_transition_loss"] = []
    loss_history["val_action_loss"] = []
    loss_history["val_transition_loss"] = []

    for iter_so_far in tqdm(range(int(max_iters))):
        #ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train')
        ob_expert, ac_expert, ob_next_expert, info = dataset.get_next_batch(
            optim_batch_size, 'train')
        train_loss_ce, train_loss_reg, g = lossandgrad(ob_expert, ac_expert,
                                                       ac_expert,
                                                       ob_next_expert, True,
                                                       True)
        adam.update(g, optim_stepsize)
        if verbose and iter_so_far % val_per_iter == 0:
            #ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
            ob_expert, ac_expert, ob_next_expert, info = dataset.get_next_batch(
                -1, 'val')

            val_loss_ce, val_loss_reg, _ = lossandgrad(ob_expert, ac_expert,
                                                       ac_expert,
                                                       ob_next_expert, True,
                                                       True)
            items = [train_loss_ce, train_loss_reg, val_loss_ce, val_loss_reg]
            logger.log("Training Action loss: {}\n" \
                       "Training Transition loss: {}\n" \
                       "Validation Action loss: {}\n" \
                       "Validation Transition Loss:{}\n".format(*items))
            loss_history["train_action_loss"].append(train_loss_ce)
            loss_history["train_transition_loss"].append(train_loss_reg)
            loss_history["val_action_loss"].append(val_loss_ce)
            loss_history["val_transition_loss"].append(val_loss_reg)

            #if len(loss_history["val_action_loss"]) > 1:
            #    val_loss_ce_delta = loss_history["val_action_loss"][-1] - val_loss_ce
            #    if np.abs(val_loss_ce_delta) < val_stop_threshold:
            #        logger.log("validation error seems to have converged.")
            #        break

    plot(env_name, loss_history, traj_lim, plot_dir)

    os.makedirs(ckpt_dir, exist_ok=True)
    if ckpt_dir is None:
        savedir_fname = tempfile.TemporaryDirectory().name
    else:
        ckpt_fname = "ckpt.bc.{}.{}".format(traj_lim, seed)
        savedir_fname = osp.join(ckpt_dir, ckpt_fname)
    U.save_state(savedir_fname, var_list=network.get_variables())
    return savedir_fname