Exemplo n.º 1
0
    def fit(self, D, pi):
        """estimate xi to compute mu

        assuminng action-value function mu(s, a)
        is linearly parametrized by xi
        such that mu(s, a) = Q_phi(s, a) = xi^T psi(s)

        Parameters
        ----------
        pi : Policy
            policy to evaluate

        Returns
        -------
        xi_hat = xi_hat

        TODO
        - vectorize this
        - phi(s, a) or phi(s) when to use
        - what phi or psi to use?
        - check dimensionality of everytthing


        """
        self._D = D

        s_next = self._D["s_next"]
        absorb = self._D["done"]
        phi_sa = self._D["phi_sa"]

        psi_sa = self._D["psi_sa"]
        self._psi = self._D["psi_fn"]

        a_next = [
            pi.act(self._stochastic, s[np.newaxis, ...])[0] for s in s_next
        ]

        psi_sa_next = self._psi(s_next, a_next)
        psi_sa_next[absorb.flatten(), :] = 0

        A_hat = np.zeros((self._q, self._q))
        A_hat += self._lstd_eps * np.identity(self._q)
        b_hat = np.zeros((self._q, self._p))

        psi_delta = psi_sa - self._gamma * psi_sa_next

        A_hat += psi_sa.T.dot(psi_delta)
        b_hat = psi_sa.T.dot(phi_sa)

        rank = matrix_rank(A_hat)
        if rank == self._p:
            xi_hat = solve(A_hat, b_hat)
        else:
            logger.log("condition number of A_hat\n{}".format(cond(A_hat)))
            logging.warning("A_hat is not full rank {} < {}".format(
                rank, self._p))
            xi_hat = lstsq(A_hat, b_hat)[0]

        self._xi_hat = xi_hat
        return xi_hat
Exemplo n.º 2
0
    def fit(self, phi_sa, phi_sa_next, r):
        """TODO: Docstring for learn.

        assuminng action-value function Q(s,a)
        is linearly parametrized by W
        such that Q = W^T phi(s)

        Parameters
        ----------

        Returns
        -------
        TODO
        this is LSTD_Q
        check dimensionality of everything

        """
        gamma = self._gamma
        A_hat, b_hat = fast_solve(phi_sa, phi_sa_next, r, gamma)

        rank = matrix_rank(A_hat)
        if rank == self._p:
            W_hat = solve(A_hat, b_hat)
        else:
            logger.log("condition number of A_hat\n{}".format(cond(A_hat)))
            logger.log("A_hat is not full rank {} < {}".format(rank, self._p))
            W_hat = lstsq(A_hat, b_hat)[0]

        self._W_hat = W_hat
        return W_hat
Exemplo n.º 3
0
def train_bc(task, params, ob_space, ac_space, args, env):
    task_path = os.path.join(root_path, "task", args.task)
    plot_path = os.path.join(task_path, "result")

    dataset = GymDataset(expert_path=args.expert_path,
                         traj_limitation=args.traj_limitation)

    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    reuse=reuse,
                                    hid_size_phi=args.policy_hidden_size,
                                    num_hid_layers_phi=2,
                                    dim_phi=args.dim_phi)

    env_name = task["env_id"]
    name = "pi.{}.{}".format(env_name.lower().split("-")[0],
                             args.traj_limitation)
    pi = policy_fn(name, ob_space, ac_space)
    n_action = env.action_space.n

    fname = "ckpt.bc.{}.{}".format(args.traj_limitation, args.seed)
    savedir_fname = osp.join(args.checkpoint_dir, fname, fname)

    if not os.path.exists(savedir_fname + ".index"):
        savedir_fname = learn(pi,
                              dataset,
                              env_name,
                              n_action,
                              prefix="bc",
                              seed=args.seed,
                              traj_lim=args.traj_limitation,
                              max_iters=args.BC_max_iter,
                              ckpt_dir=osp.join(args.checkpoint_dir, fname),
                              plot_dir=plot_path,
                              task_name=task["env_id"],
                              verbose=True)
        logger.log(savedir_fname + "saved")


#    avg_len, avg_ret = run_gym(env,
#                               policy_fn,
#                               savedir_fname,
#                               timesteps_per_batch=args.horizon,
#                               number_trajs=10,
#                               stochastic_policy=args.stochastic_policy,
#                               save=args.save_sample,
#                               reuse=True)
#
#
    return savedir_fname
 def log_info(self):
     logger.log("Total trajectorues: %d" % self.num_traj)
     logger.log("Total transitions: %d" % self.num_transition)
     logger.log("Average returns: %f" % self.avg_ret)
     logger.log("Std for returns: %f" % self.std_ret)
Exemplo n.º 5
0
    def _train(self):

        self._buffer_list = []
        self._beta_schedule_list = []
        if self._prioritized_replay:
            self._rb = PrioritizedReplayBufferNextAction(
                self._n_train, alpha=self._prioritized_replay_alpha)
            if self._prioritized_replay_beta_iters is None:
                self._prioritized_replay_beta_iters = self._max_timesteps
            self._bs = LinearSchedule(self._prioritized_replay_beta_iters,
                                      initial_p=self._prioritized_replay_beta0,
                                      final_p=1.0)
        else:
            self._rb = ReplayBufferNextAction(self._n_train)

        D_train_zipped = zip(self._D_train["s"], self._D_train["a"],
                             self._D_train["phi_sa"], self._D_train["s_next"],
                             self._D_train["done"])
        for (s, a, phi_sa, s_next, done) in D_train_zipped:

            a_next = self._pi.act(self._mu_stochastic, s_next[np.newaxis,
                                                              ...])[0]
            self._rb.add(s, a, phi_sa.flatten(), s_next, a_next, float(done))

        phi_sa_val = self._D_val["phi_sa"]
        s_val = self._D_val["s"]
        a_val = self._D_val["a"]
        s_next_val = self._D_val["s_next"]

        a_next_val = self._pi.act(self._mu_stochastic, s_next_val)[0]
        a_next_val = a_next_val[..., np.newaxis]

        sess = tf.Session()
        sess.__enter__()

        def make_obs_ph(name):
            return BatchInput(self._obs_shape, name=name)

        def make_acs_ph(name):
            return BatchInput(self._acs_shape, name=name)

        tools = build_train(
            make_obs_ph=make_obs_ph,
            make_acs_ph=make_acs_ph,
            optimizer=tf.train.AdamOptimizer(learning_rate=self._lr),
            mu_func=self._model,
            phi_sa_dim=self._mu_dim,
            grad_norm_clipping=self._grad_norm_clipping,
            gamma=self._gamma,
            scope=self._scope_name,
            reuse=True)

        mu_estimator, train, update_target = tools

        self._timestep = int(self._exploration_fraction * self._max_timesteps),

        U.initialize()
        update_target()

        for t in itertools.count():
            if self._prioritized_replay:
                experience = self._rb.sample(self._buffer_batch_size,
                                             beta=self._bs.value(t + 1))
                (s, a, phi_sa, s_next, a_next, dones, weights,
                 batch_idxes) = experience
            else:
                s, a, phi_sa, s_next, a_next, dones = self._rb.sample(
                    self._buffer_batch_size)
                weights, batch_idxes = np.ones(self._buffer_batch_size), None

            if len(a_next.shape) == 1:
                a_next = np.expand_dims(a_next, axis=1)

            td_errors = train(self._mu_stochastic, s, a, phi_sa, s_next,
                              a_next, dones, weights)

            if self._prioritized_replay:
                new_priorities = np.abs(
                    td_errors) + self._prioritized_replay_eps
                self._rb.update_priorities(batch_idxes, new_priorities)

            if t % self._target_network_update_freq == 0:
                #sys.stdout.flush()
                #sys.stdout.write("average training td_errors: {}".format(td_errors.mean()))
                logger.log("average training td_errors: {}".format(
                    td_errors.mean()))
                update_target()

            if t % self._evaluation_freq == 0:
                logger.log("been trained {} steps".format(t))

                mu_est_val = mu_estimator(self._mu_stochastic, s_val, a_val)
                mu_target_val = phi_sa_val + self._gamma * mu_estimator(
                    self._mu_stochastic, s_next_val, a_next_val)
                # average over rows and cols
                td_errors_val = np.mean((mu_est_val - mu_target_val)**2)

                if td_errors_val < self._delta:
                    logger.log(
                        "mean validation td_errors: {}".format(td_errors_val))
                    break

            if t > self._max_timesteps:
                break

        self._mu_estimator = mu_estimator
        return mu_estimator
Exemplo n.º 6
0
def learn_original(pi,
                   dataset,
                   env_name,
                   n_action,
                   prefix,
                   traj_lim,
                   seed,
                   optim_batch_size=128,
                   max_iters=5e3,
                   adam_epsilon=1e-4,
                   optim_stepsize=1e-4,
                   ckpt_dir=None,
                   plot_dir=None,
                   task_name=None,
                   verbose=False):
    """
    learn without regularization
    """
    # custom hyperparams
    seed = 0
    max_iters = 5e4

    val_per_iter = int(max_iters / 10)
    # placeholder
    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])
    stochastic = U.get_placeholder_cached(name="stochastic")
    loss = tf.reduce_mean(tf.square(tf.to_float(ac - pi.ac)))
    var_list = pi.get_trainable_variables()
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    lossandgrad = U.function([ob, ac, stochastic],
                             [loss] + [U.flatgrad(loss, var_list)])

    U.initialize()
    adam.sync()
    logger.log("Training a policy with Behavior Cloning")
    logger.log("with {} trajs, {} steps".format(dataset.num_traj,
                                                dataset.num_transition))

    loss_history = {}
    loss_history["train_action_loss"] = []
    loss_history["val_action_loss"] = []

    for iter_so_far in tqdm(range(int(max_iters))):
        ob_expert, ac_expert, _, _ = dataset.get_next_batch(
            optim_batch_size, 'train')
        train_loss, g = lossandgrad(ob_expert, ac_expert, True)
        adam.update(g, optim_stepsize)
        if verbose and iter_so_far % val_per_iter == 0:
            ob_expert, ac_expert, _, _ = dataset.get_next_batch(-1, 'val')
            val_loss, _ = lossandgrad(ob_expert, ac_expert, True)
            logger.log("Training loss: {}, Validation loss: {}".format(
                train_loss, val_loss))

            loss_history["train_action_loss"].append(train_loss)
            loss_history["val_action_loss"].append(val_loss)

    plot(env_name, loss_history, traj_lim, plot_dir)

    os.makedirs(ckpt_dir, exist_ok=True)
    if ckpt_dir is None:
        savedir_fname = tempfile.TemporaryDirectory().name
    else:
        ckpt_fname = "ckpt.bc.{}.{}".format(traj_lim, seed)
        savedir_fname = osp.join(ckpt_dir, ckpt_fname)
    U.save_state(savedir_fname, var_list=pi.get_variables())
    return savedir_fname
Exemplo n.º 7
0
def learn(network,
          dataset,
          env_name,
          n_action,
          prefix,
          traj_lim,
          seed,
          optim_batch_size=32,
          max_iters=1e4,
          adam_epsilon=1e-4,
          optim_stepsize=3e-4,
          ckpt_dir=None,
          plot_dir=None,
          task_name=None,
          verbose=False):
    """
    learn with regularization
    """
    seed = 0
    alpha = 0.7
    beta = 1.0

    pi = network.pi
    T = network.T

    val_per_iter = int(max_iters / 20)

    ob = U.get_placeholder_cached(name="ob")
    T_ac = U.get_placeholder_cached(name="T_ac")
    pi_stochastic = U.get_placeholder_cached(name="pi_stochastic")
    T_stochastic = U.get_placeholder_cached(name="T_stochastic")

    ac = network.pdtype.sample_placeholder([None])
    ob_next = network.ob_next_pdtype.sample_placeholder([None])

    onehot_ac = tf.one_hot(ac, depth=n_action)
    ce_loss = tf.losses.softmax_cross_entropy(logits=pi.logits,
                                              onehot_labels=onehot_ac)

    ce_loss = tf.reduce_mean(ce_loss)

    reg_loss = tf.reduce_mean(tf.square(tf.to_float(ob_next -
                                                    network.ob_next)))

    losses = [ce_loss, reg_loss]

    total_loss = alpha * ce_loss + beta * reg_loss

    var_list = network.get_trainable_variables()
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    lossandgrad = U.function(
        [ob, ac, T_ac, ob_next, pi_stochastic, T_stochastic],
        losses + [U.flatgrad(total_loss, var_list)])

    U.initialize()
    adam.sync()
    logger.log("Training a policy with Behavior Cloning")
    logger.log("with {} trajs, {} steps".format(dataset.num_traj,
                                                dataset.num_transition))

    loss_history = {}
    loss_history["train_action_loss"] = []
    loss_history["train_transition_loss"] = []
    loss_history["val_action_loss"] = []
    loss_history["val_transition_loss"] = []

    for iter_so_far in tqdm(range(int(max_iters))):
        #ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train')
        ob_expert, ac_expert, ob_next_expert, info = dataset.get_next_batch(
            optim_batch_size, 'train')
        train_loss_ce, train_loss_reg, g = lossandgrad(ob_expert, ac_expert,
                                                       ac_expert,
                                                       ob_next_expert, True,
                                                       True)
        adam.update(g, optim_stepsize)
        if verbose and iter_so_far % val_per_iter == 0:
            #ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
            ob_expert, ac_expert, ob_next_expert, info = dataset.get_next_batch(
                -1, 'val')

            val_loss_ce, val_loss_reg, _ = lossandgrad(ob_expert, ac_expert,
                                                       ac_expert,
                                                       ob_next_expert, True,
                                                       True)
            items = [train_loss_ce, train_loss_reg, val_loss_ce, val_loss_reg]
            logger.log("Training Action loss: {}\n" \
                       "Training Transition loss: {}\n" \
                       "Validation Action loss: {}\n" \
                       "Validation Transition Loss:{}\n".format(*items))
            loss_history["train_action_loss"].append(train_loss_ce)
            loss_history["train_transition_loss"].append(train_loss_reg)
            loss_history["val_action_loss"].append(val_loss_ce)
            loss_history["val_transition_loss"].append(val_loss_reg)

            #if len(loss_history["val_action_loss"]) > 1:
            #    val_loss_ce_delta = loss_history["val_action_loss"][-1] - val_loss_ce
            #    if np.abs(val_loss_ce_delta) < val_stop_threshold:
            #        logger.log("validation error seems to have converged.")
            #        break

    plot(env_name, loss_history, traj_lim, plot_dir)

    os.makedirs(ckpt_dir, exist_ok=True)
    if ckpt_dir is None:
        savedir_fname = tempfile.TemporaryDirectory().name
    else:
        ckpt_fname = "ckpt.bc.{}.{}".format(traj_lim, seed)
        savedir_fname = osp.join(ckpt_dir, ckpt_fname)
    U.save_state(savedir_fname, var_list=network.get_variables())
    return savedir_fname
Exemplo n.º 8
0
def train_mma(pi_0, phi_sa_dim, task_desc, params, D, evaluator, ob_space=None, ac_space=None):
    gym.logger.setLevel(logging.WARN)

    gamma =  task_desc["gamma"]
    horizon = task_desc["horizon"]
    eps = params["eps"]
    p = q = phi_sa_dim # adding action dim
    phi = D["phi_fn"]
    phi_s = D["phi_fn_s"]
    stochastic = True
    mu_estimator_type = params["mu_estimator"]
    n_action = task_desc["n_action"]
    assert isinstance(n_action, int)
    action_list = range(n_action)
    precision = params["precision"]

    mu_exp_estimator = EmpiricalMuEstimator(phi, gamma)
    mu_exp_estimator.fit(D, stochastic, return_s_init=True)
    mu_exp, s_init_list = mu_exp_estimator.estimate()


    logger.log("fitting {}".format(mu_estimator_type))
    if task_desc["type"] == "gym":
        env = gym.make(task_desc["env_id"])
        ac_space = env.action_space
        ob_space = env.observation_space
        mu_dim = p # only for discrete action
    elif task_desc["type"] == "sepsis":
        if ac_space is None:
            ac_space = (5, )
        if ob_space is None:
            ob_space = (46, )
        mu_dim = p

    stochastic = True

    s = D["s"]
    a = D["a"]
    if len(a.shape) == 1:
        a = np.expand_dims(a, axis=1)
    s_next = D["s_next"]
    done = D["done"]
    if len(done.shape) == 1:
        done = np.expand_dims(done, axis=1)
    phi_sa = D["phi_sa"]

    n_transition = D["s"].shape[0]
    idx = idx = int(n_transition * 0.7)

    D_train = {"s" : s[:idx, :],
             "a" : a[:idx, :],
             "phi_sa" : phi_sa[:idx, :],
             "s_next": s_next[:idx, :],
             "done": done[:idx, :]}

    D_val = {"s" : s[idx:, :],
           "a" : a[idx:, :],
           "phi_sa" : phi_sa[idx:, :],
           "s_next": s_next[idx:, :],
           "done": done[idx:, :]}


    if mu_estimator_type == "lstd":
        mu_estimator = LSTDMuEstimator(phi, gamma, D, p, q, eps, s_init_list)
    elif mu_estimator_type == "dsfn":
        mu_estimator = DeepMuEstimator(phi, gamma, D_train, D_val, s_init_list, ob_space,
                ac_space, mu_dim, horizon)
    else:
        raise NotImplementedError

    if params["mdp_solver"] == "lspi":
        W_0 = np.random.normal(loc=0, scale=0.1, size=p)
        lspi = LSPI(D=D,
                    action_list=range(task_desc["n_action"]),
                    p=p,
                    gamma=gamma,
                    precision=precision,
                    lstd_eps=params["eps"],
                    W_0=W_0,
                    reward_fn=None,
                    stochastic=True,
                    max_iter=10)
        mdp_solver = lspi

    elif params["mdp_solver"] == "dqn":
        mdp_solver = DQNSepsis(D=D_train)
    else:
        raise NotImplementedError


    mma = MaxMarginAbbeel(pi_init=pi_0,
                          p=p,
                          phi=phi,
                          mu_exp=mu_exp,
                          mdp_solver=mdp_solver,
                          evaluator=evaluator,
                          irl_precision=params["precision"],
                          method=params["method"],
                          mu_estimator=mu_estimator,
                          stochastic=stochastic,
                          D_val=D_val)

    results = mma.run(n_iteration=params["n_iteration"])
    return results