Python BasePrior.getControl примеры использования

Язык программирования: Python

Пространство имен/Пакет: prior

Класс/Тип: BasePrior

Метод/Функция: getControl

Примеров на hotexamples.com: 2

Python BasePrior.getControl - 2 примера найдено. Это лучшие примеры Python кода для prior.BasePrior.getControl, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

BasePrior(8)

getControl(2)

getControl_h(2)

computePrior(1)

Пример #1

Показать файл

def train(sess, env, args, actor, critic, actor_noise, reward_result,
          lambda_mix):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # Get dynamics and initialize prior controller
    [A, B] = get_linear_dynamics()
    prior = BasePrior(A, B)
    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    paths = list()

    for i in range(int(args['max_episodes'])):

        s = env.reset()

        ep_reward = 0.
        ep_ave_max_q = 0

        obs, action, rewards = [], [], []

        #Get optimal reward using optimal control
        s0 = np.copy(s)
        ep_reward_opt = 0.
        for kk in range(int(args['max_episode_len'])):
            a_prior = prior.getControl_h(s0)
            a_prior = np.squeeze(np.asarray(a_prior))
            a = a_prior
            s0, r, stop_c, _ = env.step(a)
            ep_reward_opt += r
            if (stop_c):
                break

        env.reset()
        sp = env.unwrapped.reset(s)

        reward_lqr = 0.
        while True:
            a_lqr = prior.getControl(sp)
            a_lqr = np.squeeze(np.asarray(a_lqr))
            sp, reward_p, done_p, _ = env.step(a_lqr)
            reward_lqr += reward_p
            if done_p:
                break

        # Get reward using regRL algorithm
        env.reset()
        s = env.unwrapped.reset(s)

        for j in range(int(args['max_episode_len'])):

            # Set control prior regularization weight
            # lambda_mix = 5.

            # Prior control
            a_prior = prior.getControl_h(s)
            a_prior = np.squeeze(np.asarray(a_prior))

            # Rl control with exploration noise
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()
            #a = actor.predict(np.reshape(s, (1, actor.s_dim))) + (1. / (1. + i))

            # Mix the actions (RL controller + control prior)
            act = a[0] / (1 + lambda_mix) + (lambda_mix /
                                             (1 + lambda_mix)) * a_prior

            # Take action and observe next state/reward
            s2, r, terminal, info = env.step(act)

            # Add info from time step to the replay buffer
            replay_buffer.add(
                np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )),
                r, terminal, np.reshape(s2, (actor.s_dim, )),
                np.reshape((lambda_mix / (1 + lambda_mix)) * a_prior,
                           (actor.a_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):

                #Sample a batch from the replay buffer
                s_batch, a_batch_0, r_batch, t_batch, s2_batch, a_prior_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                a_batch = a_batch_0

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))
                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))
                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

                # Calculate TD-Error for each state
                base_q = critic.predict_target(s_batch,
                                               actor.predict_target(s_batch))
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

            s = s2
            ep_reward += r

            obs.append(s)
            rewards.append(r)
            action.append(a[0])

            # Collect results at end of episode
            if terminal:
                for ii in range(len(obs)):
                    obs[ii] = obs[ii].reshape((4, 1))
                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward - ep_reward_opt), \
                        i, (ep_ave_max_q / float(j))))
                reward_result[0, i] = ep_reward
                reward_result[1, i] = ep_reward_opt
                reward_result[2, i] = reward_lqr
                path = {
                    "Observation": np.concatenate(obs).reshape((-1, 4)),
                    "Action": np.concatenate(action),
                    "Reward": np.asarray(rewards)
                }
                paths.append(path)
                break

    return [summary_ops, summary_vars, paths]

Пример #2

Показать файл

        sp = np.copy(s0)
        reward_prior = 0.
        while True:
            a_prior = prior.getControl_h(sp)
            a_prior = np.squeeze(np.asarray(a_prior))
            sp, reward_p, done_p, _ = env.step(a_prior)
            reward_prior += reward_p
            if done_p:
                break

        env.reset()
        sp = env.unwrapped.reset(s0)
            
        reward_lqr = 0.
        while True:
            a_lqr = prior.getControl(sp)
            a_lqr = np.squeeze(np.asarray(a_lqr))
            sp, reward_p, done_p, _ = env.step(a_lqr)
            reward_lqr += reward_p
            if done_p:
                break
            
        env.reset()
        s = env.unwrapped.reset(s0)
        #s = env.reset()
        ep_r, ep_t, ep_a = 0, 0, []

        while True:
            a, v = ppo.evaluate_state(s)
            a = np.squeeze(a)
            s = np.squeeze(s)[np.newaxis,:]