Python Critic.predict примеры использования

Язык программирования: Python

Пространство имен/Пакет: critic

Класс/Тип: Critic

Метод/Функция: predict

Примеров на hotexamples.com: 3

Python Critic.predict - 3 примера найдено. Это лучшие примеры Python кода для critic.Critic.predict, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Critic(30)

parameters(24)

load_state_dict(10)

get_action_gradients(9)

state_dict(9)

gradients(7)

save(6)

forward(5)

eval(4)

learn(4)

predict(3)

load(3)

action_gradients(3)

predict_rewards(3)

Q1(2)

get_weights(2)

load_checkpoint(2)

get_q(2)

model(2)

get_gradients(2)

predict_target(2)

get_action_grads(2)

next_q_value(2)

action_grad(2)

build(2)

set_weights(2)

save_weights(2)

copy_weights(2)

create_critic_model(2)

cuda(2)

save_model(1)

save_checkpoint(1)

predict_q_val(1)

return_q_and_out(1)

q1(1)

optimizer(1)

soft_update(1)

optimize_q_val(1)

hard_update_target_network(1)

network(1)

dq_da(1)

Q2(1)

__init__(1)

calc_grads(1)

compile(1)

create_value_net(1)

critic_target_update(1)

find_action_grads(1)

named_parameters(1)

gamma(1)

Пример #1

Показать файл

class DDPG:

    def __init__(self, sess, params):
        self.sess = sess
        self.__dict__.update(params)
        # create placeholders
        self.create_input_placeholders()
        # create actor/critic models
        self.actor = Actor(self.sess, self.inputs, **self.actor_params)
        self.critic = Critic(self.sess, self.inputs, **self.critic_params)
        self.noise_params = {k: np.array(list(map(float, v.split(","))))
                             for k, v in self.noise_params.items()}
        self.noise = Noise(**self.noise_params)
        self.ou_level = np.zeros(self.dimensions["u"])
        self.memory = Memory(self.n_mem_objects,
                             self.memory_size)

    def create_input_placeholders(self):
        self.inputs = {}
        with tf.name_scope("inputs"):
            for ip_name, dim in self.dimensions.items():
                self.inputs[ip_name] = tf.placeholder(tf.float32,
                                                      shape=(None, dim),
                                                      name=ip_name)
            self.inputs["g"] = tf.placeholder(tf.float32,
                                              shape=self.inputs["u"].shape,
                                              name="a_grad")
            self.inputs["p"] = tf.placeholder(tf.float32,
                                              shape=(None, 1),
                                              name="pred_q")

    def step(self, x, is_u_discrete, explore=True):
        x = x.reshape(-1, self.dimensions["x"])
        u = self.actor.predict(x)
        if explore:
            self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level)
            u = u + self.ou_level
        q = self.critic.predict(x, u)
        if is_u_discrete:
            return [np.argmax(u), u[0], q[0]]
        return [u[0], u, q[0]]

    def remember(self, experience):
        self.memory.add(experience)

    def train(self):
        # check if the memory contains enough experiences
        if self.memory.size < 3*self.b_size:
            return
        x, g, ag, u, r, nx, ng, t = self.get_batch()
        # for her transitions
        her_idxs = np.where(np.random.random(self.b_size) < 0.80)[0]
        # print("{} of {} selected for HER transitions".
        # format(len(her_idxs), self.b_size))
        g[her_idxs] = ag[her_idxs]
        r[her_idxs] = 1
        t[her_idxs] = 1
        x = np.hstack([x, g])
        nx = np.hstack([nx, ng])
        nu = self.actor.predict_target(nx)
        tq = r + self.gamma*self.critic.predict_target(nx, nu)*(1-t)
        self.critic.train(x, u, tq)
        grad = self.critic.get_action_grads(x, u)
        # print("Grads:\n", g)
        self.actor.train(x, grad)
        self.update_targets()

    def get_batch(self):
        return self.memory.sample(self.b_size)

    def update_targets(self):
        self.critic.update_target()
        self.actor.update_target()

Пример #2

Показать файл

class Agent(object):
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 env,
                 gamma=0.99,
                 max_size=10000,
                 layer1_size=400,
                 layer2_size=300,
                 batch_size=64):
        n_actions = env.action_space.shape[0]
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.sess = tf.Session()

        self.actor = Actor(alpha,
                           n_actions,
                           'Actor',
                           input_dims,
                           self.sess,
                           layer1_size,
                           layer2_size,
                           env.action_space.high,
                           self.batch_size,
                           ckpt_dir='tmp/ddpg/actor')

        self.critic = Critic(beta,
                             n_actions,
                             'Critic',
                             input_dims,
                             self.sess,
                             layer1_size,
                             layer2_size,
                             self.batch_size,
                             ckpt_dir='tmp/ddpg/critic')

        self.target_actor = Actor(alpha,
                                  n_actions,
                                  'TargetActor',
                                  input_dims,
                                  self.sess,
                                  layer1_size,
                                  layer2_size,
                                  env.action_space.high,
                                  self.batch_size,
                                  ckpt_dir='tmp/ddpg/target_actor')

        self.target_critic = Critic(beta,
                                    n_actions,
                                    'TargetCritic',
                                    input_dims,
                                    self.sess,
                                    layer1_size,
                                    layer2_size,
                                    self.batch_size,
                                    ckpt_dir='tmp/ddpg/target_critic')

        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.update_actor = [
            self.target_actor.params[i].assign(
                tf.multiply(self.actor.params[i], self.tau) +
                tf.multiply(self.target_actor.params[i], 1. - self.tau))
            for i in range(len(self.target_actor.params))
        ]

        self.update_critic = [
            self.target_critic.params[i].assign(
                tf.multiply(self.critic.params[i], self.tau) +
                tf.multiply(self.target_critic.params[i], 1. - self.tau))
            for i in range(len(self.target_critic.params))
        ]

        self.sess.run(tf.global_variables_initializer())

        self.update_target_network_parameters(first=True)

    def update_target_network_parameters(self, first=False):
        for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]):
            with tf.device(d):
                if first:
                    old_tau = self.tau
                    self.tau = 1.0
                    self.target_actor.sess.run(self.update_actor)
                    self.target_critic.sess.run(self.update_critic)
                    self.tau = old_tau
                else:
                    self.target_critic.sess.run(self.update_critic)
                    self.target_actor.sess.run(self.update_actor)

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def choose_action(self, state):
        # print("State[0]: ",state[0].shape)
        # print("State[1]: ",state[1].shape)
        state1 = state[0][np.newaxis, :]
        state2 = state[1][np.newaxis, :]
        state = [state1, state2]
        for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]):
            with tf.device(d):
                mu = self.actor.predict(state)
        noise = self.noise()
        mu_prime = mu + noise

        return mu_prime[0]

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]):
            with tf.device(d):
                state, action, reward, new_state, done = \
                                            self.memory.sample_buffer(self.batch_size)
                #target q-value(new_state) with actor's bounded action forward pass
                critic_value_ = self.target_critic.predict(
                    new_state, self.target_actor.predict(new_state))

                target = []
                for j in range(self.batch_size):
                    target.append(reward[j] +
                                  self.gamma * critic_value_[j] * done[j])

                target = np.reshape(target, (self.batch_size, 1))

                _ = self.critic.train(state, action, target)  #s_i, a_i and y_i

                # a = mu(s_i)
                a_outs = self.actor.predict(state)
                # gradients of Q w.r.t actions
                grads = self.critic.get_action_gradients(state, a_outs)

                self.actor.train(state, grads[0])

                self.update_target_network_parameters(first=True)

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()

Пример #3

Показать файл

class Agent:
    #Warning! policy.py and critic.py are still work in progress and contain many global variables that should be converted to
    #class member variables. Before that is done, all instances of Agent must use the same values for the following:
    #PPOepsilon,nHidden,nUnitsPerLayer,activation,H,entropyLossWeight,sdLowLimit
    def __init__(self,
                 stateDim: int,
                 actionDim: int,
                 actionMin: np.array,
                 actionMax: np.array,
                 learningRate=0.0005,
                 gamma=0.99,
                 GAElambda=0.95,
                 PPOepsilon=0.2,
                 PPOentropyLossWeight=0,
                 nHidden: int = 2,
                 nUnitsPerLayer: int = 128,
                 mode="PPO-CMA-m",
                 activation="lrelu",
                 H: int = 9,
                 entropyLossWeight: float = 0,
                 sdLowLimit=0.01,
                 useScaler: bool = True,
                 criticTimestepScale=0.001):
        #Create policy network
        print("Creating policy")
        self.actionMin = actionMin.copy()
        self.actionMax = actionMax.copy()
        self.actionDim = actionDim
        self.stateDim = stateDim
        self.useScaler = useScaler
        if useScaler:
            self.scaler = Scaler(stateDim)
        self.scalerInitialized = False
        self.normalizeAdvantages = True
        self.gamma = gamma
        self.GAElambda = GAElambda
        self.criticTimestepScale = 0 if gamma == 0 else criticTimestepScale  #with gamma==0, no need for this
        piEpsilon = None
        nHistory = 1
        negativeAdvantageAvoidanceSigma = 0
        if mode == "PPO-CMA" or mode == "PPO-CMA-m":
            usePPOLoss = False  #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i))
            separateVarAdapt = True
            self.reluAdvantages = True if mode == "PPO-CMA" else False
            nHistory = H  #policy mean adapts immediately, policy covariance as an aggreagate of this many past iterations
            useSigmaSoftClip = True
            negativeAdvantageAvoidanceSigma = 1 if mode == "PPO-CMA-m" else 0
        elif mode == "PPO":
            usePPOLoss = True  #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i))
            separateVarAdapt = False
            # separateSigmaAdapt=False
            self.reluAdvantages = False
            useSigmaSoftClip = True
            piEpsilon = 0
        else:
            raise ("Unknown mode {}".format(mode))
        self.policy = Policy(
            stateDim,
            actionDim,
            actionMin,
            actionMax,
            entropyLossWeight=PPOentropyLossWeight,
            networkActivation=activation,
            networkDepth=nHidden,
            networkUnits=nUnitsPerLayer,
            networkSkips=False,
            learningRate=learningRate,
            minSigma=sdLowLimit,
            PPOepsilon=PPOepsilon,
            usePPOLoss=usePPOLoss,
            separateVarAdapt=separateVarAdapt,
            nHistory=nHistory,
            useSigmaSoftClip=useSigmaSoftClip,
            piEpsilon=piEpsilon,
            negativeAdvantageAvoidanceSigma=negativeAdvantageAvoidanceSigma)

        #Create critic network, +1 stateDim because at least in OpenAI gym, episodes are time-limited and the value estimates thus depend on simulation time.
        #Thus, we use time step as an additional feature for the critic.
        #Note that this does not mess up generalization, as the feature is not used for the policy during training or at runtime
        print("Creating critic network")
        self.critic = Critic(stateDim=stateDim + 1,
                             learningRate=learningRate,
                             nHidden=nHidden,
                             networkUnits=nUnitsPerLayer,
                             networkActivation=activation,
                             useSkips=False,
                             lossType="L1")

        #Experience trajectory buffers for the memorize() and updateWithMemorized() methods
        self.experienceTrajectories = []
        self.currentTrajectory = []

    #call this after tensorflow's global variables initializer
    def init(self, sess: tf.Session, verbose=False):
        #Pretrain the policy to output the initial Gaussian for all states
        self.policy.init(
            sess, 0, 1,
            0.5 * (self.actionMin + self.actionMax) * np.ones(self.actionDim),
            0.5 * (self.actionMax - self.actionMin) * np.ones(self.actionDim),
            256, 2000, verbose)

    #stateObs is an n-by-m tensor, where n = number of observations, m = number of observation variables
    def act(self,
            sess: tf.Session,
            stateObs: np.array,
            deterministic=False,
            clipActionToLimits=True):
        #Expand a single 1d-observation into a batch of 1 vectors
        if len(stateObs.shape) == 1:
            stateObs = np.reshape(stateObs, [1, stateObs.shape[0]])
        #Query the policy for the action, except for the first iteration where we sample directly from the initial exploration Gaussian
        #that covers the whole action space.
        #This is done because we don't know the scale of state observations a priori; thus, we can only init the state scaler in update(),
        #after we have collected some experience.
        if self.useScaler and (not self.scalerInitialized):
            actions = np.random.normal(
                0.5 * (self.actionMin + self.actionMax) *
                np.ones(self.actionDim),
                0.5 * (self.actionMax - self.actionMin) *
                np.ones(self.actionDim),
                size=[stateObs.shape[0], self.actionDim])
            if clipActionToLimits:
                actions = np.clip(
                    actions, np.reshape(self.actionMin, [1, self.actionDim]),
                    np.reshape(self.actionMax, [1, self.actionDim]))
            return actions
        else:
            if self.useScaler:
                scaledObs = self.scaler.process(stateObs)
            else:
                scaledObs = stateObs
            if deterministic:
                actions = self.policy.getExpectation(sess, scaledObs)
            else:
                actions = self.policy.sample(sess, scaledObs)
            if clipActionToLimits:
                actions = np.clip(actions, self.actionMin, self.actionMax)
            return actions

    def memorize(self, observation: np.array, action: np.array, reward: float,
                 nextObservation: np.array, done: bool):
        e = Experience(observation, action, reward, nextObservation, done)
        self.currentTrajectory.append(e)
        if done:
            self.experienceTrajectories.append(self.currentTrajectory)
            self.currentTrajectory = []

    def getAverageActionStdev(self):
        if self.useScaler and (not self.scalerInitialized):
            return np.mean(0.5 * (self.actionMax - self.actionMin))
        else:
            return self.policy.usedSigmaSum / (1e-20 +
                                               self.policy.usedSigmaSumCounter)

    #If you call memorize() after each action, you can update the agent with this method.
    #If you handle the experience buffers yourself, e.g., due to a multithreaded implementation, use the update() method instead.
    def updateWithMemorized(self,
                            sess: tf.Session,
                            batchSize: int = 512,
                            nBatches: int = 100,
                            verbose=True,
                            valuesValid=False,
                            timestepsValid=False):
        self.update(sess,
                    experienceTrajectories=self.experienceTrajectories,
                    batchSize=batchSize,
                    nBatches=nBatches,
                    verbose=verbose,
                    valuesValid=valuesValid,
                    timestepsValid=timestepsValid)
        averageEpisodeReturn = 0
        for t in self.experienceTrajectories:
            episodeReturn = 0
            for e in t:
                episodeReturn += e.r
            averageEpisodeReturn += episodeReturn
        averageEpisodeReturn /= len(self.experienceTrajectories)
        self.experienceTrajectories = []
        self.currentTrajectory = []
        return averageEpisodeReturn

    #experienceTrajectories is a list of lists of Experience instances such that each of the contained lists corresponds to an episode simulation trajectory
    def update(self,
               sess: tf.Session,
               experienceTrajectories,
               batchSize: int = 512,
               nBatches: int = 100,
               verbose=True,
               valuesValid=False,
               timestepsValid=False):
        trajectories = experienceTrajectories  #shorthand

        #Collect all data into linear arrays for training.
        nTrajectories = len(trajectories)
        nData = 0
        for trajectory in trajectories:
            nData += len(trajectory)
            #propagate values backwards along trajectory if not already done
            if not valuesValid:
                for i in reversed(range(len(trajectory) - 1)):
                    #value estimates, used for training the critic and estimating advantages
                    trajectory[i].V = trajectory[
                        i].r + self.gamma * trajectory[i + 1].V
            #update time steps if not updated
            if not timestepsValid:
                for i in range(len(trajectory)):
                    trajectory[i].timeStep = i
        allStates = np.zeros([nData, self.stateDim])
        allActions = np.zeros([nData, self.actionDim])
        allValues = np.zeros([nData])
        allTimes = np.zeros([nData, 1])
        k = 0
        for trajectory in trajectories:
            for e in trajectory:
                allStates[k, :] = e.s
                allValues[k] = e.V
                allActions[k, :] = e.a
                allTimes[k, 0] = e.timeStep * self.criticTimestepScale
                k += 1

        #Update scalers
        if self.useScaler:
            self.scaler.update(allStates)
            scale, offset = self.scaler.get()
            self.scalerInitialized = True
        else:
            offset = 0
            scale = 1

        #Scale the observations for training the critic
        scaledStates = self.scaler.process(allStates)

        #Train critic
        def augmentCriticObs(obs: np.array, timeSteps: np.array):
            return np.concatenate([obs, timeSteps], axis=1)

        self.critic.train(sess,
                          augmentCriticObs(scaledStates, allTimes),
                          allValues,
                          batchSize,
                          nEpochs=0,
                          nBatches=nBatches,
                          verbose=verbose)

        #Policy training needs advantages, which depend on the critic we just trained.
        #We use Generalized Advantage Estimation by Schulman et al.
        if verbose:
            print("Estimating advantages...".format(len(trajectories)))
        for t in trajectories:
            #query the critic values of all states of this trajectory in one big batch
            nSteps = len(t)
            states = np.zeros([nSteps + 1, self.stateDim])
            timeSteps = np.zeros([nSteps + 1, 1])
            for i in range(nSteps):
                states[i, :] = t[i].s
                timeSteps[i, 0] = t[i].timeStep * self.criticTimestepScale
            states[nSteps, :] = t[nSteps - 1].s_next
            states = (states - offset) * scale
            values = self.critic.predict(sess,
                                         augmentCriticObs(states, timeSteps))

            #GAE loop, i.e., take the instantaneous advantage (how much value a single action brings, assuming that the
            #values given by the critic are unbiased), and smooth those along the trajectory using 1st-order IIR filter.
            for step in reversed(range(nSteps - 1)):
                delta_t = t[step].r + self.gamma * values[step +
                                                          1] - values[step]
                t[step].advantage = delta_t + self.GAElambda * self.gamma * t[
                    step + 1].advantage

        #Gather the advantages to linear array and apply ReLU and normalization if needed
        allAdvantages = np.zeros([nData])
        k = 0
        for trajectory in trajectories:
            for e in trajectory:
                allAdvantages[k] = e.advantage
                k += 1

        if self.reluAdvantages:
            allAdvantages = np.clip(allAdvantages, 0, np.inf)
        if self.normalizeAdvantages:
            aMean = np.mean(allAdvantages)
            aSd = np.std(allAdvantages)
            if verbose:
                print("Advantage mean {}, sd{}".format(aMean, aSd))
            allAdvantages /= 1e-10 + aSd

        #Train policy. Note that this uses original unscaled states, because the PPO-CMA variance training needs a history of
        #states in the same scale
        self.policy.train(sess,
                          allStates,
                          allActions,
                          allAdvantages,
                          batchSize,
                          nEpochs=0,
                          nBatches=nBatches,
                          stateOffset=offset,
                          stateScale=scale,
                          verbose=verbose)