Python Actor.predict примеры использования

Язык программирования: Python

Пространство имен/Пакет: Actor

Класс/Тип: Actor

Метод/Функция: predict

Примеров на hotexamples.com: 7

Python Actor.predict - 7 примеров найдено. Это лучшие примеры Python кода для Actor.Actor.predict, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Actor(30)

__init__(15)

train(15)

parameters(14)

choose_action(8)

learn(8)

predict(7)

train_fn(6)

eval(6)

state_dict(5)

load_state_dict(4)

load(4)

_loss_(3)

update(3)

setPos(2)

ChooseActionByPolicy(2)

save(2)

setcollaabb(2)

sethitaabb(2)

get_shortest_actor(2)

model(2)

build_net(2)

doAttack(1)

target_predict(1)

save_cache(1)

setLocation(1)

addtriumphs(1)

setUnqTags(1)

set_brain(1)

set_goal(1)

setaabb(1)

addedToWorld(1)

add_grad_to_graph(1)

addMovie(1)

stop(1)

take_damage_shortest_point(1)

target_actions(1)

target_predict_method(1)

run(1)

transfer_to_actor_model(1)

update_target(1)

update_actor_target(1)

updatePolicy(1)

act(1)

unpickleActors(1)

triumph(1)

train_p(1)

target_update_method(1)

action(1)

train_2(1)

Пример #1

Показать файл

Файл: PPO.py Проект: war3gu/gykRL

def play():
    print("play")

    env = gym.make('LunarLander-v2')

    state = env.reset()

    actor = Actor(env.action_space, env.observation_space)

    actor.load()

    #critic = Critic(env.action_space, env.observation_space)

    #replayMemory = ReplayMemory()

    #summary_ops, summary_vars = build_summaries()

    #writer = tf.summary.FileWriter("./log", tf.Session().graph)

    #episode_reward = 0

    #step = 1

    while True:

        env.render()

        state1 = state[np.newaxis, :]

        action, action_matrix, prob = actor.predict(state1)

        next_state, reward, done, info = env.step(action)

        #replayMemory.add(state, action_matrix, reward, done, next_state, prob)

        state = next_state

        if done:
            #summary_str = tf.Session().run(summary_ops, feed_dict={summary_vars[0]: episode_reward})
            #writer.add_summary(summary_str, step)
            #writer.flush()
            state = env.reset()

    return 0

Пример #2

Показать файл

Файл: DDPG.py Проект: farscape2012/DDPG

actor = Actor(sess, state_size, action_size)
critic = Critic(sess, state_size, action_size)
buffer = ReplayBuffer(BUFFER_SIZE)


env.monitor.start('experiments/' + 'Pendulum-v0',force=True)

for ep in range(10000):
    state = env.reset()
    Totoal = 0
    # what if the action is beyond the scope?
    for iteration  in range(100):
        # select the action with actor model.
        env.render()

        action = actor.predict([state])[0] + (np.random.randn(1)/(ep + iteration + 1))

        newState, reward, terminated, _ = env.step(action)
        Totoal += reward
        buffer.add(state, action, reward, newState, terminated) #state, action, reward, new_state, done


        # update critic
        batch = buffer.getBatch(batch_size=BATCH_SIZE)
        states = np.array([e[0] for e in batch])
        actions = np.array([e[1] for e in batch])
        rewards = np.array([e[2] for e in batch])
        newStates = np.array([e[3] for e in batch])
        notTerminated = np.array([1.-e[4] for e in batch])

        newStatesScores = critic.target_predict_method(newStates, actor.target_predict_method(newStates))

Пример #3

Показать файл

    replayMemory = ReplayMemory()

    summary_ops, summary_vars = build_summaries()

    writer = tf.summary.FileWriter("./log", tf.Session().graph)

    episode_reward = 0

    step = 0

    while True:
        #env.render()

        state1 = state[np.newaxis, :]

        action, action_matrix = actor.predict(state1)

        next_state, reward, done, info = env.step(action)

        replayMemory.add(state, action_matrix, reward, done, next_state)

        state = next_state

        episode_reward += reward

        #train
        if replayMemory.size() % 128 == 0 or done == True:

            state_b, action_matrix_b, reward_b, done_b, next_state_b = replayMemory.miniAll()

            reward_b = reward_b[:, np.newaxis]

Пример #4

Показать файл

Файл: DQN.py Проект: war3gu/gykRL

        state1 = state[np.newaxis, :]

        action, action_matrix, prob = actor.act(state1)

        next_state, reward, done, info = env.step(action)

        replayMemory.add(state, action, reward, done, next_state, prob)

        state = next_state

        episode_reward += reward
        ##############################train######################
        if replayMemory.size() >= 128:
            state_b, action_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniBatch(
                int(64))
            next_state_b_value = actor.predict(next_state_b)
            state_b_value = actor.predict(state_b)
            length = state_b.shape[0]

            for i in range(length):
                target_next = reward_b[i]
                if not done_b[i]:
                    action_values = next_state_b_value[i]
                    target_next = (reward_b[i] + 0.7 * np.amax(action_values))
                state_b_value[i][action_b[i]] = target_next
            actor.train(state_b, state_b_value)

        if done:
            summary_str = tf.Session().run(
                summary_ops, feed_dict={summary_vars[0]: episode_reward})
            writer.add_summary(summary_str, step)

Пример #5

Показать файл

Файл: Agent.py Проект: jonrosner/ReinforcementLearning

class Agent:
    def __init__(self, env, sess, LEARNING_RATE_ACTOR, LEARNING_RATE_CRITIC,
                 NET_SIZE, MEMORY_LEN, REWARD_DISCOUNT, BATCH_SIZE, TAU,
                 EXPLORATION_STEPS, VERBOSE, LOG_DIR_TF):
        self.env = env
        self.sess = sess
        self.observation_space = self.env.observation_space.shape[0]
        self.action_space = self.env.action_space.shape[0]
        self.REWARD_DISCOUNT = REWARD_DISCOUNT
        self.TAU = TAU
        self.BATCH_SIZE = BATCH_SIZE
        self.noise_state = np.zeros(self.action_space)
        self.EXPLORATION_STEPS = EXPLORATION_STEPS
        self.VERBOSE = VERBOSE
        self.LOG_DIR_TF = LOG_DIR_TF
        #check if action_space is symmetric
        if all(env.action_space.high == abs(env.action_space.low)):
            action_scale = env.action_space.high
        else:
            raise ActionSpaceNotSymmetricException
        self.actor = Actor(self.sess, self.observation_space,
                           self.action_space, LEARNING_RATE_ACTOR, NET_SIZE,
                           TAU, action_scale)
        self.critic = Critic(self.sess, self.observation_space,
                             self.action_space, LEARNING_RATE_CRITIC, NET_SIZE,
                             TAU)
        actor_network_variables = self.actor.network.get_variables()
        critic_q_net_variables = self.critic.q_net.get_variables()
        self.actor_target_update = self.actor.target_network.update_variables(
            actor_network_variables)
        self.critic_target_update = self.critic.target_q_net.update_variables(
            critic_q_net_variables)
        self.reward_pl = tf.placeholder(tf.float32, [None, 1],
                                        name='Reward_PL')
        self.done_pl = tf.placeholder(tf.bool, [None, 1], name='Done_PL')
        self.labels = tf.where(
            self.done_pl, self.reward_pl, self.reward_pl +
            tf.multiply(self.REWARD_DISCOUNT, self.critic.target_prediction))
        #self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE)
        self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE,
                                          self.observation_space,
                                          self.action_space)
        self.log_reward_pl = tf.placeholder(tf.float32, name='Reward_log_pl')
        self.reward_f = tf.add(0.0, self.log_reward_pl)
        tf.summary.scalar('reward', self.reward_f)
        init = tf.global_variables_initializer()
        self.sess.run(init)
        self.sess.run(self.actor.network.copy_to(self.actor.target_network))
        self.sess.run(self.critic.q_net.copy_to(self.critic.target_q_net))
        self.writer = tf.summary.FileWriter(self.LOG_DIR_TF, self.sess.graph)
        self.merged = tf.summary.merge_all()

    def select_action(self, observation, current_step):
        action = self.actor.predict(observation, self.actor.prediction)
        if current_step <= self.EXPLORATION_STEPS:
            noise = self.noise()
        else:
            noise = 0
        return action + noise

    def noise(self):
        x = self.noise_state
        dx = 0.15 * (0 - x) + 0.2 * np.random.randn(len(x))
        self.noise_state = x + dx
        return self.noise_state

    def calcError(self, observation, new_observation, reward, action):
        """
         Calculates the error that determines the usefullness of a memory.
         High errors are better for training
        Args:
         observation: the old state
         new_observation: the current state
         reward: the reward received
         action: the action that was taken
        Returns:
         error: the difference between prediction and label
        """
        prediction = self.critic.predict(observation, action,
                                         self.critic.prediction)
        label = reward + self.REWARD_DISCOUNT * self.critic.predict(
            new_observation, action, self.critic.target_prediction)
        error = abs(label - prediction)
        return error

    def summarize(self, episode, episode_reward, observation, new_observation,
                  reward, done):
        next_action = self.actor.predict(new_observation,
                                         self.actor.target_prediction)
        feed_dict = {
            self.critic.input_pl: new_observation,
            self.critic.actions_pl: next_action,
            self.reward_pl: [[reward]],
            self.done_pl: [[done]]
        }
        label = self.sess.run(self.labels, feed_dict=feed_dict)
        feed_dict[self.critic.labels_pl] = label
        #sometimes the reward is an array and sometimes a scalar
        if isinstance(episode_reward, np.ndarray):
            episode_reward = max(episode_reward)
        feed_dict[self.log_reward_pl] = episode_reward
        summary = self.sess.run(self.merged, feed_dict=feed_dict)
        self.writer.add_summary(summary, episode)

    def train_with_batch(self, current_step):
        """
         Call train_step with a sample batch from the replay memory
        Args:
         summary: boolean if the training results are to be saved in a logfile
        """
        observations, actions, rewards, new_observations, dones = self.replay_memory.sample(
        )
        #all of this requires ~3 seconds of computational time
        #improve the Q-Network
        next_actions = self.actor.predict(new_observations,
                                          self.actor.prediction)
        feed_dict = {
            self.critic.input_pl: new_observations,
            self.critic.actions_pl: next_actions,
            self.reward_pl: rewards,
            self.done_pl: dones
        }
        labels = self.sess.run(self.labels, feed_dict=feed_dict)
        self.critic.train(observations, actions, labels)
        actions = self.actor.predict(observations, self.actor.prediction)
        gradients = self.critic.get_gradients(observations, actions)
        #improve the policy with the calculated gradients
        self.actor.train(observations, gradients)
        #Update both target networks
        #requires ~1 second of time
        self.sess.run(self.actor_target_update)
        self.sess.run(self.critic_target_update)
        #Print debug information if verbose
        if current_step % 500 == 0 and self.VERBOSE:
            print("Observations: ", observations)
            print("Predicted Best-Actions: ", actions)
            print("Labels: ", labels)
            print("Gradients: ", gradients)

Пример #6

Показать файл

Файл: DDPG.py Проект: sunhemeng/master-thesis

class DDPGAgent:
    def __init__(self,
                 state_size=28,
                 action_size=2,
                 gamma=0.9,
                 learning_rate_actor=0.0001,
                 learning_rate_critic=0.01,
                 tau=0.001,
                 action_max=[1000, 2],
                 batch_size=32):
        self.state_size = state_size
        self.action_size = action_size
        self.action_max = action_max
        self.batch_size = batch_size
        self.memory = deque(maxlen=5000)
        self.gamma = gamma  # discount rate
        self.learning_rate_actor = learning_rate_actor  # learning rate
        self.learning_rate_critic = learning_rate_critic
        self.tau = tau  # target transfer factor
        self.gpu_options = tf.GPUOptions()
        self.config = tf.ConfigProto(gpu_options=self.gpu_options)
        self.config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=self.config)
        K.set_session(self.sess)
        self.actor = Actor(state_size=self.state_size,
                           action_size=self.action_size,
                           learning_rate=self.learning_rate_actor,
                           tau=self.tau,
                           sess=self.sess,
                           batch_size=self.batch_size,
                           action_max=self.action_max)
        self.critic = Critic(state_size=self.state_size,
                             action_size=self.action_size,
                             learning_rate=self.learning_rate_critic,
                             gamma=self.gamma,
                             tau=self.tau,
                             sess=self.sess,
                             batch_size=self.batch_size)
        self.grad_avg = 0
        self.grad_a = []
        self.critic_loss_a = []
        #self.critic_2 = Critic_2(self.state_size, self.action_size, self.learning_rate_critic, self.gamma, self.tau, self.sess)

    def policy_action(self, state):
        '''
        Actor predicts new action
        :param state:
        :return: action
        '''
        return self.actor.predict(state)[0]

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        states = np.asarray([e[0] for e in minibatch])
        actions = np.asarray([e[1] for e in minibatch])
        rewards = np.asarray([e[2] for e in minibatch])
        next_states = np.asarray([e[3] for e in minibatch])

        states = np.asarray(states).reshape(batch_size, self.state_size)
        actions = np.asarray(actions).reshape(batch_size, self.action_size)
        rewards = np.asarray(rewards).reshape(batch_size, 1)
        next_states = np.asarray(next_states).reshape(batch_size,
                                                      self.state_size)
        tar_pre = self.actor.target_predict(next_states)
        Qvals = self.critic.target_predict(next_states, tar_pre)
        Q_primes = rewards + (self.gamma * Qvals)  # Bellman equation
        self.update_models(states, actions, Q_primes)

    def update_models(self, states, actions, critic_target):
        '''
        Update actor and critic networks from sampled experience
        :param states:
        :param actions:
        :param critic_target:
        :return:
        '''
        loss = self.critic.train_on_batch(states, actions,
                                          critic_target)  # Train Critic
        self.critic_loss_a.append(loss)
        # loss = np.sum(-np.log10(loss), axis=0)
        act = self.actor.predict(
            states)  # Q Value Gradient under Current Policy
        grads = self.critic.gradients(states, act)  # actor loss

        self.grad_avg += np.sum(np.log10(np.absolute(grads)),
                                axis=0) / self.batch_size
        self.grad_a = np.append(self.grad_a,
                                np.sum(np.absolute(grads), axis=0) /
                                self.batch_size,
                                axis=0)
        # print('grad_a:', self.grad_a)

        self.actor.train_2(states, grads.reshape(
            (-1, self.action_size)))  # Train actor

        self.actor.transfer_to_actor_model(
        )  # Transfer weights to target networks at rate tau
        self.critic.transfer_to_critic_model()

    def remember(self, state, action, reward, next_state):
        self.memory.append((state, action, reward, next_state))

    def save_weights(self, directory, params):
        path_actor = directory + 'Weights' + params + '_LR'.format(
            self.learning_rate_actor)
        path_critic = directory + 'Weights' + params + '_LR'.format(
            self.learning_rate_critic)
        self.actor.save(path_actor)
        self.critic.save(path_critic)

    def load_weights(self, path_actor, path_critic):
        self.actor.load_weigths(path_actor)
        self.critic.load_weights(path_critic)

    def load_model(self, path_actor, path_critic):
        self.actor.model.load_model(path_actor)
        self.critic.model.load_model(path_critic)

Пример #7

Показать файл

Файл: PPO.py Проект: war3gu/gykRL

def train():

    env = gym.make('LunarLander-v2')

    state = env.reset()

    actor = Actor(env.action_space, env.observation_space)

    critic = Critic(env.action_space, env.observation_space)

    actor.load()
    critic.load()

    replayMemory = ReplayMemory()

    summary_ops, summary_vars = build_summaries()

    writer = tf.summary.FileWriter("./log", tf.Session().graph)

    episode_reward = 0

    step = 1

    while True:

        #env.render()

        state1 = state[np.newaxis, :]

        action, action_matrix, prob = actor.predict(state1)

        next_state, reward, done, info = env.step(action)

        replayMemory.add(state, action_matrix, reward, done, next_state, prob)

        state = next_state

        episode_reward += reward

        #train
        if replayMemory.size() % 128 == 0 or done == True:

            state_b, action_matrix_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniAll(
            )

            reward_b = reward_b[:, np.newaxis]

            c_pre = critic.predict(next_state_b)

            state_pre_value = reward_b + c_pre * 0.7

            state_value = critic.predict(state_b)

            count = 5000 // step

            if count > 500:
                count = 500

            if count < 1:
                count = 1

            count = 10

            for _ in range(count):
                critic.train(state_b, state_pre_value)

            for _ in range(count):
                actor.train(state_b, state_value, state_pre_value,
                            action_matrix_b, prob_b)

            replayMemory.clear()
        ########################

        if done:

            summary_str = tf.Session().run(
                summary_ops, feed_dict={summary_vars[0]: episode_reward})
            writer.add_summary(summary_str, step)
            writer.flush()

            ##print("step = ", step, "episode_reward = ", episode_reward)

            state = env.reset()

            episode_reward = 0

            step += 1

            if step % 25 == 0:
                actor.save()
                critic.save()