Пример #1
0
def play():
    print("play")

    env = gym.make('LunarLander-v2')

    state = env.reset()

    actor = Actor(env.action_space, env.observation_space)

    actor.load()

    #critic = Critic(env.action_space, env.observation_space)

    #replayMemory = ReplayMemory()

    #summary_ops, summary_vars = build_summaries()

    #writer = tf.summary.FileWriter("./log", tf.Session().graph)

    #episode_reward = 0

    #step = 1

    while True:

        env.render()

        state1 = state[np.newaxis, :]

        action, action_matrix, prob = actor.predict(state1)

        next_state, reward, done, info = env.step(action)

        #replayMemory.add(state, action_matrix, reward, done, next_state, prob)

        state = next_state

        if done:
            #summary_str = tf.Session().run(summary_ops, feed_dict={summary_vars[0]: episode_reward})
            #writer.add_summary(summary_str, step)
            #writer.flush()
            state = env.reset()

    return 0
Пример #2
0
actor = Actor(sess, state_size, action_size)
critic = Critic(sess, state_size, action_size)
buffer = ReplayBuffer(BUFFER_SIZE)


env.monitor.start('experiments/' + 'Pendulum-v0',force=True)

for ep in range(10000):
    state = env.reset()
    Totoal = 0
    # what if the action is beyond the scope?
    for iteration  in range(100):
        # select the action with actor model.
        env.render()

        action = actor.predict([state])[0] + (np.random.randn(1)/(ep + iteration + 1))

        newState, reward, terminated, _ = env.step(action)
        Totoal += reward
        buffer.add(state, action, reward, newState, terminated) #state, action, reward, new_state, done


        # update critic
        batch = buffer.getBatch(batch_size=BATCH_SIZE)
        states = np.array([e[0] for e in batch])
        actions = np.array([e[1] for e in batch])
        rewards = np.array([e[2] for e in batch])
        newStates = np.array([e[3] for e in batch])
        notTerminated = np.array([1.-e[4] for e in batch])

        newStatesScores = critic.target_predict_method(newStates, actor.target_predict_method(newStates))
Пример #3
0
    replayMemory = ReplayMemory()

    summary_ops, summary_vars = build_summaries()

    writer = tf.summary.FileWriter("./log", tf.Session().graph)

    episode_reward = 0

    step = 0

    while True:
        #env.render()

        state1 = state[np.newaxis, :]

        action, action_matrix = actor.predict(state1)

        next_state, reward, done, info = env.step(action)

        replayMemory.add(state, action_matrix, reward, done, next_state)

        state = next_state

        episode_reward += reward

        #train
        if replayMemory.size() % 128 == 0 or done == True:

            state_b, action_matrix_b, reward_b, done_b, next_state_b = replayMemory.miniAll()

            reward_b = reward_b[:, np.newaxis]
Пример #4
0
        state1 = state[np.newaxis, :]

        action, action_matrix, prob = actor.act(state1)

        next_state, reward, done, info = env.step(action)

        replayMemory.add(state, action, reward, done, next_state, prob)

        state = next_state

        episode_reward += reward
        ##############################train######################
        if replayMemory.size() >= 128:
            state_b, action_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniBatch(
                int(64))
            next_state_b_value = actor.predict(next_state_b)
            state_b_value = actor.predict(state_b)
            length = state_b.shape[0]

            for i in range(length):
                target_next = reward_b[i]
                if not done_b[i]:
                    action_values = next_state_b_value[i]
                    target_next = (reward_b[i] + 0.7 * np.amax(action_values))
                state_b_value[i][action_b[i]] = target_next
            actor.train(state_b, state_b_value)

        if done:
            summary_str = tf.Session().run(
                summary_ops, feed_dict={summary_vars[0]: episode_reward})
            writer.add_summary(summary_str, step)
Пример #5
0
class Agent:
    def __init__(self, env, sess, LEARNING_RATE_ACTOR, LEARNING_RATE_CRITIC,
                 NET_SIZE, MEMORY_LEN, REWARD_DISCOUNT, BATCH_SIZE, TAU,
                 EXPLORATION_STEPS, VERBOSE, LOG_DIR_TF):
        self.env = env
        self.sess = sess
        self.observation_space = self.env.observation_space.shape[0]
        self.action_space = self.env.action_space.shape[0]
        self.REWARD_DISCOUNT = REWARD_DISCOUNT
        self.TAU = TAU
        self.BATCH_SIZE = BATCH_SIZE
        self.noise_state = np.zeros(self.action_space)
        self.EXPLORATION_STEPS = EXPLORATION_STEPS
        self.VERBOSE = VERBOSE
        self.LOG_DIR_TF = LOG_DIR_TF
        #check if action_space is symmetric
        if all(env.action_space.high == abs(env.action_space.low)):
            action_scale = env.action_space.high
        else:
            raise ActionSpaceNotSymmetricException
        self.actor = Actor(self.sess, self.observation_space,
                           self.action_space, LEARNING_RATE_ACTOR, NET_SIZE,
                           TAU, action_scale)
        self.critic = Critic(self.sess, self.observation_space,
                             self.action_space, LEARNING_RATE_CRITIC, NET_SIZE,
                             TAU)
        actor_network_variables = self.actor.network.get_variables()
        critic_q_net_variables = self.critic.q_net.get_variables()
        self.actor_target_update = self.actor.target_network.update_variables(
            actor_network_variables)
        self.critic_target_update = self.critic.target_q_net.update_variables(
            critic_q_net_variables)
        self.reward_pl = tf.placeholder(tf.float32, [None, 1],
                                        name='Reward_PL')
        self.done_pl = tf.placeholder(tf.bool, [None, 1], name='Done_PL')
        self.labels = tf.where(
            self.done_pl, self.reward_pl, self.reward_pl +
            tf.multiply(self.REWARD_DISCOUNT, self.critic.target_prediction))
        #self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE)
        self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE,
                                          self.observation_space,
                                          self.action_space)
        self.log_reward_pl = tf.placeholder(tf.float32, name='Reward_log_pl')
        self.reward_f = tf.add(0.0, self.log_reward_pl)
        tf.summary.scalar('reward', self.reward_f)
        init = tf.global_variables_initializer()
        self.sess.run(init)
        self.sess.run(self.actor.network.copy_to(self.actor.target_network))
        self.sess.run(self.critic.q_net.copy_to(self.critic.target_q_net))
        self.writer = tf.summary.FileWriter(self.LOG_DIR_TF, self.sess.graph)
        self.merged = tf.summary.merge_all()

    def select_action(self, observation, current_step):
        action = self.actor.predict(observation, self.actor.prediction)
        if current_step <= self.EXPLORATION_STEPS:
            noise = self.noise()
        else:
            noise = 0
        return action + noise

    def noise(self):
        x = self.noise_state
        dx = 0.15 * (0 - x) + 0.2 * np.random.randn(len(x))
        self.noise_state = x + dx
        return self.noise_state

    def calcError(self, observation, new_observation, reward, action):
        """
         Calculates the error that determines the usefullness of a memory.
         High errors are better for training
        Args:
         observation: the old state
         new_observation: the current state
         reward: the reward received
         action: the action that was taken
        Returns:
         error: the difference between prediction and label
        """
        prediction = self.critic.predict(observation, action,
                                         self.critic.prediction)
        label = reward + self.REWARD_DISCOUNT * self.critic.predict(
            new_observation, action, self.critic.target_prediction)
        error = abs(label - prediction)
        return error

    def summarize(self, episode, episode_reward, observation, new_observation,
                  reward, done):
        next_action = self.actor.predict(new_observation,
                                         self.actor.target_prediction)
        feed_dict = {
            self.critic.input_pl: new_observation,
            self.critic.actions_pl: next_action,
            self.reward_pl: [[reward]],
            self.done_pl: [[done]]
        }
        label = self.sess.run(self.labels, feed_dict=feed_dict)
        feed_dict[self.critic.labels_pl] = label
        #sometimes the reward is an array and sometimes a scalar
        if isinstance(episode_reward, np.ndarray):
            episode_reward = max(episode_reward)
        feed_dict[self.log_reward_pl] = episode_reward
        summary = self.sess.run(self.merged, feed_dict=feed_dict)
        self.writer.add_summary(summary, episode)

    def train_with_batch(self, current_step):
        """
         Call train_step with a sample batch from the replay memory
        Args:
         summary: boolean if the training results are to be saved in a logfile
        """
        observations, actions, rewards, new_observations, dones = self.replay_memory.sample(
        )
        #all of this requires ~3 seconds of computational time
        #improve the Q-Network
        next_actions = self.actor.predict(new_observations,
                                          self.actor.prediction)
        feed_dict = {
            self.critic.input_pl: new_observations,
            self.critic.actions_pl: next_actions,
            self.reward_pl: rewards,
            self.done_pl: dones
        }
        labels = self.sess.run(self.labels, feed_dict=feed_dict)
        self.critic.train(observations, actions, labels)
        actions = self.actor.predict(observations, self.actor.prediction)
        gradients = self.critic.get_gradients(observations, actions)
        #improve the policy with the calculated gradients
        self.actor.train(observations, gradients)
        #Update both target networks
        #requires ~1 second of time
        self.sess.run(self.actor_target_update)
        self.sess.run(self.critic_target_update)
        #Print debug information if verbose
        if current_step % 500 == 0 and self.VERBOSE:
            print("Observations: ", observations)
            print("Predicted Best-Actions: ", actions)
            print("Labels: ", labels)
            print("Gradients: ", gradients)
Пример #6
0
class DDPGAgent:
    def __init__(self,
                 state_size=28,
                 action_size=2,
                 gamma=0.9,
                 learning_rate_actor=0.0001,
                 learning_rate_critic=0.01,
                 tau=0.001,
                 action_max=[1000, 2],
                 batch_size=32):
        self.state_size = state_size
        self.action_size = action_size
        self.action_max = action_max
        self.batch_size = batch_size
        self.memory = deque(maxlen=5000)
        self.gamma = gamma  # discount rate
        self.learning_rate_actor = learning_rate_actor  # learning rate
        self.learning_rate_critic = learning_rate_critic
        self.tau = tau  # target transfer factor
        self.gpu_options = tf.GPUOptions()
        self.config = tf.ConfigProto(gpu_options=self.gpu_options)
        self.config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=self.config)
        K.set_session(self.sess)
        self.actor = Actor(state_size=self.state_size,
                           action_size=self.action_size,
                           learning_rate=self.learning_rate_actor,
                           tau=self.tau,
                           sess=self.sess,
                           batch_size=self.batch_size,
                           action_max=self.action_max)
        self.critic = Critic(state_size=self.state_size,
                             action_size=self.action_size,
                             learning_rate=self.learning_rate_critic,
                             gamma=self.gamma,
                             tau=self.tau,
                             sess=self.sess,
                             batch_size=self.batch_size)
        self.grad_avg = 0
        self.grad_a = []
        self.critic_loss_a = []
        #self.critic_2 = Critic_2(self.state_size, self.action_size, self.learning_rate_critic, self.gamma, self.tau, self.sess)

    def policy_action(self, state):
        '''
        Actor predicts new action
        :param state:
        :return: action
        '''
        return self.actor.predict(state)[0]

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        states = np.asarray([e[0] for e in minibatch])
        actions = np.asarray([e[1] for e in minibatch])
        rewards = np.asarray([e[2] for e in minibatch])
        next_states = np.asarray([e[3] for e in minibatch])

        states = np.asarray(states).reshape(batch_size, self.state_size)
        actions = np.asarray(actions).reshape(batch_size, self.action_size)
        rewards = np.asarray(rewards).reshape(batch_size, 1)
        next_states = np.asarray(next_states).reshape(batch_size,
                                                      self.state_size)
        tar_pre = self.actor.target_predict(next_states)
        Qvals = self.critic.target_predict(next_states, tar_pre)
        Q_primes = rewards + (self.gamma * Qvals)  # Bellman equation
        self.update_models(states, actions, Q_primes)

    def update_models(self, states, actions, critic_target):
        '''
        Update actor and critic networks from sampled experience
        :param states:
        :param actions:
        :param critic_target:
        :return:
        '''
        loss = self.critic.train_on_batch(states, actions,
                                          critic_target)  # Train Critic
        self.critic_loss_a.append(loss)
        # loss = np.sum(-np.log10(loss), axis=0)
        act = self.actor.predict(
            states)  # Q Value Gradient under Current Policy
        grads = self.critic.gradients(states, act)  # actor loss

        self.grad_avg += np.sum(np.log10(np.absolute(grads)),
                                axis=0) / self.batch_size
        self.grad_a = np.append(self.grad_a,
                                np.sum(np.absolute(grads), axis=0) /
                                self.batch_size,
                                axis=0)
        # print('grad_a:', self.grad_a)

        self.actor.train_2(states, grads.reshape(
            (-1, self.action_size)))  # Train actor

        self.actor.transfer_to_actor_model(
        )  # Transfer weights to target networks at rate tau
        self.critic.transfer_to_critic_model()

    def remember(self, state, action, reward, next_state):
        self.memory.append((state, action, reward, next_state))

    def save_weights(self, directory, params):
        path_actor = directory + 'Weights' + params + '_LR'.format(
            self.learning_rate_actor)
        path_critic = directory + 'Weights' + params + '_LR'.format(
            self.learning_rate_critic)
        self.actor.save(path_actor)
        self.critic.save(path_critic)

    def load_weights(self, path_actor, path_critic):
        self.actor.load_weigths(path_actor)
        self.critic.load_weights(path_critic)

    def load_model(self, path_actor, path_critic):
        self.actor.model.load_model(path_actor)
        self.critic.model.load_model(path_critic)
Пример #7
0
def train():

    env = gym.make('LunarLander-v2')

    state = env.reset()

    actor = Actor(env.action_space, env.observation_space)

    critic = Critic(env.action_space, env.observation_space)

    actor.load()
    critic.load()

    replayMemory = ReplayMemory()

    summary_ops, summary_vars = build_summaries()

    writer = tf.summary.FileWriter("./log", tf.Session().graph)

    episode_reward = 0

    step = 1

    while True:

        #env.render()

        state1 = state[np.newaxis, :]

        action, action_matrix, prob = actor.predict(state1)

        next_state, reward, done, info = env.step(action)

        replayMemory.add(state, action_matrix, reward, done, next_state, prob)

        state = next_state

        episode_reward += reward

        #train
        if replayMemory.size() % 128 == 0 or done == True:

            state_b, action_matrix_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniAll(
            )

            reward_b = reward_b[:, np.newaxis]

            c_pre = critic.predict(next_state_b)

            state_pre_value = reward_b + c_pre * 0.7

            state_value = critic.predict(state_b)

            count = 5000 // step

            if count > 500:
                count = 500

            if count < 1:
                count = 1

            count = 10

            for _ in range(count):
                critic.train(state_b, state_pre_value)

            for _ in range(count):
                actor.train(state_b, state_value, state_pre_value,
                            action_matrix_b, prob_b)

            replayMemory.clear()
        ########################

        if done:

            summary_str = tf.Session().run(
                summary_ops, feed_dict={summary_vars[0]: episode_reward})
            writer.add_summary(summary_str, step)
            writer.flush()

            ##print("step = ", step, "episode_reward = ", episode_reward)

            state = env.reset()

            episode_reward = 0

            step += 1

            if step % 25 == 0:
                actor.save()
                critic.save()