Exemplo n.º 1
0
    def __init__(self, load_model=False, testing=False):
        self.critic = self.build_critic()
        if CONTINUOUS is False:
            self.actor = self.build_actor()
        else:
            self.actor = self.build_actor_continuous()

        self.env = PongEnvironment()

        self.episode = 0
        self.testing = testing
        if not self.testing:
            self.NUM_EPISODE = EPISODES
        else:
            self.NUM_EPISODE = 100
        self.observation = self.env.reset()
        self.val = False
        self.reward = []
        self.reward_over_time = []
        self.gradient_steps = 0
        self.action_noise = NOISE
        self.load_model = load_model

        if self.load_model:
            self.actor.load_weights("./weights/actor_weights.h5")
            self.critic.load_weights("./weights/critic_weights.h5")
Exemplo n.º 2
0
    def __init__(self):
        self.critic = self.build_critic()
        if CONTINUOUS:
            self.actor = self.build_actor_continuous()
        else:
            self.actor = self.build_actor()

        # self.env = gym.make(ENV)
        self.env = PongEnvironment()
        print(self.env.action_space, 'action_space',
              self.env.observation_space, 'observation_space')
        self.episode = 0
        self.observation = self.env.reset()
        self.val = False
        self.reward = []
        self.reward_over_time = []
        self.name = self.get_name()
        self.writer = SummaryWriter(self.name)
        self.gradient_steps = 0
Exemplo n.º 3
0
        # Get predict values from actor
        predicts = self.actor.predict([obs, advantage_values, predictions])

        return np.argmax(predicts[0])

    def play(self, episode):
        obs = self.environment.reset()

        while True:
            action = self.get_action(obs)
            next_obs, _, is_done, = self.environment.step(action)

            if episode == TEST_EPISODE_COUNT - 1:
                self.environment.render()

            obs = next_obs

            if is_done:
                break

    def test(self):
        for episode in range(TEST_EPISODE_COUNT):
            self.play(episode)


if __name__ == '__main__':
    agent = Agent(PongEnvironment(True))

    for test in range(TEST_COUNT):
        agent.test()
Exemplo n.º 4
0
class Agent:
    def __init__(self):
        self.critic = self.build_critic()
        if CONTINUOUS:
            self.actor = self.build_actor_continuous()
        else:
            self.actor = self.build_actor()

        # self.env = gym.make(ENV)
        self.env = PongEnvironment()
        print(self.env.action_space, 'action_space',
              self.env.observation_space, 'observation_space')
        self.episode = 0
        self.observation = self.env.reset()
        self.val = False
        self.reward = []
        self.reward_over_time = []
        self.name = self.get_name()
        self.writer = SummaryWriter(self.name)
        self.gradient_steps = 0

    def get_name(self):
        name = 'AllRuns/'
        if CONTINUOUS is True:
            name += 'continous/'
        else:
            name += 'discrete/'
        name += ENV
        return name

    def build_actor(self):
        state_input = Input(shape=(NUM_STATE, ))
        advantage = Input(shape=(1, ))
        old_prediction = Input(shape=(NUM_ACTIONS, ))

        x = Dense(HIDDEN_SIZE, activation='tanh')(state_input)
        for _ in range(NUM_LAYERS - 1):
            x = Dense(HIDDEN_SIZE, activation='tanh')(x)

        out_actions = Dense(NUM_ACTIONS, activation='softmax',
                            name='output')(x)

        model = Model(inputs=[state_input, advantage, old_prediction],
                      outputs=[out_actions])
        model.compile(optimizer=Adam(lr=LR),
                      loss=[
                          proximal_policy_optimization_loss(
                              advantage=advantage,
                              old_prediction=old_prediction)
                      ])
        model.summary()

        return model

    def build_actor_continuous(self):
        state_input = Input(shape=(NUM_STATE, ))
        advantage = Input(shape=(1, ))
        old_prediction = Input(shape=(NUM_ACTIONS, ))

        x = Dense(HIDDEN_SIZE, activation='tanh')(state_input)
        for _ in range(NUM_LAYERS - 1):
            x = Dense(HIDDEN_SIZE, activation='tanh')(x)

        out_actions = Dense(NUM_ACTIONS, name='output', activation='tanh')(x)

        model = Model(inputs=[state_input, advantage, old_prediction],
                      outputs=[out_actions])
        model.compile(optimizer=Adam(lr=LR),
                      loss=[
                          proximal_policy_optimization_loss_continuous(
                              advantage=advantage,
                              old_prediction=old_prediction)
                      ])
        model.summary()

        return model

    def build_critic(self):

        state_input = Input(shape=(NUM_STATE, ))
        x = Dense(HIDDEN_SIZE, activation='tanh')(state_input)
        for _ in range(NUM_LAYERS - 1):
            x = Dense(HIDDEN_SIZE, activation='tanh')(x)

        out_value = Dense(1)(x)

        model = Model(inputs=[state_input], outputs=[out_value])
        model.compile(optimizer=Adam(lr=LR), loss='mse')

        return model

    def reset_env(self):
        self.episode += 1
        if self.episode % 100 == 0:
            self.val = True
        else:
            self.val = False
        self.observation = self.env.reset()
        self.reward = []

    def get_action(self):
        p = self.actor.predict([
            self.observation.reshape(1, NUM_STATE), DUMMY_VALUE, DUMMY_ACTION
        ])
        if self.val is False:
            action = np.random.choice(NUM_ACTIONS, p=np.nan_to_num(p[0]))
        else:
            action = np.argmax(p[0])
        action_matrix = np.zeros(NUM_ACTIONS)
        action_matrix[action] = 1
        return action, action_matrix, p

    def get_action_continuous(self):
        p = self.actor.predict([
            self.observation.reshape(1, NUM_STATE), DUMMY_VALUE, DUMMY_ACTION
        ])
        if self.val is False:
            action = action_matrix = p[0] + np.random.normal(
                loc=0, scale=NOISE, size=p[0].shape)
        else:
            action = action_matrix = p[0]
        return action, action_matrix, p

    def transform_reward(self):
        if self.val is True:
            self.writer.add_scalar('Val episode reward',
                                   np.array(self.reward).sum(), self.episode)
        else:
            self.writer.add_scalar('Episode reward',
                                   np.array(self.reward).sum(), self.episode)
        for j in range(len(self.reward) - 2, -1, -1):
            self.reward[j] += self.reward[j + 1] * GAMMA

    def get_batch(self):
        batch = [[], [], [], []]

        tmp_batch = [[], [], []]
        untransformed_reward = list()
        while len(batch[0]) < BUFFER_SIZE:
            if CONTINUOUS:
                action, action_matrix, predicted_action = self.get_action_continuous(
                )
            else:
                action, action_matrix, predicted_action = self.get_action()
            if self.gradient_steps % RENDER_EACH == 0:
                self.env.render()
            observation, reward, done = self.env.step(action)
            # observation, reward, done, info = self.env.step(action)
            self.reward.append(reward)
            untransformed_reward.append(reward)

            tmp_batch[0].append(self.observation)
            tmp_batch[1].append(action_matrix)
            tmp_batch[2].append(predicted_action)
            self.observation = observation

            if done:
                self.transform_reward()
                if self.val is False:
                    for i in range(len(tmp_batch[0])):
                        obs, action, pred = tmp_batch[0][i], tmp_batch[1][
                            i], tmp_batch[2][i]
                        r = self.reward[i]
                        batch[0].append(obs)
                        batch[1].append(action)
                        batch[2].append(pred)
                        batch[3].append(r)
                tmp_batch = [[], [], []]
                self.reset_env()

        obs, action, pred, reward = np.array(batch[0]), np.array(
            batch[1]), np.array(batch[2]), np.reshape(np.array(batch[3]),
                                                      (len(batch[3]), 1))
        pred = np.reshape(pred, (pred.shape[0], pred.shape[2]))
        return obs, action, pred, reward, untransformed_reward

    def run(self):
        while self.episode < EPISODES:
            obs, action, pred, reward, untransformed_reward = self.get_batch()
            obs, action, pred, reward = obs[:
                                            BUFFER_SIZE], action[:
                                                                 BUFFER_SIZE], pred[:
                                                                                    BUFFER_SIZE], reward[:
                                                                                                         BUFFER_SIZE]
            old_prediction = pred
            pred_values = self.critic.predict(obs)

            advantage = reward - pred_values
            # advantage = (advantage - advantage.mean()) / advantage.std()
            actor_loss = self.actor.fit([obs, advantage, old_prediction],
                                        [action],
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                        epochs=EPOCHS,
                                        verbose=False)
            critic_loss = self.critic.fit([obs], [reward],
                                          batch_size=BATCH_SIZE,
                                          shuffle=True,
                                          epochs=EPOCHS,
                                          verbose=False)
            self.writer.add_scalar('Actor loss',
                                   actor_loss.history['loss'][-1],
                                   self.gradient_steps)
            self.writer.add_scalar('Critic loss',
                                   critic_loss.history['loss'][-1],
                                   self.gradient_steps)
            # print("Gradient Update:", self.gradient_steps, " Reward: ",sum(reward))
            print(
                f"E: {self.episode}\tgrad. update: {self.gradient_steps}\tReward: {sum(untransformed_reward)}"
            )
            self.gradient_steps += 1
Exemplo n.º 5
0
import sys
import time
import pickle

sys.path.append("Environment")
from Environment import PongEnvironment

from PPO import *

if __name__ == '__main__':
    env = PongEnvironment(False)
    ppo = PPO(env, num_states=len(env.observe()), actions=np.arange(3))
    ppo.saver.restore(ppo.sess, "model_res/Pong_model.ckpt")
    actions = set()
    all_scores = list()
    for trial in range(10):
        score = 0
        for i in range(100):
            ep_actions = list()
            done = False
            s = env.reset()
            while not done:
                a = ppo.sess.run(
                    ppo.action,
                    {ppo.in_state: s.reshape(-1, ppo.state_space)})[0]
                ep_actions.append(a)
                #env.render()
                #time.sleep(1e-3)
                try:
                    s, r, done = env.step(a)
                except ValueError:
Exemplo n.º 6
0
class Agent:
    def __init__(self, load_model=False, testing=False):
        self.critic = self.build_critic()
        if CONTINUOUS is False:
            self.actor = self.build_actor()
        else:
            self.actor = self.build_actor_continuous()

        self.env = PongEnvironment()

        self.episode = 0
        self.testing = testing
        if not self.testing:
            self.NUM_EPISODE = EPISODES
        else:
            self.NUM_EPISODE = 100
        self.observation = self.env.reset()
        self.val = False
        self.reward = []
        self.reward_over_time = []
        self.gradient_steps = 0
        self.action_noise = NOISE
        self.load_model = load_model

        if self.load_model:
            self.actor.load_weights("./weights/actor_weights.h5")
            self.critic.load_weights("./weights/critic_weights.h5")

    def build_actor(self):
        state_input = Input(shape=(NUM_STATE, ))
        advantage = Input(shape=(1, ))
        old_prediction = Input(shape=(NUM_ACTIONS, ))

        x = Dense(HIDDEN_SIZE, activation='tanh')(state_input)
        for _ in range(NUM_LAYERS - 1):
            x = Dense(HIDDEN_SIZE, activation='tanh')(x)

        out_actions = Dense(NUM_ACTIONS, activation='softmax',
                            name='output')(x)

        model = Model(inputs=[state_input, advantage, old_prediction],
                      outputs=[out_actions])
        model.compile(optimizer=Adam(lr=LR),
                      loss=[
                          self.proximal_policy_optimization_loss(
                              advantage=advantage,
                              old_prediction=old_prediction)
                      ])
        model.summary()

        return model

    def build_actor_continuous(self):
        state_input = Input(shape=(NUM_STATE, ))
        advantage = Input(shape=(1, ))
        old_prediction = Input(shape=(NUM_ACTIONS, ))

        x = Dense(HIDDEN_SIZE, activation='tanh')(state_input)
        for _ in range(NUM_LAYERS - 1):
            x = Dense(HIDDEN_SIZE, activation='tanh')(x)

        out_actions = Dense(NUM_ACTIONS, name='output', activation='tanh')(x)

        model = Model(inputs=[state_input, advantage, old_prediction],
                      outputs=[out_actions])
        model.compile(optimizer=Adam(lr=LR),
                      loss=[
                          self.proximal_policy_optimization_loss_continuous(
                              advantage=advantage,
                              old_prediction=old_prediction)
                      ])
        model.summary()

        return model

    def build_critic(self):

        state_input = Input(shape=(NUM_STATE, ))
        x = Dense(HIDDEN_SIZE, activation='tanh')(state_input)
        for _ in range(NUM_LAYERS - 1):
            x = Dense(HIDDEN_SIZE, activation='tanh')(x)

        out_value = Dense(1)(x)

        model = Model(inputs=[state_input], outputs=[out_value])
        model.compile(optimizer=Adam(lr=LR), loss='mse')

        return model

    def reset_env(self):
        self.episode += 1
        if self.episode % VALIDATION_EACH == 0:
            self.val = True
        else:
            self.val = False
        self.observation = self.env.reset()
        self.reward = []

    def get_action(self):
        p = self.actor.predict([
            self.observation.reshape(1, NUM_STATE), DUMMY_VALUE, DUMMY_ACTION
        ])
        if self.val is False:
            action = np.random.choice(NUM_ACTIONS, p=np.nan_to_num(p[0]))
        else:
            action = np.argmax(p[0])
        action_matrix = np.zeros(NUM_ACTIONS)
        action_matrix[action] = 1
        return action, action_matrix, p

    def get_action_continuous(self):
        p = self.actor.predict([
            self.observation.reshape(1, NUM_STATE), DUMMY_VALUE, DUMMY_ACTION
        ])
        if self.val is False:
            action = action_matrix = p[0] + np.random.normal(
                loc=0, scale=NOISE, size=p[0].shape)
        else:
            action = action_matrix = p[0]
        return action, action_matrix, p

    def transform_reward(self):
        for j in range(len(self.reward) - 2, -1, -1):
            self.reward[j] += self.reward[j + 1] * GAMMA

    def get_batch(self):
        """
        Sometimes this rollout exceeds buffer size and thats normal. For example,
        buffer size is 250 but we don't observe any done's until 250. This rollout
        continues until we see a done(either a goal reach or time exceed done)
        This can be altered by counting a variable and checking that variable with
        buffer size.
        """
        batch = [[], [], [], []]

        tmp_batch = [[], [], []]
        done = False
        untransformed_reward = []
        while len(batch[0]) < BUFFER_SIZE:
            if CONTINUOUS is False:
                action, action_matrix, predicted_action = self.get_action()
            else:
                action, action_matrix, predicted_action = self.get_action_continuous(
                )
            observation, reward, done = self.env.step(action)
            untransformed_reward.append(reward)
            if self.gradient_steps % RENDER_EACH == 0:
                self.env.render()
            self.reward.append(reward)
            tmp_batch[0].append(self.observation)
            tmp_batch[1].append(action_matrix)
            tmp_batch[2].append(predicted_action)
            self.observation = observation

            if done:
                self.transform_reward()
                if self.val is False:
                    for i in range(len(tmp_batch[0])):
                        obs, action, pred = tmp_batch[0][i], tmp_batch[1][
                            i], tmp_batch[2][i]
                        r = self.reward[i]
                        batch[0].append(obs)
                        batch[1].append(action)
                        batch[2].append(pred)
                        batch[3].append(r)
                tmp_batch = [[], [], []]
                self.reset_env()

        obs, action, pred, reward = np.array(batch[0]), np.array(batch[1]), np.array(batch[2]), \
            np.reshape(np.array(batch[3]), (len(batch[3]), 1))
        pred = np.reshape(pred, (pred.shape[0], pred.shape[2]))
        return obs, action, pred, reward, untransformed_reward

    def run(self):
        # Note that in PPO, episodes are not counted, instead, we do a rollout of K steps and learn from that
        while self.episode < self.NUM_EPISODE:
            """
            In the original code, these arrays are clipped to BUFFER_SIZE number of elements
            but I found out that this way it performs better so I updated this -Emir
            """
            obs, action, pred, reward, untransformed_reward = self.get_batch()
            old_prediction = pred
            pred_values = self.critic.predict(obs)

            advantage = reward - pred_values

            if not self.testing:
                # advantage = (advantage - advantage.mean()) / advantage.std()
                actor_loss = self.actor.fit([obs, advantage, old_prediction],
                                            [action],
                                            batch_size=BATCH_SIZE,
                                            shuffle=True,
                                            epochs=EPOCHS,
                                            verbose=False)
                critic_loss = self.critic.fit([obs], [reward],
                                              batch_size=BATCH_SIZE,
                                              shuffle=True,
                                              epochs=EPOCHS,
                                              verbose=False)
                print("Gradient Update:", self.gradient_steps, " Reward: ",
                      sum(untransformed_reward))
                self.gradient_steps += 1

        if not self.testing:
            self.save_weights("./weights")

    def save_weights(self, fpath):
        self.actor.save_weights(
            filepath=os.path.join(fpath, "actor_weights.h5"))
        self.critic.save_weights(
            filepath=os.path.join(fpath, "critic_weights.h5"))

    @staticmethod
    def proximal_policy_optimization_loss(advantage, old_prediction):
        def loss(y_true, y_pred):
            prob = y_true * y_pred
            old_prob = y_true * old_prediction
            r = prob / (old_prob + 1e-10)
            return -K.mean(
                K.minimum(
                    r * advantage,
                    K.clip(r,
                           min_value=1 - LOSS_CLIPPING,
                           max_value=1 + LOSS_CLIPPING) * advantage) +
                ENTROPY_LOSS * -(prob * K.log(prob + 1e-10)))

        return loss

    @staticmethod
    def proximal_policy_optimization_loss_continuous(advantage,
                                                     old_prediction):
        def loss(y_true, y_pred):
            var = K.square(NOISE)
            pi = 3.1415926
            denom = K.sqrt(2 * pi * var)
            prob_num = K.exp(-K.square(y_true - y_pred) / (2 * var))
            old_prob_num = K.exp(-K.square(y_true - old_prediction) /
                                 (2 * var))

            prob = prob_num / denom
            old_prob = old_prob_num / denom
            r = prob / (old_prob + 1e-10)

            return -K.mean(
                K.minimum(
                    r * advantage,
                    K.clip(r,
                           min_value=1 - LOSS_CLIPPING,
                           max_value=1 + LOSS_CLIPPING) * advantage))

        return loss
Exemplo n.º 7
0
    os.makedirs("model")
except FileExistsError:
    pass


def update_plot(x, y):
    plt.cla()
    ax.plot(x, y)
    plt.pause(1e-4)
    fig.tight_layout()


EPOCHS = int(2500)  # maximum number of updates
ENVIRONMENT = "Pong"
if __name__ == "__main__":
    env = PongEnvironment(False)
    num_states = len(env.observe())
    ppo = PPO(env, num_states=num_states, actions=np.arange(3))
    rewards = list()
    steps_count = 0
    eps = 0
    for e in tqdm(range(1, EPOCHS + 1)):
        actions_set, avg_rews, steps, ep_count = ppo.update()
        steps_count += steps
        eps += ep_count
        rewards.append(avg_rews)

        if e % 10 == 0:
            x = range(0, len(rewards), 10)
            update_plot(x, [rewards[i] for i in x])
            print(
Exemplo n.º 8
0
 def __init__(self, wid):
     self.wid = wid
     self.env = PongEnvironment()
     self.ppo = GLOBAL_PPO
Exemplo n.º 9
0
class Worker(object):
    def __init__(self, wid):
        self.wid = wid
        self.env = PongEnvironment()
        self.ppo = GLOBAL_PPO

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        while not COORD.should_stop():
            s = self.env.reset()
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            for t in range(EP_LEN):
                if not ROLLING_EVENT.is_set():  # while global PPO is updating
                    ROLLING_EVENT.wait()  # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], [
                    ]  # clear history buffer, use new policy to collect data
                a = self.ppo.choose_action(s)
                s_, r, done = self.env.step(a)
                # if done: r = -10
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)
                s = s_
                ep_r += r

                GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size, no need to wait other workers
                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done:
                    if done:
                        v_s_ = 0  # end of episode
                    else:
                        v_s_ = self.ppo.get_v(s_)

                    discounted_r = []  # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, None]
                    buffer_s, buffer_a, buffer_r = [], [], []
                    QUEUE.put(np.hstack((bs, ba, br)))  # put data in the queue
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        ROLLING_EVENT.clear()  # stop collecting data
                        UPDATE_EVENT.set()  # globalPPO update

                    if GLOBAL_EP >= EP_MAX:  # stop training
                        COORD.request_stop()
                        break

                    if done: break

            # record reward changes, plot later
            if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
            else:
                GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 +
                                        ep_r * 0.1)
            GLOBAL_EP += 1
            print(
                '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
                '|W%i' % self.wid,
                '|Ep_r: %.2f' % ep_r,
            )
Exemplo n.º 10
0
import matplotlib.pyplot as plt
import gym, threading, queue
from Environment import PongEnvironment

EP_MAX = 10000
EP_LEN = 10000
N_WORKER = 8  # parallel workers
GAMMA = 0.99  # reward discount factor
A_LR = 0.0001  # learning rate for actor
C_LR = 0.0001  # learning rate for critic
MIN_BATCH_SIZE = 512  # minimum batch size for updating PPO
UPDATE_STEP = 100  # loop update operation n-steps
EPSILON = 0.2  # for clipping surrogate objective
GAME = 'MountainCar-v0'

env = PongEnvironment()
S_DIM = env.observation_space
A_DIM = env.action_space


class PPONet(object):
    def __init__(self):
        self.sess = tf.Session()
        self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')

        # critic
        w_init = tf.random_normal_initializer(0., .1)
        lc1 = tf.layers.dense(self.tfs,
                              200,
                              tf.nn.relu,
                              kernel_initializer=w_init,