def start(GAME_NAME, MAX_EPISODE):
    env = gym.make(GAME_NAME)  # create enviornment
    actor = Actor(env.observation_space, env.action_space)  # create actor
    critic = Critic(env.observation_space, env.action_space)  # create critic
    reward_per_epi = []
    durations_per_epi = []
    l_A = []
    l_C = []
    MAX_EPISODE = MAX_EPISODE
    RENDER = False
    MAX_EP_STEPS = 1000
    #DISPLAY_REWARD_THRESHOLD=200

    #print ("begin.\n\n")
    for i_episode in range(MAX_EPISODE):
        s = env.reset()
        critic.reset()
        actor.reset()
        track_r = []
        for t in count():
            if RENDER: env.render()

            a = actor.choose_action(s)

            s_, r, done, info = env.step(a)
            #if done: r = -20             # Penalty if die
            track_r.append(r)

            td_error, abs_error = critic.learn(s, r, s_)  # Critic Learn
            actor.learn(s, a, td_error)  # Actor Learn

            s = s_

            #print ("... in episode (%d) step (%d)" % (i_episode+1,t))
            if is_ipython:
                display.clear_output(wait=True)
                display.display(plt.gcf())

            #env.render()

            if done or t >= MAX_EP_STEPS:  # Episode finished, print results
                ep_rs_sum = sum(track_r)
                #if 'running_reward' not in globals():
                #    running_reward = ep_rs_sum
                #else:
                #    running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
                #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True   # rendering
                running_reward_avg = ep_rs_sum / float(t)
                reward_per_epi.append(ep_rs_sum)
                durations_per_epi.append(t)
                l_A.append(np.mean(actor._loss_))
                l_C.append(np.mean(critic._loss_))
                #print("episode:", i_episode, "  reward:", ep_rs_sum)
                #plot(reward_per_epi, durations_per_epi, l_A, l_C)

                break

    return reward_per_epi, durations_per_epi, l_A, l_C
def run():
    # build environment using openai gym
    env = gym.make('MountainCar-v0')
    env = env.unwrapped
    sess = tf.Session()
    # create an actor and critic
    actor = Actor(sess, n_actions=n_actions, n_features=n_features, lr=lr_actor)
    critic = Critic(sess, n_features=n_features, lr=lr_critic)
    # build the two networks
    actor.build_net()
    critic.build_net()

    sess.run(tf.global_variables_initializer())

    # tf.summary.FileWriter("",sess.graph)
    # count steps
    step = 0
    # env.render()
    for episode in range(n_episodes):
        s = env.reset()
        # comment the render() to speed up
        # env.render()
        # s returned by gym is a vector, we need to transform it into a matrix
        s = s[np.newaxis, :]
        a = actor.choose_action(s)
        while (True):
            step += 1
            # a new transition
            s_, r, done, info = env.step(a)
            # in order to let s_ add one rank(matrix)
            s_ = s_[np.newaxis, :]
            a_ = actor.choose_action(s_)
            # calculate td_error
            td_error = critic.learn(s, s_)
            actor.learn(s, a, td_error)
            s = s_

            if step % 500 == 0:
                print(step, s_)

            if done:
                print('arrive')
                print(s_)
                break
Пример #3
0
class NetworkAC(object):
    """docstring for NetworkAC."""
    def __init__(self):
        tf.reset_default_graph()
        self.sess = tf.Session()
        self.actor = Actor(self.sess, \
                        n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \
                        lr=Config.LEARNING_RATE_START, action_bound=[-math.pi, math.pi])
        self.critic = Critic(self.sess, \
                        n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \
                        lr=Config.LEARNING_RATE_START)
        self.sess.run(tf.global_variables_initializer())

    def train(self, x, a, y, r):
        td_error = self.critic.learn(x, r, y)  # gradient = grad[r + gamma * V(y_) - V(x_)]
        self.actor.learn(x, a, td_error)  # true_gradient = grad[logPi(s,a) * td_error]

    def predict(self, state):
        action = self.actor.choose_action(state)
        value = self.critic.predict(state)
        return action, value
Пример #4
0
def start_p(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000):
    env = gym.make(GAME_NAME)
    actor = Actor(env.observation_space, env.action_space)
    critic = Critic(env.observation_space, env.action_space)
    reward_per_epi = []
    durations_per_epi = []
    l_A = []
    l_C = []

    MAX_EPISODE = 200
    RENDER = False
    MAX_EP_STEPS = 1000
    DISPLAY_REWARD_THRESHOLD = 200
    BATCH_SIZE = BATCH_SIZE
    MEMORY_CAPACITY = MEMORY_CAPACITY
    replay_memory = SumTreeMemoryBuffer(MEMORY_CAPACITY)

    #print "begin.\n\n"
    for i_episode in range(MAX_EPISODE):
        s = env.reset()
        track_r = []
        critic._v_ = []  # clean critic loss buffer
        actor._loss_ = []  # clean actor loss buffer
        for t in count():
            if RENDER: env.render()

            a = actor.choose_action(s)

            s_, r, done, info = env.step(a)

            ##if done: r = -20    #  Penalty if die

            track_r.append(r)

            # ACER: Critic Actor with Experience Replay
            if not done:
                transition = np.hstack((s, a, r, s_))
                replay_memory.save(transition)  # Save non-final transition

            #print len(replay_memory)
            #print replay_memory.data
            #print replay_memory.gettree
            if len(replay_memory
                   ) >= BATCH_SIZE:  # memory capacity into batch size
                tree_idx, batch, ISWeights = replay_memory.sample(
                    BATCH_SIZE)  # Sample from memory
                s_b = np.asarray(batch[-1, 0:8])  # state
                s_b_n = np.asarray(batch[-1, 10:18])  # next state
                a_b = np.asarray(batch[-1, 8])  # action
                r_b = np.asarray(batch[-1, 9])  # reward

                # print("tree_idx:   " + str(tree_idx))
                #print(ISWeights)

                td_error, abs_error = critic.learn(s_b, r_b, s_b_n,
                                                   ISWeights)  # Critic Learn
                replay_memory.batch_update(tree_idx,
                                           abs_error)  # Update T priority
                actor.learn(s_b, a_b, td_error)  # Actor Learn
                # print("rd_error:     " + str(td_error))
                print("abs_error:   " + str(abs_error))

            s = s_

            # print "... in episode (%d) step (%d)" % (i_episode+1,t)
            if is_ipython:
                display.clear_output(wait=True)
                display.display(plt.gcf())
            #env.render()

            if done or t >= MAX_EP_STEPS:
                ep_rs_sum = sum(track_r)
                # if 'running_reward' not in globals():
                #     running_reward = ep_rs_sum
                # else:
                #     running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
                #  if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
                running_reward_avg = ep_rs_sum / float(t)
                reward_per_epi.append(ep_rs_sum)
                durations_per_epi.append(
                    running_reward_avg)  ## draw average reward here
                l_A.append(np.mean(actor._loss_))
                l_C.append(np.mean(critic._loss_))
                # print("episode:", i_episode, "  reward:", ep_rs_sum)
                #plot(reward_per_epi, durations_per_epi, l_A, l_C)

                break

    return reward_per_epi, durations_per_epi, l_A, l_C
Пример #5
0
def start_er(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000):
    #print ("make enviornment")
    env = gym.make(GAME_NAME)
    #print ("create actor, critic")
    actor = Actor(env.observation_space, env.action_space)
    critic = Critic(env.observation_space, env.action_space)
    reward_per_epi=[]
    durations_per_epi=[]
    l_A=[]
    l_C=[]

    MAX_EPISODE = 500
    RENDER = False
    MAX_EP_STEPS= 1000
    DISPLAY_REWARD_THRESHOLD=200
    BATCH_SIZE=BATCH_SIZE
    MEMORY_CAPACITY=MEMORY_CAPACITY
    replay_memory = ReplayMemory(MEMORY_CAPACITY)

    #print ("begin.\n")
    for i_episode in range(MAX_EPISODE):
        s = env.reset()
        track_r = []
        critic._v_=[]
        actor._loss_=[]
        for t in count():
            if RENDER: env.render()

            a = actor.choose_action(s)

            s_, r, done, info = env.step(a)

            ##if done: r = -20    #  Penalty if die

            track_r.append(r)

            # ACER learn from experience
            if not done:
                replay_memory.save(s, a, r, s_)   # Save non-final transition into memeory

            if len(replay_memory) >= BATCH_SIZE:

                transitions = replay_memory.sample(BATCH_SIZE)   # Sample from memory for training
                batch = Transition(*zip(*transitions))

                s_b = np.asarray(batch.state)
                s_b_n = np.asarray(batch.next_state)
                a_b = np.asarray(batch.action).reshape(BATCH_SIZE, 1)
                r_b = np.asarray(batch.reward).reshape(BATCH_SIZE, 1)

                td_error, abs_error  = critic.learn(s_b, r_b, s_b_n)    # Critic Learn
                actor.learn(s_b, a_b, td_error)       # Actor Learn
            ################## ################

            s = s_

            ##print ("... in episode (%d) step (%d)" % (i_episode+1,t))
            if is_ipython:
                display.clear_output(wait=True)
                display.display(plt.gcf())
            #env.render()

            if done or t >= MAX_EP_STEPS:   # Episode finished, print results
                ep_rs_sum = sum(track_r)
                #if 'running_reward' not in globals():
                #    running_reward = ep_rs_sum
                #else:
                #    running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
                #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True   # rendering
                running_reward_avg = ep_rs_sum/float(t)
                reward_per_epi.append(ep_rs_sum)
                durations_per_epi.append(t)
                l_A.append(np.mean(actor._loss_))
                l_C.append(np.mean(critic._loss_))
                #print("episode:", i_episode, "  reward:", ep_rs_sum)
                #plot(reward_per_epi, durations_per_epi, l_A, l_C)

                break

    return reward_per_epi, durations_per_epi, l_A, l_C
def start_er(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000):
    #print ("make enviornment")
    env = gym.make(GAME_NAME)
    #print ("create actor, critic")
    actor = Actor(env.observation_space, env.action_space)
    critic = Critic(env.observation_space, env.action_space)
    reward_per_epi = []
    durations_per_epi = []
    l_A = []
    l_C = []

    MAX_EPISODE = 200
    RENDER = False
    MAX_EP_STEPS = 1000
    DISPLAY_REWARD_THRESHOLD = 200
    BATCH_SIZE = BATCH_SIZE
    MEMORY_CAPACITY = MEMORY_CAPACITY
    replay_memory_1 = ReplayMemory(MEMORY_CAPACITY)
    replay_memory_2 = ReplayMemory(MEMORY_CAPACITY)
    f_1 = BATCH_SIZE / 2  # define fraction for 2 buckets
    f_2 = BATCH_SIZE / 2

    #print ("begin.\n")
    for i_episode in range(MAX_EPISODE):
        s = env.reset()
        track_r = []
        critic._v_ = []
        actor._loss_ = []
        for t in count():
            if RENDER: env.render()

            a = actor.choose_action(s)

            s_, r, done, info = env.step(a)

            track_r.append(r)

            if not done:
                replay_memory_1.save(
                    s, a, r, s_) if r > 0 else replay_memory_2.save(
                        s, a, r, s_)  # Save non-final transition into memory

            #learn form memory
            if len(replay_memory_1) >= f_1 and len(
                    replay_memory_2
            ) >= f_2:  # if positive D is enough, the other must as well
                transitions_1 = replay_memory_1.sample(
                    f_1)  # Sample from 2 buckets
                batch1 = Transition(*zip(*transitions_1))
                transitions_2 = replay_memory_2.sample(f_2)
                batch2 = Transition(*zip(*transitions_2))

                s_b = np.append(np.asarray(batch1.state),
                                np.asarray(batch2.state),
                                axis=0)
                s_b_n = np.append(np.asarray(batch1.next_state),
                                  np.asarray(batch2.next_state),
                                  axis=0)
                a_b = np.append(np.asarray(batch1.action).reshape(f_1, 1),
                                np.asarray(batch2.action).reshape(f_2, 1),
                                axis=0)
                r_b = np.append(np.asarray(batch1.reward).reshape(f_1, 1),
                                np.asarray(batch2.reward).reshape(f_2, 1),
                                axis=0)

                td_error, abs_error = critic.learn(s_b, r_b,
                                                   s_b_n)  # Critic Learn
                actor.learn(s_b, a_b, td_error)  # Actor Learn

            s = s_

            ##print ("... in episode (%d) step (%d)" % (i_episode+1,t))
            if is_ipython:
                display.clear_output(wait=True)
                display.display(plt.gcf())
            #env.render()

            if done or t >= MAX_EP_STEPS:  # Episode finished, print results
                ep_rs_sum = sum(track_r)
                #if 'running_reward' not in globals():
                #    running_reward = ep_rs_sum
                #else:
                #    running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
                #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True   # rendering
                running_reward_avg = ep_rs_sum / float(t)
                reward_per_epi.append(ep_rs_sum)
                durations_per_epi.append(t)
                l_A.append(np.mean(actor._loss_))
                l_C.append(np.mean(critic._loss_))
                #print("episode:", i_episode, "  reward:", ep_rs_sum)
                #plot(reward_per_epi, durations_per_epi, l_A, l_C)

                break

    return reward_per_epi, durations_per_epi, l_A, l_C
Пример #7
0
        track_r = []

        while True:
            if RENDER: env.render()

            a = actor.choose_action(s)

            s_, r, done, info = env.step(a)

            if done: r = -20

            track_r.append(r)

            td_error = critic.learn(s, r, s_)

            actor.learn(s, a, td_error)

            s = s_
            t += 1

            if done or t >= MAX_EP_STEPS:
                ep_rs_sum = sum(track_r)

                if 'running_reward' not in globals():
                    running_reward = ep_rs_sum
                else:
                    running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
                if running_reward > DISPLAY_REWARD_THRESHOLD:
                    RENDER = True  # rendering
                print("episode:", epoch, "  reward:", int(ep_rs_sum))
                break
Пример #8
0
    s = env.reset()
    t = 0
    track_r = []
    while True:
        if RENDER: env.render()

        a = actor.choose_action(s)
        s_,r,done,info = env.step(a)

        if done: r = -20

        track_r.append(r)

        td_error = critic.learn(s,r,s_)
        actor.learn(s,a,td_error)

        s = s_
        t += 1

        if done or t >= MAX_EP_STEPS:
            ep_rs_sum = sum(track_r)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))
            break
Пример #9
0
class DDPG(object):
    def __init__(self, a_dim, s_dim):
        self.sess = tf.Session()
        self.a_dim, self.s_dim = a_dim, s_dim
        self.LR_A = 0.001
        self.LR_C = 0.001
        self.CAPACITY = 10000
        self.BATCH_SIZE = 32
        self.BATCH_SIZE_g = 24
        self.SETTING = {
            'GAMMA': 0.9,
            'TAU': 0.01,
            'N_D_MAX': 1 / np.sqrt(self.s_dim),
            'N_D_MIN': -1 / np.sqrt(self.s_dim),
            'F_N_D_MAX': 3e-3,
            'F_N_D_MIN': -3e-3,
            'L2_DECAY': 0.01,
        }

        self.S = tf.placeholder(tf.float32,
                                shape=[None, self.s_dim],
                                name='State')
        self.S_ = tf.placeholder(tf.float32,
                                 shape=[None, self.s_dim],
                                 name='State_')
        self.R = tf.placeholder(tf.float32, shape=[None, 1], name='Reward')

        self.actor = Actor(self.sess, self.a_dim, self.s_dim, self.LR_A,
                           self.SETTING, self.S, self.S_)
        self.critic = Critic(self.sess, self.a_dim, self.s_dim, self.LR_C,
                             self.SETTING, self.S, self.S_, self.R,
                             self.actor.action, self.actor.action_)
        self.actor.add_grad_to_graph(self.critic.a_grads)

        self.memory = Memory(self.CAPACITY, s_dim * 2 + a_dim + 1,
                             self.BATCH_SIZE)
        # self.memory_g = Memory(self.CAPACITY, s_dim * 2 + a_dim + 1, self.BATCH_SIZE_g)

        self.sess.run(tf.global_variables_initializer())

        tf.summary.FileWriter('logs/', self.sess.graph)

    def store_transition(self, state, action, reward, state_):
        self.memory.store_transition(state, action, reward, state_)

    # def store_transition_g(self, state, action, reward, state_):
    #     self.memory_g.store_transition(state, action, reward, state_)

    def learn(self):
        if self.memory.pointer > self.memory.capacity:
            bt = self.memory.sample()
            bs = bt[:, :self.s_dim]
            ba = bt[:, self.s_dim:self.s_dim + self.a_dim]
            br = bt[:, -self.s_dim - 1:-self.s_dim]
            bs_ = bt[:, -self.s_dim:]
            self.critic.learn(bs, ba, br, bs_)
            self.actor.learn(bs)

            # bt = self.memory_g.sample()
            # bs = bt[:, :self.s_dim]
            # ba = bt[:, self.s_dim:self.s_dim + self.a_dim]
            # br = bt[:, -self.s_dim - 1:-self.s_dim]
            # bs_ = bt[:, -self.s_dim:]
            # self.critic.learn(bs, ba, br, bs_)
            # self.actor.learn(bs)

    def save(self):
        saver = tf.train.Saver()
        saver.save(self.sess, './params/params', write_meta_graph=False)

    def load(self):
        saver = tf.train.Saver()
        saver.restore(self.sess, './params/params')